Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

4eabe123 · zhuwenwen · 45840cd2 · 58738772 · 4eabe123 · 4eabe123
Commit 4eabe123 authored May 28, 2025 by zhuwenwen
20 changed files
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
-# vLLM V1 User Guide
+# vLLM V1
 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).

--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
 # SPDX-License-Identifier: Apache-2.0
 """
-This example shows how to use vLLM for running offline inference 
+This example shows how to use vLLM for running offline inference
 with the correct prompt format on audio language models.
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
 import os
 from dataclasses import asdict
 from typing import NamedTuple, Optional
@@ -22,7 +23,7 @@ audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
 question_per_audio_count = {
    0: "What is 1+1?",
    1: "What is recited in the audio?",
-    2: "What sport and what nursery rhyme are referenced?"
+    2: "What sport and what nursery rhyme are referenced?",
 }
@@ -72,8 +73,7 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
 # MiniCPM-O
 def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
    model_name = "openbmb/MiniCPM-o-2_6"
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-                                              trust_remote_code=True)
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
@@ -82,19 +82,18 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
        limit_mm_per_prompt={"audio": audio_count},
    )
-    stop_tokens = ['<|im_end|>', '<|endoftext|>']
+    stop_tokens = ["<|im_end|>", "<|endoftext|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    audio_placeholder = "(<audio>./</audio>)" * audio_count
    audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"  # noqa: E501
-    messages = [{
+    messages = [{"role": "user", "content": f"{audio_placeholder}\n{question}"}]
-        'role': 'user',
+    prompt = tokenizer.apply_chat_template(
-        'content': f'{audio_placeholder}\n{question}'
+        messages,
-    }]
+        tokenize=False,
-    prompt = tokenizer.apply_chat_template(messages,
+        add_generation_prompt=True,
-                                           tokenize=False,
+        chat_template=audio_chat_template,
-                                           add_generation_prompt=True,
+    )
-                                           chat_template=audio_chat_template)
    return ModelRequestData(
        engine_args=engine_args,
@@ -113,7 +112,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    speech_lora_path = os.path.join(model_path, "speech-lora")
-    placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
+    placeholders = "".join([f"<|audio_{i + 1}|>" for i in range(audio_count)])
    prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
@@ -145,15 +144,19 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
        limit_mm_per_prompt={"audio": audio_count},
    )
-    audio_in_prompt = "".join([
+    audio_in_prompt = "".join(
-        f"Audio {idx+1}: "
+        [
-        f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
+            f"Audio {idx + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
-    ])
+            for idx in range(audio_count)
+        ]
+    )
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+    prompt = (
-              "<|im_start|>user\n"
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              f"{audio_in_prompt}{question}<|im_end|>\n"
+        "<|im_start|>user\n"
-              "<|im_start|>assistant\n")
+        f"{audio_in_prompt}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
    return ModelRequestData(
        engine_args=engine_args,
@@ -172,19 +175,22 @@ def run_qwen2_5_omni(question: str, audio_count: int):
        limit_mm_per_prompt={"audio": audio_count},
    )
-    audio_in_prompt = "".join([
+    audio_in_prompt = "".join(
-        "<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
+        ["<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)]
-    ])
+    )
    default_system = (
        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
        "Group, capable of perceiving auditory and visual inputs, as well as "
-        "generating text and speech.")
+        "generating text and speech."
+    )
-    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+    prompt = (
-              "<|im_start|>user\n"
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
-              f"{audio_in_prompt}{question}<|im_end|>\n"
+        "<|im_start|>user\n"
-              "<|im_start|>assistant\n")
+        f"{audio_in_prompt}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
@@ -196,13 +202,10 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
    model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    messages = [{
+    messages = [{"role": "user", "content": "<|audio|>\n" * audio_count + question}]
-        'role': 'user',
+    prompt = tokenizer.apply_chat_template(
-        'content': "<|audio|>\n" * audio_count + question
+        messages, tokenize=False, add_generation_prompt=True
-    }]
+    )
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
    engine_args = EngineArgs(
        model=model_name,
@@ -220,8 +223,7 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
 # Whisper
 def run_whisper(question: str, audio_count: int) -> ModelRequestData:
-    assert audio_count == 1, (
+    assert audio_count == 1, "Whisper only support single audio input per prompt"
-        "Whisper only support single audio input per prompt")
    model_name = "openai/whisper-large-v3-turbo"
    prompt = "<|startoftranscript|>"
@@ -252,27 +254,33 @@ model_example_map = {
 def parse_args():
    parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
+        description="Demo on using vLLM for offline inference with "
-        'audio language models')
+        "audio language models"
-    parser.add_argument('--model-type',
+    )
-                        '-m',
+    parser.add_argument(
-                        type=str,
+        "--model-type",
-                        default="ultravox",
+        "-m",
-                        choices=model_example_map.keys(),
+        type=str,
-                        help='Huggingface "model_type".')
+        default="ultravox",
-    parser.add_argument('--num-prompts',
+        choices=model_example_map.keys(),
-                        type=int,
+        help='Huggingface "model_type".',
-                        default=1,
+    )
-                        help='Number of prompts to run.')
+    parser.add_argument(
-    parser.add_argument("--num-audios",
+        "--num-prompts", type=int, default=1, help="Number of prompts to run."
-                        type=int,
+    )
-                        default=1,
+    parser.add_argument(
-                        choices=[0, 1, 2],
+        "--num-audios",
-                        help="Number of audio items per prompt.")
+        type=int,
-    parser.add_argument("--seed",
+        default=1,
-                        type=int,
+        choices=[0, 1, 2],
-                        default=None,
+        help="Number of audio items per prompt.",
-                        help="Set the seed when initializing `vllm.LLM`.")
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
    return parser.parse_args()
@@ -283,29 +291,30 @@ def main(args):
        raise ValueError(f"Model type {model} is not supported.")
    audio_count = args.num_audios
-    req_data = model_example_map[model](question_per_audio_count[audio_count],
+    req_data = model_example_map[model](
-                                        audio_count)
+        question_per_audio_count[audio_count], audio_count
+    )
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-        req_data.engine_args.limit_mm_per_prompt or {})
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)
    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(temperature=0.2,
+    sampling_params = SamplingParams(
-                                     max_tokens=64,
+        temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
-                                     stop_token_ids=req_data.stop_token_ids)
+    )
    mm_data = {}
    if audio_count > 0:
        mm_data = {
            "audio": [
-                asset.audio_and_sample_rate
+                asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
-                for asset in audio_assets[:audio_count]
            ]
        }
@@ -315,8 +324,9 @@ def main(args):
        # Batch inference
        inputs = [inputs] * args.num_prompts
    # Add LoRA request if applicable
-    lora_request = (req_data.lora_requests *
+    lora_request = (
-                    args.num_prompts if req_data.lora_requests else None)
+        req_data.lora_requests * args.num_prompts if req_data.lora_requests else None
+    )
    outputs = llm.generate(
        inputs,

--- a/docs/source/features/automatic_prefix_caching.md
+++ b/docs/source/features/automatic_prefix_caching.md
-(automatic-prefix-caching)=
+# SPDX-License-Identifier: Apache-2.0
+"""
-# Automatic Prefix Caching
+Demonstration script for Automatic Prefix Caching (APC) in vLLM.
-## Introduction
-Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
+Automatic Prefix Caching (APC) allows the vLLM engine to reuse cached
+KV (key-value) pairs from previous prompts if a new query shares the same
+prefix. This reduces redundant computation and improves inference speed.
-:::{note}
+To enable APC, set `enable_prefix_caching=True` when initializing the
-Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
+vLLM engine.
-:::
-## Enabling APC in vLLM
+This script uses a long Markdown table as the shared prompt prefix and
+compares the generation time for two queries that share the same prefix
+but ask different questions.
-Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
+Run:
+python examples/offline_inference/automatic_prefix_caching.py
+"""
-```python
 import time
-from vllm import LLM, SamplingParams
+from vllm import LLM, SamplingParams
+# ruff: noqa: E501
 # A prompt containing a large markdown table. The table is randomly generated by GPT-4.
-LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
+LONG_PROMPT = (
+    "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
+    + """
 | ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
 |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
 | 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
@@ -54,6 +59,7 @@ LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables i
 | 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
 | 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
 """
+)
 def get_generation_time(llm, sampling_params, prompts):
@@ -62,41 +68,35 @@ def get_generation_time(llm, sampling_params, prompts):
    output = llm.generate(prompts, sampling_params=sampling_params)
    end_time = time.time()
    # print the output and generation time
+    print("-" * 30)
    print(f"Output: {output[0].outputs[0].text}")
    print(f"Generation time: {end_time - start_time} seconds.")
+    print("-" * 30)
-# set enable_prefix_caching=True to enable APC
+def main():
-llm = LLM(
+    # set enable_prefix_caching=True to enable APC
-    model='lmsys/longchat-13b-16k',
+    llm = LLM(model="lmsys/longchat-13b-16k", enable_prefix_caching=True)
-    enable_prefix_caching=True
-)
-sampling_params = SamplingParams(temperature=0, max_tokens=100)
-# Querying the age of John Doe
-get_generation_time(
-    llm,
-    sampling_params,
-    LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
-)
-# Querying the age of Zack Blue
-# This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
-get_generation_time(
-    llm,
-    sampling_params,
-    LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
-)
-```
-## Example workloads
+    sampling_params = SamplingParams(temperature=0, max_tokens=100)
-We describe two example workloads, where APC can provide huge performance benefit:
+    # Querying the age of John Doe
+    get_generation_time(
+        llm,
+        sampling_params,
+        LONG_PROMPT
+        + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
+    )
- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
+    # Querying the age of Zack Blue
- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
+    # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
+    get_generation_time(
+        llm,
+        sampling_params,
+        LONG_PROMPT
+        + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
+    )
-## Limits
-APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@@ -56,22 +56,12 @@ def main(args: dict):
    # In this script, we demonstrate how to pass input to the chat method:
    conversation = [
-        {
+        {"role": "system", "content": "You are a helpful assistant"},
-            "role": "system",
+        {"role": "user", "content": "Hello"},
-            "content": "You are a helpful assistant"
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
-        },
-        {
-            "role": "user",
-            "content": "Hello"
-        },
-        {
-            "role": "assistant",
-            "content": "Hello! How can I assist you today?"
-        },
        {
            "role": "user",
-            "content":
+            "content": "Write an essay about the importance of higher education.",
-            "Write an essay about the importance of higher education.",
        },
    ]
    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)

--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -10,9 +10,9 @@ def parse_args():
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
-    parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
+    parser.set_defaults(
-                        task="classify",
+        model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True
-                        enforce_eager=True)
+    )
    return parser.parse_args()
@@ -36,10 +36,11 @@ def main(args: Namespace):
    print("\nGenerated Outputs:\n" + "-" * 60)
    for prompt, output in zip(prompts, outputs):
        probs = output.outputs.probs
-        probs_trimmed = ((str(probs[:16])[:-1] +
+        probs_trimmed = (str(probs[:16])[:-1] + ", ...]") if len(probs) > 16 else probs
-                          ", ...]") if len(probs) > 16 else probs)
+        print(
-        print(f"Prompt: {prompt!r} \n"
+            f"Prompt: {prompt!r} \n"
-              f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
+            f"Class Probabilities: {probs_trimmed} (size={len(probs)})"
+        )
        print("-" * 60)

--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -10,9 +10,9 @@ def parse_args():
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
-    parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
+    parser.set_defaults(
-                        task="embed",
+        model="intfloat/e5-mistral-7b-instruct", task="embed", enforce_eager=True
-                        enforce_eager=True)
+    )
    return parser.parse_args()
@@ -36,10 +36,10 @@ def main(args: Namespace):
    print("\nGenerated Outputs:\n" + "-" * 60)
    for prompt, output in zip(prompts, outputs):
        embeds = output.outputs.embedding
-        embeds_trimmed = ((str(embeds[:16])[:-1] +
+        embeds_trimmed = (
-                           ", ...]") if len(embeds) > 16 else embeds)
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
-        print(f"Prompt: {prompt!r} \n"
+        )
-              f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+        print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
        print("-" * 60)

--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -10,9 +10,9 @@ def parse_args():
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
-    parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
+    parser.set_defaults(
-                        task="score",
+        model="BAAI/bge-reranker-v2-m3", task="score", enforce_eager=True
-                        enforce_eager=True)
+    )
    return parser.parse_args()

--- a/examples/offline_inference/batch_llm_inference.py
+++ b/examples/offline_inference/batch_llm_inference.py
@@ -17,12 +17,14 @@ Ray Data provides functionality for:
 Learn more about Ray Data's LLM integration:
 https://docs.ray.io/en/latest/data/working-with-llms.html
 """
 import ray
 from packaging.version import Version
 from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig
-assert Version(ray.__version__) >= Version(
+assert Version(ray.__version__) >= Version("2.44.1"), (
-    "2.44.1"), "Ray version must be at least 2.44.1"
+    "Ray version must be at least 2.44.1"
+)
 # Uncomment to reduce clutter in stdout
 # ray.init(log_to_driver=False)
@@ -53,20 +55,18 @@ config = vLLMEngineProcessorConfig(
 vllm_processor = build_llm_processor(
    config,
    preprocess=lambda row: dict(
-        messages=[{
+        messages=[
-            "role": "system",
+            {"role": "system", "content": "You are a bot that responds with haikus."},
-            "content": "You are a bot that responds with haikus."
+            {"role": "user", "content": row["text"]},
-        }, {
+        ],
-            "role": "user",
-            "content": row["text"]
-        }],
        sampling_params=dict(
            temperature=0.3,
            max_tokens=250,
-        )),
+        ),
+    ),
    postprocess=lambda row: dict(
        answer=row["generated_text"],
-        **row  # This will return all the original columns in the dataset.
+        **row,  # This will return all the original columns in the dataset.
    ),
 )

--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@@ -50,87 +50,93 @@ model_name = "mistralai/Mistral-7B-Instruct-v0.3"
 # or any other mistral model with function calling ability
 sampling_params = SamplingParams(max_tokens=8192, temperature=0.0)
-llm = LLM(model=model_name,
+llm = LLM(
-          tokenizer_mode="mistral",
+    model=model_name,
-          config_format="mistral",
+    tokenizer_mode="mistral",
-          load_format="mistral")
+    config_format="mistral",
+    load_format="mistral",
+)
 def generate_random_id(length=9):
    characters = string.ascii_letters + string.digits
-    random_id = ''.join(random.choice(characters) for _ in range(length))
+    random_id = "".join(random.choice(characters) for _ in range(length))
    return random_id
 # simulate an API that can be called
-def get_current_weather(city: str, state: str, unit: 'str'):
+def get_current_weather(city: str, state: str, unit: "str"):
-    return (f"The weather in {city}, {state} is 85 degrees {unit}. It is "
+    return (
-            "partly cloudly, with highs in the 90's.")
+        f"The weather in {city}, {state} is 85 degrees {unit}. It is "
+        "partly cloudly, with highs in the 90's."
+    )
 tool_functions = {"get_current_weather": get_current_weather}
-tools = [{
+tools = [
-    "type": "function",
+    {
-    "function": {
+        "type": "function",
-        "name": "get_current_weather",
+        "function": {
-        "description": "Get the current weather in a given location",
+            "name": "get_current_weather",
-        "parameters": {
+            "description": "Get the current weather in a given location",
-            "type": "object",
+            "parameters": {
-            "properties": {
+                "type": "object",
-                "city": {
+                "properties": {
-                    "type":
+                    "city": {
-                    "string",
+                        "type": "string",
-                    "description":
+                        "description": "The city to find the weather for, e.g. 'San Francisco'",
-                    "The city to find the weather for, e.g. 'San Francisco'"
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": "the two-letter abbreviation for the state that the city is"
+                        " in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
                },
-                "state": {
+                "required": ["city", "state", "unit"],
-                    "type":
-                    "string",
-                    "description":
-                    "the two-letter abbreviation for the state that the city is"
-                    " in, e.g. 'CA' which would mean 'California'"
-                },
-                "unit": {
-                    "type": "string",
-                    "description": "The unit to fetch the temperature in",
-                    "enum": ["celsius", "fahrenheit"]
-                }
            },
-            "required": ["city", "state", "unit"]
+        },
-        }
    }
-}]
+]
-messages = [{
+messages = [
-    "role":
+    {
-    "user",
+        "role": "user",
-    "content":
+        "content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?",
-    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+    }
-}]
+]
 outputs = llm.chat(messages, sampling_params=sampling_params, tools=tools)
 output = outputs[0].outputs[0].text.strip()
 # append the assistant message
-messages.append({
+messages.append(
-    "role": "assistant",
+    {
-    "content": output,
+        "role": "assistant",
-})
+        "content": output,
+    }
+)
 # let's now actually parse and execute the model's output simulating an API call by using the
 # above defined function
 tool_calls = json.loads(output)
 tool_answers = [
-    tool_functions[call['name']](**call['arguments']) for call in tool_calls
+    tool_functions[call["name"]](**call["arguments"]) for call in tool_calls
 ]
 # append the answer as a tool message and let the LLM give you an answer
-messages.append({
+messages.append(
-    "role": "tool",
+    {
-    "content": "\n\n".join(tool_answers),
+        "role": "tool",
-    "tool_call_id": generate_random_id(),
+        "content": "\n\n".join(tool_answers),
-})
+        "tool_call_id": generate_random_id(),
+    }
+)
 outputs = llm.chat(messages, sampling_params, tools=tools)

--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -27,6 +27,7 @@ Multi-node:
                    --master-addr=10.99.48.128 \
                    --master-port=13345
 """
 import os
 from time import sleep
@@ -36,46 +37,46 @@ from vllm.utils import get_open_port
 def parse_args():
    import argparse
    parser = argparse.ArgumentParser(description="Data Parallel Inference")
-    parser.add_argument("--model",
+    parser.add_argument(
-                        type=str,
+        "--model",
-                        default="ibm-research/PowerMoE-3b",
+        type=str,
-                        help="Model name or path")
+        default="ibm-research/PowerMoE-3b",
-    parser.add_argument("--dp-size",
+        help="Model name or path",
-                        type=int,
+    )
-                        default=2,
+    parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
-                        help="Data parallel size")
+    parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
-    parser.add_argument("--tp-size",
+    parser.add_argument(
-                        type=int,
+        "--node-size", type=int, default=1, help="Total number of nodes"
-                        default=2,
+    )
-                        help="Tensor parallel size")
+    parser.add_argument(
-    parser.add_argument("--node-size",
+        "--node-rank", type=int, default=0, help="Rank of the current node"
-                        type=int,
+    )
-                        default=1,
+    parser.add_argument(
-                        help="Total number of nodes")
+        "--master-addr", type=str, default="", help="Master node IP address"
-    parser.add_argument("--node-rank",
+    )
-                        type=int,
+    parser.add_argument("--master-port", type=int, default=0, help="Master node port")
-                        default=0,
+    parser.add_argument(
-                        help="Rank of the current node")
+        "--enforce-eager", action="store_true", help="Enforce eager mode execution."
-    parser.add_argument("--master-addr",
+    )
-                        type=str,
+    parser.add_argument(
-                        default="",
+        "--trust-remote-code", action="store_true", help="Trust remote code."
-                        help="Master node IP address")
+    )
-    parser.add_argument("--master-port",
-                        type=int,
-                        default=0,
-                        help="Master node port")
-    parser.add_argument("--enforce-eager",
-                        action='store_true',
-                        help="Enforce eager mode execution.")
-    parser.add_argument("--trust-remote-code",
-                        action='store_true',
-                        help="Trust remote code.")
    return parser.parse_args()
-def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
+def main(
-         dp_master_port, GPUs_per_dp_rank, enforce_eager, trust_remote_code):
+    model,
+    dp_size,
+    local_dp_rank,
+    global_dp_rank,
+    dp_master_ip,
+    dp_master_port,
+    GPUs_per_dp_rank,
+    enforce_eager,
+    trust_remote_code,
+):
    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
    os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
    os.environ["VLLM_DP_SIZE"] = str(dp_size)
@@ -110,9 +111,9 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
    # since we are doing data parallel, every rank can have different
    # sampling params. here we set different max_tokens for different
    # ranks for demonstration.
-    sampling_params = SamplingParams(temperature=0.8,
+    sampling_params = SamplingParams(
-                                     top_p=0.95,
+        temperature=0.8, top_p=0.95, max_tokens=[16, 20][global_dp_rank % 2]
-                                     max_tokens=[16, 20][global_dp_rank % 2])
+    )
    # Create an LLM.
    llm = LLM(
@@ -130,15 +131,16 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
            break
        prompt = output.prompt
        generated_text = output.outputs[0].text
-        print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
+        print(
-              f"Generated text: {generated_text!r}")
+            f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
+            f"Generated text: {generated_text!r}"
+        )
    # Give engines time to pause their processing loops before exiting.
    sleep(1)
 if __name__ == "__main__":
    args = parse_args()
    dp_size = args.dp_size
@@ -160,20 +162,29 @@ if __name__ == "__main__":
    procs = []
    for local_dp_rank, global_dp_rank in enumerate(
-            range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)):
+        range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)
-        proc = Process(target=main,
+    ):
-                       args=(args.model, dp_size, local_dp_rank,
+        proc = Process(
-                             global_dp_rank, dp_master_ip, dp_master_port,
+            target=main,
-                             tp_size, args.enforce_eager,
+            args=(
-                             args.trust_remote_code))
+                args.model,
+                dp_size,
+                local_dp_rank,
+                global_dp_rank,
+                dp_master_ip,
+                dp_master_port,
+                tp_size,
+                args.enforce_eager,
+                args.trust_remote_code,
+            ),
+        )
        proc.start()
        procs.append(proc)
    exit_code = 0
    for proc in procs:
        proc.join(timeout=300)
        if proc.exitcode is None:
-            print(f"Killing process {proc.pid} that "
+            print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
-                  f"didn't stop within 5 minutes.")
            proc.kill()
            exit_code = 1
        elif proc.exitcode:

--- a/examples/offline_inference/disaggregated-prefill-v1/README.md
+++ b/examples/offline_inference/disaggregated-prefill-v1/README.md
@@ -5,5 +5,6 @@ This example contains scripts that demonstrate disaggregated prefill in the offl
 ## Files
 - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
+  - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
 - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
 - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -3,35 +3,48 @@
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
-# Read prompts from output.txt
-prompts = []
+def read_prompts():
-try:
+    """Read prompts from output.txt"""
-    with open("output.txt") as f:
+    prompts = []
-        for line in f:
+    try:
-            prompts.append(line.strip())
+        with open("output.txt") as f:
-    print(f"Loaded {len(prompts)} prompts from output.txt")
+            for line in f:
-except FileNotFoundError:
+                prompts.append(line.strip())
-    print("Error: output.txt file not found")
+        print(f"Loaded {len(prompts)} prompts from output.txt")
-    exit(-1)
+        return prompts
+    except FileNotFoundError:
-sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+        print("Error: output.txt file not found")
+        exit(-1)
-llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
-          enforce_eager=True,
-          gpu_memory_utilization=0.8,
+def main():
-          max_num_batched_tokens=64,
+    prompts = read_prompts()
-          max_num_seqs=16,
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
-          kv_transfer_config=KVTransferConfig(
-              kv_connector="SharedStorageConnector",
+    llm = LLM(
-              kv_role="kv_both",
+        model="meta-llama/Llama-3.2-1B-Instruct",
-              kv_connector_extra_config={
+        enforce_eager=True,
-                  "shared_storage_path": "local_storage"
+        gpu_memory_utilization=0.8,
-              }))  #, max_model_len=2048, max_num_batched_tokens=2048)
+        max_num_batched_tokens=64,
+        max_num_seqs=16,
-# 1ST generation (prefill instance)
+        kv_transfer_config=KVTransferConfig(
-outputs = llm.generate(prompts, sampling_params)
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
-for output in outputs:
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
-    prompt = output.prompt
+        ),
-    generated_text = output.outputs[0].text
+    )  # , max_model_len=2048, max_num_batched_tokens=2048)
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    # 1ST generation (prefill instance)
+    outputs = llm.generate(prompts, sampling_params)
+    print("-" * 30)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 30)
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -3,42 +3,55 @@
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
-context = "Hi " * 1000
-context2 = "Hey " * 500
+def read_prompts():
-prompts = [
+    context = "Hi " * 1000
-    context + "Hello, my name is",
+    context2 = "Hey " * 500
-    context + "The capital of France is",
+    return [
-    context2 + "Your name is",
+        context + "Hello, my name is",
-    context2 + "The capital of China is",
+        context + "The capital of France is",
-]
+        context2 + "Your name is",
+        context2 + "The capital of China is",
-sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+    ]
-llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
-          enforce_eager=True,
+def main():
-          gpu_memory_utilization=0.8,
+    prompts = read_prompts()
-          kv_transfer_config=KVTransferConfig(
-              kv_connector="SharedStorageConnector",
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
-              kv_role="kv_both",
-              kv_connector_extra_config={
+    llm = LLM(
-                  "shared_storage_path": "local_storage"
+        model="meta-llama/Llama-3.2-1B-Instruct",
-              }))  #, max_model_len=2048, max_num_batched_tokens=2048)
+        enforce_eager=True,
+        gpu_memory_utilization=0.8,
-# 1ST generation (prefill instance)
+        kv_transfer_config=KVTransferConfig(
-outputs = llm.generate(
+            kv_connector="SharedStorageConnector",
-    prompts,
+            kv_role="kv_both",
-    sampling_params,
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
-)
+        ),
+    )  # , max_model_len=2048, max_num_batched_tokens=2048)
-new_prompts = []
-for output in outputs:
+    # 1ST generation (prefill instance)
-    prompt = output.prompt
+    outputs = llm.generate(
-    generated_text = output.outputs[0].text
+        prompts,
-    new_prompts.append(prompt + generated_text)
+        sampling_params,
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    )
-# Write new_prompts to output.txt
+    new_prompts = []
-with open("output.txt", "w") as f:
+    print("-" * 30)
-    for prompt in new_prompts:
+    for output in outputs:
-        f.write(prompt + "\n")
+        prompt = output.prompt
-print(f"Saved {len(new_prompts)} prompts to output.txt")
+        generated_text = output.outputs[0].text
+        new_prompts.append(prompt + generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 30)
+    # Write new_prompts to output.txt
+    with open("output.txt", "w") as f:
+        for prompt in new_prompts:
+            f.write(prompt + "\n")
+    print(f"Saved {len(new_prompts)} prompts to output.txt")
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -4,6 +4,7 @@ This file demonstrates the example usage of disaggregated prefilling
 We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
 and then transfer the KV cache between them.
 """
 import os
 import time
 from multiprocessing import Event, Process
@@ -32,17 +33,21 @@ def run_prefill(prefill_done):
    # This instance is the prefill node (kv_producer, rank 0).
    # The number of parallel instances for KV cache transfer is set to 2,
    # as required for PyNcclConnector.
-    ktc = KVTransferConfig(kv_connector="PyNcclConnector",
+    ktc = KVTransferConfig(
-                           kv_role="kv_producer",
+        kv_connector="PyNcclConnector",
-                           kv_rank=0,
+        kv_role="kv_producer",
-                           kv_parallel_size=2)
+        kv_rank=0,
+        kv_parallel_size=2,
+    )
    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
    # memory. You may need to adjust the value to fit your GPU.
-    llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+    llm = LLM(
-              kv_transfer_config=ktc,
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
-              max_model_len=2000,
+        kv_transfer_config=ktc,
-              gpu_memory_utilization=0.8)
+        max_model_len=2000,
+        gpu_memory_utilization=0.8,
+    )
    llm.generate(prompts, sampling_params)
    print("Prefill node is finished.")
@@ -72,17 +77,21 @@ def run_decode(prefill_done):
    # This instance is the decode node (kv_consumer, rank 1).
    # The number of parallel instances for KV cache transfer is set to 2,
    # as required for PyNcclConnector.
-    ktc = KVTransferConfig(kv_connector="PyNcclConnector",
+    ktc = KVTransferConfig(
-                           kv_role="kv_consumer",
+        kv_connector="PyNcclConnector",
-                           kv_rank=1,
+        kv_role="kv_consumer",
-                           kv_parallel_size=2)
+        kv_rank=1,
+        kv_parallel_size=2,
+    )
    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
    # memory. You may need to adjust the value to fit your GPU.
-    llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+    llm = LLM(
-              kv_transfer_config=ktc,
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
-              max_model_len=2000,
+        kv_transfer_config=ktc,
-              gpu_memory_utilization=0.8)
+        max_model_len=2000,
+        gpu_memory_utilization=0.8,
+    )
    # Wait for the producer to start the pipe
    print("Waiting for prefill node to finish...")
@@ -99,8 +108,8 @@ def run_decode(prefill_done):
 def main():
    prefill_done = Event()
-    prefill_process = Process(target=run_prefill, args=(prefill_done, ))
+    prefill_process = Process(target=run_prefill, args=(prefill_done,))
-    decode_process = Process(target=run_decode, args=(prefill_done, ))
+    decode_process = Process(target=run_decode, args=(prefill_done,))
    # Start prefill node
    prefill_process.start()

--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -6,6 +6,7 @@ import os
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
+from vllm.v1.metrics.reader import Counter, Vector
 def load_prompts(dataset_path, num_prompts):
@@ -20,9 +21,7 @@ def load_prompts(dataset_path, num_prompts):
            print(f"Error reading dataset: {e}")
            return []
    else:
-        prompts = [
+        prompts = ["The future of AI is", "The president of the United States is"]
-            "The future of AI is", "The president of the United States is"
-        ]
    return prompts[:num_prompts]
@@ -33,34 +32,32 @@ def parse_args():
        "--dataset",
        type=str,
        default="./examples/data/gsm8k.jsonl",
-        help="downloaded from the eagle repo " \
+        help="downloaded from the eagle repo "
-        "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
+        "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/",
+    )
+    parser.add_argument(
+        "--method", type=str, default="eagle", choices=["eagle", "eagle3"]
    )
-    parser.add_argument("--method",
-                        type=str,
-                        default='eagle',
-                        choices=['eagle', 'eagle3'])
    parser.add_argument("--max_num_seqs", type=int, default=8)
    parser.add_argument("--num_prompts", type=int, default=80)
    parser.add_argument("--num_spec_tokens", type=int, default=2)
    parser.add_argument("--tp", type=int, default=1)
    parser.add_argument("--draft_tp", type=int, default=1)
-    parser.add_argument("--enforce_eager", action='store_true')
+    parser.add_argument("--enforce_eager", action="store_true")
-    parser.add_argument("--enable_chunked_prefill", action='store_true')
+    parser.add_argument("--enable_chunked_prefill", action="store_true")
    parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
    parser.add_argument("--temp", type=float, default=0)
    return parser.parse_args()
 def main():
    args = parse_args()
    model_dir = "meta-llama/Llama-3.1-8B-Instruct"
-    if args.method == 'eagle':
+    if args.method == "eagle":
        eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
-    elif args.method == 'eagle3':
+    elif args.method == "eagle3":
        eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
    else:
        raise ValueError(f"unknown method: {args.method}")
@@ -72,11 +69,9 @@ def main():
    prompts = load_prompts(args.dataset, args.num_prompts)
    prompt_ids = [
-        tokenizer.apply_chat_template([{
+        tokenizer.apply_chat_template(
-            "role": "user",
+            [{"role": "user", "content": prompt}], add_generation_prompt=True
-            "content": prompt
+        )
-        }],
-                                      add_generation_prompt=True)
        for prompt in prompts
    ]
@@ -102,8 +97,7 @@ def main():
    sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
-    outputs = llm.generate(prompt_token_ids=prompt_ids,
+    outputs = llm.generate(prompt_token_ids=prompt_ids, sampling_params=sampling_params)
-                           sampling_params=sampling_params)
    # print the generated text
    for output in outputs:
@@ -112,27 +106,33 @@ def main():
        print(f"generated text: {output.outputs[0].text}")
        print("-" * 50)
-    if not hasattr(outputs, "metrics") or outputs.metrics is None:
+    try:
+        metrics = llm.get_metrics()
+    except AssertionError:
+        print("Metrics are not supported in the V0 engine.")
        return
-    # calculate the average number of accepted tokens per forward pass, +1 is
+    num_drafts = num_accepted = 0
-    # to account for the token from the target model that's always going to be
+    acceptance_counts = [0] * args.num_spec_tokens
-    # accepted
+    for metric in metrics:
-    acceptance_counts = [0] * (args.num_spec_tokens + 1)
+        if metric.name == "vllm:spec_decode_num_drafts":
-    for output in outputs:
+            assert isinstance(metric, Counter)
-        for step, count in enumerate(
+            num_drafts += metric.value
-                output.metrics.spec_token_acceptance_counts):
+        elif metric.name == "vllm:spec_decode_num_accepted_tokens":
-            acceptance_counts[step] += count
+            assert isinstance(metric, Counter)
+            num_accepted += metric.value
+        elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
+            assert isinstance(metric, Vector)
+            for pos in range(len(metric.values)):
+                acceptance_counts[pos] += metric.values[pos]
    print("-" * 50)
-    print(f"mean acceptance length (including bonus tokens): \
+    print(f"mean acceptance length: {1 + (num_accepted / num_drafts):.2f}")
-        {1 + (sum(acceptance_counts) / acceptance_counts[0]):.2f}")
    print("-" * 50)
    # print acceptance at each token position
    for i in range(len(acceptance_counts)):
-        print(f"acceptance at token {i}:"
+        print(f"acceptance at token {i}:{acceptance_counts[i] / num_drafts:.2f}")
-              f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}")
 if __name__ == "__main__":

--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -10,9 +10,9 @@ def parse_args():
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
-    parser.set_defaults(model="jinaai/jina-embeddings-v3",
+    parser.set_defaults(
-                        task="embed",
+        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
-                        trust_remote_code=True)
+    )
    return parser.parse_args()
@@ -41,11 +41,14 @@ def main(args: Namespace):
    print("-" * 60)
    for prompt, output in zip(prompts, outputs):
        embeds = output.outputs.embedding
-        embeds_trimmed = ((str(embeds[:16])[:-1] +
+        embeds_trimmed = (
-                           ", ...]") if len(embeds) > 16 else embeds)
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
-        print(f"Prompt: {prompt!r} \n"
+        )
-              f"Embeddings for text matching: {embeds_trimmed} "
+        print(
-              f"(size={len(embeds)})")
+            f"Prompt: {prompt!r} \n"
+            f"Embeddings for text matching: {embeds_trimmed} "
+            f"(size={len(embeds)})"
+        )
        print("-" * 60)

--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -10,9 +10,9 @@ def parse_args():
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
-    parser.set_defaults(model="jinaai/jina-embeddings-v3",
+    parser.set_defaults(
-                        task="embed",
+        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
-                        trust_remote_code=True)
+    )
    return parser.parse_args()
@@ -39,11 +39,10 @@ def main(args: Namespace):
    print("-" * 60)
    for prompt, output in zip(prompts, outputs):
        embeds = output.outputs.embedding
-        embeds_trimmed = ((str(embeds[:16])[:-1] +
+        embeds_trimmed = (
-                           ", ...]") if len(embeds) > 16 else embeds)
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
-        print(f"Prompt: {prompt!r} \n"
+        )
-              f"Embeddings: {embeds_trimmed} "
+        print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
-              f"(size={len(embeds)})")
        print("-" * 60)

--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
 # SPDX-License-Identifier: Apache-2.0
-'''
+"""
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically BART
-'''
+"""
 from vllm import LLM, SamplingParams
-from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
+from vllm.inputs import (
-                         TokensPrompt, zip_enc_dec_prompts)
+    ExplicitEncoderDecoderPrompt,
+    TextPrompt,
+    TokensPrompt,
+    zip_enc_dec_prompts,
+)
 def create_prompts(tokenizer):
@@ -18,8 +22,9 @@ def create_prompts(tokenizer):
    # - Helpers for building prompts
    text_prompt_raw = "Hello, my name is"
    text_prompt = TextPrompt(prompt="The president of the United States is")
-    tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
+    tokens_prompt = TokensPrompt(
-        prompt="The capital of France is"))
+        prompt_token_ids=tokenizer.encode(prompt="The capital of France is")
+    )
    # - Pass a single prompt to encoder/decoder model
    #   (implicitly encoder input prompt);
    #   decoder input prompt is assumed to be None
@@ -57,14 +62,19 @@ def create_prompts(tokenizer):
    #   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
    #   instances
    zipped_prompt_list = zip_enc_dec_prompts(
-        ['An encoder prompt', 'Another encoder prompt'],
+        ["An encoder prompt", "Another encoder prompt"],
-        ['A decoder prompt', 'Another decoder prompt'])
+        ["A decoder prompt", "Another decoder prompt"],
+    )
    # - Let's put all of the above example prompts together into one list
    #   which we will pass to the encoder/decoder LLM.
    return [
-        single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
+        single_text_prompt_raw,
-        enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
+        single_text_prompt,
+        single_tokens_prompt,
+        enc_dec_prompt1,
+        enc_dec_prompt2,
+        enc_dec_prompt3,
    ] + zipped_prompt_list
@@ -85,10 +95,12 @@ def print_outputs(outputs):
        prompt = output.prompt
        encoder_prompt = output.encoder_prompt
        generated_text = output.outputs[0].text
-        print(f"Output {i+1}:")
+        print(f"Output {i + 1}:")
-        print(f"Encoder prompt: {encoder_prompt!r}\n"
+        print(
-              f"Decoder prompt: {prompt!r}\n"
+            f"Encoder prompt: {encoder_prompt!r}\n"
-              f"Generated text: {generated_text!r}")
+            f"Decoder prompt: {prompt!r}\n"
+            f"Generated text: {generated_text!r}"
+        )
        print("-" * 50)

--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -3,6 +3,7 @@
 This example shows how to use vLLM for running offline inference with
 the explicit/implicit prompt format on enc-dec LMMs for text generation.
 """
 import time
 from collections.abc import Sequence
 from dataclasses import asdict
@@ -30,18 +31,14 @@ def run_florence2():
    )
    prompts = [
-        {   # implicit prompt with task token
+        {  # implicit prompt with task token
            "prompt": "<DETAILED_CAPTION>",
-            "multi_modal_data": {
+            "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
-                "image": ImageAsset("stop_sign").pil_image
-            },
        },
-        {   # explicit encoder/decoder prompt
+        {  # explicit encoder/decoder prompt
            "encoder_prompt": {
                "prompt": "Describe in detail what is shown in the image.",
-                "multi_modal_data": {
+                "multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image},
-                    "image": ImageAsset("cherry_blossom").pil_image
-                },
            },
            "decoder_prompt": "",
        },
@@ -63,20 +60,20 @@ def run_mllama():
    )
    prompts = [
-        {   # Implicit prompt
+        {  # Implicit prompt
-            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",   # noqa: E501
+            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",  # noqa: E501
            "multi_modal_data": {
                "image": ImageAsset("stop_sign").pil_image,
            },
        },
-        {   # Explicit prompt
+        {  # Explicit prompt
            "encoder_prompt": {
                "prompt": "<|image|>",
                "multi_modal_data": {
                    "image": ImageAsset("stop_sign").pil_image,
                },
            },
-            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",   # noqa: E501
+            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",  # noqa: E501
        },
    ]
@@ -96,13 +93,13 @@ def run_whisper():
    )
    prompts = [
-        {   # Test implicit prompt
+        {  # Test implicit prompt
            "prompt": "<|startoftranscript|>",
            "multi_modal_data": {
                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
            },
        },
-        {   # Test explicit encoder/decoder prompt
+        {  # Test explicit encoder/decoder prompt
            "encoder_prompt": {
                "prompt": "",
                "multi_modal_data": {
@@ -110,7 +107,7 @@ def run_whisper():
                },
            },
            "decoder_prompt": "<|startoftranscript|>",
-        }
+        },
    ]
    return ModelRequestData(
@@ -128,18 +125,23 @@ model_example_map = {
 def parse_args():
    parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
+        description="Demo on using vLLM for offline inference with "
-        'vision language models for text generation')
+        "vision language models for text generation"
-    parser.add_argument('--model-type',
+    )
-                        '-m',
+    parser.add_argument(
-                        type=str,
+        "--model-type",
-                        default="mllama",
+        "-m",
-                        choices=model_example_map.keys(),
+        type=str,
-                        help='Huggingface "model_type".')
+        default="mllama",
-    parser.add_argument("--seed",
+        choices=model_example_map.keys(),
-                        type=int,
+        help='Huggingface "model_type".',
-                        default=None,
+    )
-                        help="Set the seed when initializing `vllm.LLM`.")
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
    return parser.parse_args()
@@ -153,7 +155,8 @@ def main(args):
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-        req_data.engine_args.limit_mm_per_prompt or {})
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)
@@ -179,8 +182,7 @@ def main(args):
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
-        print(f"Decoder prompt: {prompt!r}, "
+        print(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}")
-              f"Generated text: {generated_text!r}")
    duration = time.time() - start

--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -3,6 +3,7 @@
 This file demonstrates using the `LLMEngine`
 for processing prompts with various sampling parameters.
 """
 import argparse
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
@@ -12,24 +13,26 @@ from vllm.utils import FlexibleArgumentParser
 def create_test_prompts() -> list[tuple[str, SamplingParams]]:
    """Create a list of test prompts with their sampling parameters."""
    return [
-        ("A robot may not injure a human being",
+        (
-         SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
+            "A robot may not injure a human being",
-        ("To be or not to be,",
+            SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1),
-         SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
+        ),
-        ("What is the meaning of life?",
+        (
-         SamplingParams(n=2,
+            "To be or not to be,",
-                        temperature=0.8,
+            SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2),
-                        top_p=0.95,
+        ),
-                        frequency_penalty=0.1)),
+        (
+            "What is the meaning of life?",
+            SamplingParams(n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1),
+        ),
    ]
-def process_requests(engine: LLMEngine,
+def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]):
-                     test_prompts: list[tuple[str, SamplingParams]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0
-    print('-' * 50)
+    print("-" * 50)
    while test_prompts or engine.has_unfinished_requests():
        if test_prompts:
            prompt, sampling_params = test_prompts.pop(0)
@@ -41,7 +44,7 @@ def process_requests(engine: LLMEngine,
        for request_output in request_outputs:
            if request_output.finished:
                print(request_output)
-                print('-' * 50)
+                print("-" * 50)
 def initialize_engine(args: argparse.Namespace) -> LLMEngine:
@@ -52,7 +55,8 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
 def parse_args():
    parser = FlexibleArgumentParser(
-        description='Demo on using the LLMEngine class directly')
+        description="Demo on using the LLMEngine class directly"
+    )
    parser = EngineArgs.add_cli_args(parser)
    return parser.parse_args()
@@ -64,6 +68,6 @@ def main(args: argparse.Namespace):
    process_requests(engine, test_prompts)
-if __name__ == '__main__':
+if __name__ == "__main__":
    args = parse_args()
    main(args)