Merge tag 'v0.9.1' into v0.9.1-dev

4c676e3d · zhuwenwen · b4c4464d · b6553be1 · 4c676e3d · 4c676e3d
Commit 4c676e3d authored Jun 20, 2025 by zhuwenwen
17 changed files
--- a/examples/offline_inference/basic/basic.py
+++ b/examples/offline_inference/basic/basic.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from vllm import LLM, SamplingParams


--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
@@ -7,9 +8,8 @@ from vllm.utils import FlexibleArgumentParser
 def create_parser():
    parser = FlexibleArgumentParser()
    # Add engine args
-    engine_group = parser.add_argument_group("Engine arguments")
-    EngineArgs.add_cli_args(engine_group)
-    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+    EngineArgs.add_cli_args(parser)
+    parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
    # Add sampling params
    sampling_group = parser.add_argument_group("Sampling parameters")
    sampling_group.add_argument("--max-tokens", type=int)
@@ -57,22 +57,12 @@ def main(args: dict):

    # In this script, we demonstrate how to pass input to the chat method:
    conversation = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": "Hello"
-        },
-        {
-            "role": "assistant",
-            "content": "Hello! How can I assist you today?"
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
        {
            "role": "user",
-            "content":
-            "Write an essay about the importance of higher education.",
+            "content": "Write an essay about the importance of higher education.",
        },
    ]
    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)

--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from argparse import Namespace

@@ -10,9 +11,9 @@ def parse_args():
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
-    parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
-                        task="classify",
-                        enforce_eager=True)
+    parser.set_defaults(
+        model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True
+    )
    return parser.parse_args()


@@ -36,10 +37,11 @@ def main(args: Namespace):
    print("\nGenerated Outputs:\n" + "-" * 60)
    for prompt, output in zip(prompts, outputs):
        probs = output.outputs.probs
-        probs_trimmed = ((str(probs[:16])[:-1] +
-                          ", ...]") if len(probs) > 16 else probs)
-        print(f"Prompt: {prompt!r} \n"
-              f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
+        probs_trimmed = (str(probs[:16])[:-1] + ", ...]") if len(probs) > 16 else probs
+        print(
+            f"Prompt: {prompt!r} \n"
+            f"Class Probabilities: {probs_trimmed} (size={len(probs)})"
+        )
        print("-" * 60)



--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from argparse import Namespace

@@ -10,9 +11,9 @@ def parse_args():
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
-    parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
-                        task="embed",
-                        enforce_eager=True)
+    parser.set_defaults(
+        model="intfloat/e5-mistral-7b-instruct", task="embed", enforce_eager=True
+    )
    return parser.parse_args()


@@ -36,10 +37,10 @@ def main(args: Namespace):
    print("\nGenerated Outputs:\n" + "-" * 60)
    for prompt, output in zip(prompts, outputs):
        embeds = output.outputs.embedding
-        embeds_trimmed = ((str(embeds[:16])[:-1] +
-                           ", ...]") if len(embeds) > 16 else embeds)
-        print(f"Prompt: {prompt!r} \n"
-              f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+        embeds_trimmed = (
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
+        )
+        print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
        print("-" * 60)



--- a/examples/offline_inference/basic/generate.py
+++ b/examples/offline_inference/basic/generate.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
@@ -7,9 +8,8 @@ from vllm.utils import FlexibleArgumentParser
 def create_parser():
    parser = FlexibleArgumentParser()
    # Add engine args
-    engine_group = parser.add_argument_group("Engine arguments")
-    EngineArgs.add_cli_args(engine_group)
-    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+    EngineArgs.add_cli_args(parser)
+    parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
    # Add sampling params
    sampling_group = parser.add_argument_group("Sampling parameters")
    sampling_group.add_argument("--max-tokens", type=int)

--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from argparse import Namespace

@@ -10,9 +11,9 @@ def parse_args():
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
-    parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
-                        task="score",
-                        enforce_eager=True)
+    parser.set_defaults(
+        model="BAAI/bge-reranker-v2-m3", task="score", enforce_eager=True
+    )
    return parser.parse_args()



--- a/examples/offline_inference/batch_llm_inference.py
+++ b/examples/offline_inference/batch_llm_inference.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use Ray Data for data parallel batch inference.

@@ -17,12 +18,14 @@ Ray Data provides functionality for:
 Learn more about Ray Data's LLM integration:
 https://docs.ray.io/en/latest/data/working-with-llms.html
 """
+
 import ray
 from packaging.version import Version
 from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig

-assert Version(ray.__version__) >= Version(
-    "2.44.1"), "Ray version must be at least 2.44.1"
+assert Version(ray.__version__) >= Version("2.44.1"), (
+    "Ray version must be at least 2.44.1"
+)

 # Uncomment to reduce clutter in stdout
 # ray.init(log_to_driver=False)
@@ -53,20 +56,18 @@ config = vLLMEngineProcessorConfig(
 vllm_processor = build_llm_processor(
    config,
    preprocess=lambda row: dict(
-        messages=[{
-            "role": "system",
-            "content": "You are a bot that responds with haikus."
-        }, {
-            "role": "user",
-            "content": row["text"]
-        }],
+        messages=[
+            {"role": "system", "content": "You are a bot that responds with haikus."},
+            {"role": "user", "content": row["text"]},
+        ],
        sampling_params=dict(
            temperature=0.3,
            max_tokens=250,
-        )),
+        ),
+    ),
    postprocess=lambda row: dict(
        answer=row["generated_text"],
-        **row  # This will return all the original columns in the dataset.
+        **row,  # This will return all the original columns in the dataset.
    ),
 )


--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # ruff: noqa
 import json
@@ -50,27 +51,32 @@ model_name = "mistralai/Mistral-7B-Instruct-v0.3"
 # or any other mistral model with function calling ability

 sampling_params = SamplingParams(max_tokens=8192, temperature=0.0)
-llm = LLM(model=model_name,
+llm = LLM(
+    model=model_name,
    tokenizer_mode="mistral",
    config_format="mistral",
-          load_format="mistral")
+    load_format="mistral",
+)


 def generate_random_id(length=9):
    characters = string.ascii_letters + string.digits
-    random_id = ''.join(random.choice(characters) for _ in range(length))
+    random_id = "".join(random.choice(characters) for _ in range(length))
    return random_id


 # simulate an API that can be called
-def get_current_weather(city: str, state: str, unit: 'str'):
-    return (f"The weather in {city}, {state} is 85 degrees {unit}. It is "
-            "partly cloudly, with highs in the 90's.")
+def get_current_weather(city: str, state: str, unit: "str"):
+    return (
+        f"The weather in {city}, {state} is 85 degrees {unit}. It is "
+        "partly cloudly, with highs in the 90's."
+    )


-tool_funtions = {"get_current_weather": get_current_weather}
+tool_functions = {"get_current_weather": get_current_weather}

-tools = [{
+tools = [
+    {
        "type": "function",
        "function": {
            "name": "get_current_weather",
@@ -79,58 +85,59 @@ tools = [{
                "type": "object",
                "properties": {
                    "city": {
-                    "type":
-                    "string",
-                    "description":
-                    "The city to find the weather for, e.g. 'San Francisco'"
+                        "type": "string",
+                        "description": "The city to find the weather for, e.g. 'San Francisco'",
                    },
                    "state": {
-                    "type":
-                    "string",
-                    "description":
-                    "the two-letter abbreviation for the state that the city is"
-                    " in, e.g. 'CA' which would mean 'California'"
+                        "type": "string",
+                        "description": "the two-letter abbreviation for the state that the city is"
+                        " in, e.g. 'CA' which would mean 'California'",
                    },
                    "unit": {
                        "type": "string",
                        "description": "The unit to fetch the temperature in",
-                    "enum": ["celsius", "fahrenheit"]
-                }
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
        },
-            "required": ["city", "state", "unit"]
-        }
    }
-}]
+]

-messages = [{
-    "role":
-    "user",
-    "content":
-    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
-}]
+messages = [
+    {
+        "role": "user",
+        "content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?",
+    }
+]

 outputs = llm.chat(messages, sampling_params=sampling_params, tools=tools)
 output = outputs[0].outputs[0].text.strip()

 # append the assistant message
-messages.append({
+messages.append(
+    {
        "role": "assistant",
        "content": output,
-})
+    }
+)

 # let's now actually parse and execute the model's output simulating an API call by using the
 # above defined function
 tool_calls = json.loads(output)
 tool_answers = [
-    tool_funtions[call['name']](**call['arguments']) for call in tool_calls
+    tool_functions[call["name"]](**call["arguments"]) for call in tool_calls
 ]

 # append the answer as a tool message and let the LLM give you an answer
-messages.append({
+messages.append(
+    {
        "role": "tool",
        "content": "\n\n".join(tool_answers),
        "tool_call_id": generate_random_id(),
-})
+    }
+)

 outputs = llm.chat(messages, sampling_params, tools=tools)


--- a/examples/offline_inference/context_extension.py
+++ b/examples/offline_inference/context_extension.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script demonstrates how to extend the context length
+of a Qwen model using the YARN method (rope_scaling)
+and run a simple chat example.
+
+Usage:
+    python examples/offline_inference/context_extension.py
+"""
+
+from vllm import LLM, SamplingParams
+
+
+def create_llm():
+    rope_theta = 1000000
+    original_max_position_embeddings = 32768
+    factor = 4.0
+
+    # Use yarn to extend context
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings": original_max_position_embeddings,
+        },
+        "max_model_len": int(original_max_position_embeddings * factor),
+    }
+
+    llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
+    return llm
+
+
+def run_llm_chat(llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=128,
+    )
+
+    conversation = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
+    ]
+    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+    return outputs
+
+
+def print_outputs(outputs):
+    print("\nGenerated Outputs:\n" + "-" * 80)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\n")
+        print(f"Generated text: {generated_text!r}")
+        print("-" * 80)
+
+
+def main():
+    llm = create_llm()
+    outputs = run_llm_chat(llm)
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Usage:
 Single node:
@@ -27,6 +28,7 @@ Multi-node:
                    --master-addr=10.99.48.128 \
                    --master-port=13345
 """
+
 import os
 from time import sleep

@@ -36,40 +38,46 @@ from vllm.utils import get_open_port

 def parse_args():
    import argparse
+
    parser = argparse.ArgumentParser(description="Data Parallel Inference")
-    parser.add_argument("--model",
+    parser.add_argument(
+        "--model",
        type=str,
        default="ibm-research/PowerMoE-3b",
-                        help="Model name or path")
-    parser.add_argument("--dp-size",
-                        type=int,
-                        default=2,
-                        help="Data parallel size")
-    parser.add_argument("--tp-size",
-                        type=int,
-                        default=2,
-                        help="Tensor parallel size")
-    parser.add_argument("--node-size",
-                        type=int,
-                        default=1,
-                        help="Total number of nodes")
-    parser.add_argument("--node-rank",
-                        type=int,
-                        default=0,
-                        help="Rank of the current node")
-    parser.add_argument("--master-addr",
-                        type=str,
-                        default="",
-                        help="Master node IP address")
-    parser.add_argument("--master-port",
-                        type=int,
-                        default=0,
-                        help="Master node port")
+        help="Model name or path",
+    )
+    parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
+    parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
+    parser.add_argument(
+        "--node-size", type=int, default=1, help="Total number of nodes"
+    )
+    parser.add_argument(
+        "--node-rank", type=int, default=0, help="Rank of the current node"
+    )
+    parser.add_argument(
+        "--master-addr", type=str, default="", help="Master node IP address"
+    )
+    parser.add_argument("--master-port", type=int, default=0, help="Master node port")
+    parser.add_argument(
+        "--enforce-eager", action="store_true", help="Enforce eager mode execution."
+    )
+    parser.add_argument(
+        "--trust-remote-code", action="store_true", help="Trust remote code."
+    )
    return parser.parse_args()


-def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
-         dp_master_port, GPUs_per_dp_rank):
+def main(
+    model,
+    dp_size,
+    local_dp_rank,
+    global_dp_rank,
+    dp_master_ip,
+    dp_master_port,
+    GPUs_per_dp_rank,
+    enforce_eager,
+    trust_remote_code,
+):
    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
    os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
    os.environ["VLLM_DP_SIZE"] = str(dp_size)
@@ -90,10 +98,14 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
    # with DP, each rank should process different prompts.
    # usually all the DP ranks process a full dataset,
    # and each rank processes a different part of the dataset.
-    promts_per_rank = len(prompts) // dp_size
-    start = global_dp_rank * promts_per_rank
-    end = start + promts_per_rank
-    prompts = prompts[start:end]
+    floor = len(prompts) // dp_size
+    remainder = len(prompts) % dp_size
+
+    # Distribute prompts into even groups.
+    def start(rank):
+        return rank * floor + min(rank, remainder)
+
+    prompts = prompts[start(global_dp_rank) : start(global_dp_rank + 1)]
    if len(prompts) == 0:
        # if any rank has no prompts to process,
        # we need to set a placeholder prompt
@@ -104,15 +116,18 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
    # since we are doing data parallel, every rank can have different
    # sampling params. here we set different max_tokens for different
    # ranks for demonstration.
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=[16, 20][global_dp_rank % 2])
+    sampling_params = SamplingParams(
+        temperature=0.8, top_p=0.95, max_tokens=[16, 20][global_dp_rank % 2]
+    )

    # Create an LLM.
-    llm = LLM(model=model,
+    llm = LLM(
+        model=model,
        tensor_parallel_size=GPUs_per_dp_rank,
-              enforce_eager=True,
-              enable_expert_parallel=True)
+        enforce_eager=enforce_eager,
+        enable_expert_parallel=True,
+        trust_remote_code=trust_remote_code,
+    )
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    for i, output in enumerate(outputs):
@@ -121,15 +136,16 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
            break
        prompt = output.prompt
        generated_text = output.outputs[0].text
-        print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
-              f"Generated text: {generated_text!r}")
+        print(
+            f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
+            f"Generated text: {generated_text!r}"
+        )

    # Give engines time to pause their processing loops before exiting.
    sleep(1)


 if __name__ == "__main__":
-
    args = parse_args()

    dp_size = args.dp_size
@@ -151,19 +167,29 @@ if __name__ == "__main__":

    procs = []
    for local_dp_rank, global_dp_rank in enumerate(
-            range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)):
-        proc = Process(target=main,
-                       args=(args.model, dp_size, local_dp_rank,
-                             global_dp_rank, dp_master_ip, dp_master_port,
-                             tp_size))
+        range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)
+    ):
+        proc = Process(
+            target=main,
+            args=(
+                args.model,
+                dp_size,
+                local_dp_rank,
+                global_dp_rank,
+                dp_master_ip,
+                dp_master_port,
+                tp_size,
+                args.enforce_eager,
+                args.trust_remote_code,
+            ),
+        )
        proc.start()
        procs.append(proc)
    exit_code = 0
    for proc in procs:
        proc.join(timeout=300)
        if proc.exitcode is None:
-            print(f"Killing process {proc.pid} that "
-                  f"didn't stop within 5 minutes.")
+            print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
            proc.kill()
            exit_code = 1
        elif proc.exitcode:

--- a/examples/offline_inference/disaggregated-prefill-v1/README.md
+++ b/examples/offline_inference/disaggregated-prefill-v1/README.md
+# Disaggregated Prefill V1
+
+This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM.
+
+## Files
+
+- `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
+  - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
+- `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
+- `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig

-# Read prompts from output.txt
-prompts = []
-try:
+
+def read_prompts():
+    """Read prompts from output.txt"""
+    prompts = []
+    try:
        with open("output.txt") as f:
            for line in f:
                prompts.append(line.strip())
        print(f"Loaded {len(prompts)} prompts from output.txt")
-except FileNotFoundError:
+        return prompts
+    except FileNotFoundError:
        print("Error: output.txt file not found")
        exit(-1)

-sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)

-llm = LLM(
+def main():
+    prompts = read_prompts()
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    llm = LLM(
        model="meta-llama/Llama-3.2-1B-Instruct",
        enforce_eager=True,
        gpu_memory_utilization=0.8,
        max_num_batched_tokens=64,
        max_num_seqs=16,
-    kv_transfer_config=KVTransferConfig.from_cli(
-        '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
-        '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
-    ))  #, max_model_len=2048, max_num_batched_tokens=2048)
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        ),
+    )  # , max_model_len=2048, max_num_batched_tokens=2048)

-# 1ST generation (prefill instance)
-outputs = llm.generate(prompts, sampling_params)
+    # 1ST generation (prefill instance)
+    outputs = llm.generate(prompts, sampling_params)

-for output in outputs:
+    print("-" * 30)
+    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 30)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig

-context = "Hi " * 1000
-context2 = "Hey " * 500
-prompts = [
+
+def read_prompts():
+    context = "Hi " * 1000
+    context2 = "Hey " * 500
+    return [
        context + "Hello, my name is",
        context + "The capital of France is",
        context2 + "Your name is",
        context2 + "The capital of China is",
-]
+    ]
+
+
+def main():
+    prompts = read_prompts()

-sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)

-llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
        enforce_eager=True,
        gpu_memory_utilization=0.8,
-          kv_transfer_config=KVTransferConfig.from_cli(
-              '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
-              '"kv_connector_extra_config": '
-              '{"shared_storage_path": "local_storage"}}')
-          )  #, max_model_len=2048, max_num_batched_tokens=2048)
-
-# 1ST generation (prefill instance)
-outputs = llm.generate(
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        ),
+    )  # , max_model_len=2048, max_num_batched_tokens=2048)
+
+    # 1ST generation (prefill instance)
+    outputs = llm.generate(
        prompts,
        sampling_params,
-)
+    )

-new_prompts = []
-for output in outputs:
+    new_prompts = []
+    print("-" * 30)
+    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        new_prompts.append(prompt + generated_text)
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 30)

-# Write new_prompts to output.txt
-with open("output.txt", "w") as f:
+    # Write new_prompts to output.txt
+    with open("output.txt", "w") as f:
        for prompt in new_prompts:
            f.write(prompt + "\n")
-print(f"Saved {len(new_prompts)} prompts to output.txt")
+    print(f"Saved {len(new_prompts)} prompts to output.txt")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/disaggregated-prefill-v1/run.sh
+++ b/examples/offline_inference/disaggregated-prefill-v1/run.sh
 rm -rf local_storage/
-rm output.txt

-VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py
-VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py
+if [ -f "output.txt" ]; then
+    rm output.txt
+fi
+
+# The directory of current script
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py"
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py"
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of disaggregated prefilling
 We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
 and then transfer the KV cache between them.
 """
+
 import os
 import time
 from multiprocessing import Event, Process
@@ -32,16 +34,21 @@ def run_prefill(prefill_done):
    # This instance is the prefill node (kv_producer, rank 0).
    # The number of parallel instances for KV cache transfer is set to 2,
    # as required for PyNcclConnector.
-    ktc = KVTransferConfig.from_cli(
-        '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
+    ktc = KVTransferConfig(
+        kv_connector="PyNcclConnector",
+        kv_role="kv_producer",
+        kv_rank=0,
+        kv_parallel_size=2,
    )

    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
    # memory. You may need to adjust the value to fit your GPU.
-    llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
        kv_transfer_config=ktc,
        max_model_len=2000,
-              gpu_memory_utilization=0.8)
+        gpu_memory_utilization=0.8,
+    )

    llm.generate(prompts, sampling_params)
    print("Prefill node is finished.")
@@ -71,16 +78,21 @@ def run_decode(prefill_done):
    # This instance is the decode node (kv_consumer, rank 1).
    # The number of parallel instances for KV cache transfer is set to 2,
    # as required for PyNcclConnector.
-    ktc = KVTransferConfig.from_cli(
-        '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
+    ktc = KVTransferConfig(
+        kv_connector="PyNcclConnector",
+        kv_role="kv_consumer",
+        kv_rank=1,
+        kv_parallel_size=2,
    )

    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
    # memory. You may need to adjust the value to fit your GPU.
-    llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
        kv_transfer_config=ktc,
        max_model_len=2000,
-              gpu_memory_utilization=0.8)
+        gpu_memory_utilization=0.8,
+    )

    # Wait for the producer to start the pipe
    print("Waiting for prefill node to finish...")
@@ -97,8 +109,8 @@ def run_decode(prefill_done):

 def main():
    prefill_done = Event()
-    prefill_process = Process(target=run_prefill, args=(prefill_done, ))
-    decode_process = Process(target=run_decode, args=(prefill_done, ))
+    prefill_process = Process(target=run_prefill, args=(prefill_done,))
+    decode_process = Process(target=run_decode, args=(prefill_done,))

    # Start prefill node
    prefill_process.start()

--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
 import os
@@ -6,6 +7,7 @@ import os
 from transformers import AutoTokenizer

 from vllm import LLM, SamplingParams
+from vllm.v1.metrics.reader import Counter, Vector


 def load_prompts(dataset_path, num_prompts):
@@ -20,9 +22,7 @@ def load_prompts(dataset_path, num_prompts):
            print(f"Error reading dataset: {e}")
            return []
    else:
-        prompts = [
-            "The future of AI is", "The president of the United States is"
-        ]
+        prompts = ["The future of AI is", "The president of the United States is"]

    return prompts[:num_prompts]

@@ -33,27 +33,35 @@ def parse_args():
        "--dataset",
        type=str,
        default="./examples/data/gsm8k.jsonl",
-        help="downloaded from the eagle repo " \
-        "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
+        help="downloaded from the eagle repo "
+        "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/",
+    )
+    parser.add_argument(
+        "--method", type=str, default="eagle", choices=["eagle", "eagle3"]
    )
    parser.add_argument("--max_num_seqs", type=int, default=8)
    parser.add_argument("--num_prompts", type=int, default=80)
    parser.add_argument("--num_spec_tokens", type=int, default=2)
    parser.add_argument("--tp", type=int, default=1)
    parser.add_argument("--draft_tp", type=int, default=1)
-    parser.add_argument("--enforce_eager", action='store_true')
-    parser.add_argument("--enable_chunked_prefill", action='store_true')
+    parser.add_argument("--enforce_eager", action="store_true")
+    parser.add_argument("--enable_chunked_prefill", action="store_true")
    parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
    parser.add_argument("--temp", type=float, default=0)
    return parser.parse_args()


 def main():
-
    args = parse_args()

    model_dir = "meta-llama/Llama-3.1-8B-Instruct"
+
+    if args.method == "eagle":
+        eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+    elif args.method == "eagle3":
        eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+    else:
+        raise ValueError(f"unknown method: {args.method}")

    max_model_len = 2048

@@ -62,11 +70,9 @@ def main():
    prompts = load_prompts(args.dataset, args.num_prompts)

    prompt_ids = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      add_generation_prompt=True)
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}], add_generation_prompt=True
+        )
        for prompt in prompts
    ]

@@ -81,7 +87,7 @@ def main():
        max_num_seqs=args.max_num_seqs,
        gpu_memory_utilization=0.8,
        speculative_config={
-            "method": "eagle3" if "eagle3" in eagle_dir.lower() else "eagle",
+            "method": args.method,
            "model": eagle_dir,
            "num_speculative_tokens": args.num_spec_tokens,
            "draft_tensor_parallel_size": args.draft_tp,
@@ -92,30 +98,42 @@ def main():

    sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)

-    outputs = llm.generate(prompt_token_ids=prompt_ids,
-                           sampling_params=sampling_params)
+    outputs = llm.generate(prompt_token_ids=prompt_ids, sampling_params=sampling_params)

-    if not hasattr(outputs, "metrics") or outputs.metrics is None:
+    # print the generated text
+    for output in outputs:
+        print("-" * 50)
+        print(f"prompt: {output.prompt}")
+        print(f"generated text: {output.outputs[0].text}")
+        print("-" * 50)
+
+    try:
+        metrics = llm.get_metrics()
+    except AssertionError:
+        print("Metrics are not supported in the V0 engine.")
        return

-    # calculate the average number of accepted tokens per forward pass, +1 is
-    # to account for the token from the target model that's always going to be
-    # accepted
-    acceptance_counts = [0] * (args.num_spec_tokens + 1)
-    for output in outputs:
-        for step, count in enumerate(
-                output.metrics.spec_token_acceptance_counts):
-            acceptance_counts[step] += count
+    num_drafts = num_accepted = 0
+    acceptance_counts = [0] * args.num_spec_tokens
+    for metric in metrics:
+        if metric.name == "vllm:spec_decode_num_drafts":
+            assert isinstance(metric, Counter)
+            num_drafts += metric.value
+        elif metric.name == "vllm:spec_decode_num_accepted_tokens":
+            assert isinstance(metric, Counter)
+            num_accepted += metric.value
+        elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
+            assert isinstance(metric, Vector)
+            for pos in range(len(metric.values)):
+                acceptance_counts[pos] += metric.values[pos]

    print("-" * 50)
-    print(f"mean acceptance length: \
-        {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
+    print(f"mean acceptance length: {1 + (num_accepted / num_drafts):.2f}")
    print("-" * 50)

    # print acceptance at each token position
    for i in range(len(acceptance_counts)):
-        print(f"acceptance at token {i}:"
-              f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}")
+        print(f"acceptance at token {i}:{acceptance_counts[i] / num_drafts:.2f}")


 if __name__ == "__main__":

--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from argparse import Namespace

@@ -10,9 +11,9 @@ def parse_args():
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
-    parser.set_defaults(model="jinaai/jina-embeddings-v3",
-                        task="embed",
-                        trust_remote_code=True)
+    parser.set_defaults(
+        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+    )
    return parser.parse_args()


@@ -41,11 +42,14 @@ def main(args: Namespace):
    print("-" * 60)
    for prompt, output in zip(prompts, outputs):
        embeds = output.outputs.embedding
-        embeds_trimmed = ((str(embeds[:16])[:-1] +
-                           ", ...]") if len(embeds) > 16 else embeds)
-        print(f"Prompt: {prompt!r} \n"
+        embeds_trimmed = (
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
+        )
+        print(
+            f"Prompt: {prompt!r} \n"
            f"Embeddings for text matching: {embeds_trimmed} "
-              f"(size={len(embeds)})")
+            f"(size={len(embeds)})"
+        )
        print("-" * 60)