Merge tag 'v0.8.5' into v0.8.5-dev

dcb5624a · zhuwenwen · 55880ca2 · ba41cc90 · dcb5624a · dcb5624a
Commit dcb5624a authored Apr 29, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/batch_llm_inference.py
+++ b/examples/offline_inference/batch_llm_inference.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use Ray Data for data parallel batch inference.
+Ray Data is a data processing framework that can handle large datasets
+and integrates tightly with vLLM for data-parallel inference.
+As of Ray 2.44, Ray Data has a native integration with
+vLLM (under ray.data.llm).
+Ray Data provides functionality for:
+* Reading and writing to cloud storage (S3, GCS, etc.)
+* Automatic sharding and load-balancing across a cluster
+* Optimized configuration of vLLM using continuous batching
+* Compatible with tensor/pipeline parallel inference as well.
+Learn more about Ray Data's LLM integration:
+https://docs.ray.io/en/latest/data/working-with-llms.html
+"""
+import ray
+from packaging.version import Version
+from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig
+assert Version(ray.__version__) >= Version(
+    "2.44.1"), "Ray version must be at least 2.44.1"
+# Uncomment to reduce clutter in stdout
+# ray.init(log_to_driver=False)
+# ray.data.DataContext.get_current().enable_progress_bars = False
+# Read one text file from S3. Ray Data supports reading multiple files
+# from cloud storage (such as JSONL, Parquet, CSV, binary format).
+ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
+print(ds.schema())
+size = ds.count()
+print(f"Size of dataset: {size} prompts")
+# Configure vLLM engine.
+config = vLLMEngineProcessorConfig(
+    model_source="unsloth/Llama-3.1-8B-Instruct",
+    engine_kwargs={
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4096,
+        "max_model_len": 16384,
+    },
+    concurrency=1,  # set the number of parallel vLLM replicas
+    batch_size=64,
+)
+# Create a Processor object, which will be used to
+# do batch inference on the dataset
+vllm_processor = build_llm_processor(
+    config,
+    preprocess=lambda row: dict(
+        messages=[{
+            "role": "system",
+            "content": "You are a bot that responds with haikus."
+        }, {
+            "role": "user",
+            "content": row["text"]
+        }],
+        sampling_params=dict(
+            temperature=0.3,
+            max_tokens=250,
+        )),
+    postprocess=lambda row: dict(
+        answer=row["generated_text"],
+        **row  # This will return all the original columns in the dataset.
+    ),
+)
+ds = vllm_processor(ds)
+# Peek first 10 results.
+# NOTE: This is for local testing and debugging. For production use case,
+# one should write full result out as shown below.
+outputs = ds.take(limit=10)
+for output in outputs:
+    prompt = output["prompt"]
+    generated_text = output["generated_text"]
+    print(f"Prompt: {prompt!r}")
+    print(f"Generated text: {generated_text!r}")
+# Write inference output data out as Parquet files to S3.
+# Multiple files would be written to the output destination,
+# and each task would write one or more files separately.
+#
+# ds.write_parquet("s3://<your-output-bucket>")
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -34,6 +34,40 @@ from vllm import LLM, SamplingParams
 from vllm.utils import get_open_port
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(description="Data Parallel Inference")
+    parser.add_argument("--model",
+                        type=str,
+                        default="ibm-research/PowerMoE-3b",
+                        help="Model name or path")
+    parser.add_argument("--dp-size",
+                        type=int,
+                        default=2,
+                        help="Data parallel size")
+    parser.add_argument("--tp-size",
+                        type=int,
+                        default=2,
+                        help="Tensor parallel size")
+    parser.add_argument("--node-size",
+                        type=int,
+                        default=1,
+                        help="Total number of nodes")
+    parser.add_argument("--node-rank",
+                        type=int,
+                        default=0,
+                        help="Rank of the current node")
+    parser.add_argument("--master-addr",
+                        type=str,
+                        default="",
+                        help="Master node IP address")
+    parser.add_argument("--master-port",
+                        type=int,
+                        default=0,
+                        help="Master node port")
+    return parser.parse_args()
 def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
         dp_master_port, GPUs_per_dp_rank):
    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
@@ -95,37 +129,8 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
 if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Data Parallel Inference")
+    args = parse_args()
-    parser.add_argument("--model",
-                        type=str,
-                        default="ibm-research/PowerMoE-3b",
-                        help="Model name or path")
-    parser.add_argument("--dp-size",
-                        type=int,
-                        default=2,
-                        help="Data parallel size")
-    parser.add_argument("--tp-size",
-                        type=int,
-                        default=2,
-                        help="Tensor parallel size")
-    parser.add_argument("--node-size",
-                        type=int,
-                        default=1,
-                        help="Total number of nodes")
-    parser.add_argument("--node-rank",
-                        type=int,
-                        default=0,
-                        help="Rank of the current node")
-    parser.add_argument("--master-addr",
-                        type=str,
-                        default="",
-                        help="Master node IP address")
-    parser.add_argument("--master-port",
-                        type=int,
-                        default=0,
-                        help="Master node port")
-    args = parser.parse_args()
    dp_size = args.dp_size
    tp_size = args.tp_size

--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+# SPDX-License-Identifier: Apache-2.0
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+# Read prompts from output.txt
+prompts = []
+try:
+    with open("output.txt") as f:
+        for line in f:
+            prompts.append(line.strip())
+    print(f"Loaded {len(prompts)} prompts from output.txt")
+except FileNotFoundError:
+    print("Error: output.txt file not found")
+    exit(-1)
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B-Instruct",
+    enforce_eager=True,
+    gpu_memory_utilization=0.8,
+    max_num_batched_tokens=64,
+    max_num_seqs=16,
+    kv_transfer_config=KVTransferConfig.from_cli(
+        '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
+        '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
+    ))  #, max_model_len=2048, max_num_batched_tokens=2048)
+# 1ST generation (prefill instance)
+outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+# SPDX-License-Identifier: Apache-2.0
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+context = "Hi " * 1000
+context2 = "Hey " * 500
+prompts = [
+    context + "Hello, my name is",
+    context + "The capital of France is",
+    context2 + "Your name is",
+    context2 + "The capital of China is",
+]
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+          enforce_eager=True,
+          gpu_memory_utilization=0.8,
+          kv_transfer_config=KVTransferConfig.from_cli(
+              '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
+              '"kv_connector_extra_config": '
+              '{"shared_storage_path": "local_storage"}}')
+          )  #, max_model_len=2048, max_num_batched_tokens=2048)
+# 1ST generation (prefill instance)
+outputs = llm.generate(
+    prompts,
+    sampling_params,
+)
+new_prompts = []
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    new_prompts.append(prompt + generated_text)
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+# Write new_prompts to output.txt
+with open("output.txt", "w") as f:
+    for prompt in new_prompts:
+        f.write(prompt + "\n")
+print(f"Saved {len(new_prompts)} prompts to output.txt")
--- a/examples/offline_inference/disaggregated-prefill-v1/run.sh
+++ b/examples/offline_inference/disaggregated-prefill-v1/run.sh
+rm -rf local_storage/
+rm output.txt
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -95,7 +95,7 @@ def run_decode(prefill_done):
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-if __name__ == "__main__":
+def main():
    prefill_done = Event()
    prefill_process = Process(target=run_prefill, args=(prefill_done, ))
    decode_process = Process(target=run_decode, args=(prefill_done, ))
@@ -109,3 +109,7 @@ if __name__ == "__main__":
    # Terminate the prefill node when decode is finished
    decode_process.join()
    prefill_process.terminate()
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/distributed.py
+++ b/examples/offline_inference/distributed.py
-# SPDX-License-Identifier: Apache-2.0
-"""
-This example shows how to use Ray Data for running offline batch inference
-distributively on a multi-nodes cluster.
-Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
-"""
-from typing import Any
-import numpy as np
-import ray
-from packaging.version import Version
-from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-from vllm import LLM, SamplingParams
-assert Version(ray.__version__) >= Version(
-    "2.22.0"), "Ray version must be at least 2.22.0"
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-# Set tensor parallelism per instance.
-tensor_parallel_size = 1
-# Set number of instances. Each instance will use tensor_parallel_size GPUs.
-num_instances = 1
-# Create a class to do batch inference.
-class LLMPredictor:
-    def __init__(self):
-        # Create an LLM.
-        self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-                       tensor_parallel_size=tensor_parallel_size)
-    def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, list]:
-        # Generate texts from the prompts.
-        # The output is a list of RequestOutput objects that contain the prompt,
-        # generated text, and other information.
-        outputs = self.llm.generate(batch["text"], sampling_params)
-        prompt: list[str] = []
-        generated_text: list[str] = []
-        for output in outputs:
-            prompt.append(output.prompt)
-            generated_text.append(' '.join([o.text for o in output.outputs]))
-        return {
-            "prompt": prompt,
-            "generated_text": generated_text,
-        }
-# Read one text file from S3. Ray Data supports reading multiple files
-# from cloud storage (such as JSONL, Parquet, CSV, binary format).
-ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
-# For tensor_parallel_size > 1, we need to create placement groups for vLLM
-# to use. Every actor has to have its own placement group.
-def scheduling_strategy_fn():
-    # One bundle per tensor parallel worker
-    pg = ray.util.placement_group(
-        [{
-            "GPU": 1,
-            "CPU": 1
-        }] * tensor_parallel_size,
-        strategy="STRICT_PACK",
-    )
-    return dict(scheduling_strategy=PlacementGroupSchedulingStrategy(
-        pg, placement_group_capture_child_tasks=True))
-resources_kwarg: dict[str, Any] = {}
-if tensor_parallel_size == 1:
-    # For tensor_parallel_size == 1, we simply set num_gpus=1.
-    resources_kwarg["num_gpus"] = 1
-else:
-    # Otherwise, we have to set num_gpus=0 and provide
-    # a function that will create a placement group for
-    # each instance.
-    resources_kwarg["num_gpus"] = 0
-    resources_kwarg["ray_remote_args_fn"] = scheduling_strategy_fn
-# Apply batch inference for all input data.
-ds = ds.map_batches(
-    LLMPredictor,
-    # Set the concurrency to the number of LLM instances.
-    concurrency=num_instances,
-    # Specify the batch size for inference.
-    batch_size=32,
-    **resources_kwarg,
-)
-# Peek first 10 results.
-# NOTE: This is for local testing and debugging. For production use case,
-# one should write full result out as shown below.
-outputs = ds.take(limit=10)
-for output in outputs:
-    prompt = output["prompt"]
-    generated_text = output["generated_text"]
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-# Write inference output data out as Parquet files to S3.
-# Multiple files would be written to the output destination,
-# and each task would write one or more files separately.
-#
-# ds.write_parquet("s3://<your-output-bucket>")
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -27,7 +27,7 @@ def load_prompts(dataset_path, num_prompts):
    return prompts[:num_prompts]
-def main():
+def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--dataset",
@@ -45,10 +45,15 @@ def main():
    parser.add_argument("--enable_chunked_prefill", action='store_true')
    parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
    parser.add_argument("--temp", type=float, default=0)
-    args = parser.parse_args()
+    return parser.parse_args()
+def main():
-    model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
+    args = parse_args()
-    eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
+    model_dir = "meta-llama/Llama-3.1-8B-Instruct"
+    eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
    max_model_len = 2048
@@ -76,7 +81,7 @@ def main():
        max_num_seqs=args.max_num_seqs,
        gpu_memory_utilization=0.8,
        speculative_config={
-            "method": "eagle",
+            "method": "eagle3" if "eagle3" in eagle_dir.lower() else "eagle",
            "model": eagle_dir,
            "num_speculative_tokens": args.num_spec_tokens,
            "draft_tensor_parallel_size": args.draft_tp,
@@ -90,6 +95,9 @@ def main():
    outputs = llm.generate(prompt_token_ids=prompt_ids,
                           sampling_params=sampling_params)
+    if not hasattr(outputs, "metrics") or outputs.metrics is None:
+        return
    # calculate the average number of accepted tokens per forward pass, +1 is
    # to account for the token from the target model that's always going to be
    # accepted
@@ -104,6 +112,11 @@ def main():
        {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
    print("-" * 50)
+    # print acceptance at each token position
+    for i in range(len(acceptance_counts)):
+        print(f"acceptance at token {i}:"
+              f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}")
 if __name__ == "__main__":
    main()
--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="jinaai/jina-embeddings-v3",
+                        task="embed",
+                        trust_remote_code=True)
+    return parser.parse_args()
 def main(args: Namespace):
    # Sample prompts.
    prompts = [
@@ -40,11 +50,5 @@ def main(args: Namespace):
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
+    args = parse_args()
-    parser = EngineArgs.add_cli_args(parser)
-    # Set example specific arguments
-    parser.set_defaults(model="jinaai/jina-embeddings-v3",
-                        task="embed",
-                        trust_remote_code=True)
-    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs, PoolingParams
 from vllm.utils import FlexibleArgumentParser
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="jinaai/jina-embeddings-v3",
+                        task="embed",
+                        trust_remote_code=True)
+    return parser.parse_args()
 def main(args: Namespace):
    # Sample prompts.
    prompts = [
@@ -38,11 +48,5 @@ def main(args: Namespace):
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
+    args = parse_args()
-    parser = EngineArgs.add_cli_args(parser)
-    # Set example specific arguments
-    parser.set_defaults(model="jinaai/jina-embeddings-v3",
-                        task="embed",
-                        trust_remote_code=True)
-    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -8,94 +8,112 @@ from vllm import LLM, SamplingParams
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
                         TokensPrompt, zip_enc_dec_prompts)
-dtype = "float"
+def create_prompts(tokenizer):
-# Create a BART encoder/decoder model instance
+    # Test prompts
-llm = LLM(
+    #
-    model="facebook/bart-large-cnn",
+    # This section shows all of the valid ways to prompt an
-    dtype=dtype,
+    # encoder/decoder model.
-)
+    #
+    # - Helpers for building prompts
-# Get BART tokenizer
+    text_prompt_raw = "Hello, my name is"
-tokenizer = llm.llm_engine.get_tokenizer_group()
+    text_prompt = TextPrompt(prompt="The president of the United States is")
+    tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
-# Test prompts
+        prompt="The capital of France is"))
-#
+    # - Pass a single prompt to encoder/decoder model
-# This section shows all of the valid ways to prompt an
+    #   (implicitly encoder input prompt);
-# encoder/decoder model.
+    #   decoder input prompt is assumed to be None
-#
-# - Helpers for building prompts
+    single_text_prompt_raw = text_prompt_raw  # Pass a string directly
-text_prompt_raw = "Hello, my name is"
+    single_text_prompt = text_prompt  # Pass a TextPrompt
-text_prompt = TextPrompt(prompt="The president of the United States is")
+    single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
-tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
-    prompt="The capital of France is"))
+    # ruff: noqa: E501
-# - Pass a single prompt to encoder/decoder model
+    # - Pass explicit encoder and decoder input prompts within one data structure.
-#   (implicitly encoder input prompt);
+    #   Encoder and decoder prompts can both independently be text or tokens, with
-#   decoder input prompt is assumed to be None
+    #   no requirement that they be the same prompt type. Some example prompt-type
+    #   combinations are shown below, note that these are not exhaustive.
-single_text_prompt_raw = text_prompt_raw  # Pass a string directly
-single_text_prompt = text_prompt  # Pass a TextPrompt
+    enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
-single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
+        # Pass encoder prompt string directly, &
+        # pass decoder prompt tokens
-# - Pass explicit encoder and decoder input prompts within one data structure.
+        encoder_prompt=single_text_prompt_raw,
-#   Encoder and decoder prompts can both independently be text or tokens, with
+        decoder_prompt=single_tokens_prompt,
-#   no requirement that they be the same prompt type. Some example prompt-type
+    )
-#   combinations are shown below, note that these are not exhaustive.
+    enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
+        # Pass TextPrompt to encoder, and
-enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
+        # pass decoder prompt string directly
-    # Pass encoder prompt string directly, &
+        encoder_prompt=single_text_prompt,
-    # pass decoder prompt tokens
+        decoder_prompt=single_text_prompt_raw,
-    encoder_prompt=single_text_prompt_raw,
+    )
-    decoder_prompt=single_tokens_prompt,
+    enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
-)
+        # Pass encoder prompt tokens directly, and
-enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
+        # pass TextPrompt to decoder
-    # Pass TextPrompt to encoder, and
+        encoder_prompt=single_tokens_prompt,
-    # pass decoder prompt string directly
+        decoder_prompt=single_text_prompt,
-    encoder_prompt=single_text_prompt,
+    )
-    decoder_prompt=single_text_prompt_raw,
-)
+    # - Finally, here's a useful helper function for zipping encoder and
-enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
+    #   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
-    # Pass encoder prompt tokens directly, and
+    #   instances
-    # pass TextPrompt to decoder
+    zipped_prompt_list = zip_enc_dec_prompts(
-    encoder_prompt=single_tokens_prompt,
+        ['An encoder prompt', 'Another encoder prompt'],
-    decoder_prompt=single_text_prompt,
+        ['A decoder prompt', 'Another decoder prompt'])
-)
+    # - Let's put all of the above example prompts together into one list
-# - Finally, here's a useful helper function for zipping encoder and
+    #   which we will pass to the encoder/decoder LLM.
-#   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
+    return [
-#   instances
+        single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
-zipped_prompt_list = zip_enc_dec_prompts(
+        enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
-    ['An encoder prompt', 'Another encoder prompt'],
+    ] + zipped_prompt_list
-    ['A decoder prompt', 'Another decoder prompt'])
-# - Let's put all of the above example prompts together into one list
-#   which we will pass to the encoder/decoder LLM.
-prompts = [
-    single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
-    enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
-] + zipped_prompt_list
 # Create a sampling params object.
-sampling_params = SamplingParams(
+def create_sampling_params():
-    temperature=0,
+    return SamplingParams(
-    top_p=1.0,
+        temperature=0,
-    min_tokens=0,
+        top_p=1.0,
-    max_tokens=20,
+        min_tokens=0,
-)
+        max_tokens=20,
+    )
-# Generate output tokens from the prompts. The output is a list of
-# RequestOutput objects that contain the prompt, generated
-# text, and other information.
-outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
-print("-" * 50)
+def print_outputs(outputs):
-for i, output in enumerate(outputs):
-    prompt = output.prompt
-    encoder_prompt = output.encoder_prompt
-    generated_text = output.outputs[0].text
-    print(f"Output {i+1}:")
-    print(f"Encoder prompt: {encoder_prompt!r}\n"
-          f"Decoder prompt: {prompt!r}\n"
-          f"Generated text: {generated_text!r}")
    print("-" * 50)
+    for i, output in enumerate(outputs):
+        prompt = output.prompt
+        encoder_prompt = output.encoder_prompt
+        generated_text = output.outputs[0].text
+        print(f"Output {i+1}:")
+        print(f"Encoder prompt: {encoder_prompt!r}\n"
+              f"Decoder prompt: {prompt!r}\n"
+              f"Generated text: {generated_text!r}")
+        print("-" * 50)
+def main():
+    dtype = "float"
+    # Create a BART encoder/decoder model instance
+    llm = LLM(
+        model="facebook/bart-large-cnn",
+        dtype=dtype,
+    )
+    # Get BART tokenizer
+    tokenizer = llm.llm_engine.get_tokenizer_group()
+    prompts = create_prompts(tokenizer)
+    sampling_params = create_sampling_params()
+    # Generate output tokens from the prompts. The output is a list of
+    # RequestOutput objects that contain the prompt, generated
+    # text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    print_outputs(outputs)
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -22,7 +22,7 @@ class ModelRequestData(NamedTuple):
 def run_florence2():
    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
-        tokenizer="facebook/bart-large",
+        tokenizer="Isotr0py/Florence-2-tokenizer",
        max_num_seqs=8,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 1},
@@ -126,6 +126,23 @@ model_example_map = {
 }
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="mllama",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+    return parser.parse_args()
 def main(args):
    model = args.model_type
    if model not in model_example_map:
@@ -148,6 +165,7 @@ def main(args):
        temperature=0,
        top_p=1.0,
        max_tokens=64,
+        skip_special_tokens=False,
    )
    start = time.time()
@@ -171,19 +189,5 @@ def main(args):
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
+    args = parse_args()
-        description='Demo on using vLLM for offline inference with '
-        'vision language models for text generation')
-    parser.add_argument('--model-type',
-                        '-m',
-                        type=str,
-                        default="mllama",
-                        choices=model_example_map.keys(),
-                        help='Huggingface "model_type".')
-    parser.add_argument("--seed",
-                        type=int,
-                        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
-    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
    return LLMEngine.from_engine_args(engine_args)
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description='Demo on using the LLMEngine class directly')
+    parser = EngineArgs.add_cli_args(parser)
+    return parser.parse_args()
 def main(args: argparse.Namespace):
    """Main function that sets up and runs the prompt processing."""
    engine = initialize_engine(args)
@@ -58,8 +65,5 @@ def main(args: argparse.Namespace):
 if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
+    args = parse_args()
-        description='Demo on using the LLMEngine class directly')
-    parser = EngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -16,11 +16,11 @@ from vllm.sampling_params import SamplingParams
 # # Mistral format
 # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
 #   --tokenizer-mode mistral --config-format mistral --load-format mistral \
-#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
+#   --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
 #
 # # HF format
 # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
-#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
+#   --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
 # ```
 #
 # - Client:
@@ -62,6 +62,7 @@ def run_simple_demo(args: argparse.Namespace):
        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
        config_format="mistral" if args.format == "mistral" else "auto",
        load_format="mistral" if args.format == "mistral" else "auto",
+        limit_mm_per_prompt={"image": 1},
        max_model_len=4096,
        max_num_seqs=2,
        tensor_parallel_size=2,
@@ -168,7 +169,7 @@ def run_advanced_demo(args: argparse.Namespace):
    print("-" * 50)
-def main():
+def parse_args():
    parser = argparse.ArgumentParser(
        description="Run a demo in simple or advanced mode.")
@@ -187,8 +188,11 @@ def main():
        '--disable-mm-preprocessor-cache',
        action='store_true',
        help='If True, disables caching of multi-modal preprocessor/mapper.')
+    return parser.parse_args()
-    args = parser.parse_args()
+def main():
+    args = parse_args()
    if args.mode == "simple":
        print("Running simple demo...")

--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -34,8 +34,7 @@ def time_generation(llm: LLM, prompts: list[str],
        print("-" * 50)
-if __name__ == "__main__":
+def main():
    template = (
        "Below is an instruction that describes a task. Write a response "
        "that appropriately completes the request.\n\n### Instruction:\n{}"
@@ -66,3 +65,7 @@ if __name__ == "__main__":
    )
    time_generation(llm, prompts, sampling_params, "With speculation")
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -417,6 +417,38 @@ def run_model(input_data,
    return pred_imgs
+def parse_args():
+    parser = argparse.ArgumentParser("MAE run inference", add_help=False)
+    parser.add_argument(
+        "--data_file",
+        type=str,
+        default="./India_900498_S2Hand.tif",
+        help="Path to the file.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="Path to the directory where to save outputs.",
+    )
+    parser.add_argument(
+        "--input_indices",
+        default=[1, 2, 3, 8, 11, 12],
+        type=int,
+        nargs="+",
+        help=
+        "0-based indices of the six Prithvi channels to be selected from the  "
+        "input. By default selects [1,2,3,8,11,12] for S2L1C data.",
+    )
+    parser.add_argument(
+        "--rgb_outputs",
+        action="store_true",
+        help="If present, output files will only contain RGB channels. "
+        "Otherwise, all bands will be saved.",
+    )
 def main(
    data_file: str,
    output_dir: str,
@@ -496,35 +528,7 @@ def main(
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser("MAE run inference", add_help=False)
-    parser.add_argument(
+    args = parse_args()
-        "--data_file",
-        type=str,
-        default="./India_900498_S2Hand.tif",
-        help="Path to the file.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="output",
-        help="Path to the directory where to save outputs.",
-    )
-    parser.add_argument(
-        "--input_indices",
-        default=[1, 2, 3, 8, 11, 12],
-        type=int,
-        nargs="+",
-        help=
-        "0-based indices of the six Prithvi channels to be selected from the  "
-        "input. By default selects [1,2,3,8,11,12] for S2L1C data.",
-    )
-    parser.add_argument(
-        "--rgb_outputs",
-        action="store_true",
-        help="If present, output files will only contain RGB channels. "
-        "Otherwise, all bands will be saved.",
-    )
-    args = parser.parse_args()
    main(**vars(args))
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -359,7 +359,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
              f" in folder {context.save_chrome_traces_folder}")
-if __name__ == "__main__":
+def parse_args():
    parser = FlexibleArgumentParser(description="""
 Profile a model
@@ -449,7 +449,10 @@ Profile a model
    EngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
+    return parser.parse_args()
+def main(args):
    context = ProfileContext(
        engine_args=EngineArgs.from_cli_args(args),
        **{
@@ -458,3 +461,8 @@ Profile a model
            if k in inspect.signature(ProfileContext).parameters
        })
    run_profile(context, csv_output=args.csv, json_output=args.json)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/offline_inference/qwen2_5_omni/README.md
+++ b/examples/offline_inference/qwen2_5_omni/README.md
+# Qwen2.5-Omni Offline Inference Examples
+This folder provides several example scripts on how to inference Qwen2.5-Omni offline.
+## Thinker Only
+```bash
+# Audio + image + video
+python examples/offline_inference/qwen2_5_omni/only_thinker.py -q mixed_modalities
+# Read vision and audio inputs from a single video file
+# NOTE: V1 engine does not support interleaved modalities yet.
+VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q use_audio_in_video
+# Multiple audios
+VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q multi_audios
+```
+This script will run the thinker part of Qwen2.5-Omni, and generate text response.
+You can also test Qwen2.5-Omni on a single modality:
+```bash
+# Process audio inputs
+python examples/offline_inference/audio_language.py --model-type qwen2_5_omni
+# Process image inputs
+python examples/offline_inference/vision_language.py --modality image --model-type qwen2_5_omni
+# Process video inputs
+python examples/offline_inference/vision_language.py --modality video --model-type qwen2_5_omni
+```
--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on Qwen2.5-Omni (thinker only).
+"""
+from typing import NamedTuple
+import vllm.envs as envs
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.utils import FlexibleArgumentParser
+class QueryResult(NamedTuple):
+    inputs: dict
+    limit_mm_per_prompt: dict[str, int]
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+default_system = (
+    "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+    "Group, capable of perceiving auditory and visual inputs, as well as "
+    "generating text and speech.")
+def get_mixed_modalities_query() -> QueryResult:
+    question = ("What is recited in the audio? "
+                "What is the content of this image? Why is this video funny?")
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
+              "<|vision_bos|><|IMAGE|><|vision_eos|>"
+              "<|vision_bos|><|VIDEO|><|vision_eos|>"
+              f"{question}<|im_end|>\n"
+              f"<|im_start|>assistant\n")
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio":
+                AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                "image":
+                ImageAsset("cherry_blossom").pil_image.convert("RGB"),
+                "video":
+                VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=16).np_ndarrays,
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 1,
+            "image": 1,
+            "video": 1
+        },
+    )
+def get_use_audio_in_video_query() -> QueryResult:
+    question = ("Describe the content of the video, "
+                "then convert what the baby say into text.")
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
+              f"{question}<|im_end|>\n"
+              f"<|im_start|>assistant\n")
+    asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16)
+    audio = asset.get_audio(sampling_rate=16000)
+    assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. "
+                                  "Please launch this example with "
+                                  "`VLLM_USE_V1=0`.")
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "video": asset.np_ndarrays,
+                "audio": audio,
+            },
+            "mm_processor_kwargs": {
+                "use_audio_in_video": True,
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 1,
+            "video": 1
+        },
+    )
+def get_multi_audios_query() -> QueryResult:
+    question = "Are these two audio clips the same?"
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
+              "<|audio_bos|><|AUDIO|><|audio_eos|>"
+              f"{question}<|im_end|>\n"
+              f"<|im_start|>assistant\n")
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": [
+                    AudioAsset("winning_call").audio_and_sample_rate,
+                    AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                ],
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 2,
+        },
+    )
+query_map = {
+    "mixed_modalities": get_mixed_modalities_query,
+    "use_audio_in_video": get_use_audio_in_video_query,
+    "multi_audios": get_multi_audios_query,
+}
+def main(args):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+    query_result = query_map[args.query_type]()
+    llm = LLM(model=model_name,
+              max_model_len=5632,
+              max_num_seqs=5,
+              limit_mm_per_prompt=query_result.limit_mm_per_prompt,
+              seed=args.seed)
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
+    outputs = llm.generate(query_result.inputs,
+                           sampling_params=sampling_params)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'audio language models')
+    parser.add_argument('--query-type',
+                        '-q',
+                        type=str,
+                        default="mixed_modalities",
+                        choices=query_map.keys(),
+                        help='Query type.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+    args = parser.parse_args()
+    main(args)
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -29,20 +29,23 @@ from pathlib import Path
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
-parser = FlexibleArgumentParser()
-EngineArgs.add_cli_args(parser)
+def parse_args():
-parser.add_argument("--output",
+    parser = FlexibleArgumentParser()
-                    "-o",
+    EngineArgs.add_cli_args(parser)
-                    required=True,
+    parser.add_argument("--output",
-                    type=str,
+                        "-o",
-                    help="path to output checkpoint")
+                        required=True,
-parser.add_argument("--file-pattern",
+                        type=str,
-                    type=str,
+                        help="path to output checkpoint")
-                    help="string pattern of saved filenames")
+    parser.add_argument("--file-pattern",
-parser.add_argument("--max-file-size",
+                        type=str,
-                    type=str,
+                        help="string pattern of saved filenames")
-                    default=5 * 1024**3,
+    parser.add_argument("--max-file-size",
-                    help="max size (in bytes) of each safetensors file")
+                        type=str,
+                        default=5 * 1024**3,
+                        help="max size (in bytes) of each safetensors file")
+    return parser.parse_args()
 def main(args):
@@ -87,5 +90,5 @@ def main(args):
 if __name__ == "__main__":
-    args = parser.parse_args()
+    args = parse_args()
    main(args)