更新README.md，修改Docker镜像版本和深度学习库依赖，删除多个示例文件以简化代码库。

25f39502 · laibao · 951558c2 · 25f39502 · 25f39502 · 25f39502
Commit 25f39502 authored Aug 16, 2025 by laibao
20 changed files
--- a/examples/offline_inference/context_extension.py
+++ b/examples/offline_inference/context_extension.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script demonstrates how to extend the context length
+of a Qwen model using the YARN method (rope_scaling)
+and run a simple chat example.
+
+Usage:
+    python examples/offline_inference/context_extension.py
+"""
+
+from vllm import LLM, SamplingParams
+
+
+def create_llm():
+    rope_theta = 1000000
+    original_max_position_embeddings = 32768
+    factor = 4.0
+
+    # Use yarn to extend context
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings": original_max_position_embeddings,
+        },
+        "max_model_len": int(original_max_position_embeddings * factor),
+    }
+
+    llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
+    return llm
+
+
+def run_llm_chat(llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=128,
+    )
+
+    conversation = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
+    ]
+    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+    return outputs
+
+
+def print_outputs(outputs):
+    print("\nGenerated Outputs:\n" + "-" * 80)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\n")
+        print(f"Generated text: {generated_text!r}")
+        print("-" * 80)
+
+
+def main():
+    llm = create_llm()
+    outputs = run_llm_chat(llm)
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Usage:
+Single node:
+    python examples/offline_inference/data_parallel.py \
+            --model="ibm-research/PowerMoE-3b" \
+            --dp-size=2 \
+            --tp-size=2
+
+Multi-node:
+    Node 0 (assume the node has ip of 10.99.48.128):
+            python examples/offline_inference/data_parallel.py \
+                    --model="ibm-research/PowerMoE-3b" \
+                    --dp-size=2 \
+                    --tp-size=2 \
+                    --node-size=2 \
+                    --node-rank=0 \
+                    --master-addr=10.99.48.128 \
+                    --master-port=13345
+    Node 1:
+            python examples/offline_inference/data_parallel.py \
+                    --model="ibm-research/PowerMoE-3b" \
+                    --dp-size=2 \
+                    --tp-size=2 \
+                    --node-size=2 \
+                    --node-rank=1 \
+                    --master-addr=10.99.48.128 \
+                    --master-port=13345
+"""
+
+import os
+from time import sleep
+
+from vllm import LLM, SamplingParams
+from vllm.utils import get_open_port
+
+
+def parse_args():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Data Parallel Inference")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="ibm-research/PowerMoE-3b",
+        help="Model name or path",
+    )
+    parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
+    parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
+    parser.add_argument(
+        "--node-size", type=int, default=1, help="Total number of nodes"
+    )
+    parser.add_argument(
+        "--node-rank", type=int, default=0, help="Rank of the current node"
+    )
+    parser.add_argument(
+        "--master-addr", type=str, default="", help="Master node IP address"
+    )
+    parser.add_argument("--master-port", type=int, default=0, help="Master node port")
+    parser.add_argument(
+        "--enforce-eager", action="store_true", help="Enforce eager mode execution."
+    )
+    parser.add_argument(
+        "--trust-remote-code", action="store_true", help="Trust remote code."
+    )
+    parser.add_argument(
+        "--max-num-seqs",
+        type=int,
+        default=64,
+        help=("Maximum number of sequences to be processed in a single iteration."),
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.8,
+        help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
+    )
+    return parser.parse_args()
+
+
+def main(
+    model,
+    dp_size,
+    local_dp_rank,
+    global_dp_rank,
+    dp_master_ip,
+    dp_master_port,
+    GPUs_per_dp_rank,
+    enforce_eager,
+    trust_remote_code,
+    max_num_seqs,
+    gpu_memory_utilization,
+):
+    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
+    os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
+    os.environ["VLLM_DP_SIZE"] = str(dp_size)
+    os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
+    os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
+
+    # CUDA_VISIBLE_DEVICES for each DP rank is set automatically inside the
+    # engine processes.
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ] * 100
+
+    # with DP, each rank should process different prompts.
+    # usually all the DP ranks process a full dataset,
+    # and each rank processes a different part of the dataset.
+    floor = len(prompts) // dp_size
+    remainder = len(prompts) % dp_size
+
+    # Distribute prompts into even groups.
+    def start(rank):
+        return rank * floor + min(rank, remainder)
+
+    prompts = prompts[start(global_dp_rank) : start(global_dp_rank + 1)]
+    if len(prompts) == 0:
+        # if any rank has no prompts to process,
+        # we need to set a placeholder prompt
+        prompts = ["Placeholder"]
+    print(f"DP rank {global_dp_rank} needs to process {len(prompts)} prompts")
+
+    # Create a sampling params object.
+    # since we are doing data parallel, every rank can have different
+    # sampling params. here we set different max_tokens for different
+    # ranks for demonstration.
+    sampling_params = SamplingParams(
+        temperature=0.8, top_p=0.95, max_tokens=[16, 20][global_dp_rank % 2]
+    )
+
+    # Create an LLM.
+    llm = LLM(
+        model=model,
+        tensor_parallel_size=GPUs_per_dp_rank,
+        enforce_eager=enforce_eager,
+        enable_expert_parallel=True,
+        trust_remote_code=trust_remote_code,
+        max_num_seqs=max_num_seqs,
+        gpu_memory_utilization=gpu_memory_utilization,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        if i >= 5:
+            # print only 5 outputs
+            break
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(
+            f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
+            f"Generated text: {generated_text!r}"
+        )
+
+    # Give engines time to pause their processing loops before exiting.
+    sleep(1)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    dp_size = args.dp_size
+    tp_size = args.tp_size
+    node_size = args.node_size
+    node_rank = args.node_rank
+
+    if node_size == 1:
+        dp_master_ip = "127.0.0.1"
+        dp_master_port = get_open_port()
+    else:
+        dp_master_ip = args.master_addr
+        dp_master_port = args.master_port
+
+    assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
+    dp_per_node = dp_size // node_size
+
+    from multiprocessing import Process
+
+    procs = []
+    for local_dp_rank, global_dp_rank in enumerate(
+        range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)
+    ):
+        proc = Process(
+            target=main,
+            args=(
+                args.model,
+                dp_size,
+                local_dp_rank,
+                global_dp_rank,
+                dp_master_ip,
+                dp_master_port,
+                tp_size,
+                args.enforce_eager,
+                args.trust_remote_code,
+                args.max_num_seqs,
+                args.gpu_memory_utilization,
+            ),
+        )
+        proc.start()
+        procs.append(proc)
+    exit_code = 0
+    for proc in procs:
+        proc.join(timeout=300)
+        if proc.exitcode is None:
+            print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
+            proc.kill()
+            exit_code = 1
+        elif proc.exitcode:
+            exit_code = proc.exitcode
+
+    exit(exit_code)
--- a/examples/offline_inference/disaggregated-prefill-v1/README.md
+++ b/examples/offline_inference/disaggregated-prefill-v1/README.md
+# Disaggregated Prefill V1
+
+This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM.
+
+## Files
+
+- `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
+  - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
+- `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
+- `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+
+def read_prompts():
+    """Read prompts from output.txt"""
+    prompts = []
+    try:
+        with open("output.txt") as f:
+            for line in f:
+                prompts.append(line.strip())
+        print(f"Loaded {len(prompts)} prompts from output.txt")
+        return prompts
+    except FileNotFoundError:
+        print("Error: output.txt file not found")
+        exit(-1)
+
+
+def main():
+    prompts = read_prompts()
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        gpu_memory_utilization=0.8,
+        max_num_batched_tokens=64,
+        max_num_seqs=16,
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        ),
+    )  # , max_model_len=2048, max_num_batched_tokens=2048)
+
+    # 1ST generation (prefill instance)
+    outputs = llm.generate(prompts, sampling_params)
+
+    print("-" * 30)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 30)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+
+def read_prompts():
+    context = "Hi " * 1000
+    context2 = "Hey " * 500
+    return [
+        context + "Hello, my name is",
+        context + "The capital of France is",
+        context2 + "Your name is",
+        context2 + "The capital of China is",
+    ]
+
+
+def main():
+    prompts = read_prompts()
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        gpu_memory_utilization=0.8,
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        ),
+    )  # , max_model_len=2048, max_num_batched_tokens=2048)
+
+    # 1ST generation (prefill instance)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+    )
+
+    new_prompts = []
+    print("-" * 30)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        new_prompts.append(prompt + generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 30)
+
+    # Write new_prompts to output.txt
+    with open("output.txt", "w") as f:
+        for prompt in new_prompts:
+            f.write(prompt + "\n")
+    print(f"Saved {len(new_prompts)} prompts to output.txt")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/disaggregated-prefill-v1/run.sh
+++ b/examples/offline_inference/disaggregated-prefill-v1/run.sh
+rm -rf local_storage/
+
+if [ -f "output.txt" ]; then
+    rm output.txt
+fi
+
+# The directory of current script
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py"
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py"
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates the example usage of disaggregated prefilling
+We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
+and then transfer the KV cache between them.
+"""
+
+import os
+import time
+from multiprocessing import Event, Process
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+
+def run_prefill(prefill_done):
+    # We use GPU 0 for prefill node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+    # The prefill node receives two requests, while the decode node receives
+    # three requests. So the decode node will only receive the KV Cache for
+    # requests 1 and 3. The decode node will use the KV Cache of requests 1
+    # and 3 and do prefilling on request 2.
+    prompts = [
+        "Hello, my name is",
+        "Hi, your name is",
+        # The decode node will actually "prefill" this request.
+        "Tell me a very long story",
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # This instance is the prefill node (kv_producer, rank 0).
+    # The number of parallel instances for KV cache transfer is set to 2,
+    # as required for PyNcclConnector.
+    ktc = KVTransferConfig(
+        kv_connector="PyNcclConnector",
+        kv_role="kv_producer",
+        kv_rank=0,
+        kv_parallel_size=2,
+    )
+
+    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
+    # memory. You may need to adjust the value to fit your GPU.
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        kv_transfer_config=ktc,
+        max_model_len=2000,
+        gpu_memory_utilization=0.8,
+    )
+
+    llm.generate(prompts, sampling_params)
+    print("Prefill node is finished.")
+    prefill_done.set()
+
+    # To keep the prefill node running in case the decode node is not done;
+    # otherwise, the script might exit prematurely, causing incomplete decoding.
+    try:
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("Script stopped by user.")
+
+
+def run_decode(prefill_done):
+    # We use GPU 1 for decode node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+
+    prompts = [
+        "Hello, my name is",
+        "Hi, your name is",
+        "Tell me a very long story",
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95)
+
+    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # This instance is the decode node (kv_consumer, rank 1).
+    # The number of parallel instances for KV cache transfer is set to 2,
+    # as required for PyNcclConnector.
+    ktc = KVTransferConfig(
+        kv_connector="PyNcclConnector",
+        kv_role="kv_consumer",
+        kv_rank=1,
+        kv_parallel_size=2,
+    )
+
+    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
+    # memory. You may need to adjust the value to fit your GPU.
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        kv_transfer_config=ktc,
+        max_model_len=2000,
+        gpu_memory_utilization=0.8,
+    )
+
+    # Wait for the producer to start the pipe
+    print("Waiting for prefill node to finish...")
+    prefill_done.wait()
+
+    # At this point when the prefill_done is set, the kv-cache should have been
+    # transferred to this decode node, so we can start decoding.
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+def main():
+    prefill_done = Event()
+    prefill_process = Process(target=run_prefill, args=(prefill_done,))
+    decode_process = Process(target=run_decode, args=(prefill_done,))
+
+    # Start prefill node
+    prefill_process.start()
+
+    # Start decode node
+    decode_process.start()
+
+    # Terminate the prefill node when decode is finished
+    decode_process.join()
+    prefill_process.terminate()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Follow the white rabbit.",  # English
+        "Sigue al conejo blanco.",  # Spanish
+        "Suis le lapin blanc.",  # French
+        "跟着白兔走。",  # Chinese
+        "اتبع الأرنب الأبيض.",  # Arabic
+        "Folge dem weißen Kaninchen.",  # German
+    ]
+
+    # Create an LLM.
+    # You should pass task="embed" for embedding models
+    model = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    # Only text matching task is supported for now. See #16120
+    outputs = model.embed(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:")
+    print("Only text matching task is supported for now. See #16120")
+    print("-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        embeds_trimmed = (
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
+        )
+        print(
+            f"Prompt: {prompt!r} \n"
+            f"Embeddings for text matching: {embeds_trimmed} "
+            f"(size={len(embeds)})"
+        )
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs, PoolingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Follow the white rabbit.",  # English
+        "Sigue al conejo blanco.",  # Spanish
+        "Suis le lapin blanc.",  # French
+        "跟着白兔走。",  # Chinese
+        "اتبع الأرنب الأبيض.",  # Arabic
+        "Folge dem weißen Kaninchen.",  # German
+    ]
+
+    # Create an LLM.
+    # You should pass task="embed" for embedding models
+    model = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
+
+    # Print the outputs.
+    print("\nGenerated Outputs:")
+    print("-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        embeds_trimmed = (
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
+        )
+        print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrate prompting of text-to-text
+encoder/decoder models, specifically BART
+"""
+
+from vllm import LLM, SamplingParams
+from vllm.inputs import (
+    ExplicitEncoderDecoderPrompt,
+    TextPrompt,
+    TokensPrompt,
+    zip_enc_dec_prompts,
+)
+
+
+def create_prompts(tokenizer):
+    # Test prompts
+    #
+    # This section shows all of the valid ways to prompt an
+    # encoder/decoder model.
+    #
+    # - Helpers for building prompts
+    text_prompt_raw = "Hello, my name is"
+    text_prompt = TextPrompt(prompt="The president of the United States is")
+    tokens_prompt = TokensPrompt(
+        prompt_token_ids=tokenizer.encode(prompt="The capital of France is")
+    )
+    # - Pass a single prompt to encoder/decoder model
+    #   (implicitly encoder input prompt);
+    #   decoder input prompt is assumed to be None
+
+    single_text_prompt_raw = text_prompt_raw  # Pass a string directly
+    single_text_prompt = text_prompt  # Pass a TextPrompt
+    single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
+
+    # ruff: noqa: E501
+    # - Pass explicit encoder and decoder input prompts within one data structure.
+    #   Encoder and decoder prompts can both independently be text or tokens, with
+    #   no requirement that they be the same prompt type. Some example prompt-type
+    #   combinations are shown below, note that these are not exhaustive.
+
+    enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
+        # Pass encoder prompt string directly, &
+        # pass decoder prompt tokens
+        encoder_prompt=single_text_prompt_raw,
+        decoder_prompt=single_tokens_prompt,
+    )
+    enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
+        # Pass TextPrompt to encoder, and
+        # pass decoder prompt string directly
+        encoder_prompt=single_text_prompt,
+        decoder_prompt=single_text_prompt_raw,
+    )
+    enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
+        # Pass encoder prompt tokens directly, and
+        # pass TextPrompt to decoder
+        encoder_prompt=single_tokens_prompt,
+        decoder_prompt=single_text_prompt,
+    )
+
+    # - Finally, here's a useful helper function for zipping encoder and
+    #   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
+    #   instances
+    zipped_prompt_list = zip_enc_dec_prompts(
+        ["An encoder prompt", "Another encoder prompt"],
+        ["A decoder prompt", "Another decoder prompt"],
+    )
+
+    # - Let's put all of the above example prompts together into one list
+    #   which we will pass to the encoder/decoder LLM.
+    return [
+        single_text_prompt_raw,
+        single_text_prompt,
+        single_tokens_prompt,
+        enc_dec_prompt1,
+        enc_dec_prompt2,
+        enc_dec_prompt3,
+    ] + zipped_prompt_list
+
+
+# Create a sampling params object.
+def create_sampling_params():
+    return SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        min_tokens=0,
+        max_tokens=20,
+    )
+
+
+# Print the outputs.
+def print_outputs(outputs):
+    print("-" * 50)
+    for i, output in enumerate(outputs):
+        prompt = output.prompt
+        encoder_prompt = output.encoder_prompt
+        generated_text = output.outputs[0].text
+        print(f"Output {i + 1}:")
+        print(
+            f"Encoder prompt: {encoder_prompt!r}\n"
+            f"Decoder prompt: {prompt!r}\n"
+            f"Generated text: {generated_text!r}"
+        )
+        print("-" * 50)
+
+
+def main():
+    dtype = "float"
+
+    # Create a BART encoder/decoder model instance
+    llm = LLM(
+        model="facebook/bart-large-cnn",
+        dtype=dtype,
+    )
+
+    # Get BART tokenizer
+    tokenizer = llm.llm_engine.get_tokenizer_group()
+
+    prompts = create_prompts(tokenizer)
+    sampling_params = create_sampling_params()
+
+    # Generate output tokens from the prompts. The output is a list of
+    # RequestOutput objects that contain the prompt, generated
+    # text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference with
+the explicit/implicit prompt format on enc-dec LMMs for text generation.
+"""
+
+import time
+from collections.abc import Sequence
+from dataclasses import asdict
+from typing import NamedTuple
+
+from vllm import LLM, EngineArgs, PromptType, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.utils import FlexibleArgumentParser
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: Sequence[PromptType]
+
+
+def run_florence2():
+    engine_args = EngineArgs(
+        model="microsoft/Florence-2-large",
+        tokenizer="Isotr0py/Florence-2-tokenizer",
+        max_num_seqs=8,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {  # implicit prompt with task token
+            "prompt": "<DETAILED_CAPTION>",
+            "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
+        },
+        {  # explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "Describe in detail what is shown in the image.",
+                "multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image},
+            },
+            "decoder_prompt": "",
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+def run_mllama():
+    engine_args = EngineArgs(
+        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {  # Implicit prompt
+            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",  # noqa: E501
+            "multi_modal_data": {
+                "image": ImageAsset("stop_sign").pil_image,
+            },
+        },
+        {  # Explicit prompt
+            "encoder_prompt": {
+                "prompt": "<|image|>",
+                "multi_modal_data": {
+                    "image": ImageAsset("stop_sign").pil_image,
+                },
+            },
+            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",  # noqa: E501
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+def run_whisper():
+    engine_args = EngineArgs(
+        model="openai/whisper-large-v3-turbo",
+        max_model_len=448,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"audio": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {  # Test implicit prompt
+            "prompt": "<|startoftranscript|>",
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+            },
+        },
+        {  # Test explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "",
+                "multi_modal_data": {
+                    "audio": AudioAsset("winning_call").audio_and_sample_rate,
+                },
+            },
+            "decoder_prompt": "<|startoftranscript|>",
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+model_example_map = {
+    "florence2": run_florence2,
+    "mllama": run_mllama,
+    "whisper": run_whisper,
+}
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "vision language models for text generation"
+    )
+    parser.add_argument(
+        "--model-type",
+        "-m",
+        type=str,
+        default="mllama",
+        choices=model_example_map.keys(),
+        help='Huggingface "model_type".',
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    req_data = model_example_map[model]()
+
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    prompts = req_data.prompts
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        max_tokens=64,
+        skip_special_tokens=False,
+    )
+
+    start = time.time()
+
+    # Generate output tokens from the prompts. The output is a list of
+    # RequestOutput objects that contain the prompt, generated
+    # text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    duration = time.time() - start
+
+    print("Duration:", duration)
+    print("RPS:", len(prompts) / duration)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates using the `LLMEngine`
+for processing prompts with various sampling parameters.
+"""
+
+import argparse
+
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def create_test_prompts() -> list[tuple[str, SamplingParams]]:
+    """Create a list of test prompts with their sampling parameters."""
+    return [
+        (
+            "A robot may not injure a human being",
+            SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1),
+        ),
+        (
+            "To be or not to be,",
+            SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2),
+        ),
+        (
+            "What is the meaning of life?",
+            SamplingParams(n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1),
+        ),
+    ]
+
+
+def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+
+    print("-" * 50)
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params = test_prompts.pop(0)
+            engine.add_request(str(request_id), prompt, sampling_params)
+            request_id += 1
+
+        request_outputs: list[RequestOutput] = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                print(request_output)
+                print("-" * 50)
+
+
+def initialize_engine(args: argparse.Namespace) -> LLMEngine:
+    """Initialize the LLMEngine from the command line arguments."""
+    engine_args = EngineArgs.from_cli_args(args)
+    return LLMEngine.from_engine_args(engine_args)
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using the LLMEngine class directly"
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    return parser.parse_args()
+
+
+def main(args: argparse.Namespace):
+    """Main function that sets up and runs the prompt processing."""
+    engine = initialize_engine(args)
+    test_prompts = create_test_prompts()
+    process_requests(engine, test_prompts)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/offline_inference/load_sharded_state.py
+++ b/examples/offline_inference/load_sharded_state.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Validates the loading of a model saved with the sharded_state format.
+This script demonstrates how to load a model that was previously saved
+using save_sharded_state.py and validates it by running inference.
+Example usage:
+(First need to save a sharded_state mode)
+
+python save_sharded_state.py \
+    --model /path/to/load \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --output /path/to/save/sharded/modele
+
+python load_sharded_state.py \
+    --model /path/to/saved/sharded/model \
+    --load-format sharded_state \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --prompt "Hello, my name is" \
+    --max-tokens 50
+"""
+
+import dataclasses
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    # Add engine arguments
+    EngineArgs.add_cli_args(parser)
+
+    # Override default load_format for clarity
+    parser.set_defaults(load_format="sharded_state")
+
+    # Add validation arguments
+    parser.add_argument(
+        "--prompt", type=str, default="Hello, world!", help="Prompt for validation"
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=100,
+        help="Maximum number of tokens to generate",
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=0.7, help="Sampling temperature"
+    )
+    parser.add_argument(
+        "--top-p", type=float, default=1.0, help="Top-p sampling parameter"
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    engine_args = EngineArgs.from_cli_args(args)
+
+    print(
+        f"Loading model from {engine_args.model} using format {engine_args.load_format}"
+    )
+    print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
+
+    # Load the model using engine args
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    # Prepare sampling parameters
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_tokens=args.max_tokens,
+    )
+
+    print("\nRunning inference:")
+    print(f"Prompt: {args.prompt}")
+
+    # Generate completion
+    outputs = llm.generate(args.prompt, sampling_params)
+
+    # Display generated text
+    print("\nGenerated outputs:")
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print("-" * 50)
+        print(f"Full output: {args.prompt}{generated_text}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use LoRA with different quantization techniques
+for offline inference.
+
+Requires HuggingFace credentials for access.
+"""
+
+import gc
+from typing import Optional
+
+import torch
+from huggingface_hub import snapshot_download
+
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.lora.request import LoRARequest
+
+
+def create_test_prompts(
+    lora_path: str,
+) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
+    return [
+        # this is an example of using quantization without LoRA
+        (
+            "My name is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            None,
+        ),
+        # the next three examples use quantization with LoRA
+        (
+            "my name is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            LoRARequest("lora-test-1", 1, lora_path),
+        ),
+        (
+            "The capital of USA is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            LoRARequest("lora-test-2", 1, lora_path),
+        ),
+        (
+            "The capital of France is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            LoRARequest("lora-test-3", 1, lora_path),
+        ),
+    ]
+
+
+def process_requests(
+    engine: LLMEngine,
+    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params, lora_request = test_prompts.pop(0)
+            engine.add_request(
+                str(request_id), prompt, sampling_params, lora_request=lora_request
+            )
+            request_id += 1
+
+        request_outputs: list[RequestOutput] = engine.step()
+        for request_output in request_outputs:
+            if request_output.finished:
+                print("----------------------------------------------------")
+                print(f"Prompt: {request_output.prompt}")
+                print(f"Output: {request_output.outputs[0].text}")
+
+
+def initialize_engine(
+    model: str, quantization: str, lora_repo: Optional[str]
+) -> LLMEngine:
+    """Initialize the LLMEngine."""
+
+    engine_args = EngineArgs(
+        model=model,
+        quantization=quantization,
+        enable_lora=True,
+        max_lora_rank=64,
+        max_loras=4,
+    )
+    return LLMEngine.from_engine_args(engine_args)
+
+
+def main():
+    """Main function that sets up and runs the prompt processing."""
+
+    test_configs = [
+        # QLoRA (https://arxiv.org/abs/2305.14314)
+        {
+            "name": "qlora_inference_example",
+            "model": "huggyllama/llama-7b",
+            "quantization": "bitsandbytes",
+            "lora_repo": "timdettmers/qlora-flan-7b",
+        },
+        {
+            "name": "AWQ_inference_with_lora_example",
+            "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+            "quantization": "awq",
+            "lora_repo": "jashing/tinyllama-colorist-lora",
+        },
+        {
+            "name": "GPTQ_inference_with_lora_example",
+            "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            "quantization": "gptq",
+            "lora_repo": "jashing/tinyllama-colorist-lora",
+        },
+    ]
+
+    for test_config in test_configs:
+        print(f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~")
+        engine = initialize_engine(
+            test_config["model"], test_config["quantization"], test_config["lora_repo"]
+        )
+        lora_path = snapshot_download(repo_id=test_config["lora_repo"])
+        test_prompts = create_test_prompts(lora_path)
+        process_requests(engine, test_prompts)
+
+        # Clean up the GPU memory for the next test
+        del engine
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/metrics.py
+++ b/examples/offline_inference/metrics.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Vector
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", disable_log_stats=False)
+
+    # Generate texts from the prompts.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Dump all metrics
+    for metric in llm.get_metrics():
+        if isinstance(metric, Gauge):
+            print(f"{metric.name} (gauge) = {metric.value}")
+        elif isinstance(metric, Counter):
+            print(f"{metric.name} (counter) = {metric.value}")
+        elif isinstance(metric, Vector):
+            print(f"{metric.name} (vector) = {metric.values}")
+        elif isinstance(metric, Histogram):
+            print(f"{metric.name} (histogram)")
+            print(f"    sum = {metric.sum}")
+            print(f"    count = {metric.count}")
+            for bucket_le, value in metric.buckets.items():
+                print(f"    {bucket_le} = {value}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference_pixtral.py
+++ b/examples/offline_inference_pixtral.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
 # ruff: noqa
 import argparse

 from vllm import LLM
 from vllm.sampling_params import SamplingParams
+from vllm.assets.image import ImageAsset

-# This script is an offline demo for running Pixtral.
+# This script is an offline demo for running Mistral-Small-3.1
 #
 # If you want to run a server/client setup, please follow this code:
 #
 # - Server:
 #
 # ```bash
-# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
+# # Mistral format
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
+#   --tokenizer-mode mistral --config-format mistral --load-format mistral \
+#   --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
+#
+# # HF format
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
+#   --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
 # ```
 #
 # - Client:
@@ -21,7 +32,7 @@ from vllm.sampling_params import SamplingParams
 # --header 'Content-Type: application/json' \
 # --header 'Authorization: Bearer token' \
 # --data '{
-#     "model": "mistralai/Pixtral-12B-2409",
+#     "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
 #     "messages": [
 #       {
 #         "role": "user",
@@ -40,51 +51,61 @@ from vllm.sampling_params import SamplingParams
 #     python demo.py simple
 #     python demo.py advanced

+# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
+# These scripts have been tested on 2x L40 GPUs

-def run_simple_demo():
-    model_name = "mistralai/Pixtral-12B-2409"
+
+def run_simple_demo(args: argparse.Namespace):
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
    sampling_params = SamplingParams(max_tokens=8192)

-    # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
-    llm = LLM(model=model_name, tokenizer_mode="mistral")
+    llm = LLM(
+        model=model_name,
+        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
+        config_format="mistral" if args.format == "mistral" else "auto",
+        load_format="mistral" if args.format == "mistral" else "auto",
+        limit_mm_per_prompt={"image": 1},
+        max_model_len=4096,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )

    prompt = "Describe this image in one sentence."
-    image_url = "https://picsum.photos/id/237/200/300"

    messages = [
        {
-            "role":
-            "user",
+            "role": "user",
            "content": [
+                {"type": "text", "text": prompt},
                {
-                    "type": "text",
-                    "text": prompt
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
                },
            ],
        },
    ]
    outputs = llm.chat(messages, sampling_params=sampling_params)
-
+    print("-" * 50)
    print(outputs[0].outputs[0].text)
+    print("-" * 50)


-def run_advanced_demo():
-    model_name = "mistralai/Pixtral-12B-2409"
-    max_img_per_msg = 5
+def run_advanced_demo(args: argparse.Namespace):
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+    max_img_per_msg = 3
    max_tokens_per_img = 4096

    sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
    llm = LLM(
        model=model_name,
-        tokenizer_mode="mistral",
+        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
+        config_format="mistral" if args.format == "mistral" else "auto",
+        load_format="mistral" if args.format == "mistral" else "auto",
        limit_mm_per_prompt={"image": max_img_per_msg},
        max_model_len=max_img_per_msg * max_tokens_per_img,
+        tensor_parallel_size=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    prompt = "Describe the following image."
@@ -95,25 +116,11 @@ def run_advanced_demo():

    messages = [
        {
-            "role":
-            "user",
+            "role": "user",
            "content": [
-                {
-                    "type": "text",
-                    "text": prompt
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": url_1
-                    }
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": url_2
-                    }
-                },
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {"url": url_1}},
+                {"type": "image_url", "image_url": {"url": url_2}},
            ],
        },
        {
@@ -127,23 +134,21 @@ def run_advanced_demo():
        {
            "role": "user",
            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": url_3
-                    }
-                },
+                {"type": "image_url", "image_url": {"url": url_3}},
            ],
        },
    ]

    outputs = llm.chat(messages=messages, sampling_params=sampling_params)
+    print("-" * 50)
    print(outputs[0].outputs[0].text)
+    print("-" * 50)


-def main():
+def parse_args():
    parser = argparse.ArgumentParser(
-        description="Run a demo in simple or advanced mode.")
+        description="Run a demo in simple or advanced mode."
+    )

    parser.add_argument(
        "mode",
@@ -151,14 +156,30 @@ def main():
        help="Specify the demo mode: 'simple' or 'advanced'",
    )

-    args = parser.parse_args()
+    parser.add_argument(
+        "--format",
+        choices=["mistral", "hf"],
+        default="mistral",
+        help="Specify the format of the model to load.",
+    )
+
+    parser.add_argument(
+        "--disable-mm-preprocessor-cache",
+        action="store_true",
+        help="If True, disables caching of multi-modal preprocessor/mapper.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()

    if args.mode == "simple":
        print("Running simple demo...")
-        run_simple_demo()
+        run_simple_demo(args)
    elif args.mode == "advanced":
        print("Running advanced demo...")
-        run_advanced_demo()
+        run_advanced_demo(args)


 if __name__ == "__main__":

--- a/examples/offline_inference_mlpspeculator.py
+++ b/examples/offline_inference_mlpspeculator.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates the usage of text generation with an LLM model,
+comparing the performance with and without speculative decoding.
+
+Note that still not support `v1`:
+VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
+"""
+
 import gc
 import time
-from typing import List

 from vllm import LLM, SamplingParams


-def time_generation(llm: LLM, prompts: List[str],
-                    sampling_params: SamplingParams):
+def time_generation(
+    llm: LLM, prompts: list[str], sampling_params: SamplingParams, title: str
+):
    # Generate texts from the prompts. The output is a list of RequestOutput
    # objects that contain the prompt, generated text, and other information.
    # Warmup first
@@ -15,19 +25,22 @@ def time_generation(llm: LLM, prompts: List[str],
    start = time.time()
    outputs = llm.generate(prompts, sampling_params)
    end = time.time()
-    print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
+    print("-" * 50)
+    print(title)
+    print("time: ", (end - start) / sum(len(o.outputs[0].token_ids) for o in outputs))
    # Print the outputs.
    for output in outputs:
        generated_text = output.outputs[0].text
        print(f"text: {generated_text!r}")
+        print("-" * 50)


-if __name__ == "__main__":
-
+def main():
    template = (
        "Below is an instruction that describes a task. Write a response "
        "that appropriately completes the request.\n\n### Instruction:\n{}"
-        "\n\n### Response:\n")
+        "\n\n### Response:\n"
+    )

    # Sample prompts.
    prompts = [
@@ -40,8 +53,7 @@ if __name__ == "__main__":
    # Create an LLM without spec decoding
    llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")

-    print("Without speculation")
-    time_generation(llm, prompts, sampling_params)
+    time_generation(llm, prompts, sampling_params, "Without speculation")

    del llm
    gc.collect()
@@ -49,10 +61,13 @@ if __name__ == "__main__":
    # Create an LLM with spec decoding
    llm = LLM(
        model="meta-llama/Llama-2-13b-chat-hf",
-        speculative_model="ibm-fms/llama-13b-accelerator",
-        # These are currently required for MLPSpeculator decoding
-        use_v2_block_manager=True,
+        speculative_config={
+            "model": "ibm-ai-platform/llama-13b-accelerator",
+        },
    )

-    print("With speculation")
-    time_generation(llm, prompts, sampling_params)
+    time_generation(llm, prompts, sampling_params, "With speculation")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        max_num_seqs=8,
+        # The max_model_len and block_size arguments are required to be same as
+        # max sequence length when targeting neuron device.
+        # Currently, this is a known limitation in continuous batching support
+        # in transformers-neuronx.
+        # TODO(liangfu): Support paged-attention in transformers-neuronx.
+        max_model_len=1024,
+        block_size=1024,
+        # ruff: noqa: E501
+        # The device can be automatically detected when AWS Neuron SDK is installed.
+        # The device argument can be either unspecified for automated detection,
+        # or explicitly assigned.
+        device="neuron",
+        tensor_parallel_size=2,
+    )
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/neuron_eagle.py
+++ b/examples/offline_inference/neuron_eagle.py