Update README.md to include detailed information about GLM-4V-9B, its...

Update README.md to include detailed information about GLM-4V-9B, its capabilities, model structure, algorithms, environment setup, inference instructions, and application scenarios.

Update README.md to include detailed information about GLM-4V-9B, its...
Update README.md to include detailed information about GLM-4V-9B, its capabilities, model structure, algorithms, environment setup, inference instructions, and application scenarios.
34c31a8d · laibao · e6dcd9bd · 34c31a8d · 34c31a8d · 34c31a8d
Commit 34c31a8d authored Oct 16, 2025 by laibao
20 changed files
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create an LLM.
+    # You should pass task="classify" for classification models
+    model = LLM(**vars(args))
+
+    # Generate logits. The output is a list of ClassificationRequestOutputs.
+    outputs = model.classify(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        probs = output.outputs.probs
+        probs_trimmed = (str(probs[:16])[:-1] + ", ...]") if len(probs) > 16 else probs
+        print(
+            f"Prompt: {prompt!r} \n"
+            f"Class Probabilities: {probs_trimmed} (size={len(probs)})"
+        )
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="intfloat/e5-mistral-7b-instruct",
+        task="embed",
+        enforce_eager=True,
+        max_model_len=1024,
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create an LLM.
+    # You should pass task="embed" for embedding models
+    model = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    outputs = model.embed(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        embeds_trimmed = (
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
+        )
+        print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/offline_inference/basic/generate.py
+++ b/examples/offline_inference/basic/generate.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def create_parser():
+    parser = FlexibleArgumentParser()
+    # Add engine args
+    EngineArgs.add_cli_args(parser)
+    parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+    # Add sampling params
+    sampling_group = parser.add_argument_group("Sampling parameters")
+    sampling_group.add_argument("--max-tokens", type=int)
+    sampling_group.add_argument("--temperature", type=float)
+    sampling_group.add_argument("--top-p", type=float)
+    sampling_group.add_argument("--top-k", type=int)
+
+    return parser
+
+
+def main(args: dict):
+    # Pop arguments not used by LLM
+    max_tokens = args.pop("max_tokens")
+    temperature = args.pop("temperature")
+    top_p = args.pop("top_p")
+    top_k = args.pop("top_k")
+
+    # Create an LLM
+    llm = LLM(**args)
+
+    # Create a sampling params object
+    sampling_params = llm.get_default_sampling_params()
+    if max_tokens is not None:
+        sampling_params.max_tokens = max_tokens
+    if temperature is not None:
+        sampling_params.temperature = temperature
+    if top_p is not None:
+        sampling_params.top_p = top_p
+    if top_k is not None:
+        sampling_params.top_k = top_k
+
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    parser = create_parser()
+    args: dict = vars(parser.parse_args())
+    main(args)
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="BAAI/bge-reranker-v2-m3", task="score", enforce_eager=True
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    text_1 = "What is the capital of France?"
+    texts_2 = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    # Create an LLM.
+    # You should pass task="score" for cross-encoder models
+    model = LLM(**vars(args))
+
+    # Generate scores. The output is a list of ScoringRequestOutputs.
+    outputs = model.score(text_1, texts_2)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for text_2, output in zip(texts_2, outputs):
+        score = output.outputs.score
+        print(f"Pair: {[text_1, text_2]!r} \nScore: {score}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/offline_inference/batch_llm_inference.py
+++ b/examples/offline_inference/batch_llm_inference.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use Ray Data for data parallel batch inference.
+
+Ray Data is a data processing framework that can handle large datasets
+and integrates tightly with vLLM for data-parallel inference.
+
+As of Ray 2.44, Ray Data has a native integration with
+vLLM (under ray.data.llm).
+
+Ray Data provides functionality for:
+* Reading and writing to cloud storage (S3, GCS, etc.)
+* Automatic sharding and load-balancing across a cluster
+* Optimized configuration of vLLM using continuous batching
+* Compatible with tensor/pipeline parallel inference as well.
+
+Learn more about Ray Data's LLM integration:
+https://docs.ray.io/en/latest/data/working-with-llms.html
+"""
+
+import ray
+from packaging.version import Version
+from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig
+
+assert Version(ray.__version__) >= Version("2.44.1"), (
+    "Ray version must be at least 2.44.1"
+)
+
+# Uncomment to reduce clutter in stdout
+# ray.init(log_to_driver=False)
+# ray.data.DataContext.get_current().enable_progress_bars = False
+
+# Read one text file from S3. Ray Data supports reading multiple files
+# from cloud storage (such as JSONL, Parquet, CSV, binary format).
+ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
+print(ds.schema())
+
+size = ds.count()
+print(f"Size of dataset: {size} prompts")
+
+# Configure vLLM engine.
+config = vLLMEngineProcessorConfig(
+    model_source="unsloth/Llama-3.1-8B-Instruct",
+    engine_kwargs={
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4096,
+        "max_model_len": 16384,
+    },
+    concurrency=1,  # set the number of parallel vLLM replicas
+    batch_size=64,
+)
+
+# Create a Processor object, which will be used to
+# do batch inference on the dataset
+vllm_processor = build_llm_processor(
+    config,
+    preprocess=lambda row: dict(
+        messages=[
+            {"role": "system", "content": "You are a bot that responds with haikus."},
+            {"role": "user", "content": row["text"]},
+        ],
+        sampling_params=dict(
+            temperature=0.3,
+            max_tokens=250,
+        ),
+    ),
+    postprocess=lambda row: dict(
+        answer=row["generated_text"],
+        **row,  # This will return all the original columns in the dataset.
+    ),
+)
+
+ds = vllm_processor(ds)
+
+# Peek first 10 results.
+# NOTE: This is for local testing and debugging. For production use case,
+# one should write full result out as shown below.
+outputs = ds.take(limit=10)
+
+for output in outputs:
+    prompt = output["prompt"]
+    generated_text = output["generated_text"]
+    print(f"Prompt: {prompt!r}")
+    print(f"Generated text: {generated_text!r}")
+
+# Write inference output data out as Parquet files to S3.
+# Multiple files would be written to the output destination,
+# and each task would write one or more files separately.
+#
+# ds.write_parquet("s3://<your-output-bucket>")
--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# ruff: noqa
+import json
+import random
+import string
+
+from vllm import LLM
+from vllm.sampling_params import SamplingParams
+
+# This script is an offline demo for function calling
+#
+# If you want to run a server/client setup, please follow this code:
+#
+# - Server:
+#
+# ```bash
+# vllm serve mistralai/Mistral-7B-Instruct-v0.3 --tokenizer-mode mistral --load-format mistral --config-format mistral
+# ```
+#
+# - Client:
+#
+# ```bash
+# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
+# --header 'Content-Type: application/json' \
+# --header 'Authorization: Bearer token' \
+# --data '{
+#     "model": "mistralai/Mistral-7B-Instruct-v0.3"
+#     "messages": [
+#       {
+#         "role": "user",
+#         "content": [
+#             {"type" : "text", "text": "Describe this image in detail please."},
+#             {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
+#             {"type" : "text", "text": "and this one as well. Answer in French."},
+#             {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
+#         ]
+#       }
+#     ]
+#   }'
+# ```
+#
+# Usage:
+#     python demo.py simple
+#     python demo.py advanced
+
+model_name = "mistralai/Mistral-7B-Instruct-v0.3"
+# or switch to "mistralai/Mistral-Nemo-Instruct-2407"
+# or "mistralai/Mistral-Large-Instruct-2407"
+# or any other mistral model with function calling ability
+
+sampling_params = SamplingParams(max_tokens=8192, temperature=0.0)
+llm = LLM(
+    model=model_name,
+    tokenizer_mode="mistral",
+    config_format="mistral",
+    load_format="mistral",
+)
+
+
+def generate_random_id(length=9):
+    characters = string.ascii_letters + string.digits
+    random_id = "".join(random.choice(characters) for _ in range(length))
+    return random_id
+
+
+# simulate an API that can be called
+def get_current_weather(city: str, state: str, unit: "str"):
+    return (
+        f"The weather in {city}, {state} is 85 degrees {unit}. It is "
+        "partly cloudly, with highs in the 90's."
+    )
+
+
+tool_functions = {"get_current_weather": get_current_weather}
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for, e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": "the two-letter abbreviation for the state that the city is"
+                        " in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }
+]
+
+messages = [
+    {
+        "role": "user",
+        "content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?",
+    }
+]
+
+outputs = llm.chat(messages, sampling_params=sampling_params, tools=tools)
+output = outputs[0].outputs[0].text.strip()
+
+# append the assistant message
+messages.append(
+    {
+        "role": "assistant",
+        "content": output,
+    }
+)
+
+# let's now actually parse and execute the model's output simulating an API call by using the
+# above defined function
+tool_calls = json.loads(output)
+tool_answers = [
+    tool_functions[call["name"]](**call["arguments"]) for call in tool_calls
+]
+
+# append the answer as a tool message and let the LLM give you an answer
+messages.append(
+    {
+        "role": "tool",
+        "content": "\n\n".join(tool_answers),
+        "tool_call_id": generate_random_id(),
+    }
+)
+
+outputs = llm.chat(messages, sampling_params, tools=tools)
+
+print(outputs[0].outputs[0].text.strip())
+# yields
+#   'The weather in Dallas, TX is 85 degrees fahrenheit. '
+#   'It is partly cloudly, with highs in the 90's.'
--- a/examples/offline_inference/context_extension.py
+++ b/examples/offline_inference/context_extension.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script demonstrates how to extend the context length
+of a Qwen model using the YARN method (rope_scaling)
+and run a simple chat example.
+
+Usage:
+    python examples/offline_inference/context_extension.py
+"""
+
+from vllm import LLM, SamplingParams
+
+
+def create_llm():
+    rope_theta = 1000000
+    original_max_position_embeddings = 32768
+    factor = 4.0
+
+    # Use yarn to extend context
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings": original_max_position_embeddings,
+        },
+        "max_model_len": int(original_max_position_embeddings * factor),
+    }
+
+    llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
+    return llm
+
+
+def run_llm_chat(llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=128,
+    )
+
+    conversation = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
+    ]
+    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+    return outputs
+
+
+def print_outputs(outputs):
+    print("\nGenerated Outputs:\n" + "-" * 80)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\n")
+        print(f"Generated text: {generated_text!r}")
+        print("-" * 80)
+
+
+def main():
+    llm = create_llm()
+    outputs = run_llm_chat(llm)
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Usage:
+Single node:
+    python examples/offline_inference/data_parallel.py \
+            --model="ibm-research/PowerMoE-3b" \
+            --dp-size=2 \
+            --tp-size=2
+
+Multi-node:
+    Node 0 (assume the node has ip of 10.99.48.128):
+            python examples/offline_inference/data_parallel.py \
+                    --model="ibm-research/PowerMoE-3b" \
+                    --dp-size=2 \
+                    --tp-size=2 \
+                    --node-size=2 \
+                    --node-rank=0 \
+                    --master-addr=10.99.48.128 \
+                    --master-port=13345
+    Node 1:
+            python examples/offline_inference/data_parallel.py \
+                    --model="ibm-research/PowerMoE-3b" \
+                    --dp-size=2 \
+                    --tp-size=2 \
+                    --node-size=2 \
+                    --node-rank=1 \
+                    --master-addr=10.99.48.128 \
+                    --master-port=13345
+"""
+
+import os
+from time import sleep
+
+from vllm import LLM, SamplingParams
+from vllm.utils import get_open_port
+
+
+def parse_args():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Data Parallel Inference")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="ibm-research/PowerMoE-3b",
+        help="Model name or path",
+    )
+    parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
+    parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
+    parser.add_argument(
+        "--node-size", type=int, default=1, help="Total number of nodes"
+    )
+    parser.add_argument(
+        "--node-rank", type=int, default=0, help="Rank of the current node"
+    )
+    parser.add_argument(
+        "--master-addr", type=str, default="", help="Master node IP address"
+    )
+    parser.add_argument("--master-port", type=int, default=0, help="Master node port")
+    parser.add_argument(
+        "--enforce-eager", action="store_true", help="Enforce eager mode execution."
+    )
+    parser.add_argument(
+        "--trust-remote-code", action="store_true", help="Trust remote code."
+    )
+    parser.add_argument(
+        "--max-num-seqs",
+        type=int,
+        default=64,
+        help=("Maximum number of sequences to be processed in a single iteration."),
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.8,
+        help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
+    )
+    return parser.parse_args()
+
+
+def main(
+    model,
+    dp_size,
+    local_dp_rank,
+    global_dp_rank,
+    dp_master_ip,
+    dp_master_port,
+    GPUs_per_dp_rank,
+    enforce_eager,
+    trust_remote_code,
+    max_num_seqs,
+    gpu_memory_utilization,
+):
+    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
+    os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
+    os.environ["VLLM_DP_SIZE"] = str(dp_size)
+    os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
+    os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
+
+    # CUDA_VISIBLE_DEVICES for each DP rank is set automatically inside the
+    # engine processes.
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ] * 100
+
+    # with DP, each rank should process different prompts.
+    # usually all the DP ranks process a full dataset,
+    # and each rank processes a different part of the dataset.
+    floor = len(prompts) // dp_size
+    remainder = len(prompts) % dp_size
+
+    # Distribute prompts into even groups.
+    def start(rank):
+        return rank * floor + min(rank, remainder)
+
+    prompts = prompts[start(global_dp_rank) : start(global_dp_rank + 1)]
+    if len(prompts) == 0:
+        # if any rank has no prompts to process,
+        # we need to set a placeholder prompt
+        prompts = ["Placeholder"]
+    print(f"DP rank {global_dp_rank} needs to process {len(prompts)} prompts")
+
+    # Create a sampling params object.
+    # since we are doing data parallel, every rank can have different
+    # sampling params. here we set different max_tokens for different
+    # ranks for demonstration.
+    sampling_params = SamplingParams(
+        temperature=0.8, top_p=0.95, max_tokens=[16, 20][global_dp_rank % 2]
+    )
+
+    # Create an LLM.
+    llm = LLM(
+        model=model,
+        tensor_parallel_size=GPUs_per_dp_rank,
+        enforce_eager=enforce_eager,
+        enable_expert_parallel=True,
+        trust_remote_code=trust_remote_code,
+        max_num_seqs=max_num_seqs,
+        gpu_memory_utilization=gpu_memory_utilization,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        if i >= 5:
+            # print only 5 outputs
+            break
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(
+            f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
+            f"Generated text: {generated_text!r}"
+        )
+
+    # Give engines time to pause their processing loops before exiting.
+    sleep(1)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    dp_size = args.dp_size
+    tp_size = args.tp_size
+    node_size = args.node_size
+    node_rank = args.node_rank
+
+    if node_size == 1:
+        dp_master_ip = "127.0.0.1"
+        dp_master_port = get_open_port()
+    else:
+        dp_master_ip = args.master_addr
+        dp_master_port = args.master_port
+
+    assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
+    dp_per_node = dp_size // node_size
+
+    from multiprocessing import Process
+
+    procs = []
+    for local_dp_rank, global_dp_rank in enumerate(
+        range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)
+    ):
+        proc = Process(
+            target=main,
+            args=(
+                args.model,
+                dp_size,
+                local_dp_rank,
+                global_dp_rank,
+                dp_master_ip,
+                dp_master_port,
+                tp_size,
+                args.enforce_eager,
+                args.trust_remote_code,
+                args.max_num_seqs,
+                args.gpu_memory_utilization,
+            ),
+        )
+        proc.start()
+        procs.append(proc)
+    exit_code = 0
+    for proc in procs:
+        proc.join(timeout=300)
+        if proc.exitcode is None:
+            print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
+            proc.kill()
+            exit_code = 1
+        elif proc.exitcode:
+            exit_code = proc.exitcode
+
+    exit(exit_code)
--- a/examples/offline_inference/disaggregated-prefill-v1/README.md
+++ b/examples/offline_inference/disaggregated-prefill-v1/README.md
+# Disaggregated Prefill V1
+
+This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM.
+
+## Files
+
+- `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
+  - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
+- `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
+- `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+
+def read_prompts():
+    """Read prompts from output.txt"""
+    prompts = []
+    try:
+        with open("output.txt") as f:
+            for line in f:
+                prompts.append(line.strip())
+        print(f"Loaded {len(prompts)} prompts from output.txt")
+        return prompts
+    except FileNotFoundError:
+        print("Error: output.txt file not found")
+        exit(-1)
+
+
+def main():
+    prompts = read_prompts()
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        gpu_memory_utilization=0.8,
+        max_num_batched_tokens=64,
+        max_num_seqs=16,
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        ),
+    )  # , max_model_len=2048, max_num_batched_tokens=2048)
+
+    # 1ST generation (prefill instance)
+    outputs = llm.generate(prompts, sampling_params)
+
+    print("-" * 30)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 30)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+
+def read_prompts():
+    context = "Hi " * 1000
+    context2 = "Hey " * 500
+    return [
+        context + "Hello, my name is",
+        context + "The capital of France is",
+        context2 + "Your name is",
+        context2 + "The capital of China is",
+    ]
+
+
+def main():
+    prompts = read_prompts()
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        gpu_memory_utilization=0.8,
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        ),
+    )  # , max_model_len=2048, max_num_batched_tokens=2048)
+
+    # 1ST generation (prefill instance)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+    )
+
+    new_prompts = []
+    print("-" * 30)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        new_prompts.append(prompt + generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 30)
+
+    # Write new_prompts to output.txt
+    with open("output.txt", "w") as f:
+        for prompt in new_prompts:
+            f.write(prompt + "\n")
+    print(f"Saved {len(new_prompts)} prompts to output.txt")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/disaggregated-prefill-v1/run.sh
+++ b/examples/offline_inference/disaggregated-prefill-v1/run.sh
+rm -rf local_storage/
+
+if [ -f "output.txt" ]; then
+    rm output.txt
+fi
+
+# The directory of current script
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py"
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py"
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates the example usage of disaggregated prefilling
+We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
+and then transfer the KV cache between them.
+"""
+
+import os
+import time
+from multiprocessing import Event, Process
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+
+def run_prefill(prefill_done):
+    # We use GPU 0 for prefill node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+    # The prefill node receives two requests, while the decode node receives
+    # three requests. So the decode node will only receive the KV Cache for
+    # requests 1 and 3. The decode node will use the KV Cache of requests 1
+    # and 3 and do prefilling on request 2.
+    prompts = [
+        "Hello, my name is",
+        "Hi, your name is",
+        # The decode node will actually "prefill" this request.
+        "Tell me a very long story",
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # This instance is the prefill node (kv_producer, rank 0).
+    # The number of parallel instances for KV cache transfer is set to 2,
+    # as required for PyNcclConnector.
+    ktc = KVTransferConfig(
+        kv_connector="PyNcclConnector",
+        kv_role="kv_producer",
+        kv_rank=0,
+        kv_parallel_size=2,
+    )
+
+    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
+    # memory. You may need to adjust the value to fit your GPU.
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        kv_transfer_config=ktc,
+        max_model_len=2000,
+        gpu_memory_utilization=0.8,
+    )
+
+    llm.generate(prompts, sampling_params)
+    print("Prefill node is finished.")
+    prefill_done.set()
+
+    # To keep the prefill node running in case the decode node is not done;
+    # otherwise, the script might exit prematurely, causing incomplete decoding.
+    try:
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("Script stopped by user.")
+
+
+def run_decode(prefill_done):
+    # We use GPU 1 for decode node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+
+    prompts = [
+        "Hello, my name is",
+        "Hi, your name is",
+        "Tell me a very long story",
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95)
+
+    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # This instance is the decode node (kv_consumer, rank 1).
+    # The number of parallel instances for KV cache transfer is set to 2,
+    # as required for PyNcclConnector.
+    ktc = KVTransferConfig(
+        kv_connector="PyNcclConnector",
+        kv_role="kv_consumer",
+        kv_rank=1,
+        kv_parallel_size=2,
+    )
+
+    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
+    # memory. You may need to adjust the value to fit your GPU.
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        kv_transfer_config=ktc,
+        max_model_len=2000,
+        gpu_memory_utilization=0.8,
+    )
+
+    # Wait for the producer to start the pipe
+    print("Waiting for prefill node to finish...")
+    prefill_done.wait()
+
+    # At this point when the prefill_done is set, the kv-cache should have been
+    # transferred to this decode node, so we can start decoding.
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+def main():
+    prefill_done = Event()
+    prefill_process = Process(target=run_prefill, args=(prefill_done,))
+    decode_process = Process(target=run_decode, args=(prefill_done,))
+
+    # Start prefill node
+    prefill_process.start()
+
+    # Start decode node
+    decode_process.start()
+
+    # Terminate the prefill node when decode is finished
+    decode_process.join()
+    prefill_process.terminate()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Follow the white rabbit.",  # English
+        "Sigue al conejo blanco.",  # Spanish
+        "Suis le lapin blanc.",  # French
+        "跟着白兔走。",  # Chinese
+        "اتبع الأرنب الأبيض.",  # Arabic
+        "Folge dem weißen Kaninchen.",  # German
+    ]
+
+    # Create an LLM.
+    # You should pass task="embed" for embedding models
+    model = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    # Only text matching task is supported for now. See #16120
+    outputs = model.embed(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:")
+    print("Only text matching task is supported for now. See #16120")
+    print("-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        embeds_trimmed = (
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
+        )
+        print(
+            f"Prompt: {prompt!r} \n"
+            f"Embeddings for text matching: {embeds_trimmed} "
+            f"(size={len(embeds)})"
+        )
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs, PoolingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Follow the white rabbit.",  # English
+        "Sigue al conejo blanco.",  # Spanish
+        "Suis le lapin blanc.",  # French
+        "跟着白兔走。",  # Chinese
+        "اتبع الأرنب الأبيض.",  # Arabic
+        "Folge dem weißen Kaninchen.",  # German
+    ]
+
+    # Create an LLM.
+    # You should pass task="embed" for embedding models
+    model = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
+
+    # Print the outputs.
+    print("\nGenerated Outputs:")
+    print("-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        embeds_trimmed = (
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
+        )
+        print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrate prompting of text-to-text
+encoder/decoder models, specifically BART
+"""
+
+from vllm import LLM, SamplingParams
+from vllm.inputs import (
+    ExplicitEncoderDecoderPrompt,
+    TextPrompt,
+    TokensPrompt,
+    zip_enc_dec_prompts,
+)
+
+
+def create_prompts(tokenizer):
+    # Test prompts
+    #
+    # This section shows all of the valid ways to prompt an
+    # encoder/decoder model.
+    #
+    # - Helpers for building prompts
+    text_prompt_raw = "Hello, my name is"
+    text_prompt = TextPrompt(prompt="The president of the United States is")
+    tokens_prompt = TokensPrompt(
+        prompt_token_ids=tokenizer.encode(prompt="The capital of France is")
+    )
+    # - Pass a single prompt to encoder/decoder model
+    #   (implicitly encoder input prompt);
+    #   decoder input prompt is assumed to be None
+
+    single_text_prompt_raw = text_prompt_raw  # Pass a string directly
+    single_text_prompt = text_prompt  # Pass a TextPrompt
+    single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
+
+    # ruff: noqa: E501
+    # - Pass explicit encoder and decoder input prompts within one data structure.
+    #   Encoder and decoder prompts can both independently be text or tokens, with
+    #   no requirement that they be the same prompt type. Some example prompt-type
+    #   combinations are shown below, note that these are not exhaustive.
+
+    enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
+        # Pass encoder prompt string directly, &
+        # pass decoder prompt tokens
+        encoder_prompt=single_text_prompt_raw,
+        decoder_prompt=single_tokens_prompt,
+    )
+    enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
+        # Pass TextPrompt to encoder, and
+        # pass decoder prompt string directly
+        encoder_prompt=single_text_prompt,
+        decoder_prompt=single_text_prompt_raw,
+    )
+    enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
+        # Pass encoder prompt tokens directly, and
+        # pass TextPrompt to decoder
+        encoder_prompt=single_tokens_prompt,
+        decoder_prompt=single_text_prompt,
+    )
+
+    # - Finally, here's a useful helper function for zipping encoder and
+    #   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
+    #   instances
+    zipped_prompt_list = zip_enc_dec_prompts(
+        ["An encoder prompt", "Another encoder prompt"],
+        ["A decoder prompt", "Another decoder prompt"],
+    )
+
+    # - Let's put all of the above example prompts together into one list
+    #   which we will pass to the encoder/decoder LLM.
+    return [
+        single_text_prompt_raw,
+        single_text_prompt,
+        single_tokens_prompt,
+        enc_dec_prompt1,
+        enc_dec_prompt2,
+        enc_dec_prompt3,
+    ] + zipped_prompt_list
+
+
+# Create a sampling params object.
+def create_sampling_params():
+    return SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        min_tokens=0,
+        max_tokens=20,
+    )
+
+
+# Print the outputs.
+def print_outputs(outputs):
+    print("-" * 50)
+    for i, output in enumerate(outputs):
+        prompt = output.prompt
+        encoder_prompt = output.encoder_prompt
+        generated_text = output.outputs[0].text
+        print(f"Output {i + 1}:")
+        print(
+            f"Encoder prompt: {encoder_prompt!r}\n"
+            f"Decoder prompt: {prompt!r}\n"
+            f"Generated text: {generated_text!r}"
+        )
+        print("-" * 50)
+
+
+def main():
+    dtype = "float"
+
+    # Create a BART encoder/decoder model instance
+    llm = LLM(
+        model="facebook/bart-large-cnn",
+        dtype=dtype,
+    )
+
+    # Get BART tokenizer
+    tokenizer = llm.llm_engine.get_tokenizer_group()
+
+    prompts = create_prompts(tokenizer)
+    sampling_params = create_sampling_params()
+
+    # Generate output tokens from the prompts. The output is a list of
+    # RequestOutput objects that contain the prompt, generated
+    # text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference with
+the explicit/implicit prompt format on enc-dec LMMs for text generation.
+"""
+
+import time
+from collections.abc import Sequence
+from dataclasses import asdict
+from typing import NamedTuple
+
+from vllm import LLM, EngineArgs, PromptType, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.utils import FlexibleArgumentParser
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: Sequence[PromptType]
+
+
+def run_florence2():
+    engine_args = EngineArgs(
+        model="microsoft/Florence-2-large",
+        tokenizer="Isotr0py/Florence-2-tokenizer",
+        max_num_seqs=8,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {  # implicit prompt with task token
+            "prompt": "<DETAILED_CAPTION>",
+            "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
+        },
+        {  # explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "Describe in detail what is shown in the image.",
+                "multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image},
+            },
+            "decoder_prompt": "",
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+def run_mllama():
+    engine_args = EngineArgs(
+        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {  # Implicit prompt
+            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",  # noqa: E501
+            "multi_modal_data": {
+                "image": ImageAsset("stop_sign").pil_image,
+            },
+        },
+        {  # Explicit prompt
+            "encoder_prompt": {
+                "prompt": "<|image|>",
+                "multi_modal_data": {
+                    "image": ImageAsset("stop_sign").pil_image,
+                },
+            },
+            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",  # noqa: E501
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+def run_whisper():
+    engine_args = EngineArgs(
+        model="openai/whisper-large-v3-turbo",
+        max_model_len=448,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"audio": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {  # Test implicit prompt
+            "prompt": "<|startoftranscript|>",
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+            },
+        },
+        {  # Test explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "",
+                "multi_modal_data": {
+                    "audio": AudioAsset("winning_call").audio_and_sample_rate,
+                },
+            },
+            "decoder_prompt": "<|startoftranscript|>",
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+model_example_map = {
+    "florence2": run_florence2,
+    "mllama": run_mllama,
+    "whisper": run_whisper,
+}
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "vision language models for text generation"
+    )
+    parser.add_argument(
+        "--model-type",
+        "-m",
+        type=str,
+        default="mllama",
+        choices=model_example_map.keys(),
+        help='Huggingface "model_type".',
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    req_data = model_example_map[model]()
+
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    prompts = req_data.prompts
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        max_tokens=64,
+        skip_special_tokens=False,
+    )
+
+    start = time.time()
+
+    # Generate output tokens from the prompts. The output is a list of
+    # RequestOutput objects that contain the prompt, generated
+    # text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    duration = time.time() - start
+
+    print("Duration:", duration)
+    print("RPS:", len(prompts) / duration)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates using the `LLMEngine`
+for processing prompts with various sampling parameters.
+"""
+
+import argparse
+
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def create_test_prompts() -> list[tuple[str, SamplingParams]]:
+    """Create a list of test prompts with their sampling parameters."""
+    return [
+        (
+            "A robot may not injure a human being",
+            SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1),
+        ),
+        (
+            "To be or not to be,",
+            SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2),
+        ),
+        (
+            "What is the meaning of life?",
+            SamplingParams(n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1),
+        ),
+    ]
+
+
+def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+
+    print("-" * 50)
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params = test_prompts.pop(0)
+            engine.add_request(str(request_id), prompt, sampling_params)
+            request_id += 1
+
+        request_outputs: list[RequestOutput] = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                print(request_output)
+                print("-" * 50)
+
+
+def initialize_engine(args: argparse.Namespace) -> LLMEngine:
+    """Initialize the LLMEngine from the command line arguments."""
+    engine_args = EngineArgs.from_cli_args(args)
+    return LLMEngine.from_engine_args(engine_args)
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using the LLMEngine class directly"
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    return parser.parse_args()
+
+
+def main(args: argparse.Namespace):
+    """Main function that sets up and runs the prompt processing."""
+    engine = initialize_engine(args)
+    test_prompts = create_test_prompts()
+    process_requests(engine, test_prompts)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/offline_inference/load_sharded_state.py
+++ b/examples/offline_inference/load_sharded_state.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Validates the loading of a model saved with the sharded_state format.
+This script demonstrates how to load a model that was previously saved
+using save_sharded_state.py and validates it by running inference.
+Example usage:
+(First need to save a sharded_state mode)
+
+python save_sharded_state.py \
+    --model /path/to/load \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --output /path/to/save/sharded/modele
+
+python load_sharded_state.py \
+    --model /path/to/saved/sharded/model \
+    --load-format sharded_state \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --prompt "Hello, my name is" \
+    --max-tokens 50
+"""
+
+import dataclasses
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    # Add engine arguments
+    EngineArgs.add_cli_args(parser)
+
+    # Override default load_format for clarity
+    parser.set_defaults(load_format="sharded_state")
+
+    # Add validation arguments
+    parser.add_argument(
+        "--prompt", type=str, default="Hello, world!", help="Prompt for validation"
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=100,
+        help="Maximum number of tokens to generate",
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=0.7, help="Sampling temperature"
+    )
+    parser.add_argument(
+        "--top-p", type=float, default=1.0, help="Top-p sampling parameter"
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    engine_args = EngineArgs.from_cli_args(args)
+
+    print(
+        f"Loading model from {engine_args.model} using format {engine_args.load_format}"
+    )
+    print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
+
+    # Load the model using engine args
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    # Prepare sampling parameters
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_tokens=args.max_tokens,
+    )
+
+    print("\nRunning inference:")
+    print(f"Prompt: {args.prompt}")
+
+    # Generate completion
+    outputs = llm.generate(args.prompt, sampling_params)
+
+    # Display generated text
+    print("\nGenerated outputs:")
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print("-" * 50)
+        print(f"Full output: {args.prompt}{generated_text}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use LoRA with different quantization techniques
+for offline inference.
+
+Requires HuggingFace credentials for access.
+"""
+
+import gc
+from typing import Optional
+
+import torch
+from huggingface_hub import snapshot_download
+
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.lora.request import LoRARequest
+
+
+def create_test_prompts(
+    lora_path: str,
+) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
+    return [
+        # this is an example of using quantization without LoRA
+        (
+            "My name is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            None,
+        ),
+        # the next three examples use quantization with LoRA
+        (
+            "my name is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            LoRARequest("lora-test-1", 1, lora_path),
+        ),
+        (
+            "The capital of USA is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            LoRARequest("lora-test-2", 1, lora_path),
+        ),
+        (
+            "The capital of France is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            LoRARequest("lora-test-3", 1, lora_path),
+        ),
+    ]
+
+
+def process_requests(
+    engine: LLMEngine,
+    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params, lora_request = test_prompts.pop(0)
+            engine.add_request(
+                str(request_id), prompt, sampling_params, lora_request=lora_request
+            )
+            request_id += 1
+
+        request_outputs: list[RequestOutput] = engine.step()
+        for request_output in request_outputs:
+            if request_output.finished:
+                print("----------------------------------------------------")
+                print(f"Prompt: {request_output.prompt}")
+                print(f"Output: {request_output.outputs[0].text}")
+
+
+def initialize_engine(
+    model: str, quantization: str, lora_repo: Optional[str]
+) -> LLMEngine:
+    """Initialize the LLMEngine."""
+
+    engine_args = EngineArgs(
+        model=model,
+        quantization=quantization,
+        enable_lora=True,
+        max_lora_rank=64,
+        max_loras=4,
+    )
+    return LLMEngine.from_engine_args(engine_args)
+
+
+def main():
+    """Main function that sets up and runs the prompt processing."""
+
+    test_configs = [
+        # QLoRA (https://arxiv.org/abs/2305.14314)
+        {
+            "name": "qlora_inference_example",
+            "model": "huggyllama/llama-7b",
+            "quantization": "bitsandbytes",
+            "lora_repo": "timdettmers/qlora-flan-7b",
+        },
+        {
+            "name": "AWQ_inference_with_lora_example",
+            "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+            "quantization": "awq",
+            "lora_repo": "jashing/tinyllama-colorist-lora",
+        },
+        {
+            "name": "GPTQ_inference_with_lora_example",
+            "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            "quantization": "gptq",
+            "lora_repo": "jashing/tinyllama-colorist-lora",
+        },
+    ]
+
+    for test_config in test_configs:
+        print(f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~")
+        engine = initialize_engine(
+            test_config["model"], test_config["quantization"], test_config["lora_repo"]
+        )
+        lora_path = snapshot_download(repo_id=test_config["lora_repo"])
+        test_prompts = create_test_prompts(lora_path)
+        process_requests(engine, test_prompts)
+
+        # Clean up the GPU memory for the next test
+        del engine
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    main()