Merge tag 'v0.8.2' into v0.8.2-dev

469e903b · zhuwenwen · 389ebcf7 · 25f560a6 · 469e903b · 469e903b
Commit 469e903b authored Mar 28, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/disaggregated_prefill_lmcache.py
+++ b/examples/offline_inference/disaggregated_prefill_lmcache.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the example usage of disaggregated prefilling
+with LMCache.
+We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
+and launch an additional LMCache server.
+KV cache is transferred in the following manner: 
+vLLM prefill node -> LMCache server -> vLLM decode node.
+
+Note that `pip install lmcache` is needed to run this example.
+Learn more about LMCache in https://github.com/LMCache/LMCache.
+"""
+import os
+import subprocess
+import time
+from multiprocessing import Event, Process
+
+from lmcache.experimental.cache_engine import LMCacheEngineBuilder
+from lmcache.integration.vllm.utils import ENGINE_NAME
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+# LMCache-related environment variables
+# The port to start LMCache server
+port = 8100
+# Use experimental features in LMCache
+os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+# LMCache is set to use 256 tokens per chunk
+os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+# Disable local CPU backend in LMCache
+os.environ["LMCACHE_LOCAL_CPU"] = "False"
+# Set local CPU memory buffer limit to 5.0 GB
+os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
+# Set the remote URL for LMCache server
+os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:{port}"
+# Set the serializer/deserializer between vllm and LMCache server
+# `naive` indicates using raw bytes of the tensor without any compression
+os.environ["LMCACHE_REMOTE_SERDE"] = "naive"
+
+
+def run_prefill(prefill_done, prompts):
+    # We use GPU 0 for prefill node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"LMCacheConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
+    )
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # memory. Reduce the value if your GPU has less memory.
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+              kv_transfer_config=ktc,
+              max_model_len=8000,
+              gpu_memory_utilization=0.8,
+              enforce_eager=True)
+
+    #llm.generate(prompts, sampling_params)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+    print("Prefill node is finished.")
+    prefill_done.set()
+
+    # Clean up lmcache backend
+    LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+
+def run_decode(prefill_done, prompts, timeout=1):
+    # We use GPU 1 for decode node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"LMCacheConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
+    )
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # of memory. Reduce the value if your GPU has less memory.
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+              kv_transfer_config=ktc,
+              max_model_len=8000,
+              gpu_memory_utilization=0.8,
+              enforce_eager=True)
+
+    print("Waiting for prefill node to finish...")
+    prefill_done.wait()
+    time.sleep(timeout)
+
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+
+    # Clean up lmcache backend
+    LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+
+def run_lmcache_server(port):
+    server_proc = subprocess.Popen([
+        "python", "-m", "lmcache.experimental.server", "localhost",
+        str(port)
+    ])
+    return server_proc
+
+
+if __name__ == "__main__":
+
+    prompts = [
+        "Hello, how are you?" * 1000,
+    ]
+
+    prefill_done = Event()
+    prefill_process = Process(target=run_prefill, args=(prefill_done, prompts))
+    decode_process = Process(target=run_decode, args=(prefill_done, prompts))
+    lmcache_server_process = run_lmcache_server(port)
+
+    # Start prefill node
+    prefill_process.start()
+
+    # Start decode node
+    decode_process.start()
+
+    # Clean up the processes
+    decode_process.join()
+    prefill_process.terminate()
+    lmcache_server_process.terminate()
+    lmcache_server_process.wait()
--- a/examples/offline_inference/distributed.py
+++ b/examples/offline_inference/distributed.py
@@ -6,7 +6,7 @@ distributively on a multi-nodes cluster.
 Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
 """

-from typing import Any, Dict, List
+from typing import Any

 import numpy as np
 import ray
@@ -36,13 +36,13 @@ class LLMPredictor:
        self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
                       tensor_parallel_size=tensor_parallel_size)

-    def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
+    def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, list]:
        # Generate texts from the prompts.
        # The output is a list of RequestOutput objects that contain the prompt,
        # generated text, and other information.
        outputs = self.llm.generate(batch["text"], sampling_params)
-        prompt: List[str] = []
-        generated_text: List[str] = []
+        prompt: list[str] = []
+        generated_text: list[str] = []
        for output in outputs:
            prompt.append(output.prompt)
            generated_text.append(' '.join([o.text for o in output.outputs]))
@@ -72,7 +72,7 @@ def scheduling_strategy_fn():
        pg, placement_group_capture_child_tasks=True))


-resources_kwarg: Dict[str, Any] = {}
+resources_kwarg: dict[str, Any] = {}
 if tensor_parallel_size == 1:
    # For tensor_parallel_size == 1, we simply set num_gpus=1.
    resources_kwarg["num_gpus"] = 1

--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+import os
+
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--dataset",
+    type=str,
+    default="./examples/data/gsm8k.jsonl",
+    help="downloaded from the eagle repo " \
+    "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
+)
+parser.add_argument("--max_num_seqs", type=int, default=8)
+parser.add_argument("--num_prompts", type=int, default=80)
+parser.add_argument("--num_spec_tokens", type=int, default=2)
+parser.add_argument("--tp", type=int, default=1)
+parser.add_argument("--draft_tp", type=int, default=1)
+parser.add_argument("--enforce_eager", action='store_true')
+parser.add_argument("--enable_chunked_prefill", action='store_true')
+parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
+parser.add_argument("--temp", type=float, default=0)
+
+args = parser.parse_args()
+
+print(args)
+
+model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
+eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
+
+max_model_len = 2048
+
+tokenizer = AutoTokenizer.from_pretrained(model_dir)
+
+if os.path.exists(args.dataset):
+    prompts = []
+    num_prompts = args.num_prompts
+    with open(args.dataset) as f:
+        for line in f:
+            data = json.loads(line)
+            prompts.append(data["turns"][0])
+else:
+    prompts = ["The future of AI is", "The president of the United States is"]
+
+prompts = prompts[:args.num_prompts]
+num_prompts = len(prompts)
+
+prompt_ids = [
+    tokenizer.apply_chat_template([{
+        "role": "user",
+        "content": prompt
+    }],
+                                  add_generation_prompt=True)
+    for prompt in prompts
+]
+
+llm = LLM(
+    model=model_dir,
+    trust_remote_code=True,
+    tensor_parallel_size=args.tp,
+    enable_chunked_prefill=args.enable_chunked_prefill,
+    max_num_batched_tokens=args.max_num_batched_tokens,
+    enforce_eager=args.enforce_eager,
+    max_model_len=max_model_len,
+    max_num_seqs=args.max_num_seqs,
+    gpu_memory_utilization=0.8,
+    speculative_model=eagle_dir,
+    num_speculative_tokens=args.num_spec_tokens,
+    speculative_draft_tensor_parallel_size=args.draft_tp,
+    speculative_max_model_len=max_model_len,
+    disable_log_stats=False,
+)
+
+sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
+
+outputs = llm.generate(prompt_token_ids=prompt_ids,
+                       sampling_params=sampling_params)
+
+# calculate the average number of accepted tokens per forward pass, +1 is
+# to account for the token from the target model that's always going to be
+# accepted
+acceptance_counts = [0] * (args.num_spec_tokens + 1)
+for output in outputs:
+    for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
+        acceptance_counts[step] += count
+
+print(f"mean acceptance length: \
+    {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use vLLM for running offline inference with
+the explicit/implicit prompt format on enc-dec LMMs for text generation.
+"""
+import time
+from collections.abc import Sequence
+from dataclasses import asdict
+from typing import NamedTuple
+
+from vllm import LLM, EngineArgs, PromptType, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.utils import FlexibleArgumentParser
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: Sequence[PromptType]
+
+
+def run_florence2():
+    engine_args = EngineArgs(
+        model="microsoft/Florence-2-large",
+        tokenizer="facebook/bart-large",
+        max_num_seqs=8,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {   # implicit prompt with task token
+            "prompt": "<DETAILED_CAPTION>",
+            "multi_modal_data": {
+                "image": ImageAsset("stop_sign").pil_image
+            },
+        },
+        {   # explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "Describe in detail what is shown in the image.",
+                "multi_modal_data": {
+                    "image": ImageAsset("cherry_blossom").pil_image
+                },
+            },
+            "decoder_prompt": "",
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+def run_mllama():
+    engine_args = EngineArgs(
+        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {   # Implicit prompt
+            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",   # noqa: E501
+            "multi_modal_data": {
+                "image": ImageAsset("stop_sign").pil_image,
+            },
+        },
+        {   # Explicit prompt
+            "encoder_prompt": {
+                "prompt": "<|image|>",
+                "multi_modal_data": {
+                    "image": ImageAsset("stop_sign").pil_image,
+                },
+            },
+            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",   # noqa: E501
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+def run_whisper():
+    engine_args = EngineArgs(
+        model="openai/whisper-large-v3-turbo",
+        max_model_len=448,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"audio": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {   # Test implicit prompt
+            "prompt": "<|startoftranscript|>",
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+            },
+        },
+        {   # Test explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "",
+                "multi_modal_data": {
+                    "audio": AudioAsset("winning_call").audio_and_sample_rate,
+                },
+            },
+            "decoder_prompt": "<|startoftranscript|>",
+        }
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+model_example_map = {
+    "florence2": run_florence2,
+    "mllama": run_mllama,
+    "whisper": run_whisper,
+}
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    req_data = model_example_map[model]()
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    prompts = req_data.prompts
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        max_tokens=64,
+    )
+
+    start = time.time()
+
+    # Generate output tokens from the prompts. The output is a list of
+    # RequestOutput objects that contain the prompt, generated
+    # text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Decoder prompt: {prompt!r}, "
+              f"Generated text: {generated_text!r}")
+
+    duration = time.time() - start
+
+    print("Duration:", duration)
+    print("RPS:", len(prompts) / duration)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="mllama",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
+    args = parser.parse_args()
+    main(args)
--- a/examples/offline_inference/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
-# SPDX-License-Identifier: Apache-2.0
-'''
-Demonstrate prompting of text-to-text
-encoder/decoder models, specifically Florence-2
-'''
-# TODO(Isotr0py):
-# Move to offline_inference/vision_language.py
-# after porting vision backbone
-from vllm import LLM, SamplingParams
-
-dtype = "float"
-
-# Create a Florence-2 encoder/decoder model instance
-llm = LLM(
-    model="microsoft/Florence-2-base",
-    tokenizer="facebook/bart-base",
-    dtype=dtype,
-    trust_remote_code=True,
-)
-
-prompts = [
-    "<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>",
-    "<CAPTION_TO_PHRASE_GROUNDING>", "<OD>", "<DENSE_REGION_CAPTION>",
-    "<REGION_PROPOSAL>", "<OCR>", "<OCR_WITH_REGION>"
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(
-    temperature=0,
-    top_p=1.0,
-    min_tokens=0,
-    max_tokens=20,
-)
-
-# Generate output tokens from the prompts. The output is a list of
-# RequestOutput objects that contain the prompt, generated
-# text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    encoder_prompt = output.encoder_prompt
-    generated_text = output.outputs[0].text
-    print(f"Encoder prompt: {encoder_prompt!r}, "
-          f"Decoder prompt: {prompt!r}, "
-          f"Generated text: {generated_text!r}")
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
 # SPDX-License-Identifier: Apache-2.0

 import argparse
-from typing import List, Tuple

 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 from vllm.utils import FlexibleArgumentParser


-def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
+def create_test_prompts() -> list[tuple[str, SamplingParams]]:
    """Create a list of test prompts with their sampling parameters."""
    return [
        ("A robot may not injure a human being",
@@ -16,7 +15,6 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
         SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
        ("What is the meaning of life?",
         SamplingParams(n=2,
-                        best_of=5,
                        temperature=0.8,
                        top_p=0.95,
                        frequency_penalty=0.1)),
@@ -24,7 +22,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:


 def process_requests(engine: LLMEngine,
-                     test_prompts: List[Tuple[str, SamplingParams]]):
+                     test_prompts: list[tuple[str, SamplingParams]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0

@@ -34,7 +32,7 @@ def process_requests(engine: LLMEngine,
            engine.add_request(str(request_id), prompt, sampling_params)
            request_id += 1

-        request_outputs: List[RequestOutput] = engine.step()
+        request_outputs: list[RequestOutput] = engine.step()

        for request_output in request_outputs:
            if request_output.finished:

--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -7,7 +7,7 @@ Requires HuggingFace credentials for access.
 """

 import gc
-from typing import List, Optional, Tuple
+from typing import Optional

 import torch
 from huggingface_hub import snapshot_download
@@ -18,7 +18,7 @@ from vllm.lora.request import LoRARequest

 def create_test_prompts(
        lora_path: str
-) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
    return [
        # this is an example of using quantization without LoRA
        ("My name is",
@@ -49,7 +49,7 @@ def create_test_prompts(


 def process_requests(engine: LLMEngine,
-                     test_prompts: List[Tuple[str, SamplingParams,
+                     test_prompts: list[tuple[str, SamplingParams,
                                              Optional[LoRARequest]]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0
@@ -63,7 +63,7 @@ def process_requests(engine: LLMEngine,
                               lora_request=lora_request)
            request_id += 1

-        request_outputs: List[RequestOutput] = engine.step()
+        request_outputs: list[RequestOutput] = engine.step()
        for request_output in request_outputs:
            if request_output.finished:
                print("----------------------------------------------------")
@@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
        engine_args = EngineArgs(model=model,
                                 quantization=quantization,
                                 qlora_adapter_name_or_path=lora_repo,
-                                 load_format="bitsandbytes",
                                 enable_lora=True,
                                 max_lora_rank=64)
    else:

--- a/examples/offline_inference/pixtral.py
+++ b/examples/offline_inference/pixtral.py
@@ -6,14 +6,16 @@ import argparse
 from vllm import LLM
 from vllm.sampling_params import SamplingParams

-# This script is an offline demo for running Pixtral.
+# This script is an offline demo for running Mistral-Small-3.1
 #
 # If you want to run a server/client setup, please follow this code:
 #
 # - Server:
 #
 # ```bash
-# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
+#   --tokenizer-mode mistral --config-format mistral --load-format mistral \
+#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
 # ```
 #
 # - Client:
@@ -23,7 +25,7 @@ from vllm.sampling_params import SamplingParams
 # --header 'Content-Type: application/json' \
 # --header 'Authorization: Bearer token' \
 # --data '{
-#     "model": "mistralai/Pixtral-12B-2409",
+#     "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
 #     "messages": [
 #       {
 #         "role": "user",
@@ -43,12 +45,20 @@ from vllm.sampling_params import SamplingParams
 #     python demo.py advanced


-def run_simple_demo():
-    model_name = "mistralai/Pixtral-12B-2409"
+def run_simple_demo(args: argparse.Namespace):
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
    sampling_params = SamplingParams(max_tokens=8192)

-    # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
-    llm = LLM(model=model_name, tokenizer_mode="mistral")
+    # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
+    llm = LLM(
+        model=model_name,
+        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
+        max_model_len=4096,
+        max_num_seqs=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )

    prompt = "Describe this image in one sentence."
    image_url = "https://picsum.photos/id/237/200/300"
@@ -76,8 +86,8 @@ def run_simple_demo():
    print(outputs[0].outputs[0].text)


-def run_advanced_demo():
-    model_name = "mistralai/Pixtral-12B-2409"
+def run_advanced_demo(args: argparse.Namespace):
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
    max_img_per_msg = 5
    max_tokens_per_img = 4096

@@ -85,8 +95,11 @@ def run_advanced_demo():
    llm = LLM(
        model=model_name,
        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
        limit_mm_per_prompt={"image": max_img_per_msg},
        max_model_len=max_img_per_msg * max_tokens_per_img,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

    prompt = "Describe the following image."
@@ -153,14 +166,19 @@ def main():
        help="Specify the demo mode: 'simple' or 'advanced'",
    )

+    parser.add_argument(
+        '--disable-mm-preprocessor-cache',
+        action='store_true',
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
+
    args = parser.parse_args()

    if args.mode == "simple":
        print("Running simple demo...")
-        run_simple_demo()
+        run_simple_demo(args)
    elif args.mode == "advanced":
        print("Running advanced demo...")
-        run_advanced_demo()
+        run_advanced_demo(args)


 if __name__ == "__main__":

--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -2,12 +2,11 @@

 import gc
 import time
-from typing import List

 from vllm import LLM, SamplingParams


-def time_generation(llm: LLM, prompts: List[str],
+def time_generation(llm: LLM, prompts: list[str],
                    sampling_params: SamplingParams):
    # Generate texts from the prompts. The output is a list of RequestOutput
    # objects that contain the prompt, generated text, and other information.
@@ -51,7 +50,9 @@ if __name__ == "__main__":
    # Create an LLM with spec decoding
    llm = LLM(
        model="meta-llama/Llama-2-13b-chat-hf",
-        speculative_model="ibm-ai-platform/llama-13b-accelerator",
+        speculative_config={
+            "model": "ibm-ai-platform/llama-13b-accelerator",
+        },
    )

    print("With speculation")

--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -6,7 +6,7 @@ for offline inference.
 Requires HuggingFace credentials for access to Llama2.
 """

-from typing import List, Optional, Tuple
+from typing import Optional

 from huggingface_hub import snapshot_download

@@ -16,7 +16,7 @@ from vllm.lora.request import LoRARequest

 def create_test_prompts(
        lora_path: str
-) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
    """Create a list of test prompts with their sampling parameters.

    2 requests for base model, 4 requests for the LoRA. We define 2
@@ -56,7 +56,7 @@ def create_test_prompts(


 def process_requests(engine: LLMEngine,
-                     test_prompts: List[Tuple[str, SamplingParams,
+                     test_prompts: list[tuple[str, SamplingParams,
                                              Optional[LoRARequest]]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0
@@ -70,7 +70,7 @@ def process_requests(engine: LLMEngine,
                               lora_request=lora_request)
            request_id += 1

-        request_outputs: List[RequestOutput] = engine.step()
+        request_outputs: list[RequestOutput] = engine.step()

        for request_output in request_outputs:
            if request_output.finished:

--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -21,7 +21,7 @@ import argparse
 import datetime
 import os
 import re
-from typing import List, Union
+from typing import Union

 import albumentations
 import numpy as np
@@ -260,9 +260,9 @@ def _convert_np_uint8(float_image: torch.Tensor):


 def load_example(
-    file_paths: List[str],
-    mean: List[float] = None,
-    std: List[float] = None,
+    file_paths: list[str],
+    mean: list[float] = None,
+    std: list[float] = None,
    indices: Union[list[int], None] = None,
 ):
    """Build an input example by loading images in *file_paths*.

--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -5,8 +5,9 @@ import json
 import os
 import sys
 from argparse import RawTextHelpFormatter
+from collections.abc import Generator
 from dataclasses import asdict, dataclass
-from typing import Any, Dict, Generator, List, Optional, TypeAlias
+from typing import Any, Optional, TypeAlias

 import torch
 import tqdm
@@ -42,8 +43,8 @@ def get_dtype(dtype: str):
        return dtype


-OutputLen_NumReqs_Map: TypeAlias = Dict[int, int]
-def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
+OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
+def compute_request_output_lengths(batch_size: int, step_requests: list[int]) \
      -> OutputLen_NumReqs_Map:
    """
    Given the number of requests, batch_size, and the number of requests
@@ -63,7 +64,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
    Args:
        batch_size (int): Number of requests submitted for profile. This is
            args.batch_size.
-        step_requests (List[int]): step_requests[i] is the number of requests
+        step_requests (list[int]): step_requests[i] is the number of requests
            that the ith engine step should process.

    Returns:
@@ -114,7 +115,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
    return ol_nr


-def determine_requests_per_step(context: ProfileContext) -> List[int]:
+def determine_requests_per_step(context: ProfileContext) -> list[int]:
    """
    Determine number of requests each engine step should process.
    If context.num_steps is set, then all engine steps process the
@@ -130,7 +131,7 @@ def determine_requests_per_step(context: ProfileContext) -> List[int]:
        context: ProfileContext object.

    Returns:
-        List[int]: Number of requests to process for all engine-steps. 
+        list[int]: Number of requests to process for all engine-steps. 
         output[i], contains the number of requests that the ith step
         should process.
    """
@@ -170,7 +171,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
    for key, value in asdict(context).items():
        print(f"  {key} = {value}")

-    requests_per_step: List[int] = determine_requests_per_step(context)
+    requests_per_step: list[int] = determine_requests_per_step(context)

    ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
        context.batch_size, requests_per_step)

--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -4,7 +4,6 @@ import argparse
 import dataclasses
 import os
 import time
-from typing import List

 import numpy as np
 import torch_xla.debug.profiler as xp
@@ -35,7 +34,7 @@ def main(args: argparse.Namespace):
    dummy_prompt_token_ids = np.random.randint(10000,
                                               size=(args.batch_size,
                                                     args.input_len))
-    dummy_prompts: List[PromptType] = [{
+    dummy_prompts: list[PromptType] = [{
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]


--- a/examples/offline_inference/reproduciblity.py
+++ b/examples/offline_inference/reproduciblity.py
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+from vllm import LLM, SamplingParams
+
+# vLLM does not guarantee the reproducibility of the results by default,
+# for the sake of performance. You need to do the following to achieve
+# reproducible results:
+# 1. Turn off multiprocessing to make the scheduling deterministic.
+#    NOTE(woosuk): This is not needed and will be ignored for V0.
+os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+# 2. Fix the global seed for reproducibility. The default seed is None, which is
+# not reproducible.
+SEED = 42
+
+# NOTE(woosuk): Even with the above two settings, vLLM only provides
+# reproducibility when it runs on the same hardware and the same vLLM version.
+# Also, the online serving API (`vllm serve`) does not support reproducibility
+# because it is almost impossible to make the scheduling deterministic in the
+# online serving setting.
+
+llm = LLM(model="facebook/opt-125m", seed=SEED)
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -18,72 +18,11 @@ import ray
 import torch
 from ray.util.placement_group import placement_group
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from rlhf_utils import stateless_init_process_group
 from transformers import AutoModelForCausalLM

 from vllm import LLM, SamplingParams
 from vllm.utils import get_ip, get_open_port
-from vllm.worker.worker import Worker
-
-
-def stateless_init_process_group(master_address, master_port, rank, world_size,
-                                 device):
-    """
-    vLLM provides `StatelessProcessGroup` to create a process group
-    without considering the global process group in torch.distributed.
-    It is recommended to create `StatelessProcessGroup`, and then initialize
-    the data-plane communication (NCCL) between external (train processes) 
-    and vLLM workers.
-    """
-    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
-    from vllm.distributed.utils import StatelessProcessGroup
-    pg = StatelessProcessGroup.create(host=master_address,
-                                      port=master_port,
-                                      rank=rank,
-                                      world_size=world_size)
-    pynccl = PyNcclCommunicator(pg, device=device)
-    return pynccl
-
-
-class MyWorker(Worker):
-    """
-    The `MyWorker` class inherits from `Worker` to provide custom functions.
-    For simplicity, we define the `MyWorker` class in this self-contained 
-    script. Normally, we should define the `MyWorker` class in a separate 
-    file and pass the qualified name of the class to the `worker_cls` 
-    parameter.
-    """
-
-    def init_weight_update_group(self, master_address, master_port,
-                                 rank_offset, world_size):
-        from vllm.distributed.parallel_state import get_world_group
-        rank = get_world_group().rank + rank_offset
-        self.model_update_group = stateless_init_process_group(
-            master_address,
-            master_port,
-            rank,
-            world_size,
-            self.device,
-        )
-
-    def update_weight(self, name, dtype, shape):
-        weight = torch.empty(shape, dtype=dtype, device="cuda")
-        self.model_update_group.broadcast(weight,
-                                          src=0,
-                                          stream=torch.cuda.current_stream())
-
-        self.model_runner.model.load_weights(weights=[(name, weight)])
-
-        del weight
-
-    def check_weights_changed(self):
-        """
-        Check if the weights are updated to 0.
-        """
-        weights_updated = True
-        for name, p in self.model_runner.model.named_parameters():
-            weights_updated = weights_updated and torch.allclose(
-                p, torch.zeros_like(p))
-        return weights_updated


 class MyLLM(LLM):
@@ -129,7 +68,7 @@ llm = ray.remote(
 )(MyLLM).remote(
    model="facebook/opt-125m",
    enforce_eager=True,
-    worker_cls=MyWorker,
+    worker_extension_cls="rlhf_utils.WorkerExtension",
    tensor_parallel_size=2,
    distributed_executor_backend="ray",
 )
@@ -159,6 +98,7 @@ master_port = get_open_port()

 handle = llm.collective_rpc.remote("init_weight_update_group",
                                   args=(master_address, master_port, 1, 3))
+
 model_update_group = stateless_init_process_group(master_address, master_port,
                                                  0, 3, torch.device("cuda:0"))
 ray.get(handle)

--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -17,40 +17,6 @@ from ray.util.placement_group import placement_group
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy

 from vllm import LLM
-from vllm.worker.worker import Worker
-
-
-class MyWorker(Worker):
-
-    def report_device_id(self) -> str:
-        from vllm.platforms import current_platform
-        self.device_uuid = current_platform.get_device_uuid(self.device.index)
-        return self.device_uuid
-
-    def update_weights_from_ipc_handles(self, ipc_handles):
-        handles = ipc_handles[self.device_uuid]
-        device_id = self.device.index
-        weights = []
-        for name, handle in handles.items():
-            func, args = handle
-            list_args = list(args)
-            # the key is to change device id to the current device id
-            # in case two processes have different CUDA_VISIBLE_DEVICES
-            list_args[6] = device_id
-            tensor = func(*list_args)
-            weights.append((name, tensor))
-        self.model_runner.model.load_weights(weights=weights)
-        torch.cuda.synchronize()
-
-    def check_weights_changed(self):
-        """
-        Check if the weights are updated to 0.
-        """
-        weights_updated = True
-        for name, p in self.model_runner.model.named_parameters():
-            weights_updated = weights_updated and torch.allclose(
-                p, torch.zeros_like(p))
-        return weights_updated


 class MyLLM(LLM):
@@ -150,7 +116,7 @@ for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]):
    )(MyLLM).remote(
        model="facebook/opt-125m",
        enforce_eager=True,
-        worker_cls=MyWorker,
+        worker_extension_cls="rlhf_utils.ColocateWorkerExtension",
        tensor_parallel_size=2,
        distributed_executor_backend="ray",
        gpu_memory_utilization=0.4,

--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def stateless_init_process_group(master_address, master_port, rank, world_size,
+                                 device):
+    """
+    vLLM provides `StatelessProcessGroup` to create a process group
+    without considering the global process group in torch.distributed.
+    It is recommended to create `StatelessProcessGroup`, and then initialize
+    the data-plane communication (NCCL) between external (train processes) 
+    and vLLM workers.
+    """
+    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+    from vllm.distributed.utils import StatelessProcessGroup
+    pg = StatelessProcessGroup.create(host=master_address,
+                                      port=master_port,
+                                      rank=rank,
+                                      world_size=world_size)
+    pynccl = PyNcclCommunicator(pg, device=device)
+    return pynccl
+
+
+class WorkerExtension:
+    """
+    The class for vLLM's worker to inherit from.
+    By defining an extension class, the code can work no matter what is
+    the underlying worker class. This way, the code can be compatible
+    with both vLLM V0 and V1.
+    NOTE: we define this class in a separate module, and the main module
+    should pass the full qualified name as `worker_extension_cls` argument.
+    """
+
+    def init_weight_update_group(self, master_address, master_port,
+                                 rank_offset, world_size):
+        from vllm.distributed.parallel_state import get_world_group
+        rank = get_world_group().rank + rank_offset
+        self.model_update_group = stateless_init_process_group(
+            master_address,
+            master_port,
+            rank,
+            world_size,
+            self.device,
+        )
+
+    def update_weight(self, name, dtype, shape):
+        weight = torch.empty(shape, dtype=dtype, device="cuda")
+        self.model_update_group.broadcast(weight,
+                                          src=0,
+                                          stream=torch.cuda.current_stream())
+
+        self.model_runner.model.load_weights(weights=[(name, weight)])
+
+        del weight
+
+    def check_weights_changed(self):
+        """
+        Check if the weights are updated to 0.
+        """
+        weights_updated = True
+        for name, p in self.model_runner.model.named_parameters():
+            weights_updated = weights_updated and torch.allclose(
+                p, torch.zeros_like(p))
+        return weights_updated
+
+
+class ColocateWorkerExtension:
+    """
+    The class for vLLM's worker to inherit from, in the colocate setting.
+    By defining an extension class, the code can work no matter what is
+    the underlying worker class. This way, the code can be compatible
+    with both vLLM V0 and V1.
+    NOTE: we define this class in a separate module, and the main module
+    should pass the full qualified name as `worker_extension_cls` argument.
+    """
+
+    def report_device_id(self) -> str:
+        from vllm.platforms import current_platform
+        self.device_uuid = current_platform.get_device_uuid(self.device.index)
+        return self.device_uuid
+
+    def update_weights_from_ipc_handles(self, ipc_handles):
+        handles = ipc_handles[self.device_uuid]
+        device_id = self.device.index
+        weights = []
+        for name, handle in handles.items():
+            func, args = handle
+            list_args = list(args)
+            # the key is to change device id to the current device id
+            # in case two processes have different CUDA_VISIBLE_DEVICES
+            list_args[6] = device_id
+            tensor = func(*list_args)
+            weights.append((name, tensor))
+        self.model_runner.model.load_weights(weights=weights)
+        torch.cuda.synchronize()
+
+    def check_weights_changed(self):
+        """
+        Check if the weights are updated to 0.
+        """
+        weights_updated = True
+        for name, p in self.model_runner.model.named_parameters():
+            weights_updated = weights_updated and torch.allclose(
+                p, torch.zeros_like(p))
+        return weights_updated
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -21,7 +21,9 @@ sampling_params = SamplingParams(temperature=0.7,

 # Set `enforce_eager=True` to avoid ahead-of-time compilation.
 # In real workloads, `enforace_eager` should be `False`.
-llm = LLM(model="google/gemma-2b", enforce_eager=True)
+llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+          max_num_batched_tokens=64,
+          max_num_seqs=4)
 outputs = llm.generate(prompts, sampling_params)
 for output, answer in zip(outputs, answers):
    prompt = output.prompt

--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -6,122 +6,219 @@ the correct prompt format on vision language models for text generation.
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+import os
 import random
+from dataclasses import asdict
+from typing import NamedTuple, Optional

+from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer

-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
+from vllm.lora.request import LoRARequest
 from vllm.utils import FlexibleArgumentParser

+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: list[str]
+    stop_token_ids: Optional[list[int]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 # lower-end GPUs.
 # Unless specified, these settings have been tested to work on a single L4.


 # Aria
-def run_aria(question: str, modality: str):
+def run_aria(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

    # NOTE: Need L40 (or equivalent) to avoid OOM
-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=2,
-              dtype="bfloat16",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype="bfloat16",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )

-    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
-              "<|im_end|>\n<|im_start|>assistant\n")
+    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
+                "<|im_end|>\n<|im_start|>assistant\n")
+               for question in questions]

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-    return llm, prompt, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )


 # BLIP-2
-def run_blip2(question: str, modality: str):
+def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
-    prompt = f"Question: {question} Answer:"
-    llm = LLM(model="Salesforce/blip2-opt-2.7b",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = [f"Question: {question} Answer:" for question in questions]
+    engine_args = EngineArgs(
+        model="Salesforce/blip2-opt-2.7b",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Chameleon
-def run_chameleon(question: str, modality: str):
+def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

-    prompt = f"{question}<image>"
-    llm = LLM(model="facebook/chameleon-7b",
-              max_model_len=4096,
-              max_num_seqs=2,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = [f"{question}<image>" for question in questions]
+    engine_args = EngineArgs(
+        model="facebook/chameleon-7b",
+        max_model_len=4096,
+        max_num_seqs=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Deepseek-VL2
-def run_deepseek_vl2(question: str, modality: str):
+def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "deepseek-ai/deepseek-vl2-tiny"

-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=2,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
-              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+    )
+
+    prompts = [
+        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Florence2
+def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    engine_args = EngineArgs(
+        model="microsoft/Florence-2-large",
+        tokenizer="facebook/bart-large",
+        max_num_seqs=8,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]

-    prompt = f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Fuyu
-def run_fuyu(question: str, modality: str):
+def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [f"{question}\n" for question in questions]
+    engine_args = EngineArgs(
+        model="adept/fuyu-8b",
+        max_model_len=2048,
+        max_num_seqs=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Gemma 3
+def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
+    model_name = "google/gemma-3-4b-it"

-    prompt = f"{question}\n"
-    llm = LLM(model="adept/fuyu-8b",
-              max_model_len=2048,
-              max_num_seqs=2,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        mm_processor_kwargs={"do_pan_and_scan": True},
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    prompts = [("<bos><start_of_turn>user\n"
+                f"<start_of_image>{question}<end_of_turn>\n"
+                "<start_of_turn>model\n") for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # GLM-4v
-def run_glm4v(question: str, modality: str):
+def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"

-    llm = LLM(model=model_name,
-              max_model_len=2048,
-              max_num_seqs=2,
-              trust_remote_code=True,
-              enforce_eager=True,
-              hf_overrides={"architectures": ["GLM4VForCausalLM"]},
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        enforce_eager=True,
+        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )

-    prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
-        {question}<|assistant|>"
+    prompts = [
+        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
+        {question}<|assistant|>" for question in questions
+    ]

    stop_token_ids = [151329, 151336, 151338]
-    return llm, prompt, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )


 # H2OVL-Mississippi
-def run_h2ovl(question: str, modality: str):
+def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "h2oai/h2ovl-mississippi-800m"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
@@ -130,23 +227,31 @@ def run_h2ovl(question: str, modality: str):

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    messages = [[{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)

    # Stop tokens for H2OVL-Mississippi
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
    stop_token_ids = [tokenizer.eos_token_id]
-    return llm, prompt, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )


 # Idefics3-8B-Llama3
-def run_idefics3(question: str, modality: str):
+def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
@@ -160,20 +265,23 @@ def run_idefics3(question: str, modality: str):
        },
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
-    prompt = (
+    prompts = [(
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    ) for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
    )
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids


 # InternVL
-def run_internvl(question: str, modality: str):
+def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "OpenGVLab/InternVL2-2B"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
@@ -182,10 +290,13 @@ def run_internvl(question: str, modality: str):

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    messages = [[{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)

    # Stop tokens for InternVL
    # models variants may have different stop tokens
@@ -193,84 +304,127 @@ def run_internvl(question: str, modality: str):
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return llm, prompt, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )


 # LLaVA-1.5
-def run_llava(question: str, modality: str):
+def run_llava(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

-    prompt = f"USER: <image>\n{question}\nASSISTANT:"
+    prompts = [
+        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
+    ]
+
+    engine_args = EngineArgs(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_model_len=4096,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )

-    llm = LLM(model="llava-hf/llava-1.5-7b-hf",
-              max_model_len=4096,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(question: str, modality: str):
+def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

-    prompt = f"[INST] <image>\n{question} [/INST]"
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
-              max_model_len=8192,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
+    engine_args = EngineArgs(
+        model="llava-hf/llava-v1.6-mistral-7b-hf",
+        max_model_len=8192,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # LlaVA-NeXT-Video
 # Currently only support for video input
-def run_llava_next_video(question: str, modality: str):
+def run_llava_next_video(questions: list[str],
+                         modality: str) -> ModelRequestData:
    assert modality == "video"

-    prompt = f"USER: <video>\n{question} ASSISTANT:"
-    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
-              max_model_len=8192,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = [
+        f"USER: <video>\n{question} ASSISTANT:" for question in questions
+    ]
+    engine_args = EngineArgs(
+        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
+        max_model_len=8192,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # LLaVA-OneVision
-def run_llava_onevision(question: str, modality: str):
+def run_llava_onevision(questions: list[str],
+                        modality: str) -> ModelRequestData:

    if modality == "video":
-        prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
-        <|im_start|>assistant\n"
+        prompts = [
+            f"<|im_start|>user <video>\n{question}<|im_end|> \
+        <|im_start|>assistant\n" for question in questions
+        ]

    elif modality == "image":
-        prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
-        <|im_start|>assistant\n"
+        prompts = [
+            f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n" for question in questions
+        ]
+
+    engine_args = EngineArgs(
+        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
+        max_model_len=16384,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )

-    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
-              max_model_len=16384,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Mantis
-def run_mantis(question: str, modality: str):
+def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
-    prompt = llama3_template.format(f"{question}\n<image>")
+    prompts = [
+        llama3_template.format(f"{question}\n<image>")
+        for question in questions
+    ]

-    llm = LLM(
+    engine_args = EngineArgs(
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
        max_model_len=4096,
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
    stop_token_ids = [128009]
-    return llm, prompt, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )


 # MiniCPM-V
-def run_minicpmv_base(question: str, modality: str, model_name):
+def run_minicpmv_base(questions: list[str], modality: str, model_name):
    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa

@@ -294,7 +448,7 @@ def run_minicpmv_base(question: str, modality: str, model_name):
    # model_name = "openbmb/MiniCPM-o-2_6"
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
@@ -317,26 +471,33 @@ def run_minicpmv_base(question: str, modality: str, model_name):
        "video": "(<video>./</video>)",
    }

-    messages = [{
-        'role': 'user',
-        'content': f'{modality_placeholder[modality]}\n{question}'
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-    return llm, prompt, stop_token_ids
+    prompts = [
+        tokenizer.apply_chat_template(
+            [{
+                'role': 'user',
+                'content': f"{modality_placeholder[modality]}\n{question}"
+            }],
+            tokenize=False,
+            add_generation_prompt=True) for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )


-def run_minicpmo(question: str, modality: str):
-    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-o-2_6")
+def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
+    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")


-def run_minicpmv(question: str, modality: str):
-    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-V-2_6")
+def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
+    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")


 # LLama 3.2
-def run_mllama(question: str, modality: str):
+def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@@ -346,7 +507,7 @@ def run_mllama(question: str, modality: str):
    # You may lower either to run this example on lower-end GPUs.

    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=16,
@@ -354,49 +515,58 @@ def run_mllama(question: str, modality: str):
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    messages = [{
+    messages = [[{
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
-            "text": f"{question}"
+            "text": question
        }]
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           add_generation_prompt=True,
-                                           tokenize=False)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            add_generation_prompt=True,
+                                            tokenize=False)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Molmo
-def run_molmo(question, modality):
+def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "allenai/Molmo-7B-D-0924"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        dtype="bfloat16",
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

-    prompt = question
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = [
+        f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n" for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # NVLM-D
-def run_nvlm_d(question: str, modality: str):
+def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
@@ -406,43 +576,60 @@ def run_nvlm_d(question: str, modality: str):

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    messages = [[{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # PaliGemma
-def run_paligemma(question: str, modality: str):
+def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    # PaliGemma has special prompt format for VQA
-    prompt = "caption en"
-    llm = LLM(model="google/paligemma-3b-mix-224",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = ["caption en" for _ in questions]
+    engine_args = EngineArgs(
+        model="google/paligemma-3b-mix-224",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # PaliGemma 2
-def run_paligemma2(question: str, modality: str):
+def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    # PaliGemma 2 has special prompt format for VQA
-    prompt = "caption en"
-    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = ["caption en" for _ in questions]
+    engine_args = EngineArgs(
+        model="google/paligemma2-3b-ft-docci-448",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Phi-3-Vision
-def run_phi3v(question: str, modality: str):
+def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

-    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
+    prompts = [
+        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
+        for question in questions
+    ]

    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
@@ -456,7 +643,7 @@ def run_phi3v(question: str, modality: str):
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-    llm = LLM(
+    engine_args = EngineArgs(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
@@ -465,34 +652,71 @@ def run_phi3v(question: str, modality: str):
        mm_processor_kwargs={"num_crops": 16},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Phi-4-multimodal-instruct
+def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process image inputs.
+    """
+    assert modality == "image"
+    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    vision_lora_path = os.path.join(model_path, "vision-lora")
+    prompts = [
+        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
+        for question in questions
+    ]
+    engine_args = EngineArgs(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_lora_rank=320,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
+    )


 # Pixtral HF-format
-def run_pixtral_hf(question: str, modality: str):
+def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

    # NOTE: Need L40 (or equivalent) to avoid OOM
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

-    prompt = f"<s>[INST]{question}\n[IMG][/INST]"
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Qwen
-def run_qwen_vl(question: str, modality: str):
+def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

-    llm = LLM(
+    engine_args = EngineArgs(
        model="Qwen/Qwen-VL",
        trust_remote_code=True,
        max_model_len=1024,
@@ -501,17 +725,20 @@ def run_qwen_vl(question: str, modality: str):
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

-    prompt = f"{question}Picture 1: <img></img>\n"
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Qwen2-VL
-def run_qwen2_vl(question: str, modality: str):
+def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:

    model_name = "Qwen/Qwen2-VL-7B-Instruct"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
@@ -528,20 +755,25 @@ def run_qwen2_vl(question: str, modality: str):
    elif modality == "video":
        placeholder = "<|video_pad|>"

-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = [
+        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+         f"{question}<|im_end|>\n"
+         "<|im_start|>assistant\n") for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 # Qwen2.5-VL
-def run_qwen2_5_vl(question: str, modality: str):
+def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
@@ -558,12 +790,17 @@ def run_qwen2_5_vl(question: str, modality: str):
    elif modality == "video":
        placeholder = "<|video_pad|>"

-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = [
+        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+         f"{question}<|im_end|>\n"
+         "<|im_start|>assistant\n") for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )


 model_example_map = {
@@ -571,7 +808,9 @@ model_example_map = {
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
    "deepseek_vl_v2": run_deepseek_vl2,
+    "florence2": run_florence2,
    "fuyu": run_fuyu,
+    "gemma3": run_gemma3,
    "glm4v": run_glm4v,
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
@@ -589,6 +828,7 @@ model_example_map = {
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
+    "phi4_mm": run_phi4mm,
    "pixtral_hf": run_pixtral_hf,
    "qwen_vl": run_qwen_vl,
    "qwen2_vl": run_qwen2_vl,
@@ -607,29 +847,35 @@ def get_multi_modal_input(args):
        # Input image and question
        image = ImageAsset("cherry_blossom") \
            .pil_image.convert("RGB")
-        img_question = "What is the content of this image?"
+        img_questions = [
+            "What is the content of this image?",
+            "Describe the content of this image in detail.",
+            "What's in the image?",
+            "Where is this image taken?",
+        ]

        return {
            "data": image,
-            "question": img_question,
+            "questions": img_questions,
        }

    if args.modality == "video":
        # Input video and question
        video = VideoAsset(name="sample_demo_1.mp4",
                           num_frames=args.num_frames).np_ndarrays
-        vid_question = "Why is this video funny?"
+        vid_questions = ["Why is this video funny?"]

        return {
            "data": video,
-            "question": vid_question,
+            "questions": vid_questions,
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)


-def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
+def apply_image_repeat(image_repeat_prob, num_prompts, data,
+                       prompts: list[str], modality):
    """Repeats images with provided probability of "image_repeat_prob". 
    Used to simulate hit/miss for the MM preprocessor cache.
    """
@@ -649,7 +895,7 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
                cur_image.putpixel((0, 0), new_val)

        inputs.append({
-            "prompt": prompt,
+            "prompt": prompts[i % len(prompts)],
            "multi_modal_data": {
                modality: cur_image
            }
@@ -666,41 +912,55 @@ def main(args):
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
-    question = mm_input["question"]
+    questions = mm_input["questions"]

-    llm, prompt, stop_token_ids = model_example_map[model](question, modality)
+    req_data = model_example_map[model](questions, modality)
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
+
+    # Don't want to check the flag multiple times, so just hijack `prompts`.
+    prompts = req_data.prompts if args.use_different_prompt_per_request else [
+        req_data.prompts[0]
+    ]

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
-                                     stop_token_ids=stop_token_ids)
+                                     stop_token_ids=req_data.stop_token_ids)

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        inputs = {
-            "prompt": prompt,
+            "prompt": prompts[0],
            "multi_modal_data": {
                modality: data
            },
        }
-
    else:
        # Batch inference
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
            inputs = apply_image_repeat(args.image_repeat_prob,
-                                        args.num_prompts, data, prompt,
+                                        args.num_prompts, data, prompts,
                                        modality)
        else:
            # Use the same image for all prompts
            inputs = [{
-                "prompt": prompt,
+                "prompt": prompts[i % len(prompts)],
                "multi_modal_data": {
                    modality: data
                },
-            } for _ in range(args.num_prompts)]
+            } for i in range(args.num_prompts)]

    if args.time_generate:
        import time
@@ -740,6 +1000,10 @@ if __name__ == "__main__":
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")

    parser.add_argument(
        '--image-repeat-prob',
@@ -758,5 +1022,11 @@ if __name__ == "__main__":
        action='store_true',
        help='If True, then print the total generate() call time')

+    parser.add_argument(
+        '--use-different-prompt-per-request',
+        action='store_true',
+        help='If True, then use different prompt (with the same multi-modal '
+        'data) for each request.')
+
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -7,11 +7,12 @@ For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
 from argparse import Namespace
+from dataclasses import asdict
 from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args

 from PIL.Image import Image

-from vllm import LLM
+from vllm import LLM, EngineArgs
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser

@@ -37,12 +38,12 @@ Query = Union[TextQuery, ImageQuery, TextImageQuery]


 class ModelRequestData(NamedTuple):
-    llm: LLM
+    engine_args: EngineArgs
    prompt: str
    image: Optional[Image]


-def run_e5_v(query: Query):
+def run_e5_v(query: Query) -> ModelRequestData:
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501

    if query["modality"] == "text":
@@ -58,20 +59,20 @@ def run_e5_v(query: Query):
        modality = query['modality']
        raise ValueError(f"Unsupported query modality: '{modality}'")

-    llm = LLM(
+    engine_args = EngineArgs(
        model="royokong/e5-v",
        task="embed",
        max_model_len=4096,
    )

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        image=image,
    )


-def run_vlm2vec(query: Query):
+def run_vlm2vec(query: Query) -> ModelRequestData:
    if query["modality"] == "text":
        text = query["text"]
        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
@@ -87,7 +88,7 @@ def run_vlm2vec(query: Query):
        modality = query['modality']
        raise ValueError(f"Unsupported query modality: '{modality}'")

-    llm = LLM(
+    engine_args = EngineArgs(
        model="TIGER-Lab/VLM2Vec-Full",
        task="embed",
        trust_remote_code=True,
@@ -95,7 +96,7 @@ def run_vlm2vec(query: Query):
    )

    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        image=image,
    )
@@ -126,15 +127,18 @@ def get_query(modality: QueryModality):
    raise ValueError(msg)


-def run_encode(model: str, modality: QueryModality):
+def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
    query = get_query(modality)
    req_data = model_example_map[model](query)

+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    llm = LLM(**engine_args)
+
    mm_data = {}
    if req_data.image is not None:
        mm_data["image"] = req_data.image

-    outputs = req_data.llm.embed({
+    outputs = llm.embed({
        "prompt": req_data.prompt,
        "multi_modal_data": mm_data,
    })
@@ -144,7 +148,7 @@ def run_encode(model: str, modality: QueryModality):


 def main(args: Namespace):
-    run_encode(args.model_name, args.modality)
+    run_encode(args.model_name, args.modality, args.seed)


 model_example_map = {
@@ -167,5 +171,10 @@ if __name__ == "__main__":
                        default="image",
                        choices=get_args(QueryModality),
                        help='Modality of the input.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
    args = parser.parse_args()
    main(args)