Merge tag 'v0.6.5' into v0.6.5-dev

4d3a2c28 · zhuwenwen · 92ec5d8e · 2d1b9baa · 4d3a2c28 · 4d3a2c28
Commit 4d3a2c28 authored Dec 30, 2024 by zhuwenwen
20 changed files
--- a/examples/logging_configuration.md
+++ b/examples/logging_configuration.md
@@ -118,7 +118,7 @@ configuration for the root vLLM logger and for the logger you wish to silence:
 {
  "formatters": {
    "vllm": {
-      "class": "vllm.logging.NewLineFormatter",
+      "class": "vllm.logging_utils.NewLineFormatter",
      "datefmt": "%m-%d %H:%M:%S",
      "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
    }

--- a/examples/multilora_inference.py
+++ b/examples/multilora_inference.py
@@ -43,15 +43,6 @@ def create_test_prompts(
                           max_tokens=128,
                           stop_token_ids=[32003]),
            LoRARequest("sql-lora", 1, lora_path)),
-        (
-            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-            SamplingParams(n=3,
-                           best_of=3,
-                           use_beam_search=True,
-                           temperature=0,
-                           max_tokens=128,
-                           stop_token_ids=[32003]),
-            LoRARequest("sql-lora", 1, lora_path)),
        (
            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
            SamplingParams(temperature=0.0,
@@ -60,15 +51,6 @@ def create_test_prompts(
                           max_tokens=128,
                           stop_token_ids=[32003]),
            LoRARequest("sql-lora2", 2, lora_path)),
-        (
-            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-            SamplingParams(n=3,
-                           best_of=3,
-                           use_beam_search=True,
-                           temperature=0,
-                           max_tokens=128,
-                           stop_token_ids=[32003]),
-            LoRARequest("sql-lora", 1, lora_path)),
    ]

--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -12,7 +12,7 @@ if __name__ == '__main__':
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=16)
    # Create an LLM.
-    llm = LLM(model="facebook/opt-125m",tensor_parallel_size=1, distributed_executor_backend="ray", dtype="float16",trust_remote_code=True, enforce_eager=True)
+    llm = LLM(model="facebook/opt-125m",tensor_parallel_size=1, dtype="float16",trust_remote_code=True, enforce_eager=True)
    # Generate texts from the prompts. The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate(prompts, sampling_params)
@@ -20,4 +20,4 @@ if __name__ == '__main__':
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
\ No newline at end of file
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@@ -25,19 +25,15 @@ def run_ultravox(question: str, audio_count: int):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [{
-        'role':
+        'role': 'user',
-        'user',
+        'content': "<|audio|>\n" * audio_count + question
-        'content':
-        "<|reserved_special_token_0|>\n" * audio_count + question
    }]
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)
    llm = LLM(model=model_name,
-              enforce_eager=True,
+              trust_remote_code=True,
-              enable_chunked_prefill=False,
-              max_model_len=8192,
              limit_mm_per_prompt={"audio": audio_count})
    stop_token_ids = None
    return llm, prompt, stop_token_ids
@@ -91,10 +87,9 @@ def main(args):
                for asset in audio_assets[:audio_count]
            ]
        }
    assert args.num_prompts > 0
    inputs = {"prompt": prompt, "multi_modal_data": mm_data}
    if args.num_prompts > 1:
        # Batch inference
        inputs = [inputs] * args.num_prompts

--- a/examples/offline_inference_classification.py
+++ b/examples/offline_inference_classification.py
+from vllm import LLM
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create an LLM.
+# You should pass task="classify" for classification models
+model = LLM(
+    model="jason9693/Qwen2.5-1.5B-apeach",
+    task="classify",
+    enforce_eager=True,
+)
+# Generate logits. The output is a list of ClassificationRequestOutputs.
+outputs = model.classify(prompts)
+# Print the outputs.
+for prompt, output in zip(prompts, outputs):
+    probs = output.outputs.probs
+    probs_trimmed = ((str(probs[:16])[:-1] +
+                      ", ...]") if len(probs) > 16 else probs)
+    print(f"Prompt: {prompt!r} | "
+          f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
--- a/examples/offline_inference_cli.py
+++ b/examples/offline_inference_cli.py
+from dataclasses import asdict
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+def get_prompts(num_prompts: int):
+    # The default sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    if num_prompts != len(prompts):
+        prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
+    return prompts
+def main(args):
+    # Create prompts
+    prompts = get_prompts(args.num_prompts)
+    # Create a sampling params object.
+    sampling_params = SamplingParams(n=args.n,
+                                     temperature=args.temperature,
+                                     top_p=args.top_p,
+                                     top_k=args.top_k,
+                                     max_tokens=args.max_tokens)
+    # Create an LLM.
+    # The default model is 'facebook/opt-125m'
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**asdict(engine_args))
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+if __name__ == '__main__':
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    group = parser.add_argument_group("SamplingParams options")
+    group.add_argument("--num-prompts",
+                       type=int,
+                       default=4,
+                       help="Number of prompts used for inference")
+    group.add_argument("--max-tokens",
+                       type=int,
+                       default=16,
+                       help="Generated output length for sampling")
+    group.add_argument('--n',
+                       type=int,
+                       default=1,
+                       help='Number of generated sequences per prompt')
+    group.add_argument('--temperature',
+                       type=float,
+                       default=0.8,
+                       help='Temperature for text generation')
+    group.add_argument('--top-p',
+                       type=float,
+                       default=0.95,
+                       help='top_p for text generation')
+    group.add_argument('--top-k',
+                       type=int,
+                       default=-1,
+                       help='top_k for text generation')
+    args = parser.parse_args()
+    main(args)
--- a/examples/offline_inference_embedding.py
+++ b/examples/offline_inference_embedding.py
@@ -9,9 +9,20 @@ prompts = [
 ]
 # Create an LLM.
-model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True)
+# You should pass task="embed" for embedding models
+model = LLM(
+    model="intfloat/e5-mistral-7b-instruct",
+    task="embed",
+    enforce_eager=True,
+)
 # Generate embedding. The output is a list of EmbeddingRequestOutputs.
-outputs = model.encode(prompts)
+outputs = model.embed(prompts)
 # Print the outputs.
-for output in outputs:
+for prompt, output in zip(prompts, outputs):
-    print(output.outputs.embedding)  # list of 4096 floats
+    embeds = output.outputs.embedding
+    embeds_trimmed = ((str(embeds[:16])[:-1] +
+                       ", ...]") if len(embeds) > 16 else embeds)
+    print(f"Prompt: {prompt!r} | "
+          f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
--- a/examples/offline_inference_mlpspeculator.py
+++ b/examples/offline_inference_mlpspeculator.py
@@ -50,8 +50,6 @@ if __name__ == "__main__":
    llm = LLM(
        model="meta-llama/Llama-2-13b-chat-hf",
        speculative_model="ibm-fms/llama-13b-accelerator",
-        # These are currently required for MLPSpeculator decoding
-        use_v2_block_manager=True,
    )
    print("With speculation")

--- a/examples/offline_inference_openai.md
+++ b/examples/offline_inference_openai.md
 # Offline Inference with the OpenAI Batch file format
- **NOTE:** This is a guide to performing batch inference using the OpenAI batch file format, **NOT** the complete Batch (REST) API.
+```{important}
+This is a guide to performing batch inference using the OpenAI batch file format, **not** the complete Batch (REST) API.
- ## File Format
+```
- The OpenAI batch file format consists of a series of json objects on new lines.
+## File Format
- [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
+The OpenAI batch file format consists of a series of json objects on new lines.
- Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
- **NOTE:** We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
- ## Pre-requisites
+```{note}
+We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+```
-* Ensure you are using `vllm >= 0.4.3`. You can check by running `python -c "import vllm; print(vllm.__version__)"`.
+## Pre-requisites
 * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
  - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
  - Install the token on your machine (Run `huggingface-cli login`).
  - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
- ## Example 1: Running with a local file
+## Example 1: Running with a local file
- ### Step 1: Create your batch file
+### Step 1: Create your batch file
- To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
+To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
- ```
+```
- wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
- ```
+```
- Once you've created your batch file it should look like this
+Once you've created your batch file it should look like this
- ```
+```
- $ cat openai_example_batch.jsonl
+$ cat openai_example_batch.jsonl
-{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
- ```
+```
- ### Step 2: Run the batch
+### Step 2: Run the batch
 The batch running tool is designed to be used from the command line.
@@ -85,18 +88,18 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 ### Step 1: Upload your input script
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
- ```
+```
- wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
- ```
+```
- Once you've created your batch file it should look like this
+Once you've created your batch file it should look like this
- ```
+```
- $ cat openai_example_batch.jsonl
+$ cat openai_example_batch.jsonl
-{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
- ```
+```
 Now upload your batch file to your S3 bucket.
@@ -104,7 +107,6 @@ Now upload your batch file to your S3 bucket.
 aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 ### Step 2: Generate your presigned urls
 Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
@@ -179,21 +181,19 @@ aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
 ### Step 1: Create your batch file
- Add embedding requests to your batch file. The following is an example:
+Add embedding requests to your batch file. The following is an example:
- ```
+```
- {"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
+{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
 ```
- You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
+You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
- ### Step 2: Run the batch
+### Step 2: Run the batch
 You can run the batch using the same command as in earlier examples.
 ### Step 3: Check your results
 You can check your results by running `cat results.jsonl`
@@ -201,5 +201,5 @@ You can check your results by running `cat results.jsonl`
 ```
 $ cat results.jsonl
 {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
-...```
+...
 ```
--- a/examples/offline_inference_scoring.py
+++ b/examples/offline_inference_scoring.py
+from vllm import LLM
+# Sample prompts.
+text_1 = "What is the capital of France?"
+texts_2 = [
+    "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+]
+# Create an LLM.
+# You should pass task="score" for cross-encoder models
+model = LLM(
+    model="BAAI/bge-reranker-v2-m3",
+    task="score",
+    enforce_eager=True,
+)
+# Generate scores. The output is a list of ScoringRequestOutputs.
+outputs = model.score(text_1, texts_2)
+# Print the outputs.
+for text_2, output in zip(texts_2, outputs):
+    score = output.outputs.score
+    print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
--- a/examples/offline_inference_structured_outputs.py
+++ b/examples/offline_inference_structured_outputs.py
+from enum import Enum
+from pydantic import BaseModel
+from vllm import LLM, SamplingParams
+from vllm.sampling_params import GuidedDecodingParams
+llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
+# Guided decoding by Choice (list of possible options)
+guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
+sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+outputs = llm.generate(
+    prompts="Classify this sentiment: vLLM is wonderful!",
+    sampling_params=sampling_params,
+)
+print(outputs[0].outputs[0].text)
+# Guided decoding by Regex
+guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n")
+sampling_params = SamplingParams(guided_decoding=guided_decoding_params,
+                                 stop=["\n"])
+prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+          "End in .com and new line. Example result:"
+          "alan.turing@enigma.com\n")
+outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+print(outputs[0].outputs[0].text)
+# Guided decoding by JSON using Pydantic schema
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+json_schema = CarDescription.model_json_schema()
+guided_decoding_params = GuidedDecodingParams(json=json_schema)
+sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+prompt = ("Generate a JSON with the brand, model and car_type of"
+          "the most iconic car from the 90's")
+outputs = llm.generate(
+    prompts=prompt,
+    sampling_params=sampling_params,
+)
+print(outputs[0].outputs[0].text)
+# Guided decoding by Grammar
+simplified_sql_grammar = """
+    ?start: select_statement
+    ?select_statement: "SELECT " column_list " FROM " table_name
+    ?column_list: column_name ("," column_name)*
+    ?table_name: identifier
+    ?column_name: identifier
+    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+"""
+guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar)
+sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+prompt = ("Generate an SQL query to show the 'username' and 'email'"
+          "from the 'users' table.")
+outputs = llm.generate(
+    prompts=prompt,
+    sampling_params=sampling_params,
+)
+print(outputs[0].outputs[0].text)
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
 """
-This example shows how to use vLLM for running offline inference 
+This example shows how to use vLLM for running offline inference with
-with the correct prompt format on vision language models.
+the correct prompt format on vision language models for text generation.
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+import random
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
@@ -12,123 +14,239 @@ from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.utils import FlexibleArgumentParser
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
-# LLaVA-1.5
-def run_llava(question, modality):
+# Aria
+def run_aria(question: str, modality: str):
    assert modality == "image"
+    model_name = "rhymes-ai/Aria"
-    prompt = f"USER: <image>\n{question}\nASSISTANT:"
+    llm = LLM(model=model_name,
+              tokenizer_mode="slow",
+              trust_remote_code=True,
+              dtype="bfloat16",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
+              "<|im_end|>\n<|im_start|>assistant\n")
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+    return llm, prompt, stop_token_ids
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+# BLIP-2
+def run_blip2(question: str, modality: str):
+    assert modality == "image"
+    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
+    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
+    prompt = f"Question: {question} Answer:"
+    llm = LLM(model="Salesforce/blip2-opt-2.7b",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
    stop_token_ids = None
    return llm, prompt, stop_token_ids
-# LLaVA-1.6/LLaVA-NeXT
+# Chameleon
-def run_llava_next(question, modality):
+def run_chameleon(question: str, modality: str):
    assert modality == "image"
-    prompt = f"[INST] <image>\n{question} [/INST]"
+    prompt = f"{question}<image>"
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
+    llm = LLM(model="facebook/chameleon-7b",
+              max_model_len=4096,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
    stop_token_ids = None
    return llm, prompt, stop_token_ids
-# LlaVA-NeXT-Video
+# Fuyu
-# Currently only support for video input
+def run_fuyu(question: str, modality: str):
-def run_llava_next_video(question, modality):
+    assert modality == "image"
-    assert modality == "video"
-    prompt = f"USER: <video>\n{question} ASSISTANT:"
+    prompt = f"{question}\n"
-    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
+    llm = LLM(model="adept/fuyu-8b",
+              max_model_len=2048,
+              max_num_seqs=2,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
    stop_token_ids = None
    return llm, prompt, stop_token_ids
-# LLaVA-OneVision
+# GLM-4v
-def run_llava_onevision(question, modality):
+def run_glm4v(question: str, modality: str):
+    assert modality == "image"
+    model_name = "THUDM/glm-4v-9b"
-    if modality == "video":
+    llm = LLM(model=model_name,
-        prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
+              max_model_len=2048,
-        <|im_start|>assistant\n"
+              max_num_seqs=2,
+              trust_remote_code=True,
+              enforce_eager=True,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    prompt = question
+    stop_token_ids = [151329, 151336, 151338]
+    return llm, prompt, stop_token_ids
-    elif modality == "image":
-        prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
-        <|im_start|>assistant\n"
-    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
+# H2OVL-Mississippi
-              max_model_len=32768)
+def run_h2ovl(question: str, modality: str):
-    stop_token_ids = None
+    assert modality == "image"
+    model_name = "h2oai/h2ovl-mississippi-2b"
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    # Stop tokens for H2OVL-Mississippi
+    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    stop_token_ids = [tokenizer.eos_token_id]
    return llm, prompt, stop_token_ids
-# Fuyu
+# Idefics3-8B-Llama3
-def run_fuyu(question, modality):
+def run_idefics3(question: str, modality: str):
    assert modality == "image"
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
-    prompt = f"{question}\n"
+    llm = LLM(
-    llm = LLM(model="adept/fuyu-8b")
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 3 * 364
+            },
+        },
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+    prompt = (
+        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    )
    stop_token_ids = None
    return llm, prompt, stop_token_ids
-# Phi-3-Vision
+# InternVL
-def run_phi3v(question, modality):
+def run_internvl(question: str, modality: str):
    assert modality == "image"
-    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
+    model_name = "OpenGVLab/InternVL2-2B"
-    # Note: The default setting of max_num_seqs (256) and
-    # max_model_len (128k) for this model may cause OOM.
-    # You may lower either to run this example on lower-end GPUs.
-    # In this example, we override max_num_seqs to 5 while
-    # keeping the original context length of 128k.
-    # num_crops is an override kwarg to the multimodal image processor;
-    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
-    # to use 16 for single frame scenarios, and 4 for multi-frame.
-    #
-    # Generally speaking, a larger value for num_crops results in more
-    # tokens per image instance, because it may scale the image more in
-    # the image preprocessing. Some references in the model docs and the
-    # formula for image tokens after the preprocessing
-    # transform can be found below.
-    #
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
    llm = LLM(
-        model="microsoft/Phi-3-vision-128k-instruct",
+        model=model_name,
        trust_remote_code=True,
-        max_num_seqs=5,
+        max_model_len=4096,
-        mm_processor_kwargs={"num_crops": 16},
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
    )
-    stop_token_ids = None
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    return llm, prompt, stop_token_ids
-# PaliGemma
+# LLaVA-1.5
-def run_paligemma(question, modality):
+def run_llava(question: str, modality: str):
    assert modality == "image"
-    # PaliGemma has special prompt format for VQA
+    prompt = f"USER: <image>\n{question}\nASSISTANT:"
-    prompt = "caption en"
-    llm = LLM(model="google/paligemma-3b-mix-224")
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf",
+              max_model_len=4096,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
    stop_token_ids = None
    return llm, prompt, stop_token_ids
-# Chameleon
+# LLaVA-1.6/LLaVA-NeXT
-def run_chameleon(question, modality):
+def run_llava_next(question: str, modality: str):
    assert modality == "image"
-    prompt = f"{question}<image>"
+    prompt = f"[INST] <image>\n{question} [/INST]"
-    llm = LLM(model="facebook/chameleon-7b")
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
+              max_model_len=8192,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+# LlaVA-NeXT-Video
+# Currently only support for video input
+def run_llava_next_video(question: str, modality: str):
+    assert modality == "video"
+    prompt = f"USER: <video>\n{question} ASSISTANT:"
+    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
+              max_model_len=8192,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
    stop_token_ids = None
    return llm, prompt, stop_token_ids
+# LLaVA-OneVision
+def run_llava_onevision(question: str, modality: str):
+    if modality == "video":
+        prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+    elif modality == "image":
+        prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
+              max_model_len=16384,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+# Mantis
+def run_mantis(question: str, modality: str):
+    assert modality == "image"
+    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
+    prompt = llama3_template.format(f"{question}\n<image>")
+    llm = LLM(
+        model="TIGER-Lab/Mantis-8B-siglip-llama3",
+        max_model_len=4096,
+        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+    stop_token_ids = [128009]
+    return llm, prompt, stop_token_ids
 # MiniCPM-V
-def run_minicpmv(question, modality):
+def run_minicpmv(question: str, modality: str):
    assert modality == "image"
    # 2.0
@@ -145,7 +263,10 @@ def run_minicpmv(question, modality):
                                              trust_remote_code=True)
    llm = LLM(
        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
        trust_remote_code=True,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
    )
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
@@ -168,16 +289,61 @@ def run_minicpmv(question, modality):
    return llm, prompt, stop_token_ids
-# InternVL
+# LLama 3.2
-def run_internvl(question, modality):
+def run_mllama(question: str, modality: str):
    assert modality == "image"
-    model_name = "OpenGVLab/InternVL2-2B"
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (131072) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    llm = LLM(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=16,
+        enforce_eager=True,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+    prompt = f"<|image|><|begin_of_text|>{question}"
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+# Molmo
+def run_molmo(question, modality):
+    assert modality == "image"
+    model_name = "allenai/Molmo-7B-D-0924"
    llm = LLM(
        model=model_name,
        trust_remote_code=True,
-        max_num_seqs=5,
+        dtype="bfloat16",
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+    prompt = question
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+# NVLM-D
+def run_nvlm_d(question: str, modality: str):
+    assert modality == "image"
+    model_name = "nvidia/NVLM-D-72B"
+    # Adjust this as necessary to fit in GPU
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        tensor_parallel_size=4,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -186,81 +352,121 @@ def run_internvl(question, modality):
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
-    # Stop tokens for InternVL
-    # models variants may have different stop tokens
+# PaliGemma
-    # please refer to the model card for the correct "stop words":
+def run_paligemma(question: str, modality: str):
-    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
+    assert modality == "image"
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    # PaliGemma has special prompt format for VQA
+    prompt = "caption en"
+    llm = LLM(model="google/paligemma-3b-mix-224",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
    return llm, prompt, stop_token_ids
-# BLIP-2
+# PaliGemma 2
-def run_blip2(question, modality):
+def run_paligemma2(question: str, modality: str):
    assert modality == "image"
-    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
+    # PaliGemma 2 has special prompt format for VQA
-    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
+    prompt = "caption en"
-    prompt = f"Question: {question} Answer:"
+    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
-    llm = LLM(model="Salesforce/blip2-opt-2.7b")
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
    stop_token_ids = None
    return llm, prompt, stop_token_ids
-# Qwen
+# Phi-3-Vision
-def run_qwen_vl(question, modality):
+def run_phi3v(question: str, modality: str):
    assert modality == "image"
+    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
    llm = LLM(
-        model="Qwen/Qwen-VL",
+        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
-        max_num_seqs=5,
+        max_model_len=4096,
+        max_num_seqs=2,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"num_crops": 16},
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
    )
-    prompt = f"{question}Picture 1: <img></img>\n"
    stop_token_ids = None
    return llm, prompt, stop_token_ids
-# Qwen2-VL
+# Pixtral HF-format
-def run_qwen2_vl(question, modality):
+def run_pixtral_hf(question: str, modality: str):
    assert modality == "image"
-    model_name = "Qwen/Qwen2-VL-7B-Instruct"
+    model_name = "mistral-community/pixtral-12b"
    llm = LLM(
        model=model_name,
-        max_num_seqs=5,
+        max_model_len=8192,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
    )
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+    prompt = f"<s>[INST]{question}\n[IMG][/INST]"
-              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
    stop_token_ids = None
    return llm, prompt, stop_token_ids
-# LLama
+# Qwen
-def run_mllama(question, modality):
+def run_qwen_vl(question: str, modality: str):
    assert modality == "image"
-    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+    llm = LLM(
+        model="Qwen/Qwen-VL",
+        trust_remote_code=True,
+        max_model_len=1024,
+        max_num_seqs=2,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+    prompt = f"{question}Picture 1: <img></img>\n"
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
-    # Note: The default setting of max_num_seqs (256) and
-    # max_model_len (131072) for this model may cause OOM.
-    # You may lower either to run this example on lower-end GPUs.
-    # The configuration below has been confirmed to launch on a
+# Qwen2-VL
-    # single H100 GPU.
+def run_qwen2_vl(question: str, modality: str):
+    assert modality == "image"
+    model_name = "Qwen/Qwen2-VL-7B-Instruct"
    llm = LLM(
        model=model_name,
-        max_num_seqs=16,
+        max_model_len=4096,
-        enforce_eager=True,
+        max_num_seqs=5,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+        },
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
    )
-    prompt = f"<|image|><|begin_of_text|>{question}"
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+              f"{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
    stop_token_ids = None
    return llm, prompt, stop_token_ids
@@ -281,21 +487,29 @@ def run_glm4v(question: str, modality: str):
 model_example_map = {
+    "aria": run_aria,
+    "blip-2": run_blip2,
+    "chameleon": run_chameleon,
+    "fuyu": run_fuyu,
+    "glm4v": run_glm4v,
+    "h2ovl_chat": run_h2ovl,
+    "idefics3": run_idefics3,
+    "internvl_chat": run_internvl,
    "llava": run_llava,
    "llava-next": run_llava_next,
    "llava-next-video": run_llava_next_video,
    "llava-onevision": run_llava_onevision,
-    "fuyu": run_fuyu,
+    "mantis": run_mantis,
-    "phi3_v": run_phi3v,
-    "paligemma": run_paligemma,
-    "chameleon": run_chameleon,
    "minicpmv": run_minicpmv,
-    "blip-2": run_blip2,
+    "mllama": run_mllama,
-    "internvl_chat": run_internvl,
+    "molmo": run_molmo,
+    "NVLM_D": run_nvlm_d,
+    "paligemma": run_paligemma,
+    "paligemma2": run_paligemma2,
+    "phi3_v": run_phi3v,
+    "pixtral_hf": run_pixtral_hf,
    "qwen_vl": run_qwen_vl,
    "qwen2_vl": run_qwen2_vl,
-    "mllama": run_mllama,
-    "glm4v": run_glm4v,
 }
@@ -332,6 +546,35 @@ def get_multi_modal_input(args):
    raise ValueError(msg)
+def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
+    """Repeats images with provided probability of "image_repeat_prob". 
+    Used to simulate hit/miss for the MM preprocessor cache.
+    """
+    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
+    no_yes = [0, 1]
+    probs = [1.0 - image_repeat_prob, image_repeat_prob]
+    inputs = []
+    cur_image = data
+    for i in range(num_prompts):
+        if image_repeat_prob is not None:
+            res = random.choices(no_yes, probs)[0]
+            if res == 0:
+                # No repeat => Modify one pixel
+                cur_image = cur_image.copy()
+                new_val = (i // 256 // 256, i // 256, i % 256)
+                cur_image.putpixel((0, 0), new_val)
+        inputs.append({
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: cur_image
+            }
+        })
+    return inputs
 def main(args):
    model = args.model_type
    if model not in model_example_map:
@@ -362,14 +605,29 @@ def main(args):
    else:
        # Batch inference
-        inputs = [{
+        if args.image_repeat_prob is not None:
-            "prompt": prompt,
+            # Repeat images with specified probability of "image_repeat_prob"
-            "multi_modal_data": {
+            inputs = apply_image_repeat(args.image_repeat_prob,
-                modality: data
+                                        args.num_prompts, data, prompt,
-            },
+                                        modality)
-        } for _ in range(args.num_prompts)]
+        else:
+            # Use the same image for all prompts
+            inputs = [{
+                "prompt": prompt,
+                "multi_modal_data": {
+                    modality: data
+                },
+            } for _ in range(args.num_prompts)]
+    if args.time_generate:
+        import time
+        start_time = time.time()
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
+        elapsed_time = time.time() - start_time
+        print("-- generate time = {}".format(elapsed_time))
-    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    else:
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
    for o in outputs:
        generated_text = o.outputs[0].text
@@ -379,7 +637,7 @@ def main(args):
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
-        'vision language models')
+        'vision language models for text generation')
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
@@ -399,5 +657,23 @@ if __name__ == "__main__":
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
+    parser.add_argument(
+        '--image-repeat-prob',
+        type=float,
+        default=None,
+        help='Simulates the hit-ratio for multi-modal preprocessor cache'
+        ' (if enabled)')
+    parser.add_argument(
+        '--mm-cache-preprocessor',
+        action='store_true',
+        help='If True, enable caching of multi-modal preprocessor/mapper.')
+    parser.add_argument(
+        '--time-generate',
+        action='store_true',
+        help='If True, then print the total generate() call time')
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference_vision_language_embedding.py
+"""
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for multimodal embedding.
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+from argparse import Namespace
+from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
+from PIL.Image import Image
+from vllm import LLM
+from vllm.multimodal.utils import fetch_image
+from vllm.utils import FlexibleArgumentParser
+class TextQuery(TypedDict):
+    modality: Literal["text"]
+    text: str
+class ImageQuery(TypedDict):
+    modality: Literal["image"]
+    image: Image
+class TextImageQuery(TypedDict):
+    modality: Literal["text+image"]
+    text: str
+    image: Image
+QueryModality = Literal["text", "image", "text+image"]
+Query = Union[TextQuery, ImageQuery, TextImageQuery]
+class ModelRequestData(NamedTuple):
+    llm: LLM
+    prompt: str
+    image: Optional[Image]
+def run_e5_v(query: Query):
+    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
+    if query["modality"] == "text":
+        text = query["text"]
+        prompt = llama3_template.format(
+            f"{text}\nSummary above sentence in one word: ")
+        image = None
+    elif query["modality"] == "image":
+        prompt = llama3_template.format(
+            "<image>\nSummary above image in one word: ")
+        image = query["image"]
+    else:
+        modality = query['modality']
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+    llm = LLM(
+        model="royokong/e5-v",
+        task="embed",
+        max_model_len=4096,
+    )
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        image=image,
+    )
+def run_vlm2vec(query: Query):
+    if query["modality"] == "text":
+        text = query["text"]
+        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
+        image = None
+    elif query["modality"] == "image":
+        prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image."  # noqa: E501
+        image = query["image"]
+    elif query["modality"] == "text+image":
+        text = query["text"]
+        prompt = f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
+        image = query["image"]
+    else:
+        modality = query['modality']
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+    llm = LLM(
+        model="TIGER-Lab/VLM2Vec-Full",
+        task="embed",
+        trust_remote_code=True,
+        mm_processor_kwargs={"num_crops": 4},
+    )
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        image=image,
+    )
+def get_query(modality: QueryModality):
+    if modality == "text":
+        return TextQuery(modality="text", text="A dog sitting in the grass")
+    if modality == "image":
+        return ImageQuery(
+            modality="image",
+            image=fetch_image(
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg"  # noqa: E501
+            ),
+        )
+    if modality == "text+image":
+        return TextImageQuery(
+            modality="text+image",
+            text="A cat standing in the snow.",
+            image=fetch_image(
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg"  # noqa: E501
+            ),
+        )
+    msg = f"Modality {modality} is not supported."
+    raise ValueError(msg)
+def run_encode(model: str, modality: QueryModality):
+    query = get_query(modality)
+    req_data = model_example_map[model](query)
+    mm_data = {}
+    if req_data.image is not None:
+        mm_data["image"] = req_data.image
+    outputs = req_data.llm.embed({
+        "prompt": req_data.prompt,
+        "multi_modal_data": mm_data,
+    })
+    for output in outputs:
+        print(output.outputs.embedding)
+def main(args: Namespace):
+    run_encode(args.model_name, args.modality)
+model_example_map = {
+    "e5_v": run_e5_v,
+    "vlm2vec": run_vlm2vec,
+}
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for multimodal embedding')
+    parser.add_argument('--model-name',
+                        '-m',
+                        type=str,
+                        default="vlm2vec",
+                        choices=model_example_map.keys(),
+                        help='The name of the embedding model.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        choices=get_args(QueryModality),
+                        help='Modality of the input.')
+    args = parser.parse_args()
+    main(args)
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
 """
 This example shows how to use vLLM for running offline inference with
-multi-image input on vision language models, using the chat template defined
+multi-image input on vision language models for text generation,
-by the model.
+using the chat template defined by the model.
 """
 from argparse import Namespace
 from typing import List, NamedTuple, Optional
@@ -28,41 +28,184 @@ class ModelRequestData(NamedTuple):
    chat_template: Optional[str]
-def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
-    model_name = "Qwen/Qwen-VL-Chat"
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+def load_aria(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "rhymes-ai/Aria"
+    llm = LLM(model=model_name,
+              tokenizer_mode="slow",
+              trust_remote_code=True,
+              dtype="bfloat16",
+              limit_mm_per_prompt={"image": len(image_urls)})
+    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
+    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None)
+def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
+    model_name = "h2oai/h2ovl-mississippi-2b"
    llm = LLM(
        model=model_name,
        trust_remote_code=True,
-        max_num_seqs=5,
+        max_model_len=8192,
        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"max_dynamic_patch": 4},
    )
-    placeholders = "".join(f"Picture {i}: <img></img>\n"
-                           for i, _ in enumerate(image_urls, start=1))
-    # This model does not have a chat_template attribute on its tokenizer,
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
-    # so we need to explicitly pass it. We use ChatML since it's used in the
+                             for i, _ in enumerate(image_urls, start=1))
-    # generation utils of the model:
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
-    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
-    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
+    # Stop tokens for H2OVL-Mississippi
-    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
+    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    stop_token_ids = [tokenizer.eos_token_id]
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 2 * 364
+            },
+        },
+    )
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
+    model_name = "OpenGVLab/InternVL2-2B"
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"max_dynamic_patch": 4},
+    )
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
-                                           add_generation_prompt=True,
+                                           add_generation_prompt=True)
-                                           chat_template=chat_template)
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    return ModelRequestData(
        llm=llm,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=chat_template,
+        chat_template=None,
+    )
+def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    llm = LLM(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    prompt = f"<|image|><|image|><|begin_of_text|>{question}"
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+def load_nvlm_d(question: str, image_urls: List[str]):
+    model_name = "nvidia/NVLM-D-72B"
+    # Adjust this as necessary to fit in GPU
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        tensor_parallel_size=4,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"max_dynamic_patch": 4},
+    )
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    stop_token_ids = None
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
    )
@@ -83,6 +226,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
+        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"num_crops": 4},
    )
@@ -100,40 +244,42 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
    )
-def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
-    model_name = "OpenGVLab/InternVL2-2B"
+    model_name = "Qwen/Qwen-VL-Chat"
    llm = LLM(
        model=model_name,
        trust_remote_code=True,
-        max_num_seqs=5,
+        max_model_len=1024,
-        max_model_len=4096,
+        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
+    placeholders = "".join(f"Picture {i}: <img></img>\n"
+                           for i, _ in enumerate(image_urls, start=1))
-    placeholders = "\n".join(f"Image-{i}: <image>\n"
+    # This model does not have a chat_template attribute on its tokenizer,
-                             for i, _ in enumerate(image_urls, start=1))
+    # so we need to explicitly pass it. We use ChatML since it's used in the
-    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    # generation utils of the model:
+    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
+    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
+    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
-                                           add_generation_prompt=True)
+                                           add_generation_prompt=True,
+                                           chat_template=chat_template)
-    # Stop tokens for InternVL
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
-    # models variants may have different stop tokens
-    # please refer to the model card for the correct "stop words":
-    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    return ModelRequestData(
        llm=llm,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
+        chat_template=chat_template,
    )
@@ -148,10 +294,11 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
+    # Tested on L40
    llm = LLM(
        model=model_name,
-        max_num_seqs=5,
        max_model_len=32768 if process_vision_info is None else 4096,
+        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
@@ -194,10 +341,15 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
 model_example_map = {
-    "phi3_v": load_phi3v,
+    "aria": load_aria,
+    "h2ovl_chat": load_h2onvl,
+    "idefics3": load_idefics3,
    "internvl_chat": load_internvl,
-    "qwen2_vl": load_qwen2_vl,
+    "mllama": load_mllama,
+    "NVLM_D": load_nvlm_d,
+    "phi3_v": load_phi3v,
    "qwen_vl_chat": load_qwenvl_chat,
+    "qwen2_vl": load_qwen2_vl,
 }
@@ -269,7 +421,8 @@ def main(args: Namespace):
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
-        'vision language models that support multi-image input')
+        'vision language models that support multi-image input for text '
+        'generation')
    parser.add_argument('--model-type',
                        '-m',
                        type=str,

--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
-from time import time
 from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+# NOTE: This is just a running example. For benchmarking purpose,
+# please see benchmarks/benchmark_prefix_caching.py
 # Common prefix.
 prefix = (
@@ -27,19 +29,14 @@ generating_prompts = [prefix + prompt for prompt in prompts]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.0)
-# Create an LLM.
+# Create an LLM without prefix caching as a baseline.
 regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
-prefix_cached_llm = LLM(model="facebook/opt-125m",
-                        enable_prefix_caching=True,
-                        gpu_memory_utilization=0.4)
 print("Results without `enable_prefix_caching`")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
-start_time_regular = time()
 outputs = regular_llm.generate(generating_prompts, sampling_params)
-duration_regular = time() - start_time_regular
 regular_generated_texts = []
 # Print the outputs.
@@ -51,13 +48,20 @@ for output in outputs:
 print("-" * 80)
+# Destroy the LLM object and free up the GPU memory.
+del regular_llm
+cleanup_dist_env_and_memory()
+# Create an LLM with prefix caching enabled.
+prefix_cached_llm = LLM(model="facebook/opt-125m",
+                        enable_prefix_caching=True,
+                        gpu_memory_utilization=0.4)
 # Warmup so that the shared prompt's KV cache is computed.
 prefix_cached_llm.generate(generating_prompts[0], sampling_params)
 # Generate with prefix caching.
-start_time_cached = time()
 outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
-duration_cached = time() - start_time_cached
 print("Results with `enable_prefix_caching`")
@@ -77,6 +81,3 @@ generated_same = all([
    for i in range(len(prompts))
 ])
 print(f"Generated answers are the same: {generated_same}")
-speedup = round(duration_regular / duration_cached, 2)
-print(f"Speed up of cached generation compared to the regular is: {speedup}")
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
 import os
+import time
 from vllm import LLM, SamplingParams
@@ -15,19 +16,25 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-# Create an LLM.
+if __name__ == "__main__":
-llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
-llm.start_profile()
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
+    llm.start_profile()
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-llm.stop_profile()
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
+    llm.stop_profile()
-for output in outputs:
-    prompt = output.prompt
+    # Print the outputs.
-    generated_text = output.outputs[0].text
+    for output in outputs:
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    # Add a buffer to wait for profiler in the background process
+    # (in case MP is on) to finish writing profiling output.
+    time.sleep(10)
--- a/examples/offline_profile.py
+++ b/examples/offline_profile.py
+import inspect
+import json
+import os
+import sys
+from argparse import RawTextHelpFormatter
+from dataclasses import asdict, dataclass
+from typing import Any, Dict, Generator, List, Optional, TypeAlias
+import torch
+import tqdm
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.profiler import layerwise_profile
+from vllm.utils import FlexibleArgumentParser
+BATCH_SIZE_DEFAULT = 1
+PROMPT_LEN_DEFAULT = 256
+@dataclass
+class ProfileContext:
+    engine_args: EngineArgs
+    prompt_len: int
+    batch_size: int
+    # The profiler can run in 2 modes,
+    # 1. Run profiler for user specified num_steps
+    num_steps: Optional[int] = None
+    # 2. Run profiler until all requests complete
+    complete_num_requests_per_step: Optional[int] = None
+    save_chrome_traces_folder: Optional[str] = None
+def get_dtype(dtype: str):
+    if dtype == "torch.float":
+        return torch.float
+    else:
+        return dtype
+OutputLen_NumReqs_Map: TypeAlias = Dict[int, int]
+def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
+      -> OutputLen_NumReqs_Map:
+    """
+    Given the number of requests, batch_size, and the number of requests
+    that each engine-step should process, step_requests, determine the
+    output lengths of the requests such that step_request is honoured.
+    Example: 
+    if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
+    then return,
+    {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
+    32 requests should have output length 2,
+    32 requests should have output length 3,
+    32 requests should have output length 4,
+    31 requests should have output length 5,
+    1 request should have output length 6.
+    Args:
+        batch_size (int): Number of requests submitted for profile. This is
+            args.batch_size.
+        step_requests (List[int]): step_requests[i] is the number of requests
+            that the ith engine step should process.
+    Returns:
+        OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
+            number of requests required to have that output-length as values.
+    """
+    ol_nr: OutputLen_NumReqs_Map = {}
+    # Number of request that are assigned an output-length
+    num_reqs_assigned: int = 0
+    num_steps: int = len(step_requests)
+    # sanity check. The first step (prefill-step), must process all requests.
+    assert step_requests[0] == batch_size
+    # Begin assignments from the last step.
+    output_length: int = num_steps
+    for num_requests_at_step in reversed(step_requests):
+        if num_reqs_assigned == batch_size:
+            break
+        assert num_reqs_assigned < batch_size
+        # Remove the number of requests that have been determined
+        # to participate in this step and beyond.
+        num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
+        assert num_reqs_unassigned_at_step >= 0
+        if num_reqs_unassigned_at_step > 0:
+            ol_nr[output_length] = num_reqs_unassigned_at_step
+            num_reqs_assigned += num_reqs_unassigned_at_step
+        output_length -= 1
+    # sanity checks.
+    assert sum(ol_nr.values()) == batch_size, \
+            ("Number of requests in output-length assignment does not match "
+             f"batch-size.\n batch size {batch_size} - "
+             f"step requests {step_requests} - assignments {ol_nr}")
+    # Check that the output-length is in [1, num-steps]. Output length must be
+    # at least 1 as all requests must participate in the prefill-step.
+    assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), \
+            ("Output lengths of requests should be in range "
+             f"[1, num-engine-steps].\n batch size {batch_size} - "
+             f"step requests {step_requests} - assignments {ol_nr}")
+    return ol_nr
+def determine_requests_per_step(context: ProfileContext) -> List[int]:
+    """
+    Determine number of requests each engine step should process.
+    If context.num_steps is set, then all engine steps process the
+    same number of requests and the output list is of length
+    context.num_steps.
+    If context.complete_num_requests_per_step is set, then each decode step
+    processes fewer and fewer requests until there are no requests to process.
+    In this case, the output list is as big as the number of steps
+    required to process all requests.
+    Args:
+        context: ProfileContext object.
+    Returns:
+        List[int]: Number of requests to process for all engine-steps. 
+         output[i], contains the number of requests that the ith step
+         should process.
+    """
+    if context.num_steps:
+        # All requests must run until num_engine_steps. This implies
+        # that their output lengths must be equal to num_engine_steps.
+        return [context.batch_size] * context.num_steps
+    assert context.complete_num_requests_per_step and \
+                context.complete_num_requests_per_step > 0, \
+        (f"Expected a positive complete_num_requests_per_step argument."
+         f"Instead got {context.complete_num_requests_per_step}")
+    # We start dropping after the first decode step.
+    step_requests = [
+        context.batch_size,  # prefill
+        context.batch_size,  # decode
+    ]
+    num_running_requests = context.batch_size
+    num_running_requests -= context.complete_num_requests_per_step
+    while num_running_requests > 0:
+        step_requests.append(num_running_requests)
+        num_running_requests -= context.complete_num_requests_per_step
+    if step_requests[-1] != 1:
+        # have 1 request running at the last step. This is often
+        # useful
+        step_requests.append(1)
+    return step_requests
+def run_profile(context: ProfileContext, csv_output: Optional[str],
+                json_output: Optional[str]):
+    print("Run profile with:")
+    for key, value in asdict(context).items():
+        print(f"  {key} = {value}")
+    requests_per_step: List[int] = determine_requests_per_step(context)
+    ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
+        context.batch_size, requests_per_step)
+    num_steps_to_profile: int = len(requests_per_step)
+    max_output_len: int = max(ol_nr.keys())
+    assert max_output_len >= 1
+    # Create sampling params
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        # max_tokens is set on a per-request basis.
+        max_tokens=None,
+        ignore_eos=True)
+    # Create LLM
+    llm = LLM(**asdict(context.engine_args))
+    batch_size = context.batch_size
+    prompt_len = context.prompt_len
+    scheduler_config = llm.llm_engine.scheduler_config
+    max_model_len = llm.llm_engine.model_config.max_model_len
+    max_num_batched_tokens = scheduler_config.max_num_batched_tokens
+    max_num_seqs = scheduler_config.max_num_seqs
+    if batch_size * prompt_len > max_num_batched_tokens:
+        print(f"ERROR: chosen batch_size * prompt_len "
+              f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is  "
+              f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
+              f"and therefore cannot be run in a single profile step, please "
+              f"choose a smaller batch size or prompt length, or increase "
+              f"--max-num-batched-tokens")
+        sys.exit(-1)
+    if batch_size > max_num_seqs:
+        print(
+            f"ERROR: chosen batch_size ({batch_size}) is larger than "
+            f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
+            f"single profile step, please choose a smaller batch size")
+        sys.exit(-1)
+    print("llm.llm_engine.model_config.max_model_len: ",
+          llm.llm_engine.model_config.max_model_len)
+    if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
+        print(f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
+              f"{max_output_len} = {prompt_len + max_output_len}) is larger "
+              f"than the model's max_model_len ({max_model_len}), please "
+              f"choose a smaller prompt_len or max_output_len, or increase "
+              f"--max-model-len")
+        sys.exit(-1)
+    def add_requests():
+        def get_output_len_generator() -> Generator[int, Any, Any]:
+            for output_len, num_reqs in ol_nr.items():
+                for _ in range(num_reqs):
+                    yield output_len
+        output_len_generator = get_output_len_generator()
+        for i in range(batch_size):
+            sampling_params.max_tokens = next(output_len_generator)
+            assert isinstance(sampling_params.max_tokens, int)
+            prompt_token_ids = torch.randint(
+                llm.llm_engine.model_config.get_vocab_size(),
+                size=(prompt_len, )).tolist()
+            llm.llm_engine.add_request(
+                request_id=f"seq{i}",
+                prompt={'prompt_token_ids': prompt_token_ids},
+                params=sampling_params)
+    def abort_requests():
+        for i in range(batch_size):
+            llm.llm_engine.abort_request(f"seq{i}")
+    # Warm up run
+    print("Warm up run ...")
+    add_requests()
+    llm.llm_engine.step()  # Prefill
+    llm.llm_engine.step()  # Decode
+    abort_requests()
+    print("Profile run ...")
+    add_requests()
+    with layerwise_profile() as prefill_prof:
+        llm.llm_engine.step()  # First step is prefill
+    decode_profs = []
+    for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
+        num_running_seqs = llm.llm_engine.scheduler[
+            0].get_num_unfinished_seq_groups()
+        with layerwise_profile(
+                num_running_seqs=num_running_seqs) as decode_prof:
+            llm.llm_engine.step()
+        decode_profs.append(decode_prof)
+    decode_results_list = [prof.results for prof in decode_profs]
+    prefill_results = prefill_prof.results
+    has_decode = len(decode_results_list) > 0
+    LINE_WIDTH = 80
+    print("=" * LINE_WIDTH)
+    print(f"= Prefill Model Table "
+          f"(prompt_len={prompt_len}, batch_size={batch_size})")
+    print("=" * LINE_WIDTH)
+    print()
+    prefill_results.print_model_table()
+    if has_decode:
+        print()
+        print("=" * LINE_WIDTH)
+        print(f"= First Decode Step Model Table "
+              f"(prompt_len={prompt_len}, batch_size={batch_size})")
+        print("=" * LINE_WIDTH)
+        print()
+        decode_results_list[0].print_model_table()
+    print()
+    print("=" * LINE_WIDTH)
+    print(f"= Prefill Summary Table "
+          f"(prompt_len={prompt_len}, batch_size={batch_size})")
+    print("=" * LINE_WIDTH)
+    print()
+    prefill_results.print_summary_table()
+    if has_decode:
+        print()
+        print("=" * LINE_WIDTH)
+        print(f"= First Decode Step Summary Table "
+              f"(prompt_len={prompt_len}, batch_size={batch_size})")
+        print("=" * LINE_WIDTH)
+        print()
+        decode_results_list[0].print_summary_table()
+    if csv_output:
+        csv_filename_base = csv_output[:-4] \
+                if csv_output.endswith('.csv') else csv_output
+        prefill_results.export_model_stats_table_csv(
+            csv_filename_base + "_prefill_model_table.csv")
+        prefill_results.export_summary_stats_table_csv(
+            csv_filename_base + "_prefill_summary_table.csv")
+        if has_decode:
+            decode_results_list[0].export_model_stats_table_csv(\
+                csv_filename_base + "_decode_model_table.csv")
+            decode_results_list[0].export_summary_stats_table_csv(
+                csv_filename_base + "_decode_summary_table.csv")
+    if json_output:
+        cuda_devices = [
+            torch.cuda.get_device_properties(dev_idx)
+            for dev_idx in range(torch.cuda.device_count())
+        ]
+        json_dict = {
+            "context": {
+                "python_version": f"{sys.version}",
+                "torch_version": f"{torch.__version__}",
+                "torch_cuda_version": f"{torch.version.cuda}",
+                "cuda_devices": f"{cuda_devices}",
+                **asdict(context)
+            },
+            "prefill": prefill_results.convert_stats_to_dict(),
+        }
+        if has_decode:
+            for idx, dr in enumerate(decode_results_list):
+                json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
+        # Add .json to json_output filename if it doesn't exist already.
+        json_output_file = json_output if json_output.endswith(
+            '.json') else json_output + '.json'
+        with open(json_output_file, "w+") as f:
+            json.dump(json_dict, f, indent=2)
+        pass
+    if context.save_chrome_traces_folder is not None:
+        os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
+        prefill_prof.profiler.export_chrome_trace(
+            context.save_chrome_traces_folder + "/prefill.json")
+        for idx, decode_prof in enumerate(decode_profs):
+            decode_prof.profiler.export_chrome_trace(
+                context.save_chrome_traces_folder + f"/decode_{idx + 1}.json")
+        print("Traces saved as prefill.json and decode_1.json, etc."
+              f" in folder {context.save_chrome_traces_folder}")
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="""
+Profile a model
+    example:
+    ```
+    python examples/offline_profile.py \\
+        --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
+        --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
+        --enforce-eager run_num_steps -n 2
+    ```
+    then you can use various tools to analyze the json output
+    terminal ascii tables:
+        ```
+        python tools/profiler/print_layerwise_table.py \\
+            --json-trace Llama31-8b-FP8.json --phase prefill --table summary
+        ```
+    or create matplotlib stacked bar charts:
+        ```
+        python tools/profiler/visualize_layerwise_profile.py \\
+            --json-trace Llama31-8b-FP8.json \\
+            --output-directory profile_breakdown --plot-metric pct_cuda_time
+        ```
+""",
+                                    formatter_class=RawTextHelpFormatter)
+    parser.add_argument(
+        "--csv",
+        type=str,
+        default=None,
+        help="Export the results as multiple csv file. This should be the root "
+        "filename, will create <filename>_prefill_model_table.csv, "
+        "<filename>_prefill_summary_table.csv, "
+        "<filename>_decode_model_table.csv, and "
+        "<filename>_decode_summary_table.csv")
+    parser.add_argument(
+        "--json",
+        type=str,
+        default=None,
+        help="Export the results as a json file. This should be the filename")
+    parser.add_argument("--save-chrome-traces-folder",
+                        type=str,
+                        help="Save chrome traces for the prefill and decode "
+                        "will save traces as prefill.json and decode_1.json, "
+                        "etc. inside this folder")
+    parser.add_argument(
+        "--prompt-len",
+        type=int,
+        default=PROMPT_LEN_DEFAULT,
+        help=f"Length of the random prompt to use when profiling, all batched "
+        f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}")
+    parser.add_argument("--batch-size",
+                        type=int,
+                        default=BATCH_SIZE_DEFAULT,
+                        help=f"Number of requests to run as a single batch, "
+                        f"default={BATCH_SIZE_DEFAULT}")
+    subparsers = parser.add_subparsers(dest="cmd")
+    run_num_steps_parser = subparsers.add_parser(
+        "run_num_steps",
+        help="This variation profiles n engine.step() invocations.")
+    run_num_steps_parser.add_argument(
+        '-n',
+        '--num-steps',
+        type=int,
+        help="Number of engine steps to profile.\n"
+        "Setting it to 1, profiles only the prefill step.\n"
+        "Setting it to 2, profiles the prefill and first decode step\n"
+        "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
+        "and so on ...")
+    run_to_completion_parser = subparsers.add_parser(
+        "run_to_completion",
+        help="This variation profiles all the engine.step() invocations"
+        "until the engine exhausts all submitted requests.")
+    run_to_completion_parser.add_argument(
+        '-n',
+        '--complete-num-requests-per-step',
+        type=int,
+        help=
+        "Complete complete_num_requests_per_step requests every decode step."
+        "For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
+        "the profiler is run for 6 engine steps, with the steps processing, "
+        "128, 128, 96, 64, 32, 1 requests respectively.\n"
+        "Note that we tack-on a one-request step at the end as it is often "
+        "useful.")
+    EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    context = ProfileContext(
+        engine_args=EngineArgs.from_cli_args(args),
+        **{
+            k: v
+            for k, v in vars(args).items()
+            if k in inspect.signature(ProfileContext).parameters
+        })
+    run_profile(context, csv_output=args.csv, json_output=args.json)
--- a/examples/openai_audio_api_client.py
+++ b/examples/openai_audio_api_client.py
-"""An example showing how to use vLLM to serve VLMs.
-Launch the vLLM server with the following command:
-vllm serve fixie-ai/ultravox-v0_3
-"""
-import base64
-import requests
-from openai import OpenAI
-from vllm.assets.audio import AudioAsset
-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-models = client.models.list()
-model = models.data[0].id
-# Any format supported by librosa is supported
-audio_url = AudioAsset("winning_call").url
-# Use audio url in the payload
-chat_completion_from_url = client.chat.completions.create(
-    messages=[{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What's in this audio?"
-            },
-            {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
-                },
-            },
-        ],
-    }],
-    model=model,
-    max_tokens=64,
-)
-result = chat_completion_from_url.choices[0].message.content
-print(f"Chat completion output:{result}")
-# Use base64 encoded audio in the payload
-def encode_audio_base64_from_url(audio_url: str) -> str:
-    """Encode an audio retrieved from a remote url to base64 format."""
-    with requests.get(audio_url) as response:
-        response.raise_for_status()
-        result = base64.b64encode(response.content).decode('utf-8')
-    return result
-audio_base64 = encode_audio_base64_from_url(audio_url=audio_url)
-chat_completion_from_base64 = client.chat.completions.create(
-    messages=[{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What's in this audio?"
-            },
-            {
-                "type": "audio_url",
-                "audio_url": {
-                    # Any format supported by librosa is supported
-                    "url": f"data:audio/ogg;base64,{audio_base64}"
-                },
-            },
-        ],
-    }],
-    model=model,
-    max_tokens=64,
-)
-result = chat_completion_from_base64.choices[0].message.content
-print(f"Chat completion output:{result}")
--- a/examples/openai_chat_completion_client_for_multimodal.py
+++ b/examples/openai_chat_completion_client_for_multimodal.py
+"""An example showing how to use vLLM to serve multimodal models 
+and run online inference with OpenAI client.
+Launch the vLLM server with the following command:
+(single image inference with Llava)
+vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
+(multi-image inference with Phi-3.5-vision-instruct)
+vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
+    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
+(audio inference with Ultravox)
+vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
+"""
+import base64
+import requests
+from openai import OpenAI
+from vllm.assets.audio import AudioAsset
+from vllm.utils import FlexibleArgumentParser
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+models = client.models.list()
+model = models.data[0].id
+def encode_base64_content_from_url(content_url: str) -> str:
+    """Encode a content retrieved from a remote url to base64 format."""
+    with requests.get(content_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode('utf-8')
+    return result
+# Text-only inference
+def run_text_only() -> None:
+    chat_completion = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": "What's the capital of France?"
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion.choices[0].message.content
+    print("Chat completion output:", result)
+# Single-image input inference
+def run_single_image() -> None:
+    ## Use image url in the payload
+    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+    ## Use base64 encoded image in the payload
+    image_base64 = encode_base64_content_from_url(image_url)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{image_base64}"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded image:", result)
+# Multi-image input inference
+def run_multi_image() -> None:
+    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
+    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What are the animals in these images?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url_duck
+                    },
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url_lion
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output:", result)
+# Audio input inference
+def run_audio() -> None:
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        # Any format supported by librosa is supported
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+    # HTTP URL
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        # Any format supported by librosa is supported
+                        "url": audio_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:", result)
+    # base64 URL
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        # Any format supported by librosa is supported
+                        "url": f"data:audio/ogg;base64,{audio_base64}"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded audio:", result)
+example_function_map = {
+    "text-only": run_text_only,
+    "single-image": run_single_image,
+    "multi-image": run_multi_image,
+    "audio": run_audio,
+}
+def main(args) -> None:
+    chat_type = args.chat_type
+    example_function_map[chat_type]()
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using OpenAI client for online inference with '
+        'multimodal language models served with vLLM.')
+    parser.add_argument(
+        '--chat-type',
+        '-c',
+        type=str,
+        default="single-image",
+        choices=["text-only", "single-image", "multi-image", "audio"],
+        help='Conversation type with multimodal data.')
+    args = parser.parse_args()
+    main(args)
--- a/examples/openai_chat_completion_structured_outputs.py
+++ b/examples/openai_chat_completion_structured_outputs.py
+from enum import Enum
+from openai import OpenAI
+from pydantic import BaseModel
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="-",
+)
+# Guided decoding by Choice (list of possible options)
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[{
+        "role": "user",
+        "content": "Classify this sentiment: vLLM is wonderful!"
+    }],
+    extra_body={"guided_choice": ["positive", "negative"]},
+)
+print(completion.choices[0].message.content)
+# Guided decoding by Regex
+prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+          "End in .com and new line. Example result:"
+          "alan.turing@enigma.com\n")
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={
+        "guided_regex": "\w+@\w+\.com\n",
+        "stop": ["\n"]
+    },
+)
+print(completion.choices[0].message.content)
+# Guided decoding by JSON using Pydantic schema
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+json_schema = CarDescription.model_json_schema()
+prompt = ("Generate a JSON with the brand, model and car_type of"
+          "the most iconic car from the 90's")
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={"guided_json": json_schema},
+)
+print(completion.choices[0].message.content)
+# Guided decoding by Grammar
+simplified_sql_grammar = """
+    ?start: select_statement
+    ?select_statement: "SELECT " column_list " FROM " table_name
+    ?column_list: column_name ("," column_name)*
+    ?table_name: identifier
+    ?column_name: identifier
+    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+"""
+prompt = ("Generate an SQL query to show the 'username' and 'email'"
+          "from the 'users' table.")
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={"guided_grammar": simplified_sql_grammar},
+)
+print(completion.choices[0].message.content)