Merge tag 'v0.7.3' into v0.7.3-dev

ec5e299c · zhuwenwen · 47bd229c · ed6e9075 · 47bd229c · 47bd229c
Commit ec5e299c authored Feb 21, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/cli.py
+++ b/examples/offline_inference/cli.py
-# SPDX-License-Identifier: Apache-2.0
-from dataclasses import asdict
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
-def get_prompts(num_prompts: int):
-    # The default sample prompts.
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    if num_prompts != len(prompts):
-        prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
-    return prompts
-def main(args):
-    # Create prompts
-    prompts = get_prompts(args.num_prompts)
-    # Create a sampling params object.
-    sampling_params = SamplingParams(n=args.n,
-                                     temperature=args.temperature,
-                                     top_p=args.top_p,
-                                     top_k=args.top_k,
-                                     max_tokens=args.max_tokens)
-    # Create an LLM.
-    # The default model is 'facebook/opt-125m'
-    engine_args = EngineArgs.from_cli_args(args)
-    llm = LLM(**asdict(engine_args))
-    # Generate texts from the prompts.
-    # The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-if __name__ == '__main__':
-    parser = FlexibleArgumentParser()
-    parser = EngineArgs.add_cli_args(parser)
-    group = parser.add_argument_group("SamplingParams options")
-    group.add_argument("--num-prompts",
-                       type=int,
-                       default=4,
-                       help="Number of prompts used for inference")
-    group.add_argument("--max-tokens",
-                       type=int,
-                       default=16,
-                       help="Generated output length for sampling")
-    group.add_argument('--n',
-                       type=int,
-                       default=1,
-                       help='Number of generated sequences per prompt')
-    group.add_argument('--temperature',
-                       type=float,
-                       default=0.8,
-                       help='Temperature for text generation')
-    group.add_argument('--top-p',
-                       type=float,
-                       default=0.95,
-                       help='top_p for text generation')
-    group.add_argument('--top-k',
-                       type=int,
-                       default=-1,
-                       help='top_k for text generation')
-    args = parser.parse_args()
-    main(args)
--- a/examples/offline_inference/cpu_offload.py
+++ b/examples/offline_inference/cpu_offload.py
-# SPDX-License-Identifier: Apache-2.0
-from vllm import LLM, SamplingParams
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-# Create an LLM.
-llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the example usage of disaggregated prefilling
+We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
+and then transfer the KV cache between them.
+"""
+import os
+import time
+from multiprocessing import Event, Process
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+def run_prefill(prefill_done):
+    # We use GPU 0 for prefill node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+    # The prefill node receives two requests, while the decode node receives
+    # three requests. So the decode node will only receive the KV Cache for
+    # requests 1 and 3. The decode node will use the KV Cache of requests 1
+    # and 3 and do prefilling on request 2.
+    prompts = [
+        "Hello, my name is",
+        "Hi, your name is",
+        # The decode node will actually "prefill" this request.
+        "Tell me a very long story",
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # This instance is the prefill node (kv_producer, rank 0).
+    # The number of parallel instances for KV cache transfer is set to 2,
+    # as required for PyNcclConnector.
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
+    )
+    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
+    # memory. You may need to adjust the value to fit your GPU.
+    llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+              kv_transfer_config=ktc,
+              max_model_len=2000,
+              gpu_memory_utilization=0.8)
+    llm.generate(prompts, sampling_params)
+    print("Prefill node is finished.")
+    prefill_done.set()
+    # To keep the prefill node running in case the decode node is not done;
+    # otherwise, the script might exit prematurely, causing incomplete decoding.
+    try:
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("Script stopped by user.")
+def run_decode(prefill_done):
+    # We use GPU 1 for decode node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+    prompts = [
+        "Hello, my name is",
+        "Hi, your name is",
+        "Tell me a very long story",
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95)
+    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # This instance is the decode node (kv_consumer, rank 1).
+    # The number of parallel instances for KV cache transfer is set to 2,
+    # as required for PyNcclConnector.
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
+    )
+    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
+    # memory. You may need to adjust the value to fit your GPU.
+    llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+              kv_transfer_config=ktc,
+              max_model_len=2000,
+              gpu_memory_utilization=0.8)
+    # Wait for the producer to start the pipe
+    print("Waiting for prefill node to finish...")
+    prefill_done.wait()
+    # At this point when the prefill_done is set, the kv-cache should have been
+    # transferred to this decode node, so we can start decoding.
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+if __name__ == "__main__":
+    prefill_done = Event()
+    prefill_process = Process(target=run_prefill, args=(prefill_done, ))
+    decode_process = Process(target=run_decode, args=(prefill_done, ))
+    # Start prefill node
+    prefill_process.start()
+    # Start decode node
+    decode_process.start()
+    # Terminate the prefill node when decode is finished
+    decode_process.join()
+    prefill_process.terminate()
--- a/examples/offline_inference/embedding.py
+++ b/examples/offline_inference/embedding.py
-# SPDX-License-Identifier: Apache-2.0
-from vllm import LLM
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create an LLM.
-# You should pass task="embed" for embedding models
-model = LLM(
-    model="intfloat/e5-mistral-7b-instruct",
-    task="embed",
-    enforce_eager=True,
-)
-# Generate embedding. The output is a list of EmbeddingRequestOutputs.
-outputs = model.embed(prompts)
-# Print the outputs.
-for prompt, output in zip(prompts, outputs):
-    embeds = output.outputs.embedding
-    embeds_trimmed = ((str(embeds[:16])[:-1] +
-                       ", ...]") if len(embeds) > 16 else embeds)
-    print(f"Prompt: {prompt!r} | "
-          f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
--- a/examples/offline_inference/gguf_inference.py
+++ b/examples/offline_inference/gguf_inference.py
-# SPDX-License-Identifier: Apache-2.0
-from huggingface_hub import hf_hub_download
-from vllm import LLM, SamplingParams
-def run_gguf_inference(model_path, tokenizer):
-    # Sample prompts.
-    prompts = [
-        "How many helicopters can a human eat in one sitting?",
-        "What's the future of AI?",
-    ]
-    prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
-    # Create a sampling params object.
-    sampling_params = SamplingParams(temperature=0, max_tokens=128)
-    # Create an LLM.
-    llm = LLM(model=model_path, tokenizer=tokenizer)
-    outputs = llm.chat(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-if __name__ == "__main__":
-    repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
-    filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
-    tokenizer = "microsoft/Phi-3-medium-4k-instruct"
-    model = hf_hub_download(repo_id, filename=filename)
-    run_gguf_inference(model, tokenizer)
--- a/examples/offline_inference/openai/openai_batch.md
+++ b/examples/offline_inference/openai/openai_batch.md
@@ -5,50 +5,49 @@ This is a guide to performing batch inference using the OpenAI batch file format
 ```
 ## File Format
 The OpenAI batch file format consists of a series of json objects on new lines.
 [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl)
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
 ```{note}
 We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon).
 ```
 ## Pre-requisites
 * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
  - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
  - Install the token on your machine (Run `huggingface-cli login`).
  - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
 ## Example 1: Running with a local file
 ### Step 1: Create your batch file
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
-```
+```console
 wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
 ```
 Once you've created your batch file it should look like this
-```
+```console
 $ cat offline_inference/openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
 ### Step 2: Run the batch
 The batch running tool is designed to be used from the command line.
 You can run the batch with the following command, which will write its results to a file called `results.jsonl`
-```
+```console
 python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
@@ -56,7 +55,7 @@ python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_e
 You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl`
-```
+```console
 $ cat results.jsonl
 {"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null}
 {"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null}
@@ -68,7 +67,7 @@ The batch runner supports remote input and output urls that are accessible via h
 For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run
-```
+```console
 python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
@@ -80,7 +79,7 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 ### Additional prerequisites
-* [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html). 
+* [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html).
 * The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3.
  - [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html).
 * The `boto3` python package (Run `pip install boto3`) to generate presigned urls.
@@ -89,13 +88,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
-```
+```console
 wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
 ```
 Once you've created your batch file it should look like this
-```
+```console
 $ cat offline_inference/openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
@@ -103,7 +102,7 @@ $ cat offline_inference/openai/openai_example_batch.jsonl
 Now upload your batch file to your S3 bucket.
-```
+```console
 aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
@@ -111,9 +110,9 @@ aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_
 Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
-(The script is adapted from https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/s3/s3_basics/presigned_url.py)
+(The script is adapted from <https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/s3/s3_basics/presigned_url.py>)
-```
+```python
 import boto3
 from botocore.exceptions import ClientError
@@ -149,7 +148,7 @@ print(f"{output_url=}")
 This script should output
-```
+```text
 input_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
 output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
 ```
@@ -158,7 +157,7 @@ output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AW
 You can now run the batch runner, using the urls generated in the previous section.
-```
+```console
 python -m vllm.entrypoints.openai.run_batch \
    -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
    -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
@@ -169,7 +168,7 @@ python -m vllm.entrypoints.openai.run_batch \
 Your results are now on S3. You can view them in your terminal by running
-```
+```console
 aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
 ```
@@ -180,10 +179,10 @@ aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
 * Ensure you are using `vllm >= 0.5.5`.
 ### Step 1: Create your batch file
 Add embedding requests to your batch file. The following is an example:
-```
+```text
 {"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
 ```
@@ -198,7 +197,7 @@ You can run the batch using the same command as in earlier examples.
 You can check your results by running `cat results.jsonl`
-```
+```console
 $ cat results.jsonl
 {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
 ...
@@ -211,10 +210,10 @@ $ cat results.jsonl
 * Ensure you are using `vllm >= 0.7.0`.
 ### Step 1: Create your batch file
 Add score requests to your batch file. The following is an example:
-```
+```text
 {"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
 ```
@@ -229,7 +228,7 @@ You can run the batch using the same command as in earlier examples.
 You can check your results by running `cat results.jsonl`
-```
+```console
 $ cat results.jsonl
 {"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
 {"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}

--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+This is a demo script showing how to use the
+PrithviGeospatialMAE model with vLLM
+This script is based on: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/blob/main/inference.py # noqa
+Target model weights: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/resolve/main/Prithvi-EO-V2-300M-TL-Sen1Floods11.pt # noqa
+The requirements for running this script are:
+- Installing [terratorch, albumentations, rasterio] in your python environment
+- downloading the model weights in a 'model' folder local to the script
+  (temporary measure until the proper config.json file is uploaded to HF)
+- download an input example image (India_900498_S2Hand.tif) and place it in
+  the same folder with the script (or specify with the --data_file argument)
+Run the example:
+python prithvi_geospatial_mae.py
+""" # noqa: E501
+import argparse
+import datetime
+import os
+import re
+from typing import List, Union
+import albumentations
+import numpy as np
+import rasterio
+import torch
+from einops import rearrange
+from terratorch.datamodules import Sen1Floods11NonGeoDataModule
+from vllm import LLM
+NO_DATA = -9999
+NO_DATA_FLOAT = 0.0001
+OFFSET = 0
+PERCENTILE = 99
+model_config = """{
+  "architectures": ["PrithviGeoSpatialMAE"],
+  "num_classes": 0,
+  "pretrained_cfg": {
+    "task_args": {
+      "task": "SemanticSegmentationTask",
+      "model_factory": "EncoderDecoderFactory",
+      "loss": "ce",
+      "ignore_index": -1,
+      "lr": 0.001,
+      "freeze_backbone": false,
+      "freeze_decoder": false,
+      "plot_on_val": 10,
+      "optimizer": "AdamW",
+      "scheduler": "CosineAnnealingLR"
+    },
+    "model_args": {
+      "backbone_pretrained": false,
+      "backbone": "prithvi_eo_v2_300_tl",
+      "decoder": "UperNetDecoder",
+      "decoder_channels": 256,
+      "decoder_scale_modules": true,
+      "num_classes": 2,
+      "rescale": true,
+      "backbone_bands": [
+        "BLUE",
+        "GREEN",
+        "RED",
+        "NIR_NARROW",
+        "SWIR_1",
+        "SWIR_2"
+      ],
+      "head_dropout": 0.1,
+      "necks": [
+        {
+          "name": "SelectIndices",
+          "indices": [
+            5,
+            11,
+            17,
+            23
+          ]
+        },
+        {
+          "name": "ReshapeTokensToImage"
+        }
+      ]
+    },
+    "optimizer_params" : {
+      "lr": 5.0e-05,
+      "betas": [0.9, 0.999],
+      "eps": [1.0e-08],
+      "weight_decay": 0.05,
+      "amsgrad": false,
+      "maximize": false,
+      "capturable": false,
+      "differentiable": false
+    },
+    "scheduler_params" : {
+        "T_max": 50,
+        "eta_min": 0,
+        "last_epoch": -1,
+        "verbose": "deprecated"
+    }
+  },
+  "torch_dtype": "float32"
+}
+"""
+# Temporarily creating the "config.json" for the model.
+# This is going to disappear once the correct config.json is available on HF
+with open(os.path.join(os.path.dirname(__file__), "./model/config.json"),
+          'w') as config_file:
+    config_file.write(model_config)
+datamodule_config = {
+    'bands': ['BLUE', 'GREEN', 'RED', 'NIR_NARROW', 'SWIR_1', 'SWIR_2'],
+    'batch_size':
+    16,
+    'constant_scale':
+    0.0001,
+    'data_root':
+    '/dccstor/geofm-finetuning/datasets/sen1floods11',
+    'drop_last':
+    True,
+    'no_data_replace':
+    0.0,
+    'no_label_replace':
+    -1,
+    'num_workers':
+    8,
+    'test_transform': [
+        albumentations.Resize(always_apply=False,
+                              height=448,
+                              interpolation=1,
+                              p=1,
+                              width=448),
+        albumentations.pytorch.ToTensorV2(transpose_mask=False,
+                                          always_apply=True,
+                                          p=1.0)
+    ],
+}
+class PrithviMAE:
+    def __init__(self):
+        print("Initializing PrithviMAE model")
+        self.model = LLM(model=os.path.join(os.path.dirname(__file__),
+                                            "./model"),
+                         skip_tokenizer_init=True,
+                         dtype="float32")
+    def run(self, input_data, location_coords):
+        print("################ Running inference on vLLM ##############")
+        # merge the inputs into one data structure
+        mm_data = {
+            "pixel_values":
+            torch.empty(0) if input_data is None else input_data,
+            "location_coords":
+            torch.empty(0) if location_coords is None else location_coords
+        }
+        prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
+        outputs = self.model.encode(prompt, use_tqdm=False)
+        print(
+            "################ Inference done (it took seconds)  ##############"
+        )
+        return outputs[0].outputs.data
+def generate_datamodule():
+    datamodule = Sen1Floods11NonGeoDataModule(
+        data_root=datamodule_config['data_root'],
+        batch_size=datamodule_config["batch_size"],
+        num_workers=datamodule_config["num_workers"],
+        bands=datamodule_config["bands"],
+        drop_last=datamodule_config["drop_last"],
+        test_transform=datamodule_config["test_transform"
+                                         ""])
+    return datamodule
+def process_channel_group(orig_img, channels):
+    """
+    Args:
+        orig_img: torch.Tensor representing original image (reference)
+                  with shape = (bands, H, W).
+        channels: list of indices representing RGB channels.
+    Returns:
+        torch.Tensor with shape (num_channels, height, width) for original image
+    """
+    orig_img = orig_img[channels, ...]
+    valid_mask = torch.ones_like(orig_img, dtype=torch.bool)
+    valid_mask[orig_img == NO_DATA_FLOAT] = False
+    # Rescale (enhancing contrast)
+    max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE))
+    min_value = OFFSET
+    orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0,
+                           1)
+    # No data as zeros
+    orig_img[~valid_mask] = 0
+    return orig_img
+def read_geotiff(file_path: str):
+    """Read all bands from *file_path* and return image + meta info.
+    Args:
+        file_path: path to image file.
+    Returns:
+        np.ndarray with shape (bands, height, width)
+        meta info dict
+    """
+    with rasterio.open(file_path) as src:
+        img = src.read()
+        meta = src.meta
+        try:
+            coords = src.lnglat()
+        except Exception:
+            # Cannot read coords
+            coords = None
+    return img, meta, coords
+def save_geotiff(image, output_path: str, meta: dict):
+    """Save multi-band image in Geotiff file.
+    Args:
+        image: np.ndarray with shape (bands, height, width)
+        output_path: path where to save the image
+        meta: dict with meta info.
+    """
+    with rasterio.open(output_path, "w", **meta) as dest:
+        for i in range(image.shape[0]):
+            dest.write(image[i, :, :], i + 1)
+    return
+def _convert_np_uint8(float_image: torch.Tensor):
+    image = float_image.numpy() * 255.0
+    image = image.astype(dtype=np.uint8)
+    return image
+def load_example(
+    file_paths: List[str],
+    mean: List[float] = None,
+    std: List[float] = None,
+    indices: Union[list[int], None] = None,
+):
+    """Build an input example by loading images in *file_paths*.
+    Args:
+        file_paths: list of file paths .
+        mean: list containing mean values for each band in the images
+              in *file_paths*.
+        std: list containing std values for each band in the images
+             in *file_paths*.
+    Returns:
+        np.array containing created example
+        list of meta info for each image in *file_paths*
+    """
+    imgs = []
+    metas = []
+    temporal_coords = []
+    location_coords = []
+    for file in file_paths:
+        img, meta, coords = read_geotiff(file)
+        # Rescaling (don't normalize on nodata)
+        img = np.moveaxis(img, 0, -1)  # channels last for rescaling
+        if indices is not None:
+            img = img[..., indices]
+        if mean is not None and std is not None:
+            img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)
+        imgs.append(img)
+        metas.append(meta)
+        if coords is not None:
+            location_coords.append(coords)
+        try:
+            match = re.search(r'(\d{7,8}T\d{6})', file)
+            if match:
+                year = int(match.group(1)[:4])
+                julian_day = match.group(1).split('T')[0][4:]
+                if len(julian_day) == 3:
+                    julian_day = int(julian_day)
+                else:
+                    julian_day = datetime.datetime.strptime(
+                        julian_day, '%m%d').timetuple().tm_yday
+                temporal_coords.append([year, julian_day])
+        except Exception as e:
+            print(f'Could not extract timestamp for {file} ({e})')
+    imgs = np.stack(imgs, axis=0)  # num_frames, H, W, C
+    imgs = np.moveaxis(imgs, -1, 0).astype("float32")
+    imgs = np.expand_dims(imgs, axis=0)  # add batch di
+    return imgs, temporal_coords, location_coords, metas
+def run_model(input_data,
+              temporal_coords,
+              location_coords,
+              model,
+              datamodule,
+              img_size,
+              lightning_model=None):
+    # Reflect pad if not divisible by img_size
+    original_h, original_w = input_data.shape[-2:]
+    pad_h = (img_size - (original_h % img_size)) % img_size
+    pad_w = (img_size - (original_w % img_size)) % img_size
+    input_data = np.pad(input_data,
+                        ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)),
+                        mode="reflect")
+    # Build sliding window
+    batch_size = 1
+    batch = torch.tensor(input_data, device="cpu")
+    windows = (batch.unfold(3, img_size,
+                            img_size).unfold(4, img_size, img_size))
+    h1, w1 = windows.shape[3:5]
+    windows = rearrange(windows,
+                        "b c t h1 w1 h w -> (b h1 w1) c t h w",
+                        h=img_size,
+                        w=img_size)
+    # Split into batches if number of windows > batch_size
+    num_batches = windows.shape[0] // batch_size if windows.shape[
+        0] > batch_size else 1
+    windows = torch.tensor_split(windows, num_batches, dim=0)
+    if torch.cuda.is_available():
+        device = torch.device('cuda')
+    else:
+        device = torch.device('cpu')
+    if temporal_coords:
+        temporal_coords = torch.tensor(temporal_coords,
+                                       device=device).unsqueeze(0)
+    else:
+        temporal_coords = None
+    if location_coords:
+        location_coords = torch.tensor(location_coords[0],
+                                       device=device).unsqueeze(0)
+    else:
+        location_coords = None
+    # Run model
+    pred_imgs = []
+    for x in windows:
+        # Apply standardization
+        x = datamodule.test_transform(
+            image=x.squeeze().numpy().transpose(1, 2, 0))
+        x = datamodule.aug(x)['image']
+        with torch.no_grad():
+            x = x.to(device)
+            pred = model.run(x, location_coords=location_coords)
+            if lightning_model:
+                pred_lightning = lightning_model(
+                    x,
+                    temporal_coords=temporal_coords,
+                    location_coords=location_coords)
+                pred_lightning = pred_lightning.output.detach().cpu()
+                if not torch.equal(pred, pred_lightning):
+                    print("Inference output is not equal")
+        y_hat = pred.argmax(dim=1)
+        y_hat = torch.nn.functional.interpolate(y_hat.unsqueeze(1).float(),
+                                                size=img_size,
+                                                mode="nearest")
+        pred_imgs.append(y_hat)
+    pred_imgs = torch.concat(pred_imgs, dim=0)
+    # Build images from patches
+    pred_imgs = rearrange(
+        pred_imgs,
+        "(b h1 w1) c h w -> b c (h1 h) (w1 w)",
+        h=img_size,
+        w=img_size,
+        b=1,
+        c=1,
+        h1=h1,
+        w1=w1,
+    )
+    # Cut padded area back to original size
+    pred_imgs = pred_imgs[..., :original_h, :original_w]
+    # Squeeze (batch size 1)
+    pred_imgs = pred_imgs[0]
+    return pred_imgs
+def main(
+    data_file: str,
+    output_dir: str,
+    rgb_outputs: bool,
+    input_indices: list[int] = None,
+):
+    os.makedirs(output_dir, exist_ok=True)
+    # Load model ---------------------------------------------------------------
+    model_obj = PrithviMAE()
+    datamodule = generate_datamodule()
+    img_size = 256  # Size of Sen1Floods11
+    # Loading data -------------------------------------------------------------
+    input_data, temporal_coords, location_coords, meta_data = load_example(
+        file_paths=[data_file],
+        indices=input_indices,
+    )
+    meta_data = meta_data[0]  # only one image
+    if input_data.mean() > 1:
+        input_data = input_data / 10000  # Convert to range 0-1
+    # Running model ------------------------------------------------------------
+    channels = [
+        datamodule_config['bands'].index(b) for b in ["RED", "GREEN", "BLUE"]
+    ]  # BGR -> RGB
+    pred = run_model(input_data, temporal_coords, location_coords, model_obj,
+                     datamodule, img_size)
+    # Save pred
+    meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
+    pred_file = os.path.join(
+        output_dir,
+        f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
+    save_geotiff(_convert_np_uint8(pred), pred_file, meta_data)
+    # Save image + pred
+    meta_data.update(count=3, dtype="uint8", compress="lzw", nodata=0)
+    if input_data.mean() < 1:
+        input_data = input_data * 10000  # Scale to 0-10000
+    rgb_orig = process_channel_group(
+        orig_img=torch.Tensor(input_data[0, :, 0, ...]),
+        channels=channels,
+    )
+    pred[pred == 0.] = np.nan
+    img_pred = rgb_orig * 0.7 + pred * 0.3
+    img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()]
+    img_pred_file = os.path.join(
+        output_dir,
+        f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
+    save_geotiff(
+        image=_convert_np_uint8(img_pred),
+        output_path=img_pred_file,
+        meta=meta_data,
+    )
+    # Save image rgb
+    if rgb_outputs:
+        rgb_file = os.path.join(
+            output_dir, "original_rgb_"
+            f"{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
+        save_geotiff(
+            image=_convert_np_uint8(rgb_orig),
+            output_path=rgb_file,
+            meta=meta_data,
+        )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("MAE run inference", add_help=False)
+    parser.add_argument(
+        "--data_file",
+        type=str,
+        default="./India_900498_S2Hand.tif",
+        help="Path to the file.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="Path to the directory where to save outputs.",
+    )
+    parser.add_argument(
+        "--input_indices",
+        default=[1, 2, 3, 8, 11, 12],
+        type=int,
+        nargs="+",
+        help=
+        "0-based indices of the six Prithvi channels to be selected from the  "
+        "input. By default selects [1,2,3,8,11,12] for S2L1C data.",
+    )
+    parser.add_argument(
+        "--rgb_outputs",
+        action="store_true",
+        help="If present, output files will only contain RGB channels. "
+        "Otherwise, all bands will be saved.",
+    )
+    args = parser.parse_args()
+    main(**vars(args))
--- a/examples/offline_inference/profiling_tpu/README.md
+++ b/examples/offline_inference/profiling_tpu/README.md
@@ -29,7 +29,6 @@ python3 profiling.py \
    --profile-result-dir profiles
 ```
 ### Generate Decode Trace
 This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill).
@@ -51,17 +50,18 @@ python3 profiling.py \
    --max-model-len 2048 --tensor-parallel-size 8
 ```
 ## Visualizing the profiles
 Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).
 Here are most likely the dependencies you need to install:
 ```bash
 pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources
 ```
 Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
 ```bash
 tensorboard --logdir profiles/ --port 6006
 ```
\ No newline at end of file
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -24,7 +24,7 @@ def main(args: argparse.Namespace):
    engine_args = EngineArgs.from_cli_args(args)
    llm = LLM(**dataclasses.asdict(engine_args))
-    _ = xp.start_server(9012)
+    server = xp.start_server(9012)  # noqa: F841
    sampling_params = SamplingParams(
        temperature=0.0,

--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -92,7 +92,7 @@ class MyLLM(LLM):
        # a hack to make the script work.
        # stop ray from manipulating CUDA_VISIBLE_DEVICES
        # at the top-level
-        del os.environ["CUDA_VISIBLE_DEVICES"]
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
        super().__init__(*args, **kwargs)

--- a/examples/offline_inference/ray_placement.py
+++ b/examples/offline_inference/ray_placement.py
 # SPDX-License-Identifier: Apache-2.0
 """
-a simple demonstration to show how to control
+a simple demonstration to show how to co-locate
-the placement of the vLLM workers with Ray.
+vLLM worker with training actors on the same GPUs,
-The key is to set VLLM_RAY_PER_WORKER_GPUS and
+for RLHF-like applications.
-VLLM_RAY_BUNDLE_INDICES properly.
+The key points:
+- Control the placement of the vLLM workers with Ray, by setting
+    VLLM_RAY_PER_WORKER_GPUS and VLLM_RAY_BUNDLE_INDICES properly.
+- Use cuda-ipc to pass tensors, since NCCL does not work when we have
+    multiple processes on the same GPU.
 """
 import os
 import ray
+import torch
 from ray.util.placement_group import placement_group
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -19,7 +24,33 @@ class MyWorker(Worker):
    def report_device_id(self) -> str:
        from vllm.platforms import current_platform
-        return current_platform.get_device_uuid(self.device.index)
+        self.device_uuid = current_platform.get_device_uuid(self.device.index)
+        return self.device_uuid
+    def update_weights_from_ipc_handles(self, ipc_handles):
+        handles = ipc_handles[self.device_uuid]
+        device_id = self.device.index
+        weights = []
+        for name, handle in handles.items():
+            func, args = handle
+            list_args = list(args)
+            # the key is to change device id to the current device id
+            # in case two processes have different CUDA_VISIBLE_DEVICES
+            list_args[6] = device_id
+            tensor = func(*list_args)
+            weights.append((name, tensor))
+        self.model_runner.model.load_weights(weights=weights)
+        torch.cuda.synchronize()
+    def check_weights_changed(self):
+        """
+        Check if the weights are updated to 0.
+        """
+        weights_updated = True
+        for name, p in self.model_runner.model.named_parameters():
+            weights_updated = weights_updated and torch.allclose(
+                p, torch.zeros_like(p))
+        return weights_updated
 class MyLLM(LLM):
@@ -28,7 +59,7 @@ class MyLLM(LLM):
        # a hack to make the script work.
        # stop ray from manipulating CUDA_VISIBLE_DEVICES
        # at the top-level
-        del os.environ["CUDA_VISIBLE_DEVICES"]
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
        # every worker will use 0.4 GPU, so that we can schedule
        # 2 instances on the same GPUs.
        os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
@@ -40,12 +71,32 @@ class MyLLM(LLM):
 class RayTrainingActor:
-    def report_device_id(self) -> str:
+    def __init__(self):
+        # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
+        from transformers import AutoModelForCausalLM
+        self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+        self.model.to("cuda:0")
+        for name, p in self.model.named_parameters():
+            p.data.zero_()
+        torch.cuda.synchronize()
        # the argument for get_device_uuid is the index
        # of the GPU in the visible devices.
-        # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
        from vllm.platforms import current_platform
-        return current_platform.get_device_uuid(0)
+        self.device_uuid = current_platform.get_device_uuid(0)
+    def report_device_id(self) -> str:
+        return self.device_uuid
+    def get_weight_ipc_handles(self):
+        from torch.multiprocessing.reductions import reduce_tensor
+        data = {}
+        for name, p in self.model.named_parameters():
+            # the training actor might only have a subset of the weights
+            # and need to all-gather the weights from all the actors.
+            # for demonstration, here we assume all training actors have
+            # the full weights.
+            data[name] = reduce_tensor(p.detach())
+        return {self.device_uuid: data}
 # ray manages 4 GPUs
@@ -78,6 +129,8 @@ for bundle_index in [0, 1, 2, 3]:
        ),
    )(RayTrainingActor).remote()
    training_actors.append(training_actor)
+for bundle_index, training_actor in enumerate(training_actors):
    device_id = ray.get(training_actor.report_device_id.remote())
    print(f"training actor {bundle_index} is on {device_id}")
    training_actor_device_ids.append(device_id)
@@ -119,3 +172,18 @@ assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
 # the last two training actors should be
 # on the same GPUs as the second inference engine
 assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
+print("gather all the IPC handles from the training actors")
+ipc_handles = {}
+for actor in training_actors:
+    ipc_handles.update(ray.get(actor.get_weight_ipc_handles.remote()))
+print("update the weights of the inference engines")
+for llm in inference_engines:
+    ray.get(
+        llm.collective_rpc.remote("update_weights_from_ipc_handles",
+                                  args=(ipc_handles, )))
+print("check if the weights are updated")
+for llm in inference_engines:
+    assert ray.get(
+        llm.collective_rpc.remote("check_weights_changed", args=tuple()))
--- a/examples/offline_inference/scoring.py
+++ b/examples/offline_inference/scoring.py
-# SPDX-License-Identifier: Apache-2.0
-from vllm import LLM
-# Sample prompts.
-text_1 = "What is the capital of France?"
-texts_2 = [
-    "The capital of Brazil is Brasilia.", "The capital of France is Paris."
-]
-# Create an LLM.
-# You should pass task="score" for cross-encoder models
-model = LLM(
-    model="BAAI/bge-reranker-v2-m3",
-    task="score",
-    enforce_eager=True,
-)
-# Generate scores. The output is a list of ScoringRequestOutputs.
-outputs = model.score(text_1, texts_2)
-# Print the outputs.
-for text_2, output in zip(texts_2, outputs):
-    score = output.outputs.score
-    print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -105,8 +105,12 @@ def run_glm4v(question: str, modality: str):
              max_num_seqs=2,
              trust_remote_code=True,
              enforce_eager=True,
+              hf_overrides={"architectures": ["GLM4VForCausalLM"]},
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    prompt = question
+    prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
+        {question}<|assistant|>"
    stop_token_ids = [151329, 151336, 151338]
    return llm, prompt, stop_token_ids
@@ -115,7 +119,7 @@ def run_glm4v(question: str, modality: str):
 def run_h2ovl(question: str, modality: str):
    assert modality == "image"
-    model_name = "h2oai/h2ovl-mississippi-2b"
+    model_name = "h2oai/h2ovl-mississippi-800m"
    llm = LLM(
        model=model_name,
@@ -132,7 +136,7 @@ def run_h2ovl(question: str, modality: str):
                                           add_generation_prompt=True)
    # Stop tokens for H2OVL-Mississippi
-    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
    stop_token_ids = [tokenizer.eos_token_id]
    return llm, prompt, stop_token_ids
@@ -493,6 +497,7 @@ def run_qwen_vl(question: str, modality: str):
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
+        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -77,8 +77,8 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
    )
-def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
-    model_name = "h2oai/h2ovl-mississippi-2b"
+    model_name = "h2oai/h2ovl-mississippi-800m"
    llm = LLM(
        model=model_name,
@@ -99,7 +99,7 @@ def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
                                           add_generation_prompt=True)
    # Stop tokens for H2OVL-Mississippi
-    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
    stop_token_ids = [tokenizer.eos_token_id]
    return ModelRequestData(
@@ -302,6 +302,7 @@ def load_qwen_vl_chat(question: str,
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
+        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "".join(f"Picture {i}: <img></img>\n"
@@ -452,7 +453,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
 model_example_map = {
    "aria": load_aria,
    "deepseek_vl_v2": load_deepseek_vl2,
-    "h2ovl_chat": load_h2onvl,
+    "h2ovl_chat": load_h2ovl,
    "idefics3": load_idefics3,
    "internvl_chat": load_internvl,
    "mllama": load_mllama,

--- a/examples/online_serving/chart-helm/README.md
+++ b/examples/online_serving/chart-helm/README.md
@@ -18,4 +18,4 @@ This directory contains a Helm chart for deploying the vllm application. The cha
 - templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
 - templates/pvc.yaml: Template for Persistent Volume Claims.
 - templates/secrets.yaml: Template for Kubernetes Secrets.
 - templates/service.yaml: Template for creating Services.
\ No newline at end of file
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -12,7 +12,7 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
 (audio inference with Ultravox)
-vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
+vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
 """
 import base64

--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -36,8 +36,8 @@ response = client.chat.completions.create(model=model, messages=messages)
 reasoning_content = response.choices[0].message.reasoning_content
 content = response.choices[0].message.content
-print("reasoning_content:", reasoning_content)
+print("reasoning_content for Round 1:", reasoning_content)
-print("content:", content)
+print("content for Round 1:", content)
 # Round 2
 messages.append({"role": "assistant", "content": content})
@@ -50,5 +50,5 @@ response = client.chat.completions.create(model=model, messages=messages)
 reasoning_content = response.choices[0].message.reasoning_content
 content = response.choices[0].message.content
-print("reasoning_content:", reasoning_content)
+print("reasoning_content for Round 2:", reasoning_content)
-print("content:", content)
+print("content for Round 2:", content)
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -44,7 +44,7 @@ def vlm2vec():
 def dse_qwen2_vl(inp: dict):
    # Embedding an Image
-    if inp["dtype"] == "image":
+    if inp["type"] == "image":
        messages = [{
            "role":
            "user",
@@ -113,10 +113,10 @@ if __name__ == '__main__':
        vlm2vec()
    elif args.model == "dse_qwen2_vl":
        dse_qwen2_vl({
-            "dtye": "image",
+            "type": "image",
            "image_url": image_url,
        })
        dse_qwen2_vl({
-            "dtype": "text",
+            "type": "text",
            "content": "What is the weather like today?",
        })
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
+# SPDX-License-Identifier: Apache-2.0
+from openai import OpenAI
+from vllm.assets.audio import AudioAsset
+mary_had_lamb = AudioAsset('mary_had_lamb').get_local_path()
+winning_call = AudioAsset('winning_call').get_local_path()
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+with open(str(mary_had_lamb), "rb") as f:
+    transcription = client.audio.transcriptions.create(
+        file=f,
+        model="openai/whisper-large-v3",
+        language="en",
+        response_format="text",
+        temperature=0.0)
+    print("transcription result:", transcription)
--- a/examples/online_serving/opentelemetry/Otel.md
+++ b/examples/online_serving/opentelemetry/Otel.md
 # Setup OpenTelemetry POC
 1. Install OpenTelemetry packages:
-    ```
+    ```console
    pip install \
      'opentelemetry-sdk>=1.26.0,<1.27.0' \
      'opentelemetry-api>=1.26.0,<1.27.0' \
@@ -10,7 +11,8 @@
    ```
 1. Start Jaeger in a docker container:
-    ```
+    ```console
    # From: https://www.jaegertracing.io/docs/1.57/getting-started/
    docker run --rm --name jaeger \
        -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
@@ -28,19 +30,23 @@
    ```
 1. In a new shell, export Jaeger IP:
-    ```
+    ```console
    export JAEGER_IP=$(docker inspect   --format '{{ .NetworkSettings.IPAddress }}' jaeger)
    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
    ```
    Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
-    ```
+    ```console
    export OTEL_SERVICE_NAME="vllm-server"
    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
    vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
    ```
 1. In a new shell, send requests with trace context from a dummy client
-    ```
+    ```console
    export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
@@ -48,7 +54,7 @@
    python dummy_client.py
    ```
-1. Open Jaeger webui: http://localhost:16686/
+1. Open Jaeger webui: <http://localhost:16686/>
    In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
    ![Traces](https://i.imgur.com/GYHhFjo.png)
@@ -57,26 +63,32 @@
 ![Spans details](https://i.imgur.com/OPf6CBL.png)
 ## Exporter Protocol
 OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
 By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
-```
+```console
 export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
 vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
 ```
 ## Instrumentation of FastAPI
 OpenTelemetry allows automatic instrumentation of FastAPI.
 1. Install the instrumentation library
-    ```
+    ```console
    pip install opentelemetry-instrumentation-fastapi
    ```
 1. Run vLLM with `opentelemetry-instrument`
-    ```
+    ```console
    opentelemetry-instrument vllm serve facebook/opt-125m
    ```
 1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
 ![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
\ No newline at end of file