Merge tag 'v0.11.2' into v0.11.2-ori

006693ed · zhuwenwen · 4b51e6f1 · 275de341 · 006693ed · 006693ed
Commit 006693ed authored Dec 01, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -13,7 +13,7 @@ from tqdm import tqdm
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
 DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))

--- a/examples/offline_inference/qwen2_5_omni/README.md
+++ b/examples/offline_inference/qwen2_5_omni/README.md
@@ -11,12 +11,10 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \

 # Read vision and audio inputs from a single video file
 # NOTE: V1 engine does not support interleaved modalities yet.
-VLLM_USE_V1=0 \
 python examples/offline_inference/qwen2_5_omni/only_thinker.py \
    -q use_audio_in_video

 # Multiple audios
-VLLM_USE_V1=0 \
 python examples/offline_inference/qwen2_5_omni/only_thinker.py \
    -q multi_audios
 ```

--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -7,13 +7,12 @@ with the correct prompt format on Qwen2.5-Omni (thinker only).

 from typing import NamedTuple

-import vllm.envs as envs
 from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.multimodal.image import convert_image_mode
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser


 class QueryResult(NamedTuple):
@@ -72,11 +71,7 @@ def get_use_audio_in_video_query() -> QueryResult:
    )
    asset = VideoAsset(name="baby_reading", num_frames=16)
    audio = asset.get_audio(sampling_rate=16000)
-    assert not envs.VLLM_USE_V1, (
-        "V1 does not support use_audio_in_video. "
-        "Please launch this example with "
-        "`VLLM_USE_V1=0`."
-    )
+
    return QueryResult(
        inputs={
            "prompt": prompt,

--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -38,7 +38,7 @@ from rlhf_utils import stateless_init_process_group
 from transformers import AutoModelForCausalLM

 from vllm import LLM, SamplingParams
-from vllm.utils import get_ip, get_open_port
+from vllm.utils.network_utils import get_ip, get_open_port


 class MyLLM(LLM):

--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
-from typing import Callable, Optional, TypedDict
+from collections.abc import Callable
+from typing import TypedDict

 import torch
 import zmq
@@ -71,7 +72,7 @@ class WorkerExtension:


 def rebuild_ipc(
-    handle: tuple[Callable, tuple], device_id: Optional[int] = None
+    handle: tuple[Callable, tuple], device_id: int | None = None
 ) -> torch.Tensor:
    func, args = handle
    list_args = list(args)
@@ -109,7 +110,7 @@ class ColocateWorkerExtension:
            self._zmq_ctx = zmq.Context()
        socket = self._zmq_ctx.socket(zmq.REP)
        socket.connect(zmq_handles[self.report_device_id()])
-        buffer: Optional[torch.Tensor] = None
+        buffer: torch.Tensor | None = None
        while True:
            payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = (
                socket.recv_pyobj()

--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -30,7 +30,7 @@ from pathlib import Path

 from vllm import LLM, EngineArgs
 from vllm.model_executor.model_loader import ShardedStateLoader
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser


 def parse_args():

--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -9,25 +9,25 @@ from vllm.inputs import TokensPrompt
 from vllm.v1.metrics.reader import Counter, Vector

 try:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser


 QUESTION = "What is the content of each image?"
 IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/flycatcher.jpeg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/somefish.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/starfish.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/snail.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/thistle.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/husky.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/orangetabbycat.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/guineapig.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/rabbit.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/horsepony.jpg",
 ]



--- a/examples/offline_inference/torchrun_dp_example.py
+++ b/examples/offline_inference/torchrun_dp_example.py
@@ -4,17 +4,88 @@
 experimental support for data-parallel inference with torchrun
 Note the data load balancing and distribution is done out of the vllm engine,
 no internal lb supported in external_launcher mode.
+
+To run this example:
+```bash
+$ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py
+```
+
+With custom parallelism settings:
+```bash
+$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \
+    --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+```
 """

+import argparse
+
 from vllm import LLM, SamplingParams

+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Data-parallel inference with torchrun"
+    )
+    parser.add_argument(
+        "--tp-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size (default: 1)",
+    )
+    parser.add_argument(
+        "--pp-size",
+        type=int,
+        default=1,
+        help="Pipeline parallel size (default: 1)",
+    )
+    parser.add_argument(
+        "--dp-size",
+        type=int,
+        default=2,
+        help="Data parallel size (default: 2)",
+    )
+    parser.add_argument(
+        "--enable-ep",
+        action="store_true",
+        help="Enable expert parallel (default: False)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="microsoft/Phi-mini-MoE-instruct",
+        help="Model name or path (default: microsoft/Phi-mini-MoE-instruct)",
+    )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=4096,
+        help="Maximum model length (default: 4096)",
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.6,
+        help="GPU memory utilization (default: 0.6)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=1,
+        help="Random seed (default: 1)",
+    )
+    return parser.parse_args()
+
+
+args = parse_args()
+
+
 # Create prompts, the same across all ranks
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
-] * 50
+]

 # Create sampling parameters, the same across all ranks
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
@@ -25,15 +96,15 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # all ranks have the same random seed, so that sampling can be
 # deterministic across ranks.
 llm = LLM(
-    model="microsoft/Phi-mini-MoE-instruct",
-    tensor_parallel_size=1,
-    data_parallel_size=2,
-    pipeline_parallel_size=1,
-    enable_expert_parallel=False,
+    model=args.model,
+    tensor_parallel_size=args.tp_size,
+    data_parallel_size=args.dp_size,
+    pipeline_parallel_size=args.pp_size,
+    enable_expert_parallel=args.enable_ep,
    distributed_executor_backend="external_launcher",
-    max_model_len=4096,
-    gpu_memory_utilization=0.6,
-    seed=1,
+    max_model_len=args.max_model_len,
+    gpu_memory_utilization=args.gpu_memory_utilization,
+    seed=args.seed,
 )

 dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank
@@ -45,14 +116,13 @@ prompts = [

 outputs = llm.generate(prompts, sampling_params)

-
-# all ranks will have the same outputs
-print("-" * 50)
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
-    print("-" * 50)
+    print(
+        f"DP Rank: {dp_rank} Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n"
+    )
+
 """
 Further tips:


--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -12,7 +12,7 @@ import os
 import random
 from contextlib import contextmanager
 from dataclasses import asdict
-from typing import NamedTuple, Optional
+from typing import NamedTuple

 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
@@ -22,14 +22,15 @@ from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import convert_image_mode
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser


 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
-    stop_token_ids: Optional[list[int]] = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    stop_token_ids: list[int] | None = None
+    lora_requests: list[LoRARequest] | None = None
+    sampling_params: list[SamplingParams] | None = None


 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@@ -90,16 +91,25 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
    )


-# BLIP-2
-def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
+# Bee-8B
+def run_bee(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
+    model_name = "Open-Bee/Bee-8B-RL"
+
+    prompts = [
+        (
+            f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<image>\n{question}<|im_end|>"
+            f"<|im_start|>assistant\n<think>\n"
+        )
+        for question in questions
+    ]

-    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
-    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
-    prompts = [f"Question: {question} Answer:" for question in questions]
    engine_args = EngineArgs(
-        model="Salesforce/blip2-opt-2.7b",
+        model=model_name,
+        max_model_len=16384,
        limit_mm_per_prompt={modality: 1},
+        trust_remote_code=True,
    )

    return ModelRequestData(
@@ -108,15 +118,15 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    )


-# Chameleon
-def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
+# BLIP-2
+def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

-    prompts = [f"{question}<image>" for question in questions]
+    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
+    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
+    prompts = [f"Question: {question} Answer:" for question in questions]
    engine_args = EngineArgs(
-        model="facebook/chameleon-7b",
-        max_model_len=4096,
-        max_num_seqs=2,
+        model="Salesforce/blip2-opt-2.7b",
        limit_mm_per_prompt={modality: 1},
    )

@@ -126,15 +136,16 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
    )


-# Dots-OCR
-def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
+# Chameleon
+def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

-    prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
+    prompts = [f"{question}<image>" for question in questions]
    engine_args = EngineArgs(
-        model="rednote-hilab/dots.ocr",
+        model="facebook/chameleon-7b",
+        max_model_len=4096,
+        max_num_seqs=2,
        limit_mm_per_prompt={modality: 1},
-        trust_remote_code=True,
    )

    return ModelRequestData(
@@ -190,6 +201,66 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
    )


+def run_deepseek_ocr(questions: list[str], modality: str) -> ModelRequestData:
+    from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
+
+    assert modality == "image"
+
+    model_name = "deepseek-ai/DeepSeek-OCR"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        limit_mm_per_prompt={modality: 1},
+        logits_processors=[NGramPerReqLogitsProcessor],
+    )
+
+    # deepseek-ocr use plain prompt template
+    prompts = [f"<image>\n{question}" for question in questions]
+
+    # The following sampling params config is taken from
+    # the official Deepseek-OCR inference example.
+    # (IMPORTANT) Use the custom logits processor and avoid skipping
+    # special tokens for this model for the optimal OCR performance.
+    sampling_params = [
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=8192,
+            # ngram logit processor args
+            extra_args=dict(
+                ngram_size=30,
+                window_size=90,
+                # whitelist: <td>, </td>
+                whitelist_token_ids={128821, 128822},
+            ),
+            skip_special_tokens=False,
+        )
+        for _ in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+# Dots-OCR
+def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
+    engine_args = EngineArgs(
+        model="rednote-hilab/dots.ocr",
+        limit_mm_per_prompt={modality: 1},
+        trust_remote_code=True,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Ernie4.5-VL
 def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"
@@ -576,7 +647,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:

 # Intern-S1
 def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
-    model_name = "internlm/Intern-S1"
+    model_name = "internlm/Intern-S1-mini"

    engine_args = EngineArgs(
        model=model_name,
@@ -733,6 +804,26 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
    )


+# LightOnOCR
+def run_lightonocr(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [
+        "<|im_start|>system<|im_end|>\n<|im_start|>user\n<|image_pad|><|im_end|>\n<|im_start|>assistant\n"
+        for _ in questions
+    ]
+
+    engine_args = EngineArgs(
+        model="lightonai/LightOnOCR-1B",
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

@@ -1140,15 +1231,37 @@ def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
    elif modality == "video":
        placeholder = "<video>"

-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    messages = [
-        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+    prompts = [
+        f"<|im_start|>user\n\n{placeholder}\n{question}<|im_end|>\n<|im_start|>assistant\n"
        for question in questions
    ]
-    prompts = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
    )

+
+# PaddleOCR-VL
+def run_paddleocr_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "PaddlePaddle/PaddleOCR-VL"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+        trust_remote_code=True,
+    )
+
+    placeholder = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
+    prompts = [
+        (f"<|begin_of_sentence|>User: {question}{placeholder}\nAssistant: ")
+        for question in questions
+    ]
+
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
@@ -1423,7 +1536,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
-            "fps": [1],
+            "fps": 1,
        },
        limit_mm_per_prompt={modality: 1},
    )
@@ -1691,11 +1804,13 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
 model_example_map = {
    "aria": run_aria,
    "aya_vision": run_aya_vision,
+    "bee": run_bee,
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
-    "dots_ocr": run_dots_ocr,
    "command_a_vision": run_command_a_vision,
    "deepseek_vl_v2": run_deepseek_vl2,
+    "deepseek_ocr": run_deepseek_ocr,
+    "dots_ocr": run_dots_ocr,
    "ernie45_vl": run_ernie45_vl,
    "fuyu": run_fuyu,
    "gemma3": run_gemma3,
@@ -1712,6 +1827,7 @@ model_example_map = {
    "keye_vl": run_keye_vl,
    "keye_vl1_5": run_keye_vl1_5,
    "kimi_vl": run_kimi_vl,
+    "lightonocr": run_lightonocr,
    "llama4": run_llama4,
    "llava": run_llava,
    "llava-next": run_llava_next,
@@ -1727,6 +1843,7 @@ model_example_map = {
    "NVLM_D": run_nvlm_d,
    "ovis": run_ovis,
    "ovis2_5": run_ovis2_5,
+    "paddleocr_vl": run_paddleocr_vl,
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
@@ -1957,8 +2074,12 @@ def main(args):

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(
-        temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
+    sampling_params = (
+        SamplingParams(
+            temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
+        )
+        if req_data.sampling_params is None
+        else req_data.sampling_params
    )

    assert args.num_prompts > 0

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -9,7 +9,7 @@ using the chat template defined by the model.
 import os
 from argparse import Namespace
 from dataclasses import asdict
-from typing import NamedTuple, Optional
+from typing import NamedTuple

 from huggingface_hub import snapshot_download
 from PIL.Image import Image
@@ -18,22 +18,22 @@ from transformers import AutoProcessor, AutoTokenizer
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.utils import fetch_image
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 QUESTION = "What is the content of each image?"
 IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/flycatcher.jpeg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/somefish.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/starfish.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/snail.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/thistle.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/husky.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/orangetabbycat.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/guineapig.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/rabbit.jpg",
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/horsepony.jpg",
 ]


@@ -41,9 +41,10 @@ class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompt: str
    image_data: list[Image]
-    stop_token_ids: Optional[list[int]] = None
-    chat_template: Optional[str] = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    stop_token_ids: list[int] | None = None
+    chat_template: str | None = None
+    lora_requests: list[LoRARequest] | None = None
+    sampling_params: SamplingParams | None = None


 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@@ -107,6 +108,41 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_bee(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Open-Bee/Bee-8B-RL"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=16384,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        trust_remote_code=True,
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_command_a_vision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "CohereLabs/command-a-vision-07-2025"

@@ -166,6 +202,46 @@ def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_deepseek_ocr(question: str, image_urls: list[str]) -> ModelRequestData:
+    from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
+
+    model_name = "deepseek-ai/DeepSeek-OCR"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        logits_processors=[NGramPerReqLogitsProcessor],
+    )
+
+    placeholder = "<image>\n" * len(image_urls)
+    prompt = placeholder + question
+
+    # The following sampling params config is taken from
+    # the official Deepseek-OCR inference example.
+    # (IMPORTANT) Use the custom logits processor and avoid skipping
+    # special tokens for this model for the optimal OCR performance.
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=8192,
+        # ngram logit processor args
+        extra_args=dict(
+            ngram_size=30,
+            window_size=90,
+            # whitelist: <td>, </td>
+            whitelist_token_ids={128821, 128822},
+        ),
+        skip_special_tokens=False,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+        sampling_params=sampling_params,
+    )
+
+
 def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "google/gemma-3-4b-it"

@@ -309,7 +385,7 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:


 def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "internlm/Intern-S1"
+    model_name = "internlm/Intern-S1-mini"

    engine_args = EngineArgs(
        model=model_name,
@@ -371,13 +447,14 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    )


-def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-8B-Preview"

    engine_args = EngineArgs(
        model=model_name,
-        max_model_len=131072,
-        tensor_parallel_size=8,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

@@ -389,29 +466,32 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
                *placeholders,
                {"type": "text", "text": question},
            ],
-        }
+        },
    ]

-    processor = AutoProcessor.from_pretrained(model_name)
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

+    image_data = [fetch_image(url) for url in image_urls]
+
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
-        image_data=[fetch_image(url) for url in image_urls],
+        image_data=image_data,
    )


-def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
-    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
-    # it will generate poor response for multi-image inputs!
-    model_name = "llava-hf/llava-1.5-7b-hf"
+def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-1_5-8B"
+
    engine_args = EngineArgs(
        model=model_name,
-        max_num_seqs=16,
+        trust_remote_code=True,
+        max_model_len=32768,
+        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

@@ -423,28 +503,32 @@ def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
                *placeholders,
                {"type": "text", "text": question},
            ],
-        }
+        },
    ]

-    processor = AutoProcessor.from_pretrained(model_name)
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

+    image_data = [fetch_image(url) for url in image_urls]
+
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
-        image_data=[fetch_image(url) for url in image_urls],
+        image_data=image_data,
    )


-def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
+def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "moonshotai/Kimi-VL-A3B-Instruct"
+
    engine_args = EngineArgs(
        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=16,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=4,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

@@ -459,7 +543,7 @@ def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
        }
    ]

-    processor = AutoProcessor.from_pretrained(model_name)
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
@@ -472,12 +556,13 @@ def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
    )


-def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
+def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
    engine_args = EngineArgs(
        model=model_name,
-        max_model_len=16384,
-        max_num_seqs=16,
+        max_model_len=131072,
+        tensor_parallel_size=8,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

@@ -505,14 +590,13 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa
    )


-def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
-
+def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
+    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
+    # it will generate poor response for multi-image inputs!
+    model_name = "llava-hf/llava-1.5-7b-hf"
    engine_args = EngineArgs(
        model=model_name,
-        trust_remote_code=True,
-        max_model_len=8192,
-        max_num_seqs=5,
+        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

@@ -524,32 +608,28 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
                *placeholders,
                {"type": "text", "text": question},
            ],
-        },
+        }
    ]

-    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

-    image_data = [fetch_image(url) for url in image_urls]
-
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
-        image_data=image_data,
+        image_data=[fetch_image(url) for url in image_urls],
    )


-def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "Kwai-Keye/Keye-VL-1_5-8B"
-
+def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
    engine_args = EngineArgs(
        model=model_name,
-        trust_remote_code=True,
        max_model_len=8192,
-        max_num_seqs=5,
+        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

@@ -561,32 +641,28 @@ def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
                *placeholders,
                {"type": "text", "text": question},
            ],
-        },
+        }
    ]

-    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

-    image_data = [fetch_image(url) for url in image_urls]
-
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
-        image_data=image_data,
+        image_data=[fetch_image(url) for url in image_urls],
    )


-def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "moonshotai/Kimi-VL-A3B-Instruct"
-
+def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
    engine_args = EngineArgs(
        model=model_name,
-        trust_remote_code=True,
-        max_model_len=4096,
-        max_num_seqs=4,
+        max_model_len=16384,
+        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

@@ -601,7 +677,7 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
        }
    ]

-    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
@@ -713,13 +789,32 @@ def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
-    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+    prompt = (
+        f"<|im_start|>user\n\n{placeholders}\n{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )

-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    prompt = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_paddleocr_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "PaddlePaddle/PaddleOCR-VL"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
    )

+    placeholders = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" * len(image_urls)
+    prompt = f"<|begin_of_sentence|>User: {question}{placeholders}\nAssistant: "
+
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
@@ -1217,8 +1312,10 @@ def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
 model_example_map = {
    "aria": load_aria,
    "aya_vision": load_aya_vision,
+    "bee": load_bee,
    "command_a_vision": load_command_a_vision,
    "deepseek_vl_v2": load_deepseek_vl2,
+    "deepseek_ocr": load_deepseek_ocr,
    "gemma3": load_gemma3,
    "h2ovl_chat": load_h2ovl,
    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
@@ -1236,6 +1333,7 @@ model_example_map = {
    "NVLM_D": load_nvlm_d,
    "ovis": load_ovis,
    "ovis2_5": load_ovis2_5,
+    "paddleocr_vl": load_paddleocr_vl,
    "phi3_v": load_phi3v,
    "phi4_mm": load_phi4mm,
    "phi4_multimodal": load_phi4_multimodal,
@@ -1253,7 +1351,7 @@ model_example_map = {
 }


-def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
+def run_generate(model, question: str, image_urls: list[str], seed: int | None):
    req_data = model_example_map[model](question, image_urls)

    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
@@ -1279,7 +1377,7 @@ def run_generate(model, question: str, image_urls: list[str], seed: Optional[int
        print("-" * 50)


-def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
+def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
    req_data = model_example_map[model](question, image_urls)

    # Disable other modalities to save memory
@@ -1291,8 +1389,12 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[in
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

-    sampling_params = SamplingParams(
-        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
+    sampling_params = (
+        SamplingParams(
+            temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
+        )
+        if req_data.sampling_params is None
+        else req_data.sampling_params
    )
    outputs = llm.chat(
        [

--- a/examples/offline_inference/vision_language_pooling.py
+++ b/examples/offline_inference/vision_language_pooling.py
@@ -10,14 +10,18 @@ on HuggingFace model repository.

 from argparse import Namespace
 from dataclasses import asdict
-from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
+from pathlib import Path
+from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args

 from PIL.Image import Image

 from vllm import LLM, EngineArgs
 from vllm.entrypoints.score_utils import ScoreMultiModalParam
 from vllm.multimodal.utils import fetch_image
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+ROOT_DIR = Path(__file__).parent.parent.parent
+EXAMPLES_DIR = ROOT_DIR / "examples"


 class TextQuery(TypedDict):
@@ -43,15 +47,39 @@ class TextImagesQuery(TypedDict):


 QueryModality = Literal["text", "image", "text+image", "text+images"]
-Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]
+Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery


 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
-    prompt: Optional[str] = None
-    image: Optional[Image] = None
-    query: Optional[str] = None
-    documents: Optional[ScoreMultiModalParam] = None
+    prompt: str | None = None
+    image: Image | None = None
+    query: str | None = None
+    documents: ScoreMultiModalParam | None = None
+
+
+def run_clip(query: Query) -> ModelRequestData:
+    if query["modality"] == "text":
+        prompt = query["text"]
+        image = None
+    elif query["modality"] == "image":
+        prompt = ""  # For image input, make sure that the prompt text is empty
+        image = query["image"]
+    else:
+        modality = query["modality"]
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+
+    engine_args = EngineArgs(
+        model="openai/clip-vit-base-patch32",
+        runner="pooling",
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image=image,
+    )


 def run_e5_v(query: Query) -> ModelRequestData:
@@ -82,23 +110,74 @@ def run_e5_v(query: Query) -> ModelRequestData:
    )


-def run_vlm2vec(query: Query) -> ModelRequestData:
+def run_jinavl_reranker(query: Query) -> ModelRequestData:
+    if query["modality"] != "text+images":
+        raise ValueError(f"Unsupported query modality: '{query['modality']}'")
+
+    engine_args = EngineArgs(
+        model="jinaai/jina-reranker-m0",
+        runner="pooling",
+        max_model_len=32768,
+        trust_remote_code=True,
+        mm_processor_kwargs={
+            "min_pixels": 3136,
+            "max_pixels": 602112,
+        },
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        query=query["text"],
+        documents=query["image"],
+    )
+
+
+def run_siglip(query: Query) -> ModelRequestData:
+    if query["modality"] == "text":
+        prompt = query["text"]
+        image = None
+    elif query["modality"] == "image":
+        prompt = ""  # For image input, make sure that the prompt text is empty
+        image = query["image"]
+    else:
+        modality = query["modality"]
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+
+    engine_args = EngineArgs(
+        model="google/siglip-base-patch16-224",
+        runner="pooling",
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image=image,
+    )
+
+
+def _get_vlm2vec_prompt_image(query: Query, image_token: str):
    if query["modality"] == "text":
        text = query["text"]
-        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
+        prompt = f"Find me an everyday image that matches the given caption: {text}"
        image = None
    elif query["modality"] == "image":
-        prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image."  # noqa: E501
+        prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image."  # noqa: E501
        image = query["image"]
    elif query["modality"] == "text+image":
        text = query["text"]
-        prompt = (
-            f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
-        )
+        prompt = f"{image_token} Represent the given image with the following question: {text}"  # noqa: E501
        image = query["image"]
    else:
        modality = query["modality"]
-        raise ValueError(f"Unsupported query modality: '{modality}'")
+        raise ValueError(f"Unsupported query modality: {modality!r}")
+
+    return prompt, image
+
+
+def run_vlm2vec_phi3v(query: Query) -> ModelRequestData:
+    prompt, image = _get_vlm2vec_prompt_image(query, "<|image_1|>")

    engine_args = EngineArgs(
        model="TIGER-Lab/VLM2Vec-Full",
@@ -116,26 +195,66 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
    )


-def run_jinavl_reranker(query: Query) -> ModelRequestData:
-    if query["modality"] != "text+images":
-        raise ValueError(f"Unsupported query modality: '{query['modality']}'")
+def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
+    # vLLM does not support LoRA adapters on multi-modal encoder,
+    # so we merge the weights first
+    from huggingface_hub.constants import HF_HUB_CACHE
+    from peft import PeftConfig, PeftModel
+    from transformers import AutoModelForImageTextToText, AutoProcessor
+
+    from vllm.entrypoints.chat_utils import load_chat_template
+
+    model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
+
+    base_model = AutoModelForImageTextToText.from_pretrained(model_id)
+    lora_model = PeftModel.from_pretrained(
+        base_model,
+        model_id,
+        config=PeftConfig.from_pretrained(model_id),
+    )
+    model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
+    model._hf_peft_config_loaded = False  # Needed to save the merged model
+
+    processor = AutoProcessor.from_pretrained(
+        model_id,
+        # `min_pixels` and `max_pixels` are deprecated for
+        # transformers `preprocessor_config.json`
+        size={"shortest_edge": 3136, "longest_edge": 12845056},
+    )
+    processor.chat_template = load_chat_template(
+        # The original chat template is not correct
+        EXAMPLES_DIR / "template_vlm2vec_qwen2vl.jinja",
+    )
+
+    merged_path = str(
+        Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
+    )
+    print(f"Saving merged model to {merged_path}...")
+    print(
+        "NOTE: This directory is not tracked by `huggingface_hub` "
+        "so you have to delete this manually if you don't want it anymore."
+    )
+    model.save_pretrained(merged_path)
+    processor.save_pretrained(merged_path)
+    print("Done!")
+
+    prompt, image = _get_vlm2vec_prompt_image(query, "<|image_pad|>")

    engine_args = EngineArgs(
-        model="jinaai/jina-reranker-m0",
+        model=merged_path,
        runner="pooling",
-        max_model_len=32768,
-        trust_remote_code=True,
+        max_model_len=4096,
        mm_processor_kwargs={
            "min_pixels": 3136,
-            "max_pixels": 602112,
+            "max_pixels": 12845056,
        },
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
-        query=query["text"],
-        documents=query["image"],
+        prompt=prompt,
+        image=image,
    )


@@ -186,7 +305,7 @@ def get_query(modality: QueryModality):
    raise ValueError(msg)


-def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
+def run_encode(model: str, modality: QueryModality, seed: int | None):
    query = get_query(modality)
    req_data = model_example_map[model](query)

@@ -216,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
        print("-" * 50)


-def run_score(model: str, modality: QueryModality, seed: Optional[int]):
+def run_score(model: str, modality: QueryModality, seed: int | None):
    query = get_query(modality)
    req_data = model_example_map[model](query)

@@ -231,9 +350,12 @@ def run_score(model: str, modality: QueryModality, seed: Optional[int]):


 model_example_map = {
+    "clip": run_clip,
    "e5_v": run_e5_v,
-    "vlm2vec": run_vlm2vec,
    "jinavl_reranker": run_jinavl_reranker,
+    "siglip": run_siglip,
+    "vlm2vec_phi3v": run_vlm2vec_phi3v,
+    "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
 }


@@ -246,7 +368,7 @@ def parse_args():
        "--model-name",
        "-m",
        type=str,
-        default="vlm2vec",
+        default="vlm2vec_phi3v",
        choices=model_example_map.keys(),
        help="The name of the embedding model.",
    )

--- a/examples/online_serving/chart-helm/README.md
+++ b/examples/online_serving/chart-helm/README.md
@@ -19,3 +19,15 @@ This directory contains a Helm chart for deploying the vllm application. The cha
 - templates/pvc.yaml: Template for Persistent Volume Claims.
 - templates/secrets.yaml: Template for Kubernetes Secrets.
 - templates/service.yaml: Template for creating Services.
+
+## Running Tests
+
+This chart includes unit tests using [helm-unittest](https://github.com/helm-unittest/helm-unittest). Install the plugin and run tests:
+
+```bash
+# Install plugin
+helm plugin install https://github.com/helm-unittest/helm-unittest
+
+# Run tests
+helm unittest .
+```
--- a/examples/online_serving/chart-helm/templates/_helpers.tpl
+++ b/examples/online_serving/chart-helm/templates/_helpers.tpl
@@ -123,9 +123,6 @@ runAsUser:
 {{-   end }}
 {{- end }}

-{{- define "chart.extraInitImage" -}}
-"amazon/aws-cli:2.6.4"
-{{- end }}

 {{- define "chart.extraInitEnv" -}}
 - name: S3_ENDPOINT_URL
@@ -148,11 +145,15 @@ runAsUser:
    secretKeyRef:
      name: {{ .Release.Name }}-secrets
      key: s3accesskey
+{{- if .Values.extraInit.s3modelpath }}
 - name: S3_PATH
  value: "{{ .Values.extraInit.s3modelpath }}"
+{{- end }}
+{{- if hasKey .Values.extraInit "awsEc2MetadataDisabled" }}
 - name: AWS_EC2_METADATA_DISABLED
  value: "{{ .Values.extraInit.awsEc2MetadataDisabled }}"
 {{- end }}
+{{- end }}

 {{/*
  Define chart labels

--- a/examples/online_serving/chart-helm/templates/deployment.yaml
+++ b/examples/online_serving/chart-helm/templates/deployment.yaml
@@ -72,16 +72,21 @@ spec:
        {{ toYaml . | nindent 8 }}
        {{- end }}

-      {{-   if .Values.extraInit  }}
+      {{- if and .Values.extraInit (or .Values.extraInit.modelDownload.enabled .Values.extraInit.initContainers) }}
      initContainers:
+      {{- if .Values.extraInit.modelDownload.enabled }}
      - name: wait-download-model
-        image: {{ include "chart.extraInitImage" . }}
-        command: 
-          - /bin/bash
+        image: {{ .Values.extraInit.modelDownload.image.repository }}:{{ .Values.extraInit.modelDownload.image.tag }}
+        imagePullPolicy: {{ .Values.extraInit.modelDownload.image.pullPolicy }}
+        command: {{ .Values.extraInit.modelDownload.waitContainer.command | toJson }}
        args:
-          - -eucx
-          - while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done
-        env: {{- include "chart.extraInitEnv" . | nindent 10 }}
+        {{- toYaml .Values.extraInit.modelDownload.waitContainer.args | nindent 10 }}
+        env:
+        {{- if .Values.extraInit.modelDownload.waitContainer.env }}
+        {{- toYaml .Values.extraInit.modelDownload.waitContainer.env | nindent 10 }}
+        {{- else }}
+        {{- include "chart.extraInitEnv" . | nindent 10 }}
+        {{- end }}
        resources:
          requests:
            cpu: 200m
@@ -93,6 +98,10 @@ spec:
        - name: {{ .Release.Name }}-storage
          mountPath: /data
      {{- end }}
+      {{- with .Values.extraInit.initContainers }}
+      {{- toYaml . | nindent 6 }}
+      {{- end }}
+      {{- end }}
      volumes:
        - name: {{ .Release.Name }}-storage
          persistentVolumeClaim:

--- a/examples/online_serving/chart-helm/templates/job.yaml
+++ b/examples/online_serving/chart-helm/templates/job.yaml
-{{-   if .Values.extraInit  }}
+{{- if and .Values.extraInit .Values.extraInit.modelDownload.enabled }}
 apiVersion: batch/v1
 kind: Job
 metadata:
@@ -12,13 +12,17 @@ spec:
   spec:
    containers:
    - name: job-download-model
-      image: {{ include "chart.extraInitImage" . }}
-      command: 
-        - /bin/bash
+      image: {{ .Values.extraInit.modelDownload.image.repository }}:{{ .Values.extraInit.modelDownload.image.tag }}
+      imagePullPolicy: {{ .Values.extraInit.modelDownload.image.pullPolicy }}
+      command: {{ .Values.extraInit.modelDownload.downloadJob.command | toJson }}
      args:
-        - -eucx
-        - aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data
-      env: {{- include "chart.extraInitEnv" . | nindent 8 }}
+      {{- toYaml .Values.extraInit.modelDownload.downloadJob.args | nindent 8 }}
+      env:
+      {{- if .Values.extraInit.modelDownload.downloadJob.env }}
+      {{- toYaml .Values.extraInit.modelDownload.downloadJob.env | nindent 8 }}
+      {{- else }}
+      {{- include "chart.extraInitEnv" . | nindent 8 }}
+      {{- end }}
      volumeMounts:
        - name: {{ .Release.Name }}-storage
          mountPath: /data

--- a/examples/online_serving/chart-helm/tests/deployment_test.yaml
+++ b/examples/online_serving/chart-helm/tests/deployment_test.yaml
+suite: test deployment
+templates:
+  - deployment.yaml
+tests:
+  - it: should create wait-download-model init container when modelDownload is enabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+        initContainers: [ ]
+        pvcStorage: "1Gi"
+        s3modelpath: "relative_s3_model_path/opt-125m"
+        awsEc2MetadataDisabled: true
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Deployment
+      - isNotEmpty:
+          path: spec.template.spec.initContainers
+      - equal:
+          path: spec.template.spec.initContainers[0].name
+          value: wait-download-model
+      - equal:
+          path: spec.template.spec.initContainers[0].image
+          value: amazon/aws-cli:2.6.4
+      - equal:
+          path: spec.template.spec.initContainers[0].imagePullPolicy
+          value: IfNotPresent
+
+  - it: should only create custom init containers when modelDownload is disabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: false
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "echo test" ]
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "echo test" ]
+        initContainers:
+          - name: llm-d-routing-proxy
+            image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+            imagePullPolicy: IfNotPresent
+            ports:
+              - containerPort: 8080
+                name: proxy
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Deployment
+      - lengthEqual:
+          path: spec.template.spec.initContainers
+          count: 1
+      - equal:
+          path: spec.template.spec.initContainers[0].name
+          value: llm-d-routing-proxy
+      - equal:
+          path: spec.template.spec.initContainers[0].image
+          value: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+      - equal:
+          path: spec.template.spec.initContainers[0].ports[0].containerPort
+          value: 8080
+
+  - it: should create both wait-download-model and custom init containers when both are enabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+        initContainers:
+          - name: llm-d-routing-proxy
+            image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+            imagePullPolicy: IfNotPresent
+            ports:
+              - containerPort: 8080
+                name: proxy
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Deployment
+      - lengthEqual:
+          path: spec.template.spec.initContainers
+          count: 2
+      - equal:
+          path: spec.template.spec.initContainers[0].name
+          value: wait-download-model
+      - equal:
+          path: spec.template.spec.initContainers[0].image
+          value: amazon/aws-cli:2.6.4
+      - equal:
+          path: spec.template.spec.initContainers[1].name
+          value: llm-d-routing-proxy
+      - equal:
+          path: spec.template.spec.initContainers[1].image
+          value: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+      - equal:
+          path: spec.template.spec.initContainers[1].ports[0].containerPort
+          value: 8080
\ No newline at end of file
--- a/examples/online_serving/chart-helm/tests/job_test.yaml
+++ b/examples/online_serving/chart-helm/tests/job_test.yaml
+suite: test job
+templates:
+  - job.yaml
+tests:
+  - it: should create job when modelDownload is enabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "wait" ]
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args:
+              - "-eucx"
+              - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+        pvcStorage: "1Gi"
+        s3modelpath: "relative_s3_model_path/opt-125m"
+        awsEc2MetadataDisabled: true
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Job
+      - equal:
+          path: spec.template.spec.containers[0].name
+          value: job-download-model
+      - equal:
+          path: spec.template.spec.containers[0].image
+          value: amazon/aws-cli:2.6.4
+      - equal:
+          path: spec.template.spec.restartPolicy
+          value: OnFailure
+
+  - it: should not create job when modelDownload is disabled
+    set:
+      extraInit:
+        modelDownload:
+          enabled: false
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "wait" ]
+          downloadJob:
+            command: [ "/bin/bash" ]
+            args: [ "-c", "download" ]
+        initContainers:
+          - name: llm-d-routing-proxy
+            image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 0
--- a/examples/online_serving/chart-helm/tests/pvc_test.yaml
+++ b/examples/online_serving/chart-helm/tests/pvc_test.yaml
+suite: test pvc
+templates:
+  - pvc.yaml
+tests:
+  # Test Case: PVC Created When extraInit Defined
+  - it: should create pvc when extraInit is defined
+    set:
+      extraInit:
+        modelDownload:
+          enabled: true
+          image:
+            repository: "amazon/aws-cli"
+            tag: "2.6.4"
+            pullPolicy: "IfNotPresent"
+          waitContainer:
+            command: ["/bin/bash"]
+            args: ["-c", "wait"]
+          downloadJob:
+            command: ["/bin/bash"]
+            args: ["-c", "download"]
+        pvcStorage: "10Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: PersistentVolumeClaim
+      - equal:
+          path: spec.accessModes[0]
+          value: ReadWriteOnce
+      - equal:
+          path: spec.resources.requests.storage
+          value: 10Gi
\ No newline at end of file
--- a/examples/online_serving/chart-helm/values.schema.json
+++ b/examples/online_serving/chart-helm/values.schema.json
@@ -136,6 +136,70 @@
        "extraInit": {
            "type": "object",
            "properties": {
+                "modelDownload": {
+                    "type": "object",
+                    "properties": {
+                        "enabled": {
+                            "type": "boolean"
+                        },
+                        "image": {
+                            "type": "object",
+                            "properties": {
+                                "repository": {
+                                    "type": "string"
+                                },
+                                "tag": {
+                                    "type": "string"
+                                },
+                                "pullPolicy": {
+                                    "type": "string"
+                                }
+                            },
+                            "required": ["repository", "tag", "pullPolicy"]
+                        },
+                        "waitContainer": {
+                            "type": "object",
+                            "properties": {
+                                "command": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "args": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "env": {
+                                    "type": "array",
+                                    "items": {"type": "object"}
+                                }
+                            },
+                            "required": ["command", "args"]
+                        },
+                        "downloadJob": {
+                            "type": "object",
+                            "properties": {
+                                "command": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "args": {
+                                    "type": "array",
+                                    "items": {"type": "string"}
+                                },
+                                "env": {
+                                    "type": "array",
+                                    "items": {"type": "object"}
+                                }
+                            },
+                            "required": ["command", "args"]
+                        }
+                    },
+                    "required": ["enabled", "image", "waitContainer", "downloadJob"]
+                },
+                "initContainers": {
+                    "type": "array",
+                    "items": {"type": "object"}
+                },
                "s3modelpath": {
                    "type": "string"
                },
@@ -147,9 +211,9 @@
                }
            },
            "required": [
-                "pvcStorage",
-                "s3modelpath",
-                "awsEc2MetadataDisabled"
+                "modelDownload",
+                "initContainers",
+                "pvcStorage"
            ]
        },
        "extraContainers": {

--- a/examples/online_serving/chart-helm/values.yaml
+++ b/examples/online_serving/chart-helm/values.yaml
@@ -75,10 +75,65 @@ maxUnavailablePodDisruptionBudget: ""

 # -- Additional configuration for the init container
 extraInit:
-   # -- Path of the model on the s3 which hosts model weights and config files
+  # -- Model download functionality (optional)
+  modelDownload:
+    # -- Enable model download job and wait container
+    enabled: true
+    # -- Image configuration for model download operations
+    image:
+      # -- Image repository
+      repository: "amazon/aws-cli"
+      # -- Image tag
+      tag: "2.6.4"
+      # -- Image pull policy
+      pullPolicy: "IfNotPresent"
+    # -- Wait container configuration (init container that waits for model to be ready)
+    waitContainer:
+      # -- Command to execute
+      command: ["/bin/bash"]
+      # -- Arguments for the wait container
+      args:
+        - "-eucx"
+        - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
+      # -- Environment variables (optional, overrides S3 defaults entirely if specified)
+      # env:
+      #   - name: HUGGING_FACE_HUB_TOKEN
+      #     value: "your-token"
+      #   - name: MODEL_ID
+      #     value: "meta-llama/Llama-2-7b"
+    # -- Download job configuration (job that actually downloads the model)
+    downloadJob:
+      # -- Command to execute
+      command: ["/bin/bash"]
+      # -- Arguments for the download job
+      args:
+        - "-eucx"
+        - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
+      # -- Environment variables (optional, overrides S3 defaults entirely if specified)
+      # env:
+      #   - name: HUGGING_FACE_HUB_TOKEN
+      #     value: "your-token"
+      #   - name: MODEL_ID
+      #     value: "meta-llama/Llama-2-7b"
+
+  # -- Custom init containers (appended after wait-download-model if modelDownload is enabled)
+  initContainers: []
+  # Example for llm-d sidecar:
+  # initContainers:
+  #   - name: llm-d-routing-proxy
+  #     image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+  #     imagePullPolicy: IfNotPresent
+  #     ports:
+  #       - containerPort: 8080
+  #         name: proxy
+  #     securityContext:
+  #       runAsUser: 1000
+
+  # -- Path of the model on the s3 which hosts model weights and config files
  s3modelpath: "relative_s3_model_path/opt-125m"
-   # -- Storage size of the s3
+  # -- Storage size for the PVC
  pvcStorage: "1Gi"
+  # -- Disable AWS EC2 metadata service
  awsEc2MetadataDisabled: true

 # -- Additional containers configuration