Merge tag 'v0.7.1' into v0.7.1-dev

afd0da21 · zhuwenwen · 1a11f127 · 4f4d427a · afd0da21 · afd0da21
Commit afd0da21 authored Feb 03, 2025 by zhuwenwen
20 changed files
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -35,10 +35,13 @@ def test_models(
    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        # This test is for verifying whether the model's extra_repr
        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+        def print_model(model):
-              model_runner.model)
+            print(model)
+        vllm_model.apply_model(print_model)
    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]

--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -53,10 +53,13 @@ def test_models(
    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        # This test is for verifying whether the model's extra_repr
        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+        def print_model(model):
-              model_runner.model)
+            print(model)
+        vllm_model.apply_model(print_model)
    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]

--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -50,6 +50,10 @@ from ....utils import models_path_prefix
        ),
        pytest.param(os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t")),  # stablelm
        pytest.param(os.path.join(models_path_prefix, "bigcode/starcoder2-3b")),  # starcoder2
+        pytest.param(
+            os.path.join(models_path_prefix, "ehristoforu/Falcon3-MoE-2x7B-Insruct"),  # mixtral
+            marks=[pytest.mark.cpu_model],
+        )
    ])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
@@ -71,10 +75,13 @@ def test_models(
    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
        # This test is for verifying whether the model's extra_repr
        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+        def print_model(model):
-              model_runner.model)
+            print(model)
+        vllm_model.apply_model(print_model)
    check_logprobs_close(
        outputs_0_lst=hf_outputs,

--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -10,7 +10,7 @@ from typing import Type
 import os
 import pytest
 from transformers import AutoModelForVision2Seq
-from transformers.utils import is_flash_attn_2_available
+from transformers import __version__ as TRANSFORMERS_VERSION
 from vllm.platforms import current_platform
 from vllm.utils import identity
@@ -141,12 +141,7 @@ VLM_TEST_SETTINGS = {
    #### Extended model tests
    "aria": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")],
-        tokenizer_mode="slow",
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        test_type=(
-            VLMTestType.IMAGE,
-            VLMTestType.MULTI_IMAGE,
-        ),
-        dtype="bfloat16",
        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
        max_model_len=4096,
@@ -162,8 +157,8 @@ VLM_TEST_SETTINGS = {
        max_tokens=64,
        marks=[
            pytest.mark.skipif(
-                not is_flash_attn_2_available(),
+                TRANSFORMERS_VERSION < "4.48.0",
-                reason="Model needs flash-attn for numeric convergence.",
+                reason="HF model requires transformers>=4.48.0",
            ),
            large_gpu_mark(min_gb=64),
        ],
@@ -181,6 +176,7 @@ VLM_TEST_SETTINGS = {
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
+        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        postprocess_inputs=model_utils.cast_dtype_post_processor(
            "pixel_values"
@@ -192,6 +188,30 @@ VLM_TEST_SETTINGS = {
        max_tokens=8,
        dtype="bfloat16",
    ),
+    "deepseek_vl_v2": VLMTestInfo(
+        models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
+            "cherry_blossom": "<image>\nPlease infer the season with reason in details.",   # noqa: E501
+        }),
+        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
+        vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}},  # noqa: E501
+        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
+        postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
+        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
+        stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
+        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
+        marks=[
+            pytest.mark.skipif(
+                TRANSFORMERS_VERSION >= "4.48.0",
+                reason="HF model is not compatible with transformers>=4.48.0",
+            )
+        ],
+    ),
    "fuyu": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "adept/fuyu-8b")],
        test_type=VLMTestType.IMAGE,
@@ -214,7 +234,7 @@ VLM_TEST_SETTINGS = {
        dtype="bfloat16",
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
        patch_hf_runner=model_utils.glm_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=48)],
+        marks=[large_gpu_mark(min_gb=32)],
    ),
    "h2ovl": VLMTestInfo(
        models = [
@@ -263,6 +283,7 @@ VLM_TEST_SETTINGS = {
        dtype="bfloat16",
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=32)],
    ),
    "llava_next": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
@@ -277,10 +298,8 @@ VLM_TEST_SETTINGS = {
            ),
            limit_mm_per_prompt={"image": 4},
        )],
-        # Llava-next tests fixed sizes & the default size factors
-        image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
    ),
-    "llava_one_vision": VLMTestInfo(
+    "llava_onevision": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")],
        test_type=VLMTestType.CUSTOM_INPUTS,
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
@@ -291,8 +310,6 @@ VLM_TEST_SETTINGS = {
        ),
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
-        # Llava-one-vision tests fixed sizes & the default size factors
-        image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
@@ -309,7 +326,6 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
-        image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
    ),
    "mantis": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "TIGER-Lab/Mantis-8B-siglip-llama3")],
@@ -336,6 +352,20 @@ VLM_TEST_SETTINGS = {
        postprocess_inputs=model_utils.wrap_inputs_post_processor,
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
    ),
+    "minicpmo_26": VLMTestInfo(
+        models=["openbmb/MiniCPM-o-2_6"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        postprocess_inputs=model_utils.ignore_inputs_post_processor(
+            "image_sizes"
+        ),
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmo_patch_hf_runner
+    ),
    "minicpmv_26": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-V-2_6")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -349,6 +379,16 @@ VLM_TEST_SETTINGS = {
        ),
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
    ),
+    "molmo": VLMTestInfo(
+        models=["allenai/Molmo-7B-D-0924"],
+        test_type=(VLMTestType.IMAGE),
+        prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        image_size_factors=[(),(1.0, 1.0, 1.0)],
+        patch_hf_runner=model_utils.mlomo_patch_hf_runner,
+        postprocess_inputs=model_utils.molmo_post_processor,
+    ),
    # Tests for phi3v currently live in another file because of a bug in
    # transformers. Once this issue is fixed, we can enable them here instead.
    # https://github.com/huggingface/transformers/issues/34307
@@ -434,7 +474,7 @@ VLM_TEST_SETTINGS = {
            ) for inp in custom_inputs.different_patch_input_cases_internvl()
        ],
    ),
-    "llava_one_vision-multiple-images": VLMTestInfo(
+    "llava_onevision-multiple-images": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")],
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=16384,
@@ -497,12 +537,13 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
 # - image embeddings
 # - video
 # - custom inputs
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
-                         get_parametrized_options(
+    "model_type,test_case",
-                             VLM_TEST_SETTINGS,
+    get_parametrized_options(
-                             test_type=VLMTestType.IMAGE,
+        VLM_TEST_SETTINGS,
-                             fork_new_process_for_each_test=False,
+        test_type=VLMTestType.IMAGE,
-                         ))
+        fork_new_process_for_each_test=False,
+    ))
 def test_single_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
                             hf_runner: Type[HfRunner],
@@ -519,12 +560,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
-                         get_parametrized_options(
+    "model_type,test_case",
-                             VLM_TEST_SETTINGS,
+    get_parametrized_options(
-                             test_type=VLMTestType.MULTI_IMAGE,
+        VLM_TEST_SETTINGS,
-                             fork_new_process_for_each_test=False,
+        test_type=VLMTestType.MULTI_IMAGE,
-                         ))
+        fork_new_process_for_each_test=False,
+    ))
 def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                            test_case: ExpandableVLMTestArgs,
                            hf_runner: Type[HfRunner],
@@ -541,12 +583,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
-                         get_parametrized_options(
+    "model_type,test_case",
-                             VLM_TEST_SETTINGS,
+    get_parametrized_options(
-                             test_type=VLMTestType.EMBEDDING,
+        VLM_TEST_SETTINGS,
-                             fork_new_process_for_each_test=False,
+        test_type=VLMTestType.EMBEDDING,
-                         ))
+        fork_new_process_for_each_test=False,
+    ))
 def test_image_embedding_models(model_type: str,
                                test_case: ExpandableVLMTestArgs,
                                hf_runner: Type[HfRunner],
@@ -562,12 +605,13 @@ def test_image_embedding_models(model_type: str,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
-                         get_parametrized_options(
+    "model_type,test_case",
-                             VLM_TEST_SETTINGS,
+    get_parametrized_options(
-                             test_type=VLMTestType.VIDEO,
+        VLM_TEST_SETTINGS,
-                             fork_new_process_for_each_test=False,
+        test_type=VLMTestType.VIDEO,
-                         ))
+        fork_new_process_for_each_test=False,
+    ))
 def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
                      hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
                      video_assets: _VideoAssets):
@@ -581,12 +625,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
-                         get_parametrized_options(
+    "model_type,test_case",
-                             VLM_TEST_SETTINGS,
+    get_parametrized_options(
-                             test_type=VLMTestType.CUSTOM_INPUTS,
+        VLM_TEST_SETTINGS,
-                             fork_new_process_for_each_test=False,
+        test_type=VLMTestType.CUSTOM_INPUTS,
-                         ))
+        fork_new_process_for_each_test=False,
+    ))
 def test_custom_inputs_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
@@ -603,12 +648,13 @@ def test_custom_inputs_models(
 #### Tests filtering for things running each test as a new process
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
-                         get_parametrized_options(
+    "model_type,test_case",
-                             VLM_TEST_SETTINGS,
+    get_parametrized_options(
-                             test_type=VLMTestType.IMAGE,
+        VLM_TEST_SETTINGS,
-                             fork_new_process_for_each_test=True,
+        test_type=VLMTestType.IMAGE,
-                         ))
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
@@ -626,12 +672,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
-                         get_parametrized_options(
+    "model_type,test_case",
-                             VLM_TEST_SETTINGS,
+    get_parametrized_options(
-                             test_type=VLMTestType.MULTI_IMAGE,
+        VLM_TEST_SETTINGS,
-                             fork_new_process_for_each_test=True,
+        test_type=VLMTestType.MULTI_IMAGE,
-                         ))
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                  test_case: ExpandableVLMTestArgs,
@@ -649,12 +696,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
-                         get_parametrized_options(
+    "model_type,test_case",
-                             VLM_TEST_SETTINGS,
+    get_parametrized_options(
-                             test_type=VLMTestType.EMBEDDING,
+        VLM_TEST_SETTINGS,
-                             fork_new_process_for_each_test=True,
+        test_type=VLMTestType.EMBEDDING,
-                         ))
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_image_embedding_models_heavy(model_type: str,
                                      test_case: ExpandableVLMTestArgs,
@@ -671,12 +719,13 @@ def test_image_embedding_models_heavy(model_type: str,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
-                         get_parametrized_options(
+    "model_type,test_case",
-                             VLM_TEST_SETTINGS,
+    get_parametrized_options(
-                             test_type=VLMTestType.VIDEO,
+        VLM_TEST_SETTINGS,
-                             fork_new_process_for_each_test=True,
+        test_type=VLMTestType.VIDEO,
-                         ))
+        fork_new_process_for_each_test=True,
+    ))
 def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
                            hf_runner: Type[HfRunner],
                            vllm_runner: Type[VllmRunner],
@@ -691,12 +740,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
-                         get_parametrized_options(
+    "model_type,test_case",
-                             VLM_TEST_SETTINGS,
+    get_parametrized_options(
-                             test_type=VLMTestType.CUSTOM_INPUTS,
+        VLM_TEST_SETTINGS,
-                             fork_new_process_for_each_test=True,
+        test_type=VLMTestType.CUSTOM_INPUTS,
-                         ))
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_custom_inputs_models_heavy(
    model_type: str,

--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -138,10 +138,10 @@ def _dump_outputs_w_logprobs(
    outputs: OutputsLogprobs,
    filename: "StrPath",
 ) -> None:
-    json_data = [(tokens, text,
+    json_data = [(tokens, text, [{
-                  [{k: asdict(v)
+        k: asdict(v)
-                    for k, v in token_logprobs.items()}
+        for k, v in token_logprobs.items()
-                   for token_logprobs in (logprobs or [])])
+    } for token_logprobs in (logprobs or [])])
                 for tokens, text, logprobs in outputs]
    with open(filename, "w") as f:
@@ -152,11 +152,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
    with open(filename, "rb") as f:
        json_data = json.load(f)
-    return [(tokens, text,
+    return [(tokens, text, [{
-             [{int(k): Logprob(**v)
+        int(k): Logprob(**v)
-               for k, v in token_logprobs.items()}
+        for k, v in token_logprobs.items()
-              for token_logprobs in logprobs])
+    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
-            for tokens, text, logprobs in json_data]
 @large_gpu_test(min_gb=80)

--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -6,7 +6,6 @@ import pytest
 import torch
 from PIL import Image
-from vllm.entrypoints.llm import LLM
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
@@ -71,7 +70,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
 def batch_make_image_embeddings(
        image_batches: List[Union[Image.Image, List[Image.Image]]], processor,
-        llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]:
+        llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]:
    """batched image embeddings for Qwen2-VL
    This will infer all images' embeddings in a single batch, 
@@ -107,17 +106,19 @@ def batch_make_image_embeddings(
    pixel_values = preprocess_result["pixel_values"]
    image_grid_thw = preprocess_result["image_grid_thw"]
-    # pixel values to embeddinds & grid_thws
+    # pixel values to embeddings & grid_thws
-    with torch.no_grad():
+    def get_image_embeds(model):
-        visual = llm.llm_engine.model_executor.driver_worker. \
+        with torch.no_grad():
-            model_runner.model.visual
+            visual = model.visual
-        pixel_values_on_device = pixel_values.to(visual.device,
+            pixel_values_on_device = pixel_values.to(visual.device,
-                                                 dtype=visual.dtype)
+                                                     dtype=visual.dtype)
-        image_grid_thw_on_device = image_grid_thw.to(visual.device,
+            image_grid_thw_on_device = image_grid_thw.to(visual.device,
-                                                     dtype=torch.int64)
+                                                         dtype=torch.int64)
-        image_embeds = visual(pixel_values_on_device,
+            return visual(pixel_values_on_device,
-                              grid_thw=image_grid_thw_on_device)
+                          grid_thw=image_grid_thw_on_device)
+    image_embeds = torch.concat(llm.apply_model(get_image_embeds))
    # split into original batches
    result: List[Qwen2VLPromptImageEmbeddingInput] = []
@@ -126,11 +127,10 @@ def batch_make_image_embeddings(
    for image_batch in image_batches_:
        cur_batch_image_count = len(image_batch)
        merge_size = image_processor.merge_size
-        cur_batch_embed_len = sum([
+        cur_batch_embed_len = sum(
-            grid_thw.prod() // merge_size // merge_size
+            grid_thw.prod(-1) // merge_size // merge_size
            for grid_thw in image_grid_thw[image_counter:image_counter +
-                                           cur_batch_image_count]
+                                           cur_batch_image_count])
-        ])
        result.append({
            "image_embeds":
@@ -153,7 +153,7 @@ def batch_make_image_embeddings(
 def batch_make_video_embeddings(
        video_batches: PromptVideoInput, processor,
-        llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]:
+        llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]:
    """batched video embeddings for Qwen2-VL
    A NDArray represents a single video's all frames.
@@ -189,17 +189,19 @@ def batch_make_video_embeddings(
    pixel_values = preprocess_result["pixel_values_videos"]
    video_grid_thw = preprocess_result["video_grid_thw"]
-    # pixel values to embeddinds & grid_thws
+    # pixel values to embeddings & grid_thws
-    with torch.no_grad():
+    def get_image_embeds(model):
-        visual = llm.llm_engine.model_executor.driver_worker.\
+        with torch.no_grad():
-            model_runner.model.visual
+            visual = model.visual
+            pixel_values_on_device = pixel_values.to(visual.device,
+                                                     dtype=visual.dtype)
+            video_grid_thw_on_device = video_grid_thw.to(visual.device,
+                                                         dtype=torch.int64)
+            return visual(pixel_values_on_device,
+                          grid_thw=video_grid_thw_on_device)
-        pixel_values_on_device = pixel_values.to(visual.device,
+    video_embeds = torch.concat(llm.apply_model(get_image_embeds))
-                                                 dtype=visual.dtype)
-        video_grid_thw_on_device = video_grid_thw.to(visual.device,
-                                                     dtype=torch.int64)
-        video_embeds = visual(pixel_values_on_device,
-                              grid_thw=video_grid_thw_on_device)
    # split into original batches
    result: List[Qwen2VLPromptVideoEmbeddingInput] = []
@@ -208,11 +210,10 @@ def batch_make_video_embeddings(
    for video_batch in video_batches_:
        cur_batch_video_count = len(video_batch)
        merge_size = image_processor.merge_size
-        cur_batch_embed_len = sum([
+        cur_batch_embed_len = sum(
-            grid_thw.prod() // merge_size // merge_size
+            grid_thw.prod(-1) // merge_size // merge_size
            for grid_thw in video_grid_thw[video_counter:video_counter +
-                                           cur_batch_video_count]
+                                           cur_batch_video_count])
-        ])
        result.append({
            "video_embeds":
@@ -282,9 +283,9 @@ def run_embedding_input_test(
                max_tokens,
                num_logprobs=num_logprobs,
                images=batch_make_image_embeddings(
-                    images, processor, vllm_model.model) if images else None,
+                    images, processor, vllm_model) if images else None,
                videos=batch_make_video_embeddings(
-                    videos, processor, vllm_model.model) if videos else None)
+                    videos, processor, vllm_model) if videos else None)
            for prompts, images, videos in inputs
        ]
@@ -429,130 +430,3 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
        mm_limit=1,
        tensor_parallel_size=1,
    )
-def run_chunked_prefill_test(
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Compare inference result between
-    chunked prefill disabled and chunked prefill enabled
-    """
-    # NOTE:
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     task="generate",
-                     max_model_len=4000,
-                     max_num_seqs=4,
-                     dtype=dtype,
-                     limit_mm_per_prompt={
-                         "image": mm_limit,
-                         "video": mm_limit
-                     },
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-        outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images or None,
-                                                videos=videos or None)
-            for prompts, images, videos in inputs
-        ]
-    with vllm_runner(
-            model,
-            task="generate",
-            max_model_len=4000,
-            max_num_seqs=4,
-            dtype=dtype,
-            limit_mm_per_prompt={
-                "image": mm_limit,
-                "video": mm_limit
-            },
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_chunked_prefill=True,
-            # should be small enough to ensure prefilling is chunked
-            max_num_batched_tokens=32,
-            mm_processor_kwargs={
-                "max_pixels": 16 * 28 * 28,
-            }) as vllm_model_chunked:
-        outputs_per_case_chunked = [
-            vllm_model_chunked.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images or None,
-                videos=videos or None) for prompts, images, videos in inputs
-        ]
-    for outputs, \
-        outputs_chunked \
-        in zip(outputs_per_case,
-            outputs_per_case_chunked):
-        check_logprobs_close(
-            outputs_0_lst=outputs,
-            outputs_1_lst=outputs_chunked,
-            name_0="non_chunked",
-            name_1="chunked",
-        )
-@pytest.mark.core_model
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [1])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_mrope_chunked_prefill(vllm_runner, example_prompts,
-                                        model: str, dtype: str,
-                                        max_tokens: int,
-                                        num_logprobs: int) -> None:
-    """
-    Test Qwen2-VL's chunked prefill with M-RoPE
-    """
-    prompts = [
-        qwen2_vl_chat_template(IMAGE_PLACEHOLDER, prompt)
-        for prompt in example_prompts[:1]
-    ]
-    # 1. Qwen2-VL's M-RoPE works only when there are some multi-modal inputs,
-    #    so an image is included in the inputs
-    # 2. however, Qwen2-VL currently won't work properly
-    #    when chunked prefill is enabled and there are some multi-modal inputs,
-    #    here use a hacky way: provide a **zero-length** image to make it happy
-    #
-    # and finally we achieved:
-    # (1) chunked_prefill enabled; (2) M-RoPE works; to continue our tests
-    zero_len_image = {
-        "image_embeds": torch.empty((0, MODEL_HIDDEN_SIZE)),
-        "image_grid_thw": torch.tensor([[0, 0, 0]])
-    }
-    images = [zero_len_image] * len(prompts)
-    inputs_per_case: List[Tuple[List[str], PromptImageInput,
-                                PromptVideoInput]] = [
-                                    (prompts, images, []),
-                                ]
-    run_chunked_prefill_test(
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -5,17 +5,20 @@ typically specific to a small subset of models.
 import re
 import types
 from pathlib import PosixPath
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import torch
 from PIL.Image import Image
-from transformers import AutoConfig, AutoTokenizer, BatchEncoding
+from transformers import (AutoConfig, AutoTokenizer, BatchEncoding,
+                          GenerationConfig)
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from .....conftest import HfRunner, ImageAsset, _ImageAssets
+from .....conftest import (HfRunner, ImageAsset, PromptAudioInput,
+                           PromptImageInput, PromptVideoInput, _ImageAssets)
+from ....utils import TokensTextLogprobs
 from .types import RunnerOutput
@@ -180,6 +183,14 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
 ####### Post-processors for HF outputs
+def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
+                                model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<｜end▁of▁sentence｜>"):
+        output_str = output_str.split("<｜end▁of▁sentence｜>")[0]
+    return output_ids, output_str, out_logprobs
 def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
                             model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
@@ -222,6 +233,11 @@ def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
    return {"model_inputs": hf_inputs}
+def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
+    hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
+    return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
        tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
@@ -253,6 +269,34 @@ def qwen_prompt_path_encoder(
 ####### Model-specific HuggingFace runner patchers
+def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for GLM4."""
+    hf_processor = hf_model.processor
+    def processor(*args, text="", images=None, **kwargs):
+        if isinstance(images, Image):
+            images = [images]
+        # inputs is a custom class instead of dict or BatchFeature
+        inputs = hf_processor(
+            *args,
+            prompt=text,
+            images=images,
+            **kwargs,
+        )
+        inputs = {
+            k: inputs[k]
+            for k in inputs.keys()  # noqa
+            if k not in ("seq_lens", "sft_format")
+        }
+        inputs = BatchEncoding(data=inputs, tensor_type="pt")
+        return inputs
+    hf_model.processor = processor
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language.model.embed_tokens
+    return hf_model
 def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for GLM4."""
    hf_processor = hf_model.processor
@@ -451,3 +495,99 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
    return hf_model
+def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+    def _generate(self, *args, **kwargs):
+        return orig_generate(*args, decode_text=False, **kwargs)
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+    return hf_model
+def _generate_greedy_logprobs_limit(
+    self,
+    prompts: List[str],
+    max_tokens: int,
+    num_logprobs: int,
+    images: Optional[PromptImageInput] = None,
+    audios: Optional[PromptAudioInput] = None,
+    videos: Optional[PromptVideoInput] = None,
+    **kwargs: Any,
+) -> List[TokensTextLogprobs]:
+    all_inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+    # Process in batches for inference.
+    if len(all_inputs):
+        input_ids_lst = []
+        images_lst = []
+        images_input_idx_lst = []
+        imges_masks_lst = []
+        for inputs in all_inputs:
+            input_ids_lst.append(inputs["input_ids"])
+            images_lst.append(inputs["images"])
+            images_input_idx_lst.append(inputs["image_input_idx"])
+            imges_masks_lst.append(inputs["image_masks"])
+        batch_inputs = {}
+        batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0)
+        batch_inputs['images'] = torch.cat(images_lst, dim=0)
+        batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst,
+                                                    dim=0)
+        batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0)
+        outputs = self.model.generate_from_batch(
+            batch=self.wrap_device(batch_inputs,
+                                   device=self.model.device.type),
+            generation_config=GenerationConfig(
+                max_new_tokens=max_tokens,
+                stop_strings="<|endoftext|>",
+                do_sample=False,
+            ),
+            tokenizer=self.tokenizer,
+            output_hidden_states=True,
+            return_dict_in_generate=True,
+        )
+    all_logprobs: List[List[Dict[int, float]]] = []
+    all_output_ids: List[List[int]] = []
+    all_output_strs: List[str] = []
+    for index in range(len(all_inputs)):
+        (
+            seq_logprobs_lst,
+            output_len,
+        ) = self._hidden_states_to_logprobs(outputs.hidden_states,
+                                            num_logprobs)
+        all_logprobs.append(seq_logprobs_lst)
+        seq_ids = outputs.sequences[index]
+        output_ids = seq_ids[-output_len:]
+        all_output_ids.append(output_ids.tolist())
+        all_output_strs.append(self.tokenizer.decode(output_ids))
+    outputs = zip(all_output_ids, all_output_strs, all_logprobs)
+    return [(output_ids, output_str, output_logprobs)
+            for output_ids, output_str, output_logprobs in outputs]
+####### Molmo-specific HuggingFace runner patchers
+def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Molmo."""
+    hf_processor = hf_model.processor
+    def _processor(*args, **kwargs):
+        return hf_processor.process(*args, **kwargs)
+    hf_model.processor = _processor
+    setattr(  # noqa: B010
+        hf_model,
+        "generate_greedy_logprobs_limit",
+        types.MethodType(_generate_greedy_logprobs_limit, hf_model),
+    )
+    return hf_model
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
@@ -26,10 +26,13 @@ def test_classification_models(
 ) -> None:
    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.classify(example_prompts)
        # This test is for verifying whether the model's extra_repr
        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+        def print_model(model):
-              model_runner.model)
+            print(model)
+        vllm_model.apply_model(print_model)
    with hf_runner(model,
                   dtype=dtype,

--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -18,15 +18,18 @@ from vllm.platforms import current_platform
        # [Encoder-only]
        pytest.param(os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param(os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2")),
        pytest.param(os.path.join(models_path_prefix, "intfloat/multilingual-e5-large")),
-        # [Encoder-decoder]
+        # [Decoder-only]
-        pytest.param(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
        pytest.param(os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
                     marks=[pytest.mark.core_model]),
-        pytest.param(os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base")),
+        pytest.param(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
        pytest.param(os.path.join(models_path_prefix, "Alibaba-NLP/gte-Qwen2-1.5B-instruct")),
        pytest.param(os.path.join(models_path_prefix, "Alibaba-NLP/gte-Qwen2-7B-instruct")),
+        pytest.param(os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base")),
+        # [Encoder-decoder]
+        pytest.param(os.path.join(models_path_prefix, "sentence-transformers/stsb-roberta-base-v2")),
    ],
 )
@@ -66,10 +69,13 @@ def test_models(
                     max_model_len=None,
                     **vllm_extra_kwargs) as vllm_model:
        vllm_outputs = vllm_model.encode(example_prompts)
        # This test is for verifying whether the model's extra_repr
        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+        def print_model(model):
-              model_runner.model)
+            print(model)
+        vllm_model.apply_model(print_model)
    check_embeddings_close(
        embeddings_0_lst=hf_outputs,

--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -6,6 +6,8 @@ import math
 import os
 import pytest
+import torch
+import torch.nn.functional as F
 from ....utils import models_path_prefix
 MODELS = [
@@ -13,6 +15,10 @@ MODELS = [
    os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"),  # Roberta
 ]
+EMBEDDING_MODELS = [
+    "sentence-transformers/all-MiniLM-L12-v2",
+]
 TEXTS_1 = [
    "What is the capital of France?",
    "What is the capital of Germany?",
@@ -89,3 +95,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
+def emb_model_name(request):
+    yield request.param
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = hf_model.encode(text_pair)
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
+        ]
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
--- a/tests/models/encoder_decoder/audio_language/__init__.py
+++ b/tests/models/encoder_decoder/audio_language/__init__.py
--- a/tests/models/encoder_decoder/audio_language/test_whisper.py
+++ b/tests/models/encoder_decoder/audio_language/test_whisper.py
+"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
+Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
+"""
+from typing import Optional
+import pytest
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from ....utils import fork_new_process_for_each_test, multi_gpu_test
+PROMPTS = [
+    {
+        "prompt":
+        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+        "multi_modal_data": {
+            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+        },
+    },
+    {  # Test explicit encoder/decoder prompt
+        "encoder_prompt": {
+            "prompt": "",
+            "multi_modal_data": {
+                "audio": AudioAsset("winning_call").audio_and_sample_rate,
+            },
+        },
+        "decoder_prompt":
+        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+    }
+]
+EXPECTED = {
+    "openai/whisper-tiny": [
+        " He has birth words I spoke in the original corner of that. And a"
+        " little piece of black coat poetry. Mary had a little sandwich,"
+        " sweet, with white and snow. And everyone had it very went the last"
+        " would sure to go.",
+        " >> And the old one, fit John the way to Edgar Martinez. >> One more"
+        " to line down the field line for our base camp. Here comes joy. Here"
+        " is June and the third base. They're going to wave him in. The throw"
+        " to the plate will be late. The Mariners are going to play for the"
+        " American League Championship. I don't believe it. It just continues"
+        " by all five."
+    ],
+    "openai/whisper-small": [
+        " The first words I spoke in the original pornograph. A little piece"
+        " of practical poetry. Mary had a little lamb, its fleece was quite a"
+        " slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the old one pitch on the way to Edgar Martinez one month. Here"
+        " comes joy. Here is Junior to third base. They're gonna wave him"
+        " in. The throw to the plate will be late. The Mariners are going to"
+        " play for the American League Championship. I don't believe it. It"
+        " just continues. My, oh my."
+    ],
+    "openai/whisper-medium": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its fleece was quite as"
+        " slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez swung on the line"
+        " down the left field line for Obeyshev. Here comes Joy. Here is"
+        " Jorgen at third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh"
+        " my."
+    ],
+    "openai/whisper-large-v3": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its feet were quite as"
+        " slow, and everywhere that Mary went, the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
+        " Now the left field line for a base hit. Here comes Joy. Here is"
+        " Junior to third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh,"
+        " my."
+    ],
+    "openai/whisper-large-v3-turbo": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its streets were quite"
+        " as slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
+        " down the left field line for a base hit. Here comes Joy. Here is"
+        " Junior to third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh,"
+        " my."
+    ]
+}
+def run_test(
+    model: str,
+    *,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+) -> None:
+    prompt_list = PROMPTS * 10
+    expected_list = EXPECTED[model] * 10
+    llm = LLM(
+        model=model,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    )
+    sampling_params = SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        max_tokens=200,
+    )
+    outputs = llm.generate(prompt_list, sampling_params)
+    for output, expected in zip(outputs, expected_list):
+        print(output.outputs[0].text)
+        assert output.outputs[0].text == expected
+@fork_new_process_for_each_test
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
+def test_models(model) -> None:
+    run_test(model, tensor_parallel_size=1)
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+def test_models_distributed(model, distributed_executor_backend) -> None:
+    run_test(model,
+             tensor_parallel_size=2,
+             distributed_executor_backend=distributed_executor_backend)
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -2,11 +2,15 @@ from typing import List, Optional, Tuple, Type, overload
 import os
 import pytest
+import torch
 from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
                          BatchEncoding)
+from vllm.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                     global_force_attn_backend_context_manager)
+from vllm.model_executor.models.mllama import (MLLAMA_IMAGE_TOKEN_ID,
+                                               MllamaForConditionalGeneration)
 from vllm.multimodal.image import rescale_image_size
 from vllm.sequence import SampleLogprobs
@@ -35,6 +39,29 @@ models = [
    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct"),
 ]
+# Indices for inputs
+TEXT_ONLY = '0'
+IMAGE_AT_BEG = '1'
+IMAGE_AT_MIDDLE = '2'
+TWO_IMAGES = '3'
+# Input tokenized
+prompt_data = {
+    # Tell me a story
+    TEXT_ONLY: [41551, 757, 264, 3446],
+    # <|image|> What's the content of this image
+    IMAGE_AT_BEG:
+    [MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220],
+    # Hello <|image|>What' the content of this image
+    IMAGE_AT_MIDDLE:
+    [9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217],
+    #<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501
+    TWO_IMAGES: [
+        MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30,
+        MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30
+    ]
+}
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                         Optional[SampleLogprobs]],
@@ -367,3 +394,184 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
            num_logprobs=num_logprobs,
            tensor_parallel_size=1,
        )
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
+                    num_logprobs, attn_backend: _Backend) -> None:
+    stop_sign = image_assets[0].pil_image
+    with global_force_attn_backend_context_manager(attn_backend), vllm_runner(
+            model,
+            dtype=dtype,
+            max_model_len=4096,
+            max_num_seqs=2,
+            tensor_parallel_size=1,
+            enforce_eager=True,
+            limit_mm_per_prompt={"image":
+                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
+        # Regression tests for https://github.com/vllm-project/vllm/issues/10648
+        # Number of image tags is greater than the number of images provided
+        prompt = "<|begin_of_text|><|image|><|image|> Compare the two images"  # noqa: E501
+        image = stop_sign
+        with pytest.raises(ValueError):
+            vllm_model.generate_greedy_logprobs([prompt],
+                                                max_tokens,
+                                                num_logprobs,
+                                                images=[image])
+        # Batch of a text-only and image request that requires cross-attention
+        prompts = [
+            "What is the capital of spain?",
+            "Text before the image...<|image|>What is in the image?",  # noqa: E501
+        ]
+        images = [
+            None,
+            [stop_sign],
+        ]
+        vllm_model.generate_greedy_logprobs(prompts,
+                                            max_tokens,
+                                            num_logprobs,
+                                            images=images)
+        # Test the reverse order too for good measure
+        prompts = [
+            "<|begin_of_text|>Text before the image...<|image|>What is in the image?",  # noqa: E501
+            "<|begin_of_text|>Hello!",
+        ]
+        images = [
+            [stop_sign],
+            None,
+        ]
+        vllm_model.generate_greedy_logprobs(prompts,
+                                            max_tokens,
+                                            num_logprobs,
+                                            images=images)
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "input_indices_and_output",
+    # inputs, (cross_attention_mask, kv_range_for_decode)
+    [([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)),
+     ([TEXT_ONLY, IMAGE_AT_BEG], (None, None)),
+     ([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])),
+     ([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])),
+     ([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
+      ((23, 24), [[0, 6], [6, 12]])),
+     ([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])),
+     ([TWO_IMAGES], ((18, 12), [[6, 12]])),
+     ([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))])
+def test_get_cross_attention_mask(input_indices_and_output) -> None:
+    input_indices, expected_output = input_indices_and_output
+    sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
+    num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices
+                 if i != TEXT_ONLY]
+    input = torch.cat(sequences)
+    seq_lens = [len(s) for s in sequences]
+    attn_data = FlashAttentionMetadata(
+        seq_lens=seq_lens,
+        # Dummy values
+        enable_kv_scales_calculation=False,
+        num_prefills=0,
+        num_prefill_tokens=0,
+        num_decode_tokens=0,
+        slot_mapping=0,
+        multi_modal_placeholder_index_maps=None,
+        seq_lens_tensor=0,
+        max_prefill_seq_len=0,
+        max_decode_seq_len=0,
+        context_lens_tensor=None,
+        block_tables=None,
+        use_cuda_graph=False,
+    )
+    dummy: dict[str, str] = {}
+    cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
+        .get_cross_attention_mask(dummy,
+                                  input,
+                                  attn_data,
+                                  num_tiles=num_tiles,
+                                  num_tokens_per_tile=3,
+                                  dtype=torch.bfloat16)
+    expected_cross_attention_mask, expected_kv_range_for_decode = \
+        expected_output
+    assert kv_range_for_decode == expected_kv_range_for_decode
+    if expected_cross_attention_mask is not None:
+        assert cross_attention_mask is not None
+        assert cross_attention_mask.shape == expected_cross_attention_mask
+    else:
+        assert cross_attention_mask is None
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "input_indices",
+    [[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE],
+     [TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
+     [IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]])
+def test_get_full_text_row_masked_out_mask(input_indices) -> None:
+    sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
+    seq_lens = [len(s) for s in sequences]
+    num_prefill_tokens = sum(seq_lens)
+    # TEXT_ONLY is zero, so it will be masked out,
+    # other instances should not be.
+    encoder_seq_lens = [int(i) for i in input_indices]
+    attn_data = FlashAttentionMetadata(
+        seq_lens=seq_lens,
+        encoder_seq_lens=encoder_seq_lens,
+        num_prefill_tokens=num_prefill_tokens,
+        # Dummy values
+        enable_kv_scales_calculation=False,
+        num_prefills=0,
+        num_decode_tokens=0,
+        slot_mapping=0,
+        multi_modal_placeholder_index_maps=None,
+        seq_lens_tensor=0,
+        max_prefill_seq_len=0,
+        max_decode_seq_len=0,
+        context_lens_tensor=None,
+        block_tables=None,
+        use_cuda_graph=False,
+    )
+    dummy: dict[str, str] = {}
+    full_text_row_masked_out_mask = MllamaForConditionalGeneration\
+        .get_full_text_row_masked_out_mask(dummy,
+                                  attn_data,
+                                  torch.get_default_device())
+    full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze()
+    full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist()
+    idx = 0
+    assert len(full_text_row_masked_out_mask) == num_prefill_tokens
+    for i, seq_len in enumerate(seq_lens):
+        must_be_masked = input_indices[i] != TEXT_ONLY
+        for _ in range(seq_len):
+            assert full_text_row_masked_out_mask[idx] == must_be_masked, \
+                f"full_text_row_masked_out_mask[{idx}] must be " \
+                f"'{must_be_masked}' "
+            idx += 1
--- a/tests/models/multimodal/__init__.py
+++ b/tests/models/multimodal/__init__.py
--- a/tests/models/multimodal/processing/__init__.py
+++ b/tests/models/multimodal/processing/__init__.py
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
+from functools import partial
+import numpy as np
+import pytest
+from PIL import Image
+from vllm.config import ModelConfig
+from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.processing import ProcessingCache
+from vllm.multimodal.utils import cached_get_tokenizer
+from ....multimodal.utils import random_audio, random_image, random_video
+from ...registry import HF_EXAMPLE_MODELS
+def _test_processing_correctness(
+    model_id: str,
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=model_info.trust_remote_code,
+        seed=0,
+        dtype="float16",
+        revision=None,
+        hf_overrides=model_info.hf_overrides,
+    )
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_get_tokenizer(
+            model_config.tokenizer,
+            trust_remote_code=model_info.trust_remote_code,
+        ),
+    )
+    # Ensure that it can fit all of the data
+    cache = ProcessingCache(capacity=1 << 30)
+    processing_info = factories.info(ctx)
+    supported_mm_limits = processing_info.get_supported_mm_limits()
+    limit_mm_per_prompt = {
+        modality: 3 if limit is None else limit
+        for modality, limit in supported_mm_limits.items()
+    }
+    model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
+    baseline_processor = factories.build_processor(ctx, cache=None)
+    cached_processor = factories.build_processor(ctx, cache=cache)
+    dummy_inputs = baseline_processor.dummy_inputs
+    tokenizer = baseline_processor.info.get_tokenizer()
+    rng = np.random.RandomState(0)
+    input_to_hit = {
+        "image": Image.new("RGB", size=(128, 128)),
+        "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
+        "audio": (np.zeros((512, )), 16000),
+    }
+    input_factory = {
+        "image":
+        partial(random_image, rng, min_wh=128, max_wh=256),
+        "video":
+        partial(random_video,
+                rng,
+                min_frames=2,
+                max_frames=8,
+                min_wh=128,
+                max_wh=256),
+        "audio":
+        partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
+    }
+    for batch_idx in range(num_batches):
+        mm_data = {
+            k:
+            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
+             for _ in range(rng.randint(limit))]
+            for k, limit in limit_mm_per_prompt.items()
+        }
+        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
+        prompt = dummy_inputs.get_dummy_processor_inputs(
+            model_config.max_model_len,
+            mm_counts,
+        ).prompt_text
+        # Drop unnecessary keys and test single -> multi conversion
+        if rng.rand() < simplify_rate:
+            for k in list(mm_data.keys()):
+                if not mm_data[k]:
+                    del mm_data[k]
+                elif len(mm_data[k]) == 1:
+                    mm_data[k] = mm_data[k][0]
+        baseline_result = baseline_processor.apply(
+            prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+        cached_result = cached_processor.apply(
+            prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+        assert baseline_result == cached_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+        baseline_tokenized_result = baseline_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+        assert baseline_result == baseline_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+        cached_tokenized_result = cached_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+        assert cached_result == cached_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+# yapf: disable
+# True if the model supports multiple data items of the modality per request
+@pytest.mark.parametrize("model_id", [
+    "rhymes-ai/Aria",
+    "Salesforce/blip2-opt-2.7b",
+    "facebook/chameleon-7b",
+    "deepseek-ai/deepseek-vl2-tiny",
+    "adept/fuyu-8b",
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    "llava-hf/LLaVA-NeXT-Video-7B-hf",
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+    "TIGER-Lab/Mantis-8B-siglip-llama3",
+    "mistral-community/pixtral-12b",
+    "openbmb/MiniCPM-o-2_6",
+    "openbmb/MiniCPM-V-2_6",
+    "Qwen/Qwen-VL-Chat",
+    "Qwen/Qwen2-VL-2B-Instruct",
+    "Qwen/Qwen2-Audio-7B-Instruct",
+    "fixie-ai/ultravox-v0_3",
+])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+# yapf: enable
+def test_processing_correctness(
+    model_id: str,
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    _test_processing_correctness(
+        model_id,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
+# yapf: disable
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-3-vision-128k-instruct"])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+# yapf: enable
+def test_processing_correctness_phi3v(
+    model_id: str,
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    # HACK - this is an attempted workaround for the following bug
+    # https://github.com/huggingface/transformers/issues/34307
+    from transformers import AutoImageProcessor  # noqa: F401
+    from transformers import AutoProcessor  # noqa: F401
+    AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
+    _test_processing_correctness(
+        model_id,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py
@@ -9,9 +9,9 @@ from transformers import AutoImageProcessor, AutoTokenizer
 from vllm.inputs import InputContext, token_inputs
 from vllm.multimodal import MultiModalRegistry
-from .....conftest import _ImageAssets
+from ....conftest import _ImageAssets
-from ....utils import build_model_context
+from ...utils import build_model_context
-from .....utils import models_path_prefix
+from ....utils import models_path_prefix
 models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")]

--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py
@@ -8,9 +8,9 @@ from transformers import AutoTokenizer
 from vllm.inputs import InputContext, token_inputs
 from vllm.multimodal import MultiModalRegistry
-from .....conftest import _ImageAssets
+from ....conftest import _ImageAssets
-from ....utils import build_model_context
+from ...utils import build_model_context
-from .....utils import models_path_prefix
+from ....utils import models_path_prefix
 models = [os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")]

--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
+import itertools
+from functools import partial
+import pytest
+from PIL import Image
+from pqdm.threads import pqdm
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.multimodal.utils import cached_get_tokenizer
+from ...utils import build_model_context
+def _validate_image_max_tokens_one(
+    processor: BaseMultiModalProcessor,
+    max_tokens: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    info = processor.info
+    feature_size = info.get_num_image_tokens(image_width=image_size.width,
+                                             image_height=image_size.height)
+    try:
+        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+@pytest.mark.skip("This test takes around 5 minutes to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+def test_processor_max_tokens(model_id):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
+    info = processor.info
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+    # The aspect ratio of the grid layout is between 1 and 2
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(32, 4096), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+    validate_one = partial(
+        _validate_image_max_tokens_one,
+        processor,
+        info.get_max_image_tokens(),  # type: ignore
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+def _validate_image_prompt_replacements_one(
+    processor: BaseMultiModalProcessor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+    try:
+        # The processor will throw an error if there is a mismatch
+        # in the prompt replacements
+        processed_inputs = processor.apply(prompt, mm_data, {})
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+        first_placeholder = image_placeholders[0]
+        # NOTE: There is a BOS token
+        assert first_placeholder["offset"] == 1
+        assert first_placeholder["length"] == (
+            len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+    """
+    Ensure LlavaNextMultiModalProcessor
+    handles prompt replacement properly for input images.
+    """
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+    validate_one = partial(
+        _validate_image_prompt_replacements_one,
+        processor,
+        num_imgs,
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
+    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
+                    (488, 183), (2560, 1669)]
+    image_sizes = [
+        size for w, h in image_ratios
+        for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
+@pytest.mark.skip("This test takes around 2 hours to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("num_imgs", [1])
+def test_processor_prompt_replacements_all(model_id, num_imgs):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+    # The aspect ratio of the grid layout is between 1 and 2
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(64, 1024), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
+import itertools
+from functools import partial
+import pytest
+from PIL import Image
+from pqdm.threads import pqdm
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.multimodal.utils import cached_get_tokenizer
+from ...utils import build_model_context
+def _validate_image_max_tokens_one(
+    processor: BaseMultiModalProcessor,
+    max_tokens: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    info = processor.info
+    feature_size = info.get_num_image_tokens(image_width=image_size.width,
+                                             image_height=image_size.height)
+    try:
+        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+@pytest.mark.skip("This test takes around 5 minutes to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+def test_processor_max_tokens(model_id):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
+    info = processor.info
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+    # The aspect ratio of the grid layout is between 1 and 6
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(32, 4096), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+    validate_one = partial(
+        _validate_image_max_tokens_one,
+        processor,
+        info.get_max_image_tokens(),  # type: ignore
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+def _validate_image_prompt_replacements_one(
+    processor: BaseMultiModalProcessor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+    try:
+        # The processor will throw an error if there is a mismatch
+        # in the prompt replacements
+        processed_inputs = processor.apply(prompt, mm_data, {})
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+        first_placeholder = image_placeholders[0]
+        assert first_placeholder["offset"] == 0
+        assert first_placeholder["length"] == len(
+            processed_inputs["prompt_token_ids"]) // num_imgs
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+    """
+    Ensure LlavaOnevisionMultiModalProcessor
+    handles prompt replacement properly for input images.
+    """
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+    validate_one = partial(
+        _validate_image_prompt_replacements_one,
+        processor,
+        num_imgs,
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
+    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
+                    (488, 183), (2560, 1669)]
+    image_sizes = [
+        size for w, h in image_ratios
+        for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
+@pytest.mark.skip("This test takes around 2 hours to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("num_imgs", [1])
+def test_processor_prompt_replacements_all(model_id, num_imgs):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+    # The aspect ratio of the grid layout is between 1 and 6
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(64, 1024), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )