Merge tag 'v0.8.1' into v0.8.1-ori

ca796e19 · zhuwenwen · e983c804 · 61c7a1b8 · ca796e19 · ca796e19
Commit ca796e19 authored Mar 21, 2025 by zhuwenwen
20 changed files
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -34,7 +34,7 @@ def phi3v_model_config():
                       tokenizer=PHI3V_MODEL_ID,
                       tokenizer_mode="auto",
                       trust_remote_code=True,
-                       dtype="bfloat16",
+                       dtype="auto",
                       seed=0,
                       limit_mm_per_prompt={
                           "image": 2,
@@ -58,7 +58,7 @@ def mllama_model_config():
                       tokenizer=MLLAMA_MODEL_ID,
                       tokenizer_mode="auto",
                       trust_remote_code=True,
-                       dtype="bfloat16",
+                       dtype="auto",
                       seed=0,
                       limit_mm_per_prompt={
                           "image": 2,
@@ -669,7 +669,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
                               tokenizer=MLLAMA_MODEL_ID,
                               tokenizer_mode="auto",
                               trust_remote_code=True,
-                               dtype="bfloat16",
+                               dtype="auto",
                               seed=0,
                               limit_mm_per_prompt={
                                   "image": 2,

--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -5,11 +5,10 @@ from typing import Optional
 import numpy as np
 import pytest
 import pytest_asyncio
-from transformers import AutoModel, AutoTokenizer, BatchEncoding
+from transformers import AutoModel, AutoTokenizer
 from vllm.multimodal.audio import resample_audio
 from vllm.sequence import SampleLogprobs
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from ....conftest import HfRunner, VllmRunner
 from ....utils import RemoteOpenAIServer
@@ -107,8 +106,6 @@ def run_test(
    **kwargs,
 ):
    """Inference result should be the same between hf and vllm."""
-    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
@@ -124,15 +121,7 @@ def run_test(
            for vllm_prompt, _, audio in prompts_and_audios
        ]
-    def process(hf_inputs: BatchEncoding, **kwargs):
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
-        hf_inputs["audio_values"] = hf_inputs["audio_values"] \
-            .to(torch_dtype)  # type: ignore
-        return hf_inputs
-    with hf_runner(model,
-                   dtype=dtype,
-                   postprocess_inputs=process,
-                   auto_cls=AutoModel) as hf_model:
        hf_outputs_per_audio = [
            hf_model.generate_greedy_logprobs_limit(
                [hf_prompt],

--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -9,7 +9,7 @@ from pathlib import PosixPath
 import pytest
 from packaging.version import Version
-from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
+from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
 from transformers import __version__ as TRANSFORMERS_VERSION
 from vllm.platforms import current_platform
@@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        convert_assets_to_embeddings=model_utils.get_llava_embeddings,
        max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@@ -121,10 +121,7 @@ VLM_TEST_SETTINGS = {
            "stop_sign": "caption es",
            "cherry_blossom": "What is in the picture?",
        }),
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
        dtype="bfloat16",
        marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
@@ -179,7 +176,6 @@ VLM_TEST_SETTINGS = {
    #         "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
    #     }),
    #     multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
-    #     postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # noqa: E501
    #     stop_str=["<|im_end|>"],
    #     image_size_factors=[(0.10, 0.15)],
    #     max_tokens=64,
@@ -190,7 +186,7 @@ VLM_TEST_SETTINGS = {
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
    ),
    "chameleon": VLMTestInfo(
@@ -199,10 +195,7 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
        max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        # For chameleon, we only compare the sequences
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
@@ -222,7 +215,6 @@ VLM_TEST_SETTINGS = {
        }),
        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
-        postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
        stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
@@ -240,6 +232,7 @@ VLM_TEST_SETTINGS = {
        img_idx_to_prompt=lambda idx: "",
        max_model_len=2048,
        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
        num_logprobs=10,
@@ -256,9 +249,7 @@ VLM_TEST_SETTINGS = {
        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
-        # TODO: Use AutoModelForVision2Seq once transformers supports this
+        auto_cls=AutoModelForImageTextToText,
-        auto_cls=AutoModelForPreTraining,
-        dtype="bfloat16",
        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
    ),
@@ -272,7 +263,6 @@ VLM_TEST_SETTINGS = {
        }),
        max_model_len=2048,
        max_num_seqs=2,
-        dtype="bfloat16",
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
        patch_hf_runner=model_utils.glm4v_patch_hf_runner,
        # The image embeddings match with HF but the outputs of the language
@@ -295,7 +285,6 @@ VLM_TEST_SETTINGS = {
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=8192,
-        dtype="bfloat16",
        use_tokenizer_eos=True,
        num_logprobs=10,
        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
@@ -307,7 +296,7 @@ VLM_TEST_SETTINGS = {
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
    ),
    "intern_vl": VLMTestInfo(
@@ -324,10 +313,6 @@ VLM_TEST_SETTINGS = {
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=4096,
-        # NOTE: Mono-InternVL-2B doesn't work with fp16,
-        # it will result NaN during inference.
-        # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
-        dtype="bfloat16",
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
@@ -336,7 +321,7 @@ VLM_TEST_SETTINGS = {
        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@@ -351,9 +336,6 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
        num_video_frames=16,
        max_model_len=16384,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values_videos"
-        ),
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
@@ -378,11 +360,8 @@ VLM_TEST_SETTINGS = {
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        max_model_len=4096,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        get_stop_token_ids=lambda tok: [128009],
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
        patch_hf_runner=model_utils.mantis_patch_hf_runner,
        marks=[
@@ -400,8 +379,8 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
-        postprocess_inputs=model_utils.wrap_inputs_post_processor,
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
    ),
    "minicpmo_26": VLMTestInfo(
        models=["openbmb/MiniCPM-o-2_6"],
@@ -411,11 +390,8 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        postprocess_inputs=model_utils.ignore_inputs_post_processor(
-            "image_sizes"
-        ),
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
-        patch_hf_runner=model_utils.minicpmo_patch_hf_runner
+        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
    ),
    "minicpmv_26": VLMTestInfo(
        models=["openbmb/MiniCPM-V-2_6"],
@@ -425,10 +401,8 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        postprocess_inputs=model_utils.ignore_inputs_post_processor(
-            "image_sizes"
-        ),
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
    ),
    "molmo": VLMTestInfo(
        models=["allenai/Molmo-7B-D-0924"],
@@ -437,7 +411,6 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        patch_hf_runner=model_utils.molmo_patch_hf_runner,
-        postprocess_inputs=model_utils.molmo_post_processor,
    ),
    # Tests for phi3v currently live in another file because of a bug in
    # transformers. Once this issue is fixed, we can enable them here instead.
@@ -463,7 +436,7 @@ VLM_TEST_SETTINGS = {
        img_idx_to_prompt=lambda idx: "[IMG]",
        max_model_len=8192,
        max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        marks=[large_gpu_mark(min_gb=48)],
    ),
    "qwen_vl": VLMTestInfo(
@@ -481,10 +454,7 @@ VLM_TEST_SETTINGS = {
        models=["facebook/chameleon-7b"],
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
        comparator=check_outputs_equal,
@@ -495,7 +465,7 @@ VLM_TEST_SETTINGS = {
        models=["llava-hf/llava-1.5-7b-hf"],
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        marks=multi_gpu_marks(num_gpus=2),
        **COMMON_BROADCAST_SETTINGS # type: ignore
@@ -504,7 +474,7 @@ VLM_TEST_SETTINGS = {
        models=["llava-hf/llava-v1.6-mistral-7b-hf"],
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        marks=multi_gpu_marks(num_gpus=2),
        **COMMON_BROADCAST_SETTINGS # type: ignore
@@ -529,9 +499,6 @@ VLM_TEST_SETTINGS = {
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=16384,
        max_num_seqs=2,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(

--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -4,7 +4,6 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import json
-import uuid
 from dataclasses import asdict
 from typing import TYPE_CHECKING, Any, Optional
@@ -16,8 +15,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
 from transformers import AutoProcessor
-from vllm import (EngineArgs, LLMEngine, RequestOutput, SamplingParams,
+from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
-                  TextPrompt, TokensPrompt)
 from vllm.multimodal import MultiModalDataBuiltins
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sequence import Logprob, SampleLogprobs
@@ -28,7 +26,11 @@ from ...utils import check_logprobs_close
 if TYPE_CHECKING:
    from _typeshed import StrPath
-MODELS = ["mistralai/Pixtral-12B-2409"]
+PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
+MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
 IMG_URLS = [
    "https://picsum.photos/id/237/400/300",
    "https://picsum.photos/id/231/200/300",
@@ -125,8 +127,10 @@ MAX_MODEL_LEN = [8192, 65536]
 FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
 assert FIXTURES_PATH.exists()
-FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
+FIXTURE_LOGPROBS_CHAT = {
-FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
+    PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
+    MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
+}
 OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
@@ -166,12 +170,12 @@ def test_chat(
    model: str,
    dtype: str,
 ) -> None:
-    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
+        FIXTURE_LOGPROBS_CHAT[model])
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="mistral",
-            enable_chunked_prefill=False,
            max_model_len=max_model_len,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
@@ -183,70 +187,40 @@ def test_chat(
            outputs.extend(output)
    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
+    # Remove last `None` prompt_logprobs to compare with fixture
+    for i in range(len(logprobs)):
+        assert logprobs[i][-1] is None
+        logprobs[i] = logprobs[i][:-1]
    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
                         outputs_1_lst=logprobs,
                         name_0="h100_ref",
                         name_1="output")
-@large_gpu_test(min_gb=80)
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
-    EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
-    args = EngineArgs(
-        model=model,
-        tokenizer_mode="mistral",
-        enable_chunked_prefill=False,
-        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
-        dtype=dtype,
-    )
-    engine = LLMEngine.from_engine_args(args)
-    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
-    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
-    outputs = []
-    count = 0
-    while True:
-        out = engine.step()
-        count += 1
-        for request_output in out:
-            if request_output.finished:
-                outputs.append(request_output)
-        if count == 2:
-            engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
-                               SAMPLING_PARAMS)
-        if not engine.has_unfinished_requests():
-            break
-    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
-    check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
-                         outputs_1_lst=logprobs,
-                         name_0="h100_ref",
-                         name_1="output")
 @large_gpu_test(min_gb=48)
 @pytest.mark.parametrize(
    "prompt,expected_ranges",
    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
-        "offset": 10,
+        "offset": 11,
        "length": 494
    }]),
     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
-         "offset": 10,
+         "offset": 11,
         "length": 266
     }, {
-         "offset": 276,
+         "offset": 277,
         "length": 1056
     }, {
-         "offset": 1332,
+         "offset": 1333,
         "length": 418
     }])])
-def test_multi_modal_placeholders(
+def test_multi_modal_placeholders(vllm_runner, prompt,
-        vllm_runner, prompt, expected_ranges: list[PlaceholderRange]) -> None:
+                                  expected_ranges: list[PlaceholderRange],
+                                  monkeypatch) -> None:
+    # This placeholder checking test only works with V0 engine
+    # where `multi_modal_placeholders` is returned with `RequestOutput`
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(
            "mistral-community/pixtral-12b",
            max_model_len=8192,

--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -4,7 +4,6 @@ from typing import Any, Callable, Optional, Union
 import torch
 from PIL.Image import Image
-from transformers import BatchEncoding
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from vllm.config import TaskOption
@@ -31,7 +30,6 @@ def run_test(
    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
    auto_cls: type[_BaseAutoModelClass],
    use_tokenizer_eos: bool,
-    postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
    comparator: Callable[..., None],
    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
    stop_str: Optional[list[str]],
@@ -101,7 +99,6 @@ def run_test(
    hf_model = hf_runner(model,
                         dtype=dtype,
                         auto_cls=auto_cls,
-                         postprocess_inputs=postprocess_inputs,
                         model_kwargs=hf_model_kwargs)
    # Some models need to patch things like the model processor, e.g., internvl

--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -6,16 +6,15 @@ typically specific to a small subset of models.
 import re
 import types
 from pathlib import PosixPath
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 import torch
 from PIL.Image import Image
-from transformers import (AutoConfig, AutoTokenizer, BatchEncoding,
+from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
                          GenerationConfig)
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from .....conftest import HfRunner, ImageAsset, _ImageAssets
 from .types import RunnerOutput
@@ -211,40 +210,6 @@ def get_llava_embeddings(image_assets: _ImageAssets):
    return [asset.image_embeds for asset in image_assets]
-####### postprocessors to run on HF BatchEncoding
-def cast_dtype_post_processor(
-        hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
-    """Gets a handle to a post processor which converts a given key into a
-    target data type."""
-    def process(hf_inputs: BatchEncoding, dtype: str):
-        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-        hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
-        return hf_inputs
-    return process
-def ignore_inputs_post_processor(
-        hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
-    """Gets a handle to a post processor which ignores a given key."""
-    def process(hf_inputs: BatchEncoding, dtype: str):
-        del hf_inputs[hf_inp_key]
-        return hf_inputs
-    return process
-def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
-    return {"model_inputs": hf_inputs}
-def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
-    hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
-    return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
        tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
@@ -295,8 +260,7 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            for k in inputs.keys()  # noqa
            if k not in ("seq_lens", "sft_format")
        }
-        inputs = BatchEncoding(data=inputs, tensor_type="pt")
+        return BatchFeature(data=inputs, tensor_type="pt")
-        return inputs
    hf_model.processor = processor
    hf_model.model.get_output_embeddings = lambda: \
@@ -529,10 +493,52 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    return hf_model
-def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+def minicpmv_25_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    orig_generate = hf_model.model.generate
-    def _generate(self, *args, **kwargs):
+    def _generate(
+        self,
+        *args,
+        input_ids=None,
+        pixel_values=None,
+        image_sizes=None,
+        image_bound=None,
+        tgt_sizes=None,
+        **kwargs,
+    ):
+        model_inputs = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "image_sizes": image_sizes,
+            "image_bound": image_bound,
+            "tgt_sizes": tgt_sizes,
+        }
+        for k in list(model_inputs.keys()):
+            if model_inputs[k] is None:
+                model_inputs.pop(k)
+        return orig_generate(model_inputs, *args, decode_text=False, **kwargs)
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+    return hf_model
+def minicpmo_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+    def _generate(self, *args, image_sizes=None, **kwargs):
+        return orig_generate(*args, decode_text=False, **kwargs)
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+    return hf_model
+def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+    def _generate(self, *args, image_sizes=None, **kwargs):
        return orig_generate(*args, decode_text=False, **kwargs)
    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
@@ -551,10 +557,11 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
        batch = {
-            k: kwargs.pop(k)
+            k: kwargs.pop(k).unsqueeze(0)
            for k in ("input_ids", "images", "image_input_idx", "image_masks")
            if k in kwargs
        }
+        batch = BatchFeature(batch).to(dtype=self.dtype)
        return self.generate_from_batch(
            batch,

--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -8,13 +8,12 @@ from typing import Any, Callable, NamedTuple, Optional, Union
 import torch
 from PIL.Image import Image
 from pytest import MarkDecorator
-from transformers import AutoModelForCausalLM, BatchEncoding
+from transformers import AutoModelForCausalLM
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from vllm.config import TaskOption
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import identity
 from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
 from ....utils import check_logprobs_close
@@ -110,11 +109,6 @@ class VLMTestInfo(NamedTuple):
    # Indicates we should explicitly pass the EOS from the tokenizer
    use_tokenizer_eos: bool = False
    auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
-    # Callable to pass to the HF runner to run on inputs; for now, we also pass
-    # the data type to input post processing, because almost all of the uses of
-    # postprocess_inputs are to fix the data types of BatchEncoding values.
-    postprocess_inputs: Callable[[BatchEncoding, str],
-                                 BatchEncoding] = identity
    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
    # Post processors that if defined, will run oun the outputs of the
@@ -130,7 +124,7 @@ class VLMTestInfo(NamedTuple):
    # is all combinations of .models + all fields below
    max_tokens: Union[int, tuple[int]] = 128
    num_logprobs: Union[int, tuple[int]] = 5
-    dtype: Union[str, Iterable[str]] = "half"
+    dtype: Union[str, Union[list[str], tuple[str, ...]]] = "auto"
    distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
    # Only expanded in video tests
    num_video_frames: Union[int, tuple[int]] = 16
@@ -171,7 +165,6 @@ class VLMTestInfo(NamedTuple):
            "vllm_output_post_proc": self.vllm_output_post_proc,
            "auto_cls": self.auto_cls,
            "use_tokenizer_eos": self.use_tokenizer_eos,
-            "postprocess_inputs": self.postprocess_inputs,
            "comparator": self.comparator,
            "get_stop_token_ids": self.get_stop_token_ids,
            "hf_model_kwargs": self.hf_model_kwargs,

--- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
 # SPDX-License-Identifier: Apache-2.0
-from functools import partial
 from typing import Callable
 import pytest
 import torch
+import torch.nn.functional as F
 from PIL import Image
-from transformers import BatchEncoding, Qwen2VLForConditionalGeneration
+from transformers import Qwen2VLForConditionalGeneration
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
@@ -75,10 +75,6 @@ def apply_chat_template_and_add_eos(
    return prompt
-def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs):
-    return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs)
 def _run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
@@ -118,14 +114,8 @@ def _run_test(
    with hf_runner(model,
                   dtype=dtype,
                   auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
-        hf_model.postprocess_inputs = partial(
-            postprocess_inputs,
+        prompts = []
-            hf_model,
-            cache_position=torch.arange(
-                0,
-                1,  # 1 for batch size
-                requires_grad=False),
-            use_cache=False)
        for text, image, embed_text in zip(input_texts, input_images,
                                           embed_texts):
            # dse requires non-standard input processing
@@ -133,20 +123,34 @@ def _run_test(
            messages = get_messages(image, text, embed_text)
            prompt = apply_chat_template_and_add_eos(
                messages, hf_model.processor.apply_chat_template)
-            inputs = hf_model.get_inputs(
-                prompts=[[prompt]],
+            prompts.append(prompt)
-                images=[[image]],
+        all_inputs = hf_model.get_inputs(
+            prompts=prompts,
+            images=input_images,
        )
        with torch.no_grad():
+            all_outputs = []
+            for inputs in all_inputs:
+                inputs = hf_model.model.prepare_inputs_for_generation(
+                    **inputs,
+                    cache_position=torch.arange(1),  # 1 for batch size
+                    use_cache=False,
+                )
                outputs = hf_model.model(
-                    **hf_model.wrap_device(inputs[0],
+                    **hf_model.wrap_device(inputs),
-                                           device=hf_model.model.device.type),
                    return_dict=True,
                    output_hidden_states=True,
                )
-                pooled_output = torch.nn.functional.normalize(
+                pooled_output = F.normalize(outputs.hidden_states[-1][0, -1],
-                    outputs.hidden_states[-1][0, -1], p=2, dim=-1)
+                                            p=2,
-            hf_outputs.append(pooled_output.tolist())
+                                            dim=-1)
+                all_outputs.append(pooled_output.tolist())
+            hf_outputs = all_outputs
    check_embeddings_close(
        embeddings_0_lst=hf_outputs,

--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -2,7 +2,7 @@
 import pytest
 import torch.nn.functional as F
-from transformers import AutoModelForVision2Seq
+from transformers import AutoModelForImageTextToText
 from vllm.platforms import current_platform
@@ -70,7 +70,7 @@ def _run_test(
        vllm_outputs = vllm_model.encode(input_texts, images=input_images)
    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
+                   auto_cls=AutoModelForImageTextToText) as hf_model:
        # Patch the issue where generation_config.json is missing
        hf_model.processor.patch_size = \
            hf_model.model.config.vision_config.patch_size
@@ -86,8 +86,7 @@ def _run_test(
        for inputs in all_inputs:
            # Based on: https://huggingface.co/royokong/e5-v
            outputs = hf_model.model(
-                **hf_model.wrap_device(inputs,
+                **hf_model.wrap_device(inputs),
-                                       device=hf_model.model.device.type),
                return_dict=True,
                output_hidden_states=True,
            )

--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -53,8 +53,7 @@ def _run_test(
        for inputs in all_inputs:
            # Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
            outputs = hf_model.model(
-                **hf_model.wrap_device(inputs,
+                **hf_model.wrap_device(inputs),
-                                       device=hf_model.model.device.type),
                return_dict=True,
                output_hidden_states=True,
            )

--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -4,8 +4,7 @@ from typing import Optional, overload
 import pytest
 import torch
-from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
+from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
-                          BatchEncoding)
 from vllm import LLM, SamplingParams
 from vllm.attention.backends.flash_attn import FlashAttentionMetadata
@@ -227,14 +226,10 @@ def _run_test(
            for prompts, images in inputs
        ]
-    def process(hf_inputs: BatchEncoding, **kwargs):
-        return hf_inputs
    with hf_runner(model,
                   dtype=dtype,
                   model_kwargs={"device_map": "auto"},
-                   postprocess_inputs=process,
+                   auto_cls=AutoModelForImageTextToText) as hf_model:
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
        hf_outputs_per_image = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,

--- a/tests/models/fixtures/mistral_small_3_chat.json
+++ b/tests/models/fixtures/mistral_small_3_chat.json
--- a/tests/models/fixtures/pixtral_chat_engine.json
+++ b/tests/models/fixtures/pixtral_chat_engine.json
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -2,7 +2,7 @@
 import warnings
 from collections.abc import Sequence
-from typing import Optional, Union
+from typing import Any, Optional, Union
 import torch
@@ -254,9 +254,9 @@ def check_logprobs_close(
 def build_model_context(
    model_id: str,
    task: TaskOption = "auto",
-    dtype: Optional[Union[str, torch.dtype]] = None,
+    dtype: Union[str, torch.dtype] = "auto",
-    mm_processor_kwargs: Optional[dict] = None,
+    mm_processor_kwargs: Optional[dict[str, Any]] = None,
-    limit_mm_per_prompt: Optional[dict] = None,
+    limit_mm_per_prompt: Optional[dict[str, int]] = None,
    disable_mm_preprocessor_cache: bool = True,
 ):
    """Creates an InputContext for a given model.
@@ -274,9 +274,6 @@ def build_model_context(
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")
-    if dtype is None:
-        dtype = "half"
    model_config = ModelConfig(
        model_id,
        task=task,

--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -7,19 +7,25 @@ from unittest.mock import MagicMock
 import numpy as np
 import pytest
+import torch
 from transformers import ProcessorMixin
 from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField)
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
-                                        PromptIndexTargets, PromptInsertion,
+                                        ProcessingCache, PromptIndexTargets,
-                                        PromptReplacement, apply_text_matches,
+                                        PromptInsertion, PromptReplacement,
+                                        apply_text_matches,
                                        apply_token_matches,
                                        find_mm_placeholders,
                                        find_text_matches, find_token_matches,
-                                        iter_token_matches)
+                                        iter_token_matches,
+                                        replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.transformers_utils.tokenizer import (AnyTokenizer,
@@ -89,6 +95,58 @@ def test_iter_token_matches(token_ids, match_ids, expected):
    assert all(match_len == len(match_ids) for match_len in match_lens)
+# yapf: disable
+@pytest.mark.parametrize(
+    ("token_ids", "match_ids", "new_ids", "expected"),
+    [
+        ([], [], [-1], []),
+        ([], [32000], [-1], []),
+        (
+            [32000, 32000, 32000],
+            [32000],
+            [-1],
+            [-1, -1, -1],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000],
+            [-1],
+            [-1, 32000],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000, 32000],
+            [-1],
+            [-1],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000],
+            [-1],
+            [9833, -1, 32000, 32000, 9833, -1, 32000, 918],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000, 32000, 32000],
+            [-1],
+            [9833, -1, 9833, 28747, 32000, 32000, 918],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 0, 32000],
+            [-1],
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+        ),
+    ],
+)
+# yapf: enable
+def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
+    result = replace_token_matches(token_ids, match_ids, new_ids)
+    # Manually constructed results
+    assert result == expected
 # yapf: disable
 @pytest.mark.parametrize(
    ("prompt", "target_by_key", "expected_by_key"),
@@ -837,6 +895,45 @@ def test_find_mm_placeholders(
    assert result == expected
+def _dummy_elem(modality: str, key: str, size: int):
+    return MultiModalFieldElem(
+        modality=modality,
+        key=key,
+        data=torch.empty((size, ), dtype=torch.int8),
+        field=MultiModalSharedField(1),
+    )
+def _dummy_item(modality: str, size_by_key: dict[str, int]):
+    return MultiModalKwargsItem.from_elems([
+        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
+    ])
+def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
+    return MultiModalKwargs.from_items([
+        _dummy_item(modality, size_by_key)
+        for modality, size_by_key in size_by_key_modality.items()
+    ])
+# yapf: disable
+@pytest.mark.parametrize(
+    ("item", "expected_size"),
+    [
+        (_dummy_item("a", {"a1": 100}), 100),
+        (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
+        (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
+    ],
+)
+# yapf: enable
+def test_cache_item_size(item, expected_size):
+    cache = ProcessingCache.get_lru_cache(2048, type(item))
+    cache[""] = item
+    assert cache.currsize == expected_size
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
    ("limit", "num_supported", "is_valid"),
@@ -853,7 +950,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
-        dtype="half",
+        dtype="auto",
        revision=None,
        limit_mm_per_prompt=limit_mm_per_prompt,
    )
@@ -892,7 +989,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
-        dtype="half",
+        dtype="auto",
        revision=None,
        limit_mm_per_prompt=limit_mm_per_prompt,
    )
@@ -965,7 +1062,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
-        dtype="half",
+        dtype="auto",
        revision=None,
    )

--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -314,7 +314,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
        # Test edge cases
        (1, 128, 16, 1024, 4, 2, 16, False),  # large decode batch
-        (16, 4, 8, 8192, 48, 1, 128, True),  # large prefill batch
+        (16, 4, 8, 1024, 4, 2, 128, True),  # large prefill batch
        (4, 12, 32, 2048, 16, 1, 32, True),  # multi-head attention (MHA)
        (4, 12, 32, 2048, 16, 16, 32, True),  # multi-query attention (MQA)
    ])

--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -15,6 +15,8 @@ from ..utils import compare_two_settings, create_new_process_for_each_test
 models_4bit_to_test = [
    ("facebook/opt-125m", "quantize opt model inflight"),
+    ("mistralai/Mistral-7B-Instruct-v0.3",
+     "quantize inflight model with both HF and Mistral format weights")
 ]
 models_pre_qaunt_4bit_to_test = [

--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -166,7 +166,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
    test_prompts = multilora_inference.create_test_prompts(lora_path)
    # Serialize model before deserializing and binding LoRA adapters
-    with vllm_runner(model_ref, ) as vllm_model:
+    with vllm_runner(model_ref) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
        vllm_model.apply_model(
@@ -208,7 +208,7 @@ def test_load_without_tensorizer_load_format(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
    ## Serialize model
-    with vllm_runner(model_ref, ) as vllm_model:
+    with vllm_runner(model_ref) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
        vllm_model.apply_model(

--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -34,7 +34,9 @@ with depyf.prepare_debug(temp_dir):
    # disable custom dispatcher, let Dynamo takes over
    # all the control
-    llm = LLM(model="google/gemma-2b",
+    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
+              max_model_len=512,
+              max_num_seqs=64,
              enforce_eager=True,
              compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
    outputs = llm.generate(prompts, sampling_params)
@@ -44,38 +46,51 @@ with depyf.prepare_debug(temp_dir):
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
        assert generated_text.startswith(answer)
-compiled_code = sorted(
+compiled_codes = sorted(
    glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
-# we should only trigger Dynamo compilation three times:
+for i, compiled_code in enumerate(compiled_codes):
-# one for the profiling phase without kv cache
+    print("{} file: {}".format(i + 1, compiled_code))
-# one for the prefill phase with symbolic shapes
-# one for the decode phase with symbolic shapes
+# We should only trigger Dynamo compilation 4 times:
+# 1. forward pass (symbolic)
+# 2. compute_logits (symbolic)
+# 3. forward pass (shape 16)
+# 4. forward pass (shape 32)
 # and later calls should not trigger Dynamo compilation again.
-# NOTE: it might still trigger XLA compilation.
+# NOTE: It might still trigger XLA compilation.
+# Check we have 4 compiled codes
+assert len(compiled_codes) == 4
-# check we have three compiled code
+kv_cache_prefix = "kv_cache"
-# this is the assumption when we use the custom dispatcher
+attn_prefix = "ragged_paged_attention"
-assert len(compiled_code) == 3
-# check all the compilations are as expected
+# Check all the compilations are as expected
-compiled_fn = sorted(
+compiled_fns = sorted(
    glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
-# the first compilation is the profiling phase,
+for i, compiled_fn in enumerate(compiled_fns):
-# it should not have any kv cache
+    print("{} file: {}".format(i + 1, compiled_fn))
-with open(compiled_fn[0]) as f:
+# The first compilation is symbolic, so it should not have any kv_caches
+with open(compiled_fns[0]) as f:
+    content = f.read()
+    assert kv_cache_prefix not in content
+# The second compilation is symbolic, so it should not have any kv_caches
+with open(compiled_fns[1]) as f:
    content = f.read()
-    assert "kv_caches" not in content
+    assert kv_cache_prefix not in content
-# the second compilation is the prefill phase,
+# The third compilation is shape 16, so it should have kv_caches and the
-# it should have kv cache and the flash_attention op
+# ragged_paged_attention
-with open(compiled_fn[1]) as f:
+with open(compiled_fns[2]) as f:
    content = f.read()
-    assert "kv_caches" in content and "torch.ops.xla.flash_attention" in content
+    assert (kv_cache_prefix in content and attn_prefix in content)
-# the third compilation is the decode phase,
+# The forth compilation is shape 32, so it should have kv_caches and the
-# it should have kv cache and the paged_attention op
+# ragged_paged_attention
-with open(compiled_fn[2]) as f:
+with open(compiled_fns[3]) as f:
    content = f.read()
-    assert "kv_caches" in content and "torch.ops.xla.paged_attention" in content
+    assert (kv_cache_prefix in content and attn_prefix in content)
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -14,12 +14,17 @@ from ..utils import compare_two_settings
 def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_RPC_TIMEOUT", "30000")
-        compare_two_settings(
+        compare_two_settings("Qwen/Qwen2.5-1.5B-Instruct",
-            "google/gemma-2b",
                             arg1=[
+                                 "--max-model-len=256",
+                                 "--max-num-seqs=32",
                                 "--enforce-eager",
                                 f"-O{CompilationLevel.DYNAMO_ONCE}",
                             ],
-            arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
+                             arg2=[
+                                 "--max-model-len=256", "--max-num-seqs=32",
+                                 "--enforce-eager",
+                                 f"-O{CompilationLevel.DYNAMO_AS_IS}"
+                             ],
                             env1={},
                             env2={})