Merge tag 'v0.8.2' into v0.8.2-dev

469e903b · zhuwenwen · 389ebcf7 · 25f560a6 · 469e903b · 469e903b
Commit 469e903b authored Mar 28, 2025 by zhuwenwen
20 changed files
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -15,7 +15,7 @@ MODEL_NAME = os.environ.get("MODEL_NAME", os.path.join(models_path_prefix, "BAAI
 REVISION = os.environ.get("REVISION", "main")
 MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME",
-                                    os.path.join(models_path_prefix, "intfloat/multilingual-e5-large"))
+                                    os.path.join(models_path_prefix, "intfloat/multilingual-e5-small"))
 REVISION_ROBERTA = os.environ.get("REVISION", "main")
@@ -84,7 +84,7 @@ def test_roberta_model_loading_with_params(vllm_runner):
        assert model_config.pooler_config.pooling_norm
        # asserts on the tokenizer loaded
-        assert model_tokenizer.tokenizer_id == os.path.join(models_path_prefix, "intfloat/multilingual-e5-large")
+        assert model_tokenizer.tokenizer_id == os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
        assert not model_tokenizer.tokenizer_config["do_lower_case"]
        def check_model(model):

--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import List, Optional, Tuple, Type
+from typing import Optional
 import numpy as np
 import pytest
 import os
 import pytest_asyncio
-from transformers import AutoModel, AutoTokenizer, BatchEncoding
+from transformers import AutoModel, AutoTokenizer
 from vllm.multimodal.audio import resample_audio
 from vllm.sequence import SampleLogprobs
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from ....conftest import HfRunner, VllmRunner
@@ -20,7 +19,7 @@ from ...utils import check_logprobs_close
 MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b")
-AudioTuple = Tuple[np.ndarray, int]
+AudioTuple = tuple[np.ndarray, int]
 VLLM_PLACEHOLDER = "<|audio|>"
 HF_PLACEHOLDER = "<|audio|>"
@@ -81,7 +80,7 @@ def _get_prompt(audio_count, question, placeholder):
                                         add_generation_prompt=True)
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+def vllm_to_hf_output(vllm_output: tuple[list[int], str,
                                         Optional[SampleLogprobs]],
                      model: str):
    """Sanitize vllm output to be comparable with hf output."""
@@ -99,9 +98,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
 def run_test(
-    hf_runner: Type[HfRunner],
+    hf_runner: type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
-    prompts_and_audios: List[Tuple[str, str, AudioTuple]],
+    prompts_and_audios: list[tuple[str, str, AudioTuple]],
    model: str,
    *,
    dtype: str,
@@ -110,8 +109,6 @@ def run_test(
    **kwargs,
 ):
    """Inference result should be the same between hf and vllm."""
-    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
@@ -127,15 +124,7 @@ def run_test(
            for vllm_prompt, _, audio in prompts_and_audios
        ]
-    def process(hf_inputs: BatchEncoding, **kwargs):
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
-        hf_inputs["audio_values"] = hf_inputs["audio_values"] \
-            .to(torch_dtype)  # type: ignore
-        return hf_inputs
-    with hf_runner(model,
-                   dtype=dtype,
-                   postprocess_inputs=process,
-                   auto_cls=AutoModel) as hf_model:
        hf_outputs_per_audio = [
            hf_model.generate_greedy_logprobs_limit(
                [hf_prompt],
@@ -161,8 +150,8 @@ def run_test(
 def run_multi_audio_test(
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
-    prompts_and_audios: List[Tuple[str, List[AudioTuple]]],
+    prompts_and_audios: list[tuple[str, list[AudioTuple]]],
    model: str,
    *,
    dtype: str,
@@ -190,7 +179,7 @@ def run_multi_audio_test(
 @pytest.mark.core_model
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("vllm_kwargs", [

--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -11,12 +11,12 @@ import pytest
 from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
+from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
 from ...utils import check_logprobs_close
 from ....utils import models_path_prefix
-os.environ["TOKENIZERS_PARALLELISM"] = "true"
 @pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
@@ -55,42 +55,107 @@ def test_models(
    backend: str,
    tensor_parallel_size: int,
    disable_async_output_proc: bool,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """
+    Only checks log probs match to cover the discrepancy in
+    numerical sensitive kernels.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+        m.setenv(STR_BACKEND_ENV_VAR, backend)
+        MAX_MODEL_LEN = 1024
+        NUM_LOG_PROBS = 8
+        with vllm_runner(
+                base_model,
+                max_model_len=MAX_MODEL_LEN,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                kv_cache_dtype="auto",
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+        with vllm_runner(
+                test_model,
+                max_model_len=MAX_MODEL_LEN,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                kv_cache_dtype=kv_cache_dtype,
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            test_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+        check_logprobs_close(
+            outputs_0_lst=baseline_outputs,
+            outputs_1_lst=test_outputs,
+            name_0="fp16_kv_cache",
+            name_1="fp8_kv_cache",
+        )
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(),
+                    reason="test for the CPU backend.")
+@pytest.mark.parametrize(
+    "kv_cache_dtype,base_model,test_model",
+    [
+        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
+        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct"),
+    ])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+# Due to low-precision numerical divergence, this test is too sensitive for
+# the async postprocessor
+@pytest.mark.parametrize("disable_async_output_proc", [True])
+def test_cpu_models(
+    vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    base_model: str,
+    test_model: str,
+    max_tokens: int,
+    disable_async_output_proc: bool,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Only checks log probs match to cover the discrepancy in
    numerical sensitive kernels.
    """
-    override_backend_env_variable(monkeypatch, backend)
+    with monkeypatch.context() as m:
+        m.setenv("TOKENIZERS_PARALLELISM", 'true')
-    MAX_MODEL_LEN = 1024
-    NUM_LOG_PROBS = 8
+        MAX_MODEL_LEN = 1024
+        NUM_LOG_PROBS = 8
-    with vllm_runner(
-            base_model,
+        with vllm_runner(
-            max_model_len=MAX_MODEL_LEN,
+                base_model,
-            tensor_parallel_size=tensor_parallel_size,
+                max_model_len=MAX_MODEL_LEN,
-            enforce_eager=enforce_eager,
+                dtype="bfloat16",
-            kv_cache_dtype="auto",
+                kv_cache_dtype="auto",
-            disable_async_output_proc=disable_async_output_proc,
+                disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
+        ) as vllm_model:
-        baseline_outputs = vllm_model.generate_greedy_logprobs(
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS)
-    with vllm_runner(
+        with vllm_runner(
-            test_model,
+                test_model,
-            max_model_len=MAX_MODEL_LEN,
+                max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=tensor_parallel_size,
+                dtype="bfloat16",
-            enforce_eager=enforce_eager,
+                kv_cache_dtype=kv_cache_dtype,
-            kv_cache_dtype=kv_cache_dtype,
+                disable_async_output_proc=disable_async_output_proc,
-            disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
-    ) as vllm_model:
+            test_outputs = vllm_model.generate_greedy_logprobs(
-        test_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
-            example_prompts, max_tokens, NUM_LOG_PROBS)
+        check_logprobs_close(
-    check_logprobs_close(
+            outputs_0_lst=baseline_outputs,
-        outputs_0_lst=baseline_outputs,
+            outputs_1_lst=test_outputs,
-        outputs_1_lst=test_outputs,
+            name_0="bf16_kv_cache",
-        name_0="fp16_kv_cache",
+            name_1="fp8_kv_cache",
-        name_1="fp8_kv_cache",
+        )
-    )
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -5,7 +5,7 @@ Note: To pass the test, quantization higher than Q4 should be used
 """
 import os
-from typing import List, NamedTuple, Type
+from typing import NamedTuple
 import pytest
 from huggingface_hub import hf_hub_download
@@ -91,8 +91,8 @@ MODELS = [
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_models(
    num_gpus_available: int,
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
-    example_prompts: List[str],
+    example_prompts: list[str],
    model: GGUFTestConfig,
    dtype: str,
    max_tokens: int,
@@ -111,16 +111,6 @@ def test_models(
        example_prompts = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True)
-    # Run unquantized model.
-    with vllm_runner(
-            model_name=model.original_model,
-            enforce_eager=True,  # faster tests
-            dtype=dtype,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=tp_size) as original_model:
-        original_outputs = original_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
    # Run gguf model.
    with vllm_runner(model_name=model.gguf_model,
                     enforce_eager=True,
@@ -131,6 +121,16 @@ def test_models(
        gguf_outputs = gguf_model.generate_greedy_logprobs(
            example_prompts[:-1], max_tokens, num_logprobs)
+    # Run unquantized model.
+    with vllm_runner(
+            model_name=model.original_model,
+            enforce_eager=True,  # faster tests
+            dtype=dtype,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tp_size) as original_model:
+        original_outputs = original_model.generate_greedy_logprobs(
+            example_prompts[:-1], max_tokens, num_logprobs)
    check_logprobs_close(
        outputs_0_lst=original_outputs,
        outputs_1_lst=gguf_outputs,

--- a/tests/models/decoder_only/language/test_hybrid.py
+++ b/tests/models/decoder_only/language/test_hybrid.py
@@ -11,7 +11,9 @@ from ...utils import check_outputs_equal
 from ....utils import models_path_prefix
 # This test is for the hybrid models
-MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"), os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B")]
+MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"), os.path.join(models_path_prefix, "Zyphra/Zamba2-1.2B-instruct")]
+# Bamba at Fp32 is too big for the CI (L4 GPU).
+# MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
 @pytest.mark.parametrize("model", MODELS)
@@ -27,29 +29,24 @@ def test_models(
 ) -> None:
    # numeric error produces different generation
-    if 'Bamba' in model:
+    if "Bamba" in model:
        example_prompts.pop(3)
-    with hf_runner(
+    model_kwargs = {
-            model,
+        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
-            dtype=dtype,
+        # don't use them
-            model_kwargs={
+    }
-                "use_mamba_kernels":
+    if "Zamba2" in model:
-                False,  # mamba kernels are not installed so HF 
+        # Zamba2 HF implementation automatically checks if mamba kernels are
-                # don't use them
+        # installed
-            }) as hf_model:
+        model_kwargs = {}
+    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        def print_model(model):
-            print(model)
-        vllm_model.apply_model(print_model)
    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
        vllm_output_ids, vllm_output_str = vllm_outputs[i]
@@ -119,26 +116,31 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
 def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
                                model: str, dtype: str,
                                max_tokens: int) -> None:
-    # numeric error during prefill chucking produces different generation
+    # numeric error during prefill chunking produces different generation
    # compared to w/o prefill chunking for those examples, removed them for now
-    if 'Jamba' in model:
+    if "Jamba" in model:
        example_prompts.pop(7)
        example_prompts.pop(2)
        example_prompts.pop(1)
-    elif 'Bamba' in model:
+    elif "Bamba" in model:
        example_prompts.pop(6)
        example_prompts.pop(3)
        example_prompts.pop(2)
        dtype = "half"  # use a different dtype for Bamba
+    elif "Zamba2" in model:
-    with hf_runner(
+        example_prompts.pop(7)
-            model,
+        dtype = "half"
-            dtype=dtype,
-            model_kwargs={
+    model_kwargs = {
-                "use_mamba_kernels":
+        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
-                False,  # mamba kernels are not installed so HF 
+        # don't use them
-                # don't use them
+    }
-            }) as hf_model:
+    if "Zamba2" in model:
+        # Zamba2 HF implementation automatically checks if mamba kernels are
+        # installed
+        model_kwargs = {}
+    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
        non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
    with vllm_runner(model,
@@ -194,6 +196,7 @@ def test_parallel_sampling(
    )
+@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [20])
@@ -295,6 +298,7 @@ def test_state_cleanup(
                    "could be related to finished_requests_ids")
+@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 def test_multistep(
@@ -310,6 +314,7 @@ def test_multistep(
        vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
+@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [64])

--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -70,13 +70,6 @@ def test_models(
    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        def print_model(model):
-            print(model)
-        vllm_model.apply_model(print_model)
    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
        vllm_output_ids, vllm_output_str = vllm_outputs[i]

--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -203,6 +203,7 @@ def test_models(
    )
+@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
 @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
@@ -215,16 +216,6 @@ def test_mistral_format(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="auto",
-            load_format="safetensors",
-            config_format="hf",
-    ) as hf_format_model:
-        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
    with vllm_runner(
            model,
            dtype=dtype,
@@ -235,6 +226,16 @@ def test_mistral_format(
        mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tokenizer_mode="auto",
+            load_format="safetensors",
+            config_format="hf",
+    ) as hf_format_model:
+        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
    check_logprobs_close(
        outputs_0_lst=hf_format_outputs,
        outputs_1_lst=mistral_format_outputs,
@@ -263,6 +264,7 @@ def test_mistral_symbolic_languages(
            assert "�" not in outputs[0].outputs[0].text.strip()
+@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("model",
                         MISTRAL_FORMAT_MODELS)  # v1 can't do func calling

--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/decoder_only/language/test_modelopt.py
@@ -5,7 +5,6 @@
 Note: these tests will only pass on H100
 """
 import os
-from typing import List
 import pytest
 from transformers import AutoTokenizer
@@ -66,7 +65,7 @@ def test_models(example_prompts, model_name) -> None:
        for prompt in example_prompts
    ]
    params = SamplingParams(max_tokens=20, temperature=0)
-    generations: List[str] = []
+    generations: list[str] = []
    # Note: these need to be run 1 at a time due to numerical precision,
    # since the expected strs were generated this way.
    for prompt in formatted_prompts:

--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -3,13 +3,37 @@
 Run `pytest tests/models/test_models.py`.
 """
 import pytest
 import os
+import torch
+from vllm.platforms import current_platform
 from ...utils import check_logprobs_close
 from ....utils import models_path_prefix
+# These have unsupported head_dim for FA. We do not
+# not have a clean way to fall back, so we fail with
+# a clear msg when it happens.
+# https://github.com/vllm-project/vllm/issues/14524
+REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
+# This list contains the model that are using AITER kernel.
+# Skip model that are not using AITER tests.
+# When more AITER kernels are added, this list will not be
+# needed as all the models will be calling AITER kernels
+# in parts of the operators
+AITER_MODEL_LIST = [
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "openbmb/MiniCPM3-4B",
+    "Qwen/Qwen-7B",
+    "Qwen/Qwen2.5-0.5B-Instruct",
+    "ehristoforu/Falcon3-MoE-2x7B-Insruct",
+]
+# @maybe_test_rocm_aiter
 @pytest.mark.parametrize(
    "model",
    [
@@ -65,15 +89,23 @@ from ....utils import models_path_prefix
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
+@pytest.mark.parametrize(
-    hf_runner,
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
-    vllm_runner,
+def test_models(hf_runner, vllm_runner, example_prompts, model: str,
-    example_prompts,
+                dtype: str, max_tokens: int, num_logprobs: int,
-    model: str,
+                use_rocm_aiter: bool, monkeypatch) -> None:
-    dtype: str,
-    max_tokens: int,
+    if model in REQUIRES_V0:
-    num_logprobs: int,
+        monkeypatch.setenv("VLLM_USE_V1", "0")
-) -> None:
+    if use_rocm_aiter and (model in AITER_MODEL_LIST):
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+    elif use_rocm_aiter and model not in AITER_MODEL_LIST:
+        # Skip model that are not using AITER tests.
+        # When more AITER kernels are added, this list will not be
+        # needed as all the models will be calling AITER kernels
+        # in parts of the operators
+        pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
    with hf_runner(model, dtype=dtype) as hf_model:
        if model.startswith("THUDM/chatglm3"):
@@ -87,16 +119,16 @@ def test_models(
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        def print_model(model):
-            print(model)
-        vllm_model.apply_model(print_model)
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
+    if use_rocm_aiter:
+        # this is to ensure that vllm engine
+        # has deallocated the memory before running the next
+        # unit tests. On ROCm, when using AITER
+        # the memory might not be deallocated completely
+        # before running the next test case
+        torch.cuda.synchronize()
--- a/tests/models/decoder_only/language/test_nvfp4.py
+++ b/tests/models/decoder_only/language/test_nvfp4.py
+# SPDX-License-Identifier: Apache-2.0
+# flake8: noqa
+"""Tests Model Optimizer nvfp4 models against ground truth generation
+Note: these tests will only pass on B200
+"""
+import os
+from typing import List
+import pytest
+from transformers import AutoTokenizer
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+MAX_MODEL_LEN = 1024
+MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]
+EXPECTED_STRS_MAP = {
+    "nvidia/Llama-3.3-70B-Instruct-FP4": [
+        'vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference',
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process',
+        'A neural network is a type of machine learning model inspired by the structure and function of the human brain',
+        'In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts'
+    ]
+}
+# This test compares against golden strings for exact match since
+# there is no baseline implementation to compare against
+# and is unstable w.r.t specifics of the fp4 implementation or
+# the hardware being run on.
+# Disabled to prevent it from breaking the build
+@pytest.mark.skip(
+    reason=
+    "Prevent unstable test based on golden strings from breaking the build "
+    " and test input model being too large and hanging the system.")
+@pytest.mark.quant_model
+@pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
+                    reason="nvfp4 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(example_prompts, model_name) -> None:
+    model = LLM(
+        model=model_name,
+        max_model_len=MAX_MODEL_LEN,
+        trust_remote_code=True,
+        enforce_eager=True,
+        quantization="nvfp4",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for prompt in example_prompts
+    ]
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations: List[str] = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = model.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del model
+    print(model_name, generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        assert expected_str == generated_str, (
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import List, Optional, Type
+from typing import Optional
 import os
 import pytest
@@ -21,12 +21,12 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
 def run_awq_test(
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
    image_assets: _ImageAssets,
    source_model: str,
    quant_model: str,
    *,
-    size_factors: List[float],
+    size_factors: list[float],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
@@ -110,7 +110,12 @@ def run_awq_test(
 @pytest.mark.parametrize("num_logprobs", [5])
 @torch.inference_mode()
 def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
-                    size_factors, dtype, max_tokens, num_logprobs) -> None:
+                    size_factors, dtype, max_tokens, num_logprobs,
+                    monkeypatch) -> None:
+    # Test V1: this test hangs during setup on single-scale input.
+    # TODO: fixure out why and re-enable this on V1.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    run_awq_test(
        vllm_runner,
        image_assets,

--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -6,12 +6,11 @@ import math
 import os
 from collections import defaultdict
 from pathlib import PosixPath
-from typing import Type
 import os
 import pytest
 from packaging.version import Version
-from transformers import AutoModelForVision2Seq
+from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
 from transformers import __version__ as TRANSFORMERS_VERSION
 from vllm.platforms import current_platform
@@ -19,7 +18,7 @@ from vllm.utils import identity
 from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
                          _VideoAssets)
-from ....utils import (fork_new_process_for_each_test, large_gpu_mark,
+from ....utils import (create_new_process_for_each_test, large_gpu_mark,
                       multi_gpu_marks)
 from ...utils import check_outputs_equal
 from .vlm_utils import custom_inputs, model_utils, runners
@@ -35,6 +34,16 @@ from ....utils import models_path_prefix
 if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+REQUIRES_V0_MODELS = [
+    # V1 Test: no way to fall back for head_dim = 80
+    # https://github.com/vllm-project/vllm/issues/14524
+    "qwen_vl",
+    "h2ovl",
+    "blip2",
+    # V1 Test: not enough KV cache space in C1.
+    "fuyu",
+]
 # yapf: disable
 COMMON_BROADCAST_SETTINGS = {
    "test_type": VLMTestType.IMAGE,
@@ -94,7 +103,7 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        convert_assets_to_embeddings=model_utils.get_llava_embeddings,
        max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@@ -114,14 +123,10 @@ VLM_TEST_SETTINGS = {
            "stop_sign": "caption es",
            "cherry_blossom": "What is in the picture?",
        }),
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
-        dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
+        dtype="bfloat16",
-               else ("half", "float")),
+        marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
-        marks=[pytest.mark.core_model],
    ),
    # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
    # once we upgraded to transformers>=4.49.0.
@@ -160,30 +165,30 @@ VLM_TEST_SETTINGS = {
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
    #### Extended model tests
-    "aria": VLMTestInfo(
+    # "aria": VLMTestInfo(
-        models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")],
+    #     models=["rhymes-ai/Aria"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
+    #     prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
+    #     img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
-        max_model_len=4096,
+    #     max_model_len=4096,
-        max_num_seqs=2,
+    #     max_num_seqs=2,
-        single_image_prompts=IMAGE_ASSETS.prompts({
+    #     auto_cls=AutoModelForImageTextToText,
-            "stop_sign": "<vlm_image>Please describe the image shortly.",
+    #     single_image_prompts=IMAGE_ASSETS.prompts({
-            "cherry_blossom": "<vlm_image>Please infer the season with reason.",
+    #         "stop_sign": "<vlm_image>Please describe the image shortly.",
-        }),
+    #         "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
-        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
+    #     }),
-        postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"),
+    #     multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
-        stop_str=["<|im_end|>"],
+    #     stop_str=["<|im_end|>"],
-        image_size_factors=[(0.10, 0.15)],
+    #     image_size_factors=[(0.10, 0.15)],
-        max_tokens=64,
+    #     max_tokens=64,
-        marks=[large_gpu_mark(min_gb=64)],
+    #     marks=[large_gpu_mark(min_gb=64)],
-    ),
+    # ),
    "blip2": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
    ),
    "chameleon": VLMTestInfo(
@@ -192,10 +197,7 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
        max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        # For chameleon, we only compare the sequences
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
@@ -215,7 +217,6 @@ VLM_TEST_SETTINGS = {
        }),
        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
-        postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
        stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
@@ -233,21 +234,44 @@ VLM_TEST_SETTINGS = {
        img_idx_to_prompt=lambda idx: "",
        max_model_len=2048,
        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
-    "glm4": VLMTestInfo(
+    "gemma3": VLMTestInfo(
+        models=[os.path.join(models_path_prefix, "google/gemma-3-4b-it")],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<start_of_image>What is the season?",  # noqa: E501
+        }),
+        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
+        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
+    ),
+    "glm4v": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "THUDM/glm-4v-9b")],
        test_type=VLMTestType.IMAGE,
-        prompt_formatter=identity,
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "",
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?",  # noqa: E501
+        }),
        max_model_len=2048,
        max_num_seqs=2,
-        dtype="bfloat16",
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
-        patch_hf_runner=model_utils.glm_patch_hf_runner,
+        patch_hf_runner=model_utils.glm4v_patch_hf_runner,
+        # The image embeddings match with HF but the outputs of the language
+        # decoder are only consistent up to 2 decimal places.
+        # So, we need to reduce the number of tokens for the test to pass.
+        max_tokens=8,
+        num_logprobs=10,
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "h2ovl": VLMTestInfo(
@@ -263,7 +287,6 @@ VLM_TEST_SETTINGS = {
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=8192,
-        dtype="bfloat16",
        use_tokenizer_eos=True,
        num_logprobs=10,
        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
@@ -275,7 +298,7 @@ VLM_TEST_SETTINGS = {
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
    ),
    "intern_vl": VLMTestInfo(
@@ -292,10 +315,6 @@ VLM_TEST_SETTINGS = {
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=4096,
-        # NOTE: Mono-InternVL-2B doesn't work with fp16,
-        # it will result NaN during inference.
-        # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
-        dtype="bfloat16",
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
@@ -304,7 +323,7 @@ VLM_TEST_SETTINGS = {
        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@@ -319,9 +338,6 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
        num_video_frames=16,
        max_model_len=16384,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values_videos"
-        ),
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
@@ -346,11 +362,8 @@ VLM_TEST_SETTINGS = {
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        max_model_len=4096,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        get_stop_token_ids=lambda tok: [128009],
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
        patch_hf_runner=model_utils.mantis_patch_hf_runner,
        marks=[
@@ -368,8 +381,8 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
-        postprocess_inputs=model_utils.wrap_inputs_post_processor,
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
    ),
    "minicpmo_26": VLMTestInfo(
        models=["openbmb/MiniCPM-o-2_6"],
@@ -379,11 +392,8 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        postprocess_inputs=model_utils.ignore_inputs_post_processor(
-            "image_sizes"
-        ),
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
-        patch_hf_runner=model_utils.minicpmo_patch_hf_runner
+        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
    ),
    "minicpmv_26": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-V-2_6")],
@@ -393,10 +403,8 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        postprocess_inputs=model_utils.ignore_inputs_post_processor(
-            "image_sizes"
-        ),
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
    ),
    "molmo": VLMTestInfo(
        models=["allenai/Molmo-7B-D-0924"],
@@ -405,7 +413,6 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        patch_hf_runner=model_utils.molmo_patch_hf_runner,
-        postprocess_inputs=model_utils.molmo_post_processor,
    ),
    # Tests for phi3v currently live in another file because of a bug in
    # transformers. Once this issue is fixed, we can enable them here instead.
@@ -431,7 +438,7 @@ VLM_TEST_SETTINGS = {
        img_idx_to_prompt=lambda idx: "[IMG]",
        max_model_len=8192,
        max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        marks=[large_gpu_mark(min_gb=48)],
    ),
    "qwen_vl": VLMTestInfo(
@@ -449,10 +456,7 @@ VLM_TEST_SETTINGS = {
        models=[os.path.join(models_path_prefix, "facebook/chameleon-7b")],
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
        comparator=check_outputs_equal,
@@ -463,7 +467,7 @@ VLM_TEST_SETTINGS = {
        models=[os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")],
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        marks=multi_gpu_marks(num_gpus=2),
        **COMMON_BROADCAST_SETTINGS # type: ignore
@@ -472,7 +476,7 @@ VLM_TEST_SETTINGS = {
        models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
        marks=multi_gpu_marks(num_gpus=2),
        **COMMON_BROADCAST_SETTINGS # type: ignore
@@ -497,9 +501,6 @@ VLM_TEST_SETTINGS = {
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=16384,
        max_num_seqs=2,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
@@ -509,6 +510,19 @@ VLM_TEST_SETTINGS = {
            limit_mm_per_prompt={"image": 4},
        )],
    ),
+    # regression test for https://github.com/vllm-project/vllm/issues/15122
+    "qwen2_5_vl-windows-attention": VLMTestInfo(
+        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
+            limit_mm_per_prompt={"image": 1},
+        )],
+    ),
 }
 # yapf: enable
@@ -560,13 +574,15 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
    ))
 def test_single_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
-                             hf_runner: Type[HfRunner],
+                             hf_runner: type[HfRunner],
-                             vllm_runner: Type[VllmRunner],
+                             vllm_runner: type[VllmRunner],
-                             image_assets: _ImageAssets):
+                             image_assets: _ImageAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
@@ -583,13 +599,15 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
    ))
 def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                            test_case: ExpandableVLMTestArgs,
-                            hf_runner: Type[HfRunner],
+                            hf_runner: type[HfRunner],
-                            vllm_runner: Type[VllmRunner],
+                            vllm_runner: type[VllmRunner],
-                            image_assets: _ImageAssets):
+                            image_assets: _ImageAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
@@ -606,13 +624,15 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
    ))
 def test_image_embedding_models(model_type: str,
                                test_case: ExpandableVLMTestArgs,
-                                hf_runner: Type[HfRunner],
+                                hf_runner: type[HfRunner],
-                                vllm_runner: Type[VllmRunner],
+                                vllm_runner: type[VllmRunner],
-                                image_assets: _ImageAssets):
+                                image_assets: _ImageAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
@@ -628,11 +648,13 @@ def test_image_embedding_models(model_type: str,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
    ))
 def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
-                      hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
+                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
-                      video_assets: _VideoAssets):
+                      video_assets: _VideoAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
@@ -648,14 +670,17 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
    ))
 def test_custom_inputs_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
-    hf_runner: Type[HfRunner],
+    hf_runner: type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
+    monkeypatch,
 ):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,
@@ -671,14 +696,16 @@ def test_custom_inputs_models(
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
    ))
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
-                                   hf_runner: Type[HfRunner],
+                                   hf_runner: type[HfRunner],
-                                   vllm_runner: Type[VllmRunner],
+                                   vllm_runner: type[VllmRunner],
-                                   image_assets: _ImageAssets):
+                                   image_assets: _ImageAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
@@ -695,14 +722,16 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
    ))
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                  test_case: ExpandableVLMTestArgs,
-                                  hf_runner: Type[HfRunner],
+                                  hf_runner: type[HfRunner],
-                                  vllm_runner: Type[VllmRunner],
+                                  vllm_runner: type[VllmRunner],
-                                  image_assets: _ImageAssets):
+                                  image_assets: _ImageAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
@@ -719,14 +748,16 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
    ))
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_image_embedding_models_heavy(model_type: str,
                                      test_case: ExpandableVLMTestArgs,
-                                      hf_runner: Type[HfRunner],
+                                      hf_runner: type[HfRunner],
-                                      vllm_runner: Type[VllmRunner],
+                                      vllm_runner: type[VllmRunner],
-                                      image_assets: _ImageAssets):
+                                      image_assets: _ImageAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
@@ -742,12 +773,14 @@ def test_image_embedding_models_heavy(model_type: str,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
    ))
 def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
-                            hf_runner: Type[HfRunner],
+                            hf_runner: type[HfRunner],
-                            vllm_runner: Type[VllmRunner],
+                            vllm_runner: type[VllmRunner],
-                            video_assets: _VideoAssets):
+                            video_assets: _VideoAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
@@ -763,15 +796,18 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
    ))
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_custom_inputs_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
-    hf_runner: Type[HfRunner],
+    hf_runner: type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
+    monkeypatch,
 ):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,

--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -2,7 +2,7 @@
 import os
 import re
-from typing import List, Optional, Tuple, Type
+from typing import Optional
 import pytest
 from transformers import AutoTokenizer
@@ -26,7 +26,7 @@ HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these
 models = [os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")]
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+def vllm_to_hf_output(vllm_output: tuple[list[int], str,
                                         Optional[SampleLogprobs]],
                      model: str):
    """Sanitize vllm output to be comparable with hf output."""
@@ -56,9 +56,9 @@ if current_platform.is_rocm():
 def run_test(
-    hf_runner: Type[HfRunner],
+    hf_runner: type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput]],
+    inputs: list[tuple[list[str], PromptImageInput]],
    model: str,
    *,
    dtype: str,

--- a/tests/models/decoder_only/vision_language/test_phi4mm.py
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
+# SPDX-License-Identifier: Apache-2.0
+import os
+import re
+from typing import Optional
+import pytest
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.image import rescale_image_size
+from vllm.platforms import current_platform
+from vllm.sequence import SampleLogprobs
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
+from ...utils import check_logprobs_close
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+    "cherry_blossom":
+    "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+})
+HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+# Since the vision-lora and speech-lora co-exist with the base model,
+# we have to manually specify the path of the lora weights.
+vision_lora_path = os.path.join(model_path, "vision-lora")
+models = [model_path]
+def vllm_to_hf_output(vllm_output: tuple[list[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    _, output_str, out_logprobs = vllm_output
+    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
+    assert output_str_without_image[0] == " "
+    output_str_without_image = output_str_without_image[1:]
+    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    hf_output_ids = tokenizer.encode(output_str_without_image)
+    assert hf_output_ids[0] == 1
+    hf_output_ids = hf_output_ids[1:]
+    return hf_output_ids, hf_output_str, out_logprobs
+target_dtype = "half"
+# ROCm Triton FA can run into shared memory issues with these models,
+# use other backends in the meantime
+# FIXME (mattwong, gshtrasb, hongxiayan)
+if current_platform.is_rocm():
+    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: list[tuple[list[str], PromptImageInput]],
+    model: str,
+    *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+    All the image fixtures for the test are from IMAGE_ASSETS.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(
+            model,
+            task="generate",
+            max_model_len=max_model_len,
+            max_num_seqs=2,
+            dtype=dtype,
+            limit_mm_per_prompt={"image": mm_limit},
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            enable_lora=True,
+            max_lora_rank=320,
+            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
+            enforce_eager=True,
+    ) as vllm_model:
+        lora_request = LoRARequest("vision", 1, vision_lora_path)
+        vllm_model.model.llm_engine.add_lora(lora_request=lora_request)
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs
+        ]
+    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
+    hf_model_kwargs = {"_attn_implementation": "eager"}
+    with hf_runner(model, dtype=dtype,
+                   model_kwargs=hf_model_kwargs) as hf_model:
+        eos_token_id = hf_model.processor.tokenizer.eos_token_id
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images,
+                                                    eos_token_id=eos_token_id,
+                                                    num_logits_to_keep=0)
+            for prompts, images in inputs
+        ]
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
+                                        vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+# Since we use _attn_implementation="eager" for hf_runner, there is more
+# significant numerical difference. The basic `logprobs=5` fails to pass.
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.7, 0.75, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [4096])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_model_len: int, max_tokens: int,
+                num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_image,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        # [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+@pytest.mark.xfail(
+    reason="Phi-4-MM multi-image inference is divergent with hf model.")
+def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
+                             size_factors, dtype: str, max_model_len: int,
+                             max_tokens: int, num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+    inputs_per_case = [
+        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+         [[rescale_image_size(image, factor) for image in images]
+          for factor in size_factors])
+    ]
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=2,
+        tensor_parallel_size=1,
+    )
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -4,9 +4,8 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import json
-import uuid
 from dataclasses import asdict
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Optional
 import os
 import pytest
@@ -17,8 +16,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
 from transformers import AutoProcessor
-from vllm import (EngineArgs, LLMEngine, RequestOutput, SamplingParams,
+from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
-                  TextPrompt, TokensPrompt)
 from vllm.multimodal import MultiModalDataBuiltins
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sequence import Logprob, SampleLogprobs
@@ -30,8 +28,11 @@ from ....utils import models_path_prefix
 if TYPE_CHECKING:
    from _typeshed import StrPath
-MODELS = [os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409")]
+PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
-#todo
+MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+MODELS = [os.path.join(models_path_prefix, PIXTRAL_ID), os.path.join(models_path_prefix, MISTRAL_SMALL_3_1_ID)]
 IMG_URLS = [
    "https://picsum.photos/id/237/400/300",
    "https://picsum.photos/id/231/200/300",
@@ -41,7 +42,7 @@ IMG_URLS = [
 PROMPT = "Describe each image in one short sentence."
-def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
+def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
    return [{
        "role":
        "user",
@@ -57,7 +58,7 @@ def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
    }]
-def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]:
+def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
    return [{
        "role":
        "user",
@@ -71,7 +72,7 @@ def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]:
    }]
-def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
+def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
    msg = _create_msg_format(urls)
    tokenizer = MistralTokenizer.from_model("pixtral")
@@ -92,7 +93,7 @@ def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
    return engine_inputs
-def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt:
+def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
    msg = _create_msg_format_hf(urls)
    tokenizer = AutoProcessor.from_pretrained(os.path.join(models_path_prefix, "mistral-community/pixtral-12b"))
@@ -128,10 +129,12 @@ MAX_MODEL_LEN = [8192, 65536]
 FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
 assert FIXTURES_PATH.exists()
-FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
+FIXTURE_LOGPROBS_CHAT = {
-FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
+    PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
+    MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
+}
-OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
+OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
 # For the test author to store golden output in JSON
@@ -169,12 +172,12 @@ def test_chat(
    model: str,
    dtype: str,
 ) -> None:
-    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
+        FIXTURE_LOGPROBS_CHAT[model])
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="mistral",
-            enable_chunked_prefill=False,
            max_model_len=max_model_len,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
@@ -186,70 +189,40 @@ def test_chat(
            outputs.extend(output)
    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
+    # Remove last `None` prompt_logprobs to compare with fixture
+    for i in range(len(logprobs)):
+        assert logprobs[i][-1] is None
+        logprobs[i] = logprobs[i][:-1]
    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
                         outputs_1_lst=logprobs,
                         name_0="h100_ref",
                         name_1="output")
-@large_gpu_test(min_gb=80)
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
-    EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
-    args = EngineArgs(
-        model=model,
-        tokenizer_mode="mistral",
-        enable_chunked_prefill=False,
-        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
-        dtype=dtype,
-    )
-    engine = LLMEngine.from_engine_args(args)
-    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
-    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
-    outputs = []
-    count = 0
-    while True:
-        out = engine.step()
-        count += 1
-        for request_output in out:
-            if request_output.finished:
-                outputs.append(request_output)
-        if count == 2:
-            engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
-                               SAMPLING_PARAMS)
-        if not engine.has_unfinished_requests():
-            break
-    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
-    check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
-                         outputs_1_lst=logprobs,
-                         name_0="h100_ref",
-                         name_1="output")
 @large_gpu_test(min_gb=48)
 @pytest.mark.parametrize(
    "prompt,expected_ranges",
    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
-        "offset": 10,
+        "offset": 11,
        "length": 494
    }]),
     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
-         "offset": 10,
+         "offset": 11,
         "length": 266
     }, {
-         "offset": 276,
+         "offset": 277,
         "length": 1056
     }, {
-         "offset": 1332,
+         "offset": 1333,
         "length": 418
     }])])
-def test_multi_modal_placeholders(
+def test_multi_modal_placeholders(vllm_runner, prompt,
-        vllm_runner, prompt, expected_ranges: list[PlaceholderRange]) -> None:
+                                  expected_ranges: list[PlaceholderRange],
+                                  monkeypatch) -> None:
+    # This placeholder checking test only works with V0 engine
+    # where `multi_modal_placeholders` is returned with `RequestOutput`
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(
            os.path.join(models_path_prefix, "mistral-community/pixtral-12b"),
            max_model_len=8192,

--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import Any, List, Optional, Tuple, Type, TypedDict, Union
+from typing import Any, Optional, TypedDict, Union
 import os
 import numpy.typing as npt
@@ -16,6 +16,15 @@ from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
 from ...utils import check_logprobs_close
 from ....utils import models_path_prefix
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    V1 Test: batch_make_xxxxx_embeddings calls a V0 internal
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
 models = [os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")]
 target_dtype = "half"
@@ -71,21 +80,21 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
 def batch_make_image_embeddings(
-        image_batches: List[Union[Image.Image, List[Image.Image]]], processor,
+        image_batches: list[Union[Image.Image, list[Image.Image]]], processor,
-        llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]:
+        llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]:
    """batched image embeddings for Qwen2-VL
    This will infer all images' embeddings in a single batch, 
      and split the result according to input batches.
    image_batches:
-      - Single-image batches: `List[Image.Image]`
+      - Single-image batches: `list[Image.Image]`
-      - Multiple-image batches: `List[List[Image.Image]]]`
+      - Multiple-image batches: `list[list[Image.Image]]]`
-    returns: `List[Qwen2VLPromptImageEmbeddingInput]`
+    returns: `list[Qwen2VLPromptImageEmbeddingInput]`
    """
-    image_batches_: List[Any] = image_batches[:]
+    image_batches_: list[Any] = image_batches[:]
    # convert single-image batches to multiple-image batches
    for idx in range(len(image_batches_)):
@@ -95,7 +104,7 @@ def batch_make_image_embeddings(
        assert isinstance(image_batches_[idx], list)
    # append all images into a list (as a batch)
-    images: List[Image.Image] = []
+    images: list[Image.Image] = []
    for image_batch in image_batches_:
        images += image_batch
@@ -120,10 +129,11 @@ def batch_make_image_embeddings(
            return visual(pixel_values_on_device,
                          grid_thw=image_grid_thw_on_device)
+    # V1 Test: this calls a V0 internal.
    image_embeds = torch.concat(llm.apply_model(get_image_embeds))
    # split into original batches
-    result: List[Qwen2VLPromptImageEmbeddingInput] = []
+    result: list[Qwen2VLPromptImageEmbeddingInput] = []
    image_counter = 0
    embed_counter = 0
    for image_batch in image_batches_:
@@ -155,7 +165,7 @@ def batch_make_image_embeddings(
 def batch_make_video_embeddings(
        video_batches: PromptVideoInput, processor,
-        llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]:
+        llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]:
    """batched video embeddings for Qwen2-VL
    A NDArray represents a single video's all frames.
@@ -164,21 +174,21 @@ def batch_make_video_embeddings(
      and split the result according to input batches.
    video_batches:
-      - Single-video batches: `List[NDArray]`
+      - Single-video batches: `list[NDArray]`
-      - Multiple-video batches: `List[List[NDArray]]`
+      - Multiple-video batches: `list[list[NDArray]]`
    """
-    video_batches_: List[Any] = video_batches[:]
+    video_batches_: list[Any] = video_batches[:]
    for idx in range(len(video_batches_)):
        if not isinstance(video_batches_[idx], list):
-            single_video_batch: List[npt.NDArray] = [video_batches_[idx]]
+            single_video_batch: list[npt.NDArray] = [video_batches_[idx]]
            video_batches_[idx] = single_video_batch
        assert isinstance(video_batches_[idx], list)
    # append all videos into a list (as a batch)
-    videos: List[npt.NDArray] = []
+    videos: list[npt.NDArray] = []
    for video_batch in video_batches_:
        videos += video_batch
@@ -203,10 +213,11 @@ def batch_make_video_embeddings(
            return visual(pixel_values_on_device,
                          grid_thw=video_grid_thw_on_device)
+    # V1 Test: this calls a V0 internal.
    video_embeds = torch.concat(llm.apply_model(get_image_embeds))
    # split into original batches
-    result: List[Qwen2VLPromptVideoEmbeddingInput] = []
+    result: list[Qwen2VLPromptVideoEmbeddingInput] = []
    video_counter = 0
    embed_counter = 0
    for video_batch in video_batches_:
@@ -237,8 +248,8 @@ def batch_make_video_embeddings(
 def run_embedding_input_test(
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
+    inputs: list[tuple[list[str], PromptImageInput, PromptVideoInput]],
    model: str,
    *,
    dtype: str,
@@ -255,7 +266,6 @@ def run_embedding_input_test(
    processor = AutoProcessor.from_pretrained(model)
-    # NOTE:
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
                     task="generate",
@@ -325,8 +335,8 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
                                         num_logprobs: int) -> None:
    images = [asset.pil_image for asset in image_assets]
-    inputs_per_case: List[Tuple[
+    inputs_per_case: list[tuple[
-        List[str], PromptImageInput, PromptVideoInput]] = [(
+        list[str], PromptImageInput, PromptVideoInput]] = [(
            [prompt for _ in size_factors],
            [rescale_image_size(image, factor) for factor in size_factors],
            [],
@@ -367,7 +377,7 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
                                                  num_logprobs: int) -> None:
    images = [asset.pil_image for asset in image_assets]
-    inputs_per_case: List[Tuple[List[str], PromptImageInput,
+    inputs_per_case: list[tuple[list[str], PromptImageInput,
                                PromptVideoInput]] = [(
                                    [MULTIIMAGE_PROMPT for _ in size_factors],
                                    [[
@@ -415,8 +425,8 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
        for asset in video_assets
    ]
-    inputs_per_case: List[Tuple[
+    inputs_per_case: list[tuple[
-        List[str], PromptImageInput, PromptVideoInput]] = [(
+        list[str], PromptImageInput, PromptVideoInput]] = [(
            [prompt for _ in size_factors],
            [],
            [rescale_video_size(video, factor) for factor in size_factors],

--- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
 # SPDX-License-Identifier: Apache-2.0
 """Helpers for building inputs that can be leveraged for different test types.
 """
+from collections.abc import Iterable
 from pathlib import PosixPath
-from typing import Callable, Iterable, List, Optional, Tuple, Union
+from typing import Callable, Optional, Union
 import torch
@@ -33,7 +34,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
 def get_model_prompts(base_prompts: Iterable[str],
                      img_idx_to_prompt: Optional[Callable[[int], str]],
                      video_idx_to_prompt: Optional[Callable[[int], str]],
-                      prompt_formatter: Callable[[str], str]) -> List[str]:
+                      prompt_formatter: Callable[[str], str]) -> list[str]:
    """Given a model-agnostic base prompt and test configuration for a model(s)
    to be tested, update the media placeholders and apply the prompt formatting
    to get the test prompt string for this model.
@@ -218,7 +219,7 @@ def build_video_inputs_from_test_info(
    ) for video, prompt in zip(sampled_vids, model_prompts)]
-def apply_image_size_scaling(image, size: Union[float, Tuple[int, int]],
+def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
                             size_type: SizeType):
    """Applies a size scaler to one image; this can be a an image size factor,
    which scales the image while maintaining the aspect ratio"""

--- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
@@ -5,7 +5,7 @@ handling multimodal placeholder substitution, and so on.
 """
 import itertools
 from collections import OrderedDict
-from typing import Dict, Iterable, Tuple
+from collections.abc import Iterable
 import pytest
@@ -13,9 +13,9 @@ from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
                    ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
-def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
+def get_filtered_test_settings(
-                               test_type: VLMTestType,
+        test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
-                               fork_per_test: bool) -> Dict[str, VLMTestInfo]:
+        new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
    """Given the dict of potential test settings to run, return a subdict
    of tests who have the current test type enabled with the matching val for
    fork_per_test.
@@ -43,22 +43,22 @@ def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
            # Everything looks okay; keep if this is has correct proc handling
            if (test_info.distributed_executor_backend
-                    is not None) == fork_per_test:
+                    is not None) == new_proc_per_test:
                matching_tests[test_name] = test_info
    return matching_tests
-def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
+def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
                             test_type: VLMTestType,
-                             fork_new_process_for_each_test: bool):
+                             create_new_process_for_each_test: bool):
    """Converts all of our VLMTestInfo into an expanded list of parameters.
    This is similar to nesting pytest parametrize calls, but done directly
    through an itertools product so that each test can set things like
    size factors etc, while still running in isolated test cases.
    """
    matching_tests = get_filtered_test_settings(
-        test_settings, test_type, fork_new_process_for_each_test)
+        test_settings, test_type, create_new_process_for_each_test)
    # Ensure that something is wrapped as an iterable it's not already
    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
@@ -121,7 +121,7 @@ def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
 def get_wrapped_test_sizes(
        test_info: VLMTestInfo,
-        test_type: VLMTestType) -> Tuple[ImageSizeWrapper, ...]:
+        test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
    """Given a test info which may have size factors or fixed sizes, wrap them
    and combine them into an iterable, each of which will be used in parameter
    expansion.

--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
 # SPDX-License-Identifier: Apache-2.0
 """Core test implementation to be shared across modalities."""
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Optional, Union
 import torch
 from PIL.Image import Image
-from transformers import BatchEncoding
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from vllm.config import TaskOption
@@ -17,9 +16,9 @@ from .types import RunnerOutput
 def run_test(
    *,
-    hf_runner: Type[HfRunner],
+    hf_runner: type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
-    inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]],
+    inputs: list[tuple[list[str], list[Union[list[Image], Image]]]],
    model: str,
    dtype: str,
    max_tokens: int,
@@ -29,15 +28,14 @@ def run_test(
    max_num_seqs: int,
    hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
-    auto_cls: Type[_BaseAutoModelClass],
+    auto_cls: type[_BaseAutoModelClass],
    use_tokenizer_eos: bool,
-    postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
    comparator: Callable[..., None],
    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
-    stop_str: Optional[List[str]],
+    stop_str: Optional[list[str]],
-    limit_mm_per_prompt: Dict[str, int],
+    limit_mm_per_prompt: dict[str, int],
-    vllm_runner_kwargs: Optional[Dict[str, Any]],
+    vllm_runner_kwargs: Optional[dict[str, Any]],
-    hf_model_kwargs: Optional[Dict[str, Any]],
+    hf_model_kwargs: Optional[dict[str, Any]],
    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
    task: TaskOption = "auto",
    runner_mm_key: str = "images",
@@ -61,7 +59,9 @@ def run_test(
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
-    vllm_runner_kwargs_: Dict[str, Any] = {}
+    vllm_runner_kwargs_: dict[str, Any] = {
+        "disable_mm_preprocessor_cache": True,
+    }
    if model_info.tokenizer:
        vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
    if model_info.tokenizer_mode:
@@ -84,7 +84,7 @@ def run_test(
                     **vllm_runner_kwargs_) as vllm_model:
        tokenizer = vllm_model.model.get_tokenizer()
-        vllm_kwargs: Dict[str, Any] = {}
+        vllm_kwargs: dict[str, Any] = {}
        if get_stop_token_ids is not None:
            vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
        if stop_str:
@@ -99,7 +99,6 @@ def run_test(
    hf_model = hf_runner(model,
                         dtype=dtype,
                         auto_cls=auto_cls,
-                         postprocess_inputs=postprocess_inputs,
                         model_kwargs=hf_model_kwargs)
    # Some models need to patch things like the model processor, e.g., internvl

--- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
 # SPDX-License-Identifier: Apache-2.0
 """Custom input builders for edge-cases in different models."""
+from io import BytesIO
 from typing import Callable
+import requests
+from PIL import Image
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import (rescale_video_size, resize_video,
                                   sample_frames_from_video)
@@ -102,3 +106,17 @@ def different_patch_input_cases_internvl():
        build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
        build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
    ]
+def windows_attention_image_qwen2_5_vl():
+    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122
+    image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
+    image = Image.open(BytesIO(requests.get(image_url).content))
+    question = "Describe the image."
+    img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
+    prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
+    return build_single_image_inputs([image], [prompt], wrapped_sf)