Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori

a3f8d5dd · zhuwenwen · 8d75f22e · f34eca5f · a3f8d5dd · a3f8d5dd
Commit a3f8d5dd authored Dec 17, 2025 by zhuwenwen
20 changed files
--- a/tests/kernels/moe/test_modular_oai_triton_moe.py
+++ b/tests/kernels/moe/test_modular_oai_triton_moe.py
@@ -209,6 +209,7 @@ def test_oai_triton_moe(
    num_experts: int,
    topk: int,
    unfused: bool,
+    workspace_init,
 ):
    current_platform.seed_everything(0)
    (

--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -231,6 +231,7 @@ def test_fused_moe(
    padding: bool,
    chunk_size: int,
    monkeypatch,
+    workspace_init,
 ):
    current_platform.seed_everything(7)


--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@@ -40,7 +40,7 @@ MNK_FACTORS = [
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @torch.inference_mode()
 def test_cutlass_fp4_moe_no_graph(
-    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, workspace_init
 ):
    current_platform.seed_everything(7)
    with set_current_vllm_config(

--- a/tests/kernels/moe/test_ocp_mx_moe.py
+++ b/tests/kernels/moe/test_ocp_mx_moe.py
@@ -17,7 +17,7 @@ QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
 ) >= version.parse("0.8.99")

 TRTLLM_GEN_MXFP4_AVAILABLE = (
-    current_platform.is_cuda() and current_platform.is_device_capability(100)
+    current_platform.is_cuda() and current_platform.is_device_capability_family(100)
 )

 HOPPER_MXFP4_BF16_AVAILABLE = (
@@ -70,12 +70,12 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
            f"{torch.cuda.device_count()}"
        )

-    # `cuda_graph_sizes=[16]` to reduce load time.
+    # `cudagraph_capture_sizes=[16]` to reduce load time.
    with vllm_runner(
        model_case.model_id,
        tensor_parallel_size=model_case.tp,
        load_format="dummy",
-        cuda_graph_sizes=[16],
+        cudagraph_capture_sizes=[16],
    ) as llm:
        # Disabled as check_model is broken: https://github.com/vllm-project/vllm/pull/18465#issuecomment-3329880562
        # def check_model(model):
@@ -799,7 +799,7 @@ def test_flashinfer_cutlass_mxfp4_fused_moe(
 @pytest.mark.skipif(
    not (
        current_platform.is_cuda()
-        and current_platform.is_device_capability(100)
+        and current_platform.is_device_capability_family(100)
        and has_flashinfer()
    ),
    reason="NVIDIA GPU sm100 and flashinfer are required for this test",

--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -46,6 +46,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 )
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up
+from vllm.v1.worker.workspace import init_workspace_manager

 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -181,6 +182,7 @@ def test_fused_moe_batched_experts(
    e: int,
    topk: int,
    dtype: torch.dtype,
+    workspace_init,
 ):
    current_platform.seed_everything(7)

@@ -863,6 +865,9 @@ def _pplx_test_loop(
    make_weights: bool,
    test_fn: Callable,
 ):
+    device = torch.device(f"cuda:{pgi.local_rank}")
+    init_workspace_manager(device)
+
    def format_result(msg, ex=None):
        if ex is not None:
            x = str(ex)

--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -30,16 +30,11 @@ def ref_dynamic_per_token_quant(
        if quant_dtype == torch.int8
        else torch.finfo(quant_dtype)
    )
-    qtype_traits_max = (
-        ROCM_FP8FNUZ_MAX
-        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
-        else qtype_traits.max
-    )
-    qtype_traits_min = (
-        -ROCM_FP8FNUZ_MAX
-        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
-        else qtype_traits.min
+    use_fp8fnuz = (
+        current_platform.is_fp8_fnuz() and quant_dtype == current_platform.fp8_dtype()
    )
+    qtype_traits_max = ROCM_FP8FNUZ_MAX if use_fp8fnuz else qtype_traits.max
+    qtype_traits_min = -ROCM_FP8FNUZ_MAX if use_fp8fnuz else qtype_traits.min
    qtype_max = as_float32_tensor(qtype_traits_max)
    s_1 = as_float32_tensor(1.0)
    s_512 = as_float32_tensor(512.0)

--- a/tests/kernels/quantization/test_awq.py
+++ b/tests/kernels/quantization/test_awq.py
@@ -41,9 +41,9 @@ def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
        qweight = torch.randint(
            -2000000000, 2000000000, (8192, 256), device="cuda", dtype=torch.int32
        )
-        scales = torch.randint(
+        scales = torch.empty((64, 2048), device="cuda", dtype=torch.float16)
+        qzeros = torch.randint(
            -2000000000, 2000000000, (64, 256), device="cuda", dtype=torch.int32
        )
-        qzeros = torch.empty((64, 2048), device="cuda", dtype=torch.float16)
        split_k_iters = 8
-        opcheck(torch.ops._C.awq_gemm, (input, qweight, qzeros, scales, split_k_iters))
+        opcheck(torch.ops._C.awq_gemm, (input, qweight, scales, qzeros, split_k_iters))
--- a/tests/kernels/quantization/test_cutlass_w4a8_moe.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
@@ -18,7 +18,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types

-IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
+IS_SUPPORTED_BY_GPU = (
+    current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9
+)


 def to_fp8(tensor: torch.Tensor) -> torch.Tensor:

--- a/tests/kernels/quantization/test_fp8_quant_group.py
+++ b/tests/kernels/quantization/test_fp8_quant_group.py
@@ -62,7 +62,7 @@ def test_quantfp8_group_functionality(
    assert scales_col.stride(1) == batch_size

    # Test column-major scales consistency
-    assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8)
+    torch.testing.assert_close(scales_col, scales_native, rtol=1e-9, atol=1e-8)

    # 3. Test CUDA implementation (only for divisible dimensions)
    if is_divisible:
@@ -71,7 +71,7 @@ def test_quantfp8_group_functionality(
        assert scales_cuda.shape == (batch_size, expected_num_groups)

        # Verify CUDA/native consistency
-        assert torch.allclose(scales_cuda, scales_native, rtol=1e-9, atol=1e-8)
+        torch.testing.assert_close(scales_cuda, scales_native, rtol=2e-7, atol=2e-8)

        # Quantized values should mostly match
        diff_count = (x_quant_cuda != x_quant_native).sum().item()

--- a/tests/kernels/quantization/test_scaled_mm_kernel_selection.py
+++ b/tests/kernels/quantization/test_scaled_mm_kernel_selection.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for ScaledMM kernel selection logic (CPU-only)
+
+Run `pytest tests/kernels/quantization/test_scaled_mm_kernel_selection.py`.
+"""
+
+import inspect
+from abc import ABC
+
+import pytest
+
+from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
+    ScaledMMLinearLayerConfig,
+)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
+    AiterScaledMMLinearKernel,
+)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import (
+    CPUScaledMMLinearKernel,
+)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
+    ScaledMMLinearKernel,
+)
+
+pytestmark = pytest.mark.cpu_test
+
+
+def test_is_supported_is_abstract():
+    """Test that is_supported() is properly defined as abstract."""
+    assert issubclass(ScaledMMLinearKernel, ABC)
+    assert hasattr(ScaledMMLinearKernel, "is_supported")
+
+
+def test_cpu_kernel_implements_is_supported():
+    """Test that CPUScaledMMLinearKernel implements is_supported() method."""
+    assert hasattr(CPUScaledMMLinearKernel, "is_supported"), (
+        "CPUScaledMMLinearKernel missing is_supported() method"
+    )
+    # Verify it's a classmethod by checking if it can be called with the class
+    # and by checking the method type
+    assert inspect.ismethod(CPUScaledMMLinearKernel.is_supported) or inspect.isfunction(
+        CPUScaledMMLinearKernel.is_supported
+    ), "CPUScaledMMLinearKernel.is_supported() should be a classmethod"
+    # Verify it can be called as a classmethod
+    result, reason = CPUScaledMMLinearKernel.is_supported()
+    assert isinstance(result, bool), "is_supported() should return a bool"
+    assert reason is None or isinstance(reason, str), "reason should be str or None"
+
+
+def test_aiter_kernel_implements_is_supported():
+    """Test that AiterScaledMMLinearKernel implements is_supported() method."""
+    assert hasattr(AiterScaledMMLinearKernel, "is_supported"), (
+        "AiterScaledMMLinearKernel missing is_supported() method"
+    )
+    # Verify it's a classmethod by checking if it can be called with the class
+    # and by checking the method type
+    assert inspect.ismethod(
+        AiterScaledMMLinearKernel.is_supported
+    ) or inspect.isfunction(AiterScaledMMLinearKernel.is_supported), (
+        "AiterScaledMMLinearKernel.is_supported() should be a classmethod"
+    )
+    # Verify it can be called as a classmethod
+    # (will return False on CPU, which is expected)
+    result, reason = AiterScaledMMLinearKernel.is_supported()
+    assert isinstance(result, bool), "is_supported() should return a bool"
+    assert reason is None or isinstance(reason, str), "reason should be str or None"
+    # On CPU, it should return False with a reason about requiring ROCm
+    # This validates the method works correctly even on non-ROCm platforms
+
+
+def test_cpu_kernel_accepts_all_configs():
+    """Test that CPUScaledMMLinearKernel accepts all config combinations."""
+    configs = [
+        ScaledMMLinearLayerConfig(
+            is_channelwise=False,
+            is_static_input_scheme=True,
+            input_symmetric=True,
+        ),
+        ScaledMMLinearLayerConfig(
+            is_channelwise=True,
+            is_static_input_scheme=False,
+            input_symmetric=False,
+        ),
+    ]
+
+    for config in configs:
+        can_impl, reason = CPUScaledMMLinearKernel.can_implement(config)
+        assert can_impl, (
+            f"CPUScaledMMLinearKernel should accept config {config}: {reason}"
+        )
--- a/tests/models/fixtures/audioflamingo3/expected_results_batched.json
+++ b/tests/models/fixtures/audioflamingo3/expected_results_batched.json
+{"transcriptions": ["There is no clear relationship between the barking and the music, as they seem to be independent of each other.", "(B) To indicate that language cannot express clearly, satirizing the inversion of black and white in the world"], "token_ids": [[3862, 374, 902, 2797, 5025, 1948, 279, 293, 33452, 323, 279, 4627, 11, 438, 807, 2803, 311, 387, 9489, 315, 1817, 1008, 13, 151645], [5349, 8, 2014, 13216, 429, 4128, 4157, 3158, 9355, 11, 7578, 404, 4849, 279, 46488, 315, 3691, 323, 4158, 304, 279, 1879, 151645, 151671]]}
\ No newline at end of file
--- a/tests/models/fixtures/audioflamingo3/expected_results_single.json
+++ b/tests/models/fixtures/audioflamingo3/expected_results_single.json
+{"transcriptions": ["The content of the input audio is 'you can ask why over and over and over again forever even if one day we explain every physical interaction and scientific law and hope and dream and regret with a single elegant equation'."], "token_ids": [[785, 2213, 315, 279, 1946, 7699, 374, 364, 9330, 646, 2548, 3170, 916, 323, 916, 323, 916, 1549, 15683, 1496, 421, 825, 1899, 582, 10339, 1449, 6961, 16230, 323, 12344, 2329, 323, 3900, 323, 7904, 323, 22231, 448, 264, 3175, 25777, 23606, 4427, 151645]]}
\ No newline at end of file
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -5,12 +5,12 @@ import json

 import pytest

-from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
+from vllm.sampling_params import SamplingParams
+from vllm.tokenizers.mistral import MistralTokenizer
+from vllm.tool_parsers.mistral_tool_parser import (
    MistralToolCall,
    MistralToolParser,
 )
-from vllm.sampling_params import SamplingParams
-from vllm.tokenizers import MistralTokenizer

 from ...utils import check_logprobs_close


--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@@ -17,7 +17,6 @@ def test_idefics_multimodal(
    with vllm_runner(
        model_name="HuggingFaceM4/Idefics3-8B-Llama3",
        runner="pooling",
-        task="classify",
        convert="classify",
        load_format="dummy",
        max_model_len=512,
@@ -86,7 +85,6 @@ def test_gemma_multimodal(
    with vllm_runner(
        model_name="google/gemma-3-4b-it",
        runner="pooling",
-        task="classify",
        convert="classify",
        load_format="auto",
        hf_overrides=update_config,

--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
@@ -68,3 +68,34 @@ def test_modernbert_models(
        hf_output = torch.tensor(hf_output).cpu().float()
        vllm_output = torch.tensor(vllm_output).cpu().float()
        assert torch.allclose(hf_output, vllm_output, atol=1e-2)
+
+
+@pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"])
+@pytest.mark.parametrize("dtype", ["float"])
+@torch.inference_mode
+def test_auto_conversion(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.token_classify(example_prompts)
+
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
+    ) as hf_model:
+        tokenizer = hf_model.tokenizer
+        hf_outputs = []
+        for prompt in example_prompts:
+            inputs = tokenizer([prompt], return_tensors="pt")
+            inputs = hf_model.wrap_device(inputs)
+            output = hf_model.model(**inputs)
+            hf_outputs.append(softmax(output.logits[0]))
+
+    # check logits difference
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output).cpu().float()
+        vllm_output = torch.tensor(vllm_output).cpu().float()
+        assert torch.allclose(hf_output, vllm_output, atol=1e-2)
--- a/tests/models/multimodal/generation/test_audioflamingo3.py
+++ b/tests/models/multimodal/generation/test_audioflamingo3.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+import pytest
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+from vllm import LLM, SamplingParams
+
+MODEL_NAME = "nvidia/audio-flamingo-3-hf"
+
+
+def get_fixture_path(filename):
+    return os.path.join(
+        os.path.dirname(__file__), "../../fixtures/audioflamingo3", filename
+    )
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # Check if the model is supported by the current transformers version
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
+    model_info.check_transformers_version(on_fail="skip")
+
+    try:
+        llm = LLM(
+            model=MODEL_NAME,
+            trust_remote_code=True,
+            dtype="bfloat16",
+            enforce_eager=True,
+            limit_mm_per_prompt={"audio": 1},
+        )
+        return llm
+    except Exception as e:
+        pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
+
+
+def test_single_generation(llm):
+    fixture_path = get_fixture_path("expected_results_single.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+
+    with open(fixture_path) as f:
+        expected = json.load(f)
+
+    audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio_url", "audio_url": {"url": audio_url}},
+                {"type": "text", "text": "Transcribe the input speech."},
+            ],
+        }
+    ]
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
+
+    outputs = llm.chat(
+        messages=messages,
+        sampling_params=sampling_params,
+    )
+    generated_text = outputs[0].outputs[0].text.strip()
+
+    expected_text = expected["transcriptions"][0]
+
+    assert expected_text in generated_text or generated_text in expected_text
+
+
+def test_batched_generation(llm):
+    fixture_path = get_fixture_path("expected_results_batched.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+
+    with open(fixture_path) as f:
+        expected = json.load(f)
+
+    items = [
+        {
+            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
+            "question": "What is surprising about the relationship "
+            "between the barking and the music?",
+            "expected_idx": 0,
+        },
+        {
+            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
+            "question": (
+                "Why is the philosopher's name mentioned in the lyrics? "
+                "(A) To express a sense of nostalgia "
+                "(B) To indicate that language cannot express clearly, "
+                "satirizing the inversion of black and white in the world "
+                "(C) To add depth and complexity to the lyrics "
+                "(D) To showcase the wisdom and influence of the philosopher"
+            ),
+            "expected_idx": 1,
+        },
+    ]
+
+    conversations = []
+    for item in items:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
+                    {"type": "text", "text": item["question"]},
+                ],
+            }
+        ]
+        conversations.append(messages)
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
+
+    outputs = llm.chat(
+        messages=conversations,
+        sampling_params=sampling_params,
+    )
+
+    for i, output in enumerate(outputs):
+        generated_text = output.outputs[0].text.strip()
+        expected_text = expected["transcriptions"][i]
+
+        assert expected_text in generated_text or generated_text in expected_text
--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Consolidated test for ViT attention backend functionality across multiple models.
+
+This test validates that each multimodal model can successfully generate outputs
+using different ViT attention backends. Tests are parametrized by model and backend.
+"""
+
+from dataclasses import asdict
+from typing import Any
+
+import pytest
+from transformers import AutoProcessor
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.video import sample_frames_from_video
+from vllm.platforms import current_platform
+
+from ....utils import create_new_process_for_each_test
+from ...utils import dummy_hf_overrides
+
+# Dots.OCR prompt from official repository
+# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3
+# ruff: noqa: E501
+DOTS_OCR_PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
+
+1. Bbox format: [x1, y1, x2, y2]
+
+2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
+
+3. Text Extraction & Formatting Rules:
+    - Picture: For the 'Picture' category, the text field should be omitted.
+    - Formula: Format its text as LaTeX.
+    - Table: Format its text as HTML.
+    - All Others (Text, Title, etc.): Format their text as Markdown.
+
+4. Constraints:
+    - The output text must be the original text from the image, with no translation.
+    - All layout elements must be sorted according to human reading order.
+
+5. Final Output: The entire output must be a single JSON object.
+"""
+
+VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+
+
+# Model configurations
+MODEL_CONFIGS: dict[str, dict[str, Any]] = {
+    "dots_ocr": {
+        "model_name": "rednote-hilab/dots.ocr",
+        "interface": "llm_chat",
+        "max_model_len": 32768,
+        "max_num_seqs": 1,
+        "limit_mm_per_prompt": {"image": 1},
+        "sampling_params": {
+            "temperature": 0.1,
+            "max_tokens": 16384,
+            "top_p": 0.9,
+            "stop_token_ids": None,
+        },
+        "use_specific_image": "stop_sign",
+        "prompt_builder": "build_dots_ocr_prompt",
+        "output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
+    },
+    "ernie45_vl": {
+        "model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT",
+        "interface": "llm_generate",
+        "max_model_len": 16384,
+        "max_num_seqs": 2,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+    "glm4_1v": {
+        "model_name": "zai-org/GLM-4.1V-9B-Thinking",
+        "interface": "llm_generate",
+        "max_model_len": 32768,
+        "max_num_seqs": 2,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+    "keye_vl": {
+        "model_name": "Kwai-Keye/Keye-VL-8B-Preview",
+        "interface": "llm_generate",
+        "max_model_len": 8192,
+        "max_num_seqs": 5,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "supported_backends": {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+    "ovis2_5": {
+        "model_name": "AIDC-AI/Ovis2.5-2B",
+        "interface": "llm_generate",
+        "max_model_len": 8192,
+        "max_num_seqs": 2,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "prompt_builder": "build_ovis_prompt",
+        "question": "What is the content of each image?",
+    },
+    "qwen2_5_vl": {
+        "model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
+        "interface": "vllm_runner",
+        "media_type": "video",
+        "max_model_len": 4000,
+        "max_num_seqs": 1,
+        "limit_mm_per_prompt": {"video": 1},
+        "sampling_params": {
+            "max_tokens": 128,
+        },
+        "runner_kwargs": {
+            "runner": "generate",
+            "dtype": "bfloat16",
+        },
+        "video_params": {
+            "num_frames": 16,
+            "pruning_rates": [0.0, 0.75],
+        },
+    },
+    "qwen2_5_omni": {
+        "model_name": "Qwen/Qwen2.5-Omni-3B",
+        "interface": "llm_generate",
+        "max_model_len": 32768,
+        "max_num_seqs": 2,
+        "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
+        "sampling_params": {
+            "temperature": 0.6,
+            "top_p": 0.95,
+            "top_k": 20,
+            "max_tokens": 16384,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+    "qwen3_omni": {
+        "model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+        "interface": "llm_generate",
+        "max_model_len": 32768,
+        "max_num_seqs": 2,
+        "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
+        "sampling_params": {
+            "temperature": 0.6,
+            "top_p": 0.95,
+            "top_k": 20,
+            "max_tokens": 16384,
+        },
+        "use_processor": True,
+        "question": "What is the content of each image?",
+    },
+}
+
+
+# Prompt builder functions
+def build_dots_ocr_prompt(images, config):
+    """Build Dots.OCR specific prompt with OCR instructions."""
+    # Use only stop_sign image for Dots.OCR
+    image = images[0]  # Already filtered to stop_sign
+
+    image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
+
+    placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {
+                    "type": "text",
+                    "text": f"<|img|><|imgpad|><|endofimg|>{DOTS_OCR_PROMPT}",
+                },
+            ],
+        },
+    ]
+
+    return messages
+
+
+def build_processor_prompt(images, config):
+    """Build prompt using AutoProcessor.apply_chat_template()."""
+    processor = AutoProcessor.from_pretrained(
+        config["model_name"], trust_remote_code=True
+    )
+
+    image_urls = [
+        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
+    ]
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": config["question"]},
+            ],
+        },
+    ]
+
+    return processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+
+def build_ovis_prompt(images, config):
+    """Build Ovis2.5 specific prompt with custom format."""
+    image_urls = [
+        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
+    ]
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+
+    return (
+        f"<|im_start|>user\n\n{placeholders}\n{config['question']}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+
+
+def build_qwen2_5_video_prompt():
+    """Build Qwen2.5-VL video prompt with EVS placeholder."""
+    return (
+        f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        f"<|im_start|>user\n{VIDEO_PLACEHOLDER}"
+        "Describe this video with a short sentence (no more than 20 words)"
+        "<|im_end|><|im_start|>assistant\n"
+    )
+
+
+# Handler functions
+def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
+    """Standard LLM.generate() interface handler."""
+    images = [asset.pil_image for asset in image_assets]
+
+    # Build prompt
+    if config.get("use_processor"):
+        prompt = build_processor_prompt(images, config)
+    else:
+        prompt_builder_name = config.get("prompt_builder", "build_ovis_prompt")
+        prompt_builder = globals()[prompt_builder_name]
+        prompt = prompt_builder(images, config)
+
+    # Determine limit_mm_per_prompt
+    limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})
+
+    # Create engine
+    engine_args = EngineArgs(
+        model=config["model_name"],
+        trust_remote_code=True,
+        max_model_len=config["max_model_len"],
+        max_num_seqs=config["max_num_seqs"],
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        mm_encoder_attn_backend=mm_encoder_attn_backend,
+        hf_overrides=dummy_hf_overrides,
+        load_format="dummy",
+    )
+
+    engine_dict = asdict(engine_args) | {"seed": 42}
+    llm = LLM(**engine_dict)
+
+    # Generate
+    sampling_params = SamplingParams(**config["sampling_params"])
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {"image": images},
+        },
+        sampling_params=sampling_params,
+    )
+
+    # Validate
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        validator = config.get("output_validator", lambda x: len(x) > 10)
+        assert validator(generated_text), (
+            f"Validation failed for {config['model_name']}: {generated_text}"
+        )
+
+
+def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
+    """LLM.chat() interface handler for Dots.OCR."""
+    # Filter to stop_sign image only
+    stop_sign_image = [
+        asset.pil_image for asset in image_assets if asset.name == "stop_sign"
+    ][0]
+
+    # Build messages
+    messages = build_dots_ocr_prompt([stop_sign_image], config)
+
+    # Create engine
+    engine_args = EngineArgs(
+        model=config["model_name"],
+        trust_remote_code=True,
+        max_model_len=config["max_model_len"],
+        max_num_seqs=config["max_num_seqs"],
+        limit_mm_per_prompt=config["limit_mm_per_prompt"],
+        mm_encoder_attn_backend=mm_encoder_attn_backend,
+        hf_overrides=dummy_hf_overrides,
+        load_format="dummy",
+    )
+
+    engine_dict = asdict(engine_args) | {"seed": 42}
+    llm = LLM(**engine_dict)
+
+    # Generate using chat
+    sampling_params = SamplingParams(**config["sampling_params"])
+    outputs = llm.chat(messages=messages, sampling_params=sampling_params)
+
+    # Validate
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        validator = config.get("output_validator", lambda x: len(x) > 10)
+        assert validator(generated_text), (
+            f"Validation failed for {config['model_name']}: {generated_text}"
+        )
+
+
+def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner):
+    """Video test with EVS (Efficient Video Sampling) handler."""
+    for pruning_rate in config["video_params"]["pruning_rates"]:
+        num_frames = config["video_params"]["num_frames"]
+
+        # Sample frames from video
+        sampled_vids = [
+            sample_frames_from_video(asset.np_ndarrays, num_frames)
+            for asset in video_assets
+        ]
+
+        # Build prompt and prepare video
+        prompt = build_qwen2_5_video_prompt()
+        prompts = [prompt]
+        videos = [sampled_vids[0]]
+
+        # Run with vllm_runner context manager
+        with vllm_runner(
+            config["model_name"],
+            max_model_len=config["max_model_len"],
+            max_num_seqs=config["max_num_seqs"],
+            limit_mm_per_prompt=config["limit_mm_per_prompt"],
+            tensor_parallel_size=1,
+            video_pruning_rate=pruning_rate,
+            mm_encoder_attn_backend=mm_encoder_attn_backend,
+            hf_overrides=dummy_hf_overrides,
+            load_format="dummy",
+            **config["runner_kwargs"],
+        ) as vllm_model:
+            outputs = vllm_model.generate_greedy(
+                prompts,
+                config["sampling_params"]["max_tokens"],
+                videos=videos,
+            )
+
+            # Validate output
+            assert len(outputs) == 1, f"Expected 1 output, got {len(outputs)}"
+            output_ids, output_text = outputs[0]
+            assert len(output_ids) > 0, "Generated no output IDs"
+            assert len(output_text) > 0, "Generated empty text"
+            assert isinstance(output_text, str), (
+                f"Output is not string: {type(output_text)}"
+            )
+
+
+# Main test function
+@pytest.mark.parametrize("model_key", list(MODEL_CONFIGS.keys()))
+@pytest.mark.parametrize(
+    "mm_encoder_attn_backend",
+    [None] + current_platform.get_supported_vit_attn_backends(),
+)
+@pytest.mark.skip(reason="Broken test due to memory segmentation fault")
+@create_new_process_for_each_test()
+def test_vit_backend_functionality(
+    model_key: str,
+    mm_encoder_attn_backend: AttentionBackendEnum | None,
+    image_assets,
+    video_assets,
+    vllm_runner,
+    request,
+):
+    """Test ViT attention backend functionality for multimodal models.
+
+    This test validates that each model can successfully generate outputs
+    using different ViT attention backends. The test:
+    1. Filters unsupported backends per model
+    2. Applies appropriate GPU marks
+    3. Routes to the correct test handler based on interface
+    4. Validates output meets minimum requirements
+    """
+    config = MODEL_CONFIGS[model_key]
+
+    # Step 1: Backend filtering
+    if (
+        "supported_backends" in config
+        and mm_encoder_attn_backend is not None
+        and mm_encoder_attn_backend not in config["supported_backends"]
+    ):
+        pytest.skip(
+            f"{model_key} does not support {mm_encoder_attn_backend} backend now."
+        )
+
+    # Step 2: Apply GPU marks dynamically
+    if "gpu_marks" in config:
+        for mark in config["gpu_marks"]:
+            request.applymarker(mark)
+
+    # Step 3: Route to appropriate handler
+    if config.get("media_type") == "video":
+        run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner)
+    elif config["interface"] == "llm_chat":
+        run_llm_chat_test(config, mm_encoder_attn_backend, image_assets)
+    elif config["interface"] == "llm_generate":
+        run_llm_generate_test(config, mm_encoder_attn_backend, image_assets)
+    else:
+        raise ValueError(f"Unknown interface: {config['interface']}")
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -9,7 +9,7 @@ from mistral_common.audio import Audio
 from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
 from mistral_common.protocol.instruct.messages import UserMessage

-from vllm.tokenizers import MistralTokenizer
+from vllm.tokenizers.mistral import MistralTokenizer

 from ....conftest import AudioTestAssets
 from ....utils import RemoteOpenAIServer

--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from collections.abc import Sequence
+from typing import Any
+
+import librosa
 import pytest
+from transformers import AutoModelForSpeechSeq2Seq

-from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
+from vllm.platforms import current_platform

-from ....conftest import VllmRunner
+from ....conftest import HfRunner, PromptAudioInput, VllmRunner
 from ....utils import create_new_process_for_each_test, multi_gpu_test
+from ...registry import HF_EXAMPLE_MODELS
+from ...utils import check_logprobs_close
+
+VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
+HF_PROMPT = ""
+# Whisper expects 16kHz audio
+WHISPER_SAMPLE_RATE = 16000

-PROMPTS = [
-    {
-        "prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
-        "multi_modal_data": {
-            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
-        },
-    },
-    {  # Test explicit encoder/decoder prompt
-        "encoder_prompt": {
-            "prompt": "",
-            "multi_modal_data": {
-                "audio": AudioAsset("winning_call").audio_and_sample_rate,
-            },
-        },
-        "decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
-    },
-]
-
-EXPECTED = {
-    "openai/whisper-tiny": [
-        " He has birth words I spoke in the original corner of that. And a"
-        " little piece of black coat poetry. Mary had a little sandwich,"
-        " sweet, with white and snow. And everyone had it very went the last"
-        " would sure to go.",
-        " >> And the old one, fit John the way to Edgar Martinez. >> One more"
-        " to line down the field line for our base camp. Here comes joy. Here"
-        " is June and the third base. They're going to wave him in. The throw"
-        " to the plate will be late. The Mariners are going to play for the"
-        " American League Championship. I don't believe it. It just continues"
-        " by all five.",
-    ],
-    "openai/whisper-small": [
-        " The first words I spoke in the original pornograph. A little piece"
-        " of practical poetry. Mary had a little lamb, its fleece was quite a"
-        " slow, and everywhere that Mary went the lamb was sure to go.",
-        " And the old one pitch on the way to Edgar Martinez one month. Here"
-        " comes joy. Here is Junior to third base. They're gonna wave him"
-        " in. The throw to the plate will be late. The Mariners are going to"
-        " play for the American League Championship. I don't believe it. It"
-        " just continues. My, oh my.",
-    ],
-    "openai/whisper-medium": [
-        " The first words I spoke in the original phonograph, a little piece"
-        " of practical poetry. Mary had a little lamb, its fleece was quite as"
-        " slow, and everywhere that Mary went the lamb was sure to go.",
-        " And the 0-1 pitch on the way to Edgar Martinez swung on the line"
-        " down the left field line for Obeyshev. Here comes Joy. Here is"
-        " Jorgen at third base. They're going to wave him in. The throw to the"
-        " plate will be late. The Mariners are going to play for the American"
-        " League Championship. I don't believe it. It just continues. My, oh"
-        " my.",
-    ],
-    "openai/whisper-large-v3": [
-        " The first words I spoke in the original phonograph, a little piece"
-        " of practical poetry. Mary had a little lamb, its feet were quite as"
-        " slow, and everywhere that Mary went, the lamb was sure to go.",
-        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
-        " Now the left field line for a base hit. Here comes Joy. Here is"
-        " Junior to third base. They're going to wave him in. The throw to the"
-        " plate will be late. The Mariners are going to play for the American"
-        " League Championship. I don't believe it. It just continues. My, oh,"
-        " my.",
-    ],
-    "openai/whisper-large-v3-turbo": [
-        " The first words I spoke in the original phonograph, a little piece"
-        " of practical poetry. Mary had a little lamb, its streets were quite"
-        " as slow, and everywhere that Mary went the lamb was sure to go.",
-        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
-        " down the left field line for a base hit. Here comes Joy. Here is"
-        " Junior to third base. They're going to wave him in. The throw to the"
-        " plate will be late. The Mariners are going to play for the American"
-        " League Championship. I don't believe it. It just continues. My, oh,"
-        " my.",
-    ],
-}
+
+@pytest.fixture(autouse=True)
+def use_spawn_for_whisper(monkeypatch):
+    """Whisper has issues with forked workers, use spawn instead."""
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")


 def run_test(
+    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
    model: str,
    *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: str | None = None,
+    enforce_eager: bool = True,
 ) -> None:
-    prompt_list = PROMPTS * 10
-    expected_list = EXPECTED[model] * 10
+    """Inference result should be the same between hf and vllm.

+    All the audio fixtures for the test are from AudioAsset.
+    For huggingface runner, we provide the audio as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    """
    with vllm_runner(
        model,
-        dtype="half",
-        max_model_len=448,
+        dtype=dtype,
+        max_model_len=max_model_len,
        tensor_parallel_size=tensor_parallel_size,
        distributed_executor_backend=distributed_executor_backend,
+        limit_mm_per_prompt={"audio": 2},
+        enforce_eager=enforce_eager,
+        disable_custom_all_reduce=True,
    ) as vllm_model:
-        llm = vllm_model.llm
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(
+                vllm_prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=audios,
+            )
+            for vllm_prompts, _, audios in inputs
+        ]

-        sampling_params = SamplingParams(
-            temperature=0,
-            top_p=1.0,
-            max_tokens=200,
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(
+                hf_prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=audios,
+            )
+            for _, hf_prompts, audios in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
        )

-        outputs = llm.generate(prompt_list, sampling_params)

-    for output, expected in zip(outputs, expected_list):
-        print(output.outputs[0].text)
-        assert output.outputs[0].text == expected
+@pytest.fixture
+def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
+    audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
+    inputs = []
+    for asset in audio_assets:
+        audio, orig_sr = asset.audio_and_sample_rate
+        # Resample to Whisper's expected sample rate (16kHz)
+        if orig_sr != WHISPER_SAMPLE_RATE:
+            audio = librosa.resample(
+                audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
+            )
+        # vLLM prompts, HF prompts, audio inputs
+        inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
+    return inputs
+
+
+def check_model_available(model: str) -> None:
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")


 @pytest.mark.core_model
+@pytest.mark.cpu_model
 @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
-@create_new_process_for_each_test()
-def test_models(vllm_runner, model) -> None:
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@create_new_process_for_each_test("spawn")
+def test_models(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+    num_logprobs: int,
+    input_audios,
+    enforce_eager: bool,
+) -> None:
+    check_model_available(model)
+    if current_platform.is_cpu() and not enforce_eager:
+        pytest.skip("Skipping test for CPU with non-eager mode")
    run_test(
+        hf_runner,
        vllm_runner,
+        input_audios,
        model,
+        dtype=dtype,
+        max_model_len=448,
+        max_tokens=200,
+        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
+        enforce_eager=enforce_eager,
    )


@@ -133,15 +148,31 @@ def test_models(vllm_runner, model) -> None:
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@create_new_process_for_each_test()
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [200])
+@pytest.mark.parametrize("num_logprobs", [5])
+@create_new_process_for_each_test("spawn")
 def test_models_distributed(
+    hf_runner,
    vllm_runner,
-    model,
-    distributed_executor_backend,
+    model: str,
+    distributed_executor_backend: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    input_audios,
 ) -> None:
+    check_model_available(model)
    run_test(
+        hf_runner,
        vllm_runner,
+        input_audios,
        model,
+        dtype=dtype,
+        max_model_len=448,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
        tensor_parallel_size=2,
        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=False,
    )
--- a/tests/models/multimodal/processing/test_audioflamingo3.py
+++ b/tests/models/multimodal/processing/test_audioflamingo3.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+import torch
+from transformers import PretrainedConfig
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+
+
+class MockAudioFlamingo3Config(PretrainedConfig):
+    model_type = "audioflamingo3"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.audio_config = PretrainedConfig()
+        self.text_config = PretrainedConfig()
+
+
+class MockAudioFlamingo3Processor:
+    def __init__(self):
+        self.audio_token = "<sound>"
+        self.audio_token_id = 12345
+        self.feature_extractor = MockFeatureExtractor()
+
+    def __call__(self, text=None, audios=None, **kwargs):
+        return {"input_ids": [1, 2, 3], "input_features": [np.zeros((3000, 80))]}
+
+
+class MockFeatureExtractor:
+    def __init__(self):
+        self.sampling_rate = 16000
+        self.chunk_length = 30
+
+
+@pytest.fixture
+def mock_ctx():
+    config = MockAudioFlamingo3Config()
+
+    ctx = MagicMock()
+    ctx.get_hf_config.return_value = config
+    ctx.get_hf_processor.return_value = MockAudioFlamingo3Processor()
+    ctx.model_config.hf_config = config
+    return ctx
+
+
+@pytest.fixture(autouse=True)
+def check_transformers_version():
+    # Check if the model is supported by the current transformers version
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
+    model_info.check_transformers_version(on_fail="skip")
+
+
+def test_audio_chunk_counting(mock_ctx):
+    from vllm.model_executor.models.audioflamingo3 import (
+        AudioFlamingo3DummyInputsBuilder,
+        AudioFlamingo3MultiModalProcessor,
+        AudioFlamingo3ProcessingInfo,
+    )
+
+    info = AudioFlamingo3ProcessingInfo(mock_ctx)
+    processor = AudioFlamingo3MultiModalProcessor(
+        info, AudioFlamingo3DummyInputsBuilder(info)
+    )
+
+    sr = 16000
+    audio_1 = np.zeros(30 * sr)
+    audio_2 = np.zeros(45 * sr)
+
+    mm_data = {"audio": [audio_1, audio_2]}
+    prompt = "<|user|>Listen.<|end|>"
+
+    from vllm.multimodal.processing import BaseMultiModalProcessor
+
+    def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
+        return {"input_ids": [1, 2, 3], "input_features": torch.randn(1, 80, 3000)}
+
+    with pytest.MonkeyPatch.context() as mp:
+        mp.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)
+
+        processed = processor._call_hf_processor(prompt, mm_data, {}, {})
+
+        chunk_counts = processed["chunk_counts"]
+
+        assert chunk_counts[0].item() == 1
+        assert chunk_counts[1].item() == 2
+        assert len(chunk_counts) == 2
+
+
+def test_dummy_data_generation(mock_ctx):
+    from vllm.model_executor.models.audioflamingo3 import (
+        AudioFlamingo3DummyInputsBuilder,
+        AudioFlamingo3ProcessingInfo,
+    )
+
+    info = AudioFlamingo3ProcessingInfo(mock_ctx)
+    builder = AudioFlamingo3DummyInputsBuilder(info)
+
+    mm_counts = {"audio": 2}
+    dummy_data = builder.get_dummy_mm_data(100, mm_counts, None)
+
+    assert "audio" in dummy_data
+    assert len(dummy_data["audio"]) == 2
+
+    expected_len = 600 * 16000
+    assert len(dummy_data["audio"][0]) == expected_len