Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -114,7 +114,7 @@ def check_model_available(model: str) -> None:
 @pytest.mark.core_model
 @pytest.mark.cpu_model
 @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "openai/whisper-large-v3-turbo")])
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["half", "float"])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("enforce_eager", [True, False])
 @create_new_process_for_each_test("spawn")

--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -522,6 +522,183 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    return hf_model


+def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patch HF runner for Isaac:
+    1) Move processor outputs to model device
+    2) Ensure IsaacModel.forward returns hidden_states
+    for compatibility with hidden_states_to_seq_logprobs()
+    """
+
+    from perceptron.tensorstream import TextType
+    from perceptron.tensorstream.ops import compute_mrope_pos_tensor, modality_mask
+    from transformers.modeling_outputs import BaseModelOutputWithPast
+
+    def compute_position_ids_input_ids(input_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Create 3D positional indices for token input.
+        """
+        batch_size, seq_length = input_ids.shape
+        position_ids = torch.arange(seq_length, device=input_ids.device)
+        position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+        position_ids = position_ids.unsqueeze(2).expand(-1, -1, 3)  # Add 3D for MRoPE
+        return position_ids
+
+    model_device = next(hf_model.model.parameters()).device
+
+    # ----------------------------
+    # 1) Patch processor: move BatchFeature input_ids and TensorStream to model device
+    # ----------------------------
+    original_processor = hf_model.processor
+
+    def patched_processor(*args, **kwargs):
+        result = original_processor(*args, **kwargs)
+        for k, v in result.data.items():
+            result[k] = v.to(model_device)
+        return result
+
+    hf_model.processor = patched_processor
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        hf_model.model_name, trust_remote_code=True
+    )
+
+    original_generate = hf_model.model.generate
+
+    def patched_generate(*args, **kwargs):
+        kwargs["pad_token_id"] = tokenizer.eos_token_id
+        kwargs["eos_token_id"] = tokenizer.eos_token_id
+        return original_generate(*args, **kwargs)
+
+    hf_model.model.generate = patched_generate
+
+    # ----------------------------
+    # 2) Patch IsaacModel.forward: add hidden_states to the output
+    # ----------------------------
+    isaac_model = hf_model.model.model
+
+    def patched_forward(
+        self,
+        input_ids=None,
+        tensor_stream=None,
+        attention_mask=None,
+        position_ids=None,
+        modality_tensor=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_hidden_states=None,
+        return_dict=None,
+        cache_position=None,
+        **kwargs,
+    ):
+        """
+        Forward pass with MRoPE position embeddings.
+        Computes position embeddings once and passes them through all layers.
+        """
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # Get inputs
+        if tensor_stream is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both tensor_stream and inputs_embeds")
+        elif tensor_stream is not None:
+            # Embed TensorStream directly
+            inputs_embeds = self.embed_stream(tensor_stream)
+            # Create modality tensor if not provided
+            if modality_tensor is None:
+                modality_tensor = modality_mask(tensor_stream)
+        elif input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            # Create text modality tensor if not provided
+            if modality_tensor is None:
+                batch_size, seq_length = input_ids.shape
+                modality_tensor = torch.full(
+                    (batch_size, seq_length),
+                    TextType.text.value,
+                    device=input_ids.device,
+                    dtype=torch.long,
+                )
+        elif inputs_embeds is None:
+            raise ValueError(
+                "You have to specify either tensor_stream, input_ids or inputs_embeds"
+            )
+
+        # Create default position_ids if not provided
+        if position_ids is None:
+            if tensor_stream is not None:
+                position_ids = compute_mrope_pos_tensor(tensor_stream)  # (B,L,3)
+            else:
+                position_ids = compute_position_ids_input_ids(input_ids)
+
+        # Compute MRoPE position embeddings if we have custom rotary_emb
+        cos, sin = self.rotary_emb(position_ids, modality_tensor)
+        cos = cos.to(inputs_embeds.dtype)
+        sin = sin.to(inputs_embeds.dtype)
+
+        # Prepare attention mask
+        if attention_mask is not None:
+            attention_mask = self._update_causal_mask(
+                attention_mask, inputs_embeds, cache_position, past_key_values, False
+            )
+
+        # Initialize and collect hidden states
+        hidden_states = inputs_embeds
+        hidden_states_list: list[torch.Tensor] = []
+
+        if output_hidden_states:
+            hidden_states_list.append(hidden_states)
+
+        for decoder_layer in self.layers:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=(cos, sin),
+                **kwargs,
+            )
+
+            hidden_states = (
+                layer_outputs[0] if isinstance(layer_outputs, tuple) else layer_outputs
+            )
+
+            if output_hidden_states:
+                hidden_states_list.append(hidden_states)
+
+        # Final layer norm
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            hidden_states_list.append(hidden_states)
+
+        # Convert to tuple or None
+        all_hidden_states = tuple(hidden_states_list) if output_hidden_states else None
+
+        # Include hiden_states for compatibility with hidden_states_to_seq_logprobs()
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+        )
+
+    isaac_model.forward = types.MethodType(patched_forward, isaac_model)
+
+    return hf_model
+
+
 def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for SkyworkR1V."""


--- a/tests/models/multimodal/pooling/conftest.py
+++ b/tests/models/multimodal/pooling/conftest.py
@@ -2,23 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM pooling tests."""

-import os
-import warnings
+import pytest

 from vllm.platforms import current_platform


-def pytest_collection_modifyitems(config, items):
-    """Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
-    if not current_platform.is_rocm():
-        return
+@pytest.fixture
+def siglip_attention_config():
+    """Return attention config for SigLIP tests on ROCm.

-    siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
-
-    if siglip_tests:
-        os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
-        warnings.warn(
-            "ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
-            UserWarning,
-            stacklevel=1,
-        )
+    On ROCm, SigLIP tests require FLEX_ATTENTION backend.
+    """
+    if current_platform.is_rocm():
+        return {"backend": "FLEX_ATTENTION"}
+    return None
--- a/tests/models/multimodal/pooling/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -78,7 +78,9 @@ def run_intern_vit_test(
    ],
 )
 @pytest.mark.parametrize("dtype", ["half"])
-def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
+def test_models(
+    default_vllm_config, dist_init, image_assets, model_id, dtype: str
+) -> None:
    run_intern_vit_test(
        image_assets,
        model_id,

--- a/tests/models/multimodal/pooling/test_jinavl_reranker.py
+++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import cast

 import pytest
 from transformers import AutoModel

-from vllm.entrypoints.chat_utils import ChatCompletionContentPartImageParam
+from vllm.entrypoints.chat_utils import (
+    ChatCompletionContentPartImageEmbedsParam,
+    ChatCompletionContentPartImageParam,
+    ChatCompletionContentPartTextParam,
+)
 from vllm.entrypoints.score_utils import ScoreMultiModalParam

 from ....conftest import HfRunner, VllmRunner

-model_name = "jinaai/jina-reranker-m0"
+MODELS = ["jinaai/jina-reranker-m0"]

-mm_processor_kwargs = {
+MM_PROCESSOR_KWARGS = {
    "min_pixels": 3136,
    "max_pixels": 602112,
 }

-limit_mm_per_prompt = {"image": 2}
+LIMIT_MM_PER_PROMPT = {"image": 2}

+CHECKPOINT_TO_HF_MAPPER = {
+    "visual.": "model.visual.",
+    "model.": "model.language_model.",
+}
+
+# Shared long text for test data
+LONG_TEXT_DOC = """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
+web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
+into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
+large language models. The models effectiveness results from two key innovations: (1) a three-stage
+data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
+refining, and critiquing web content extraction; and (2) a unified training framework combining
+continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
+ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
+benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
+lower computational requirements."""  # noqa: E501
+
+# Test data for different scenarios
+TEXT_IMAGE_TEST_DATA = {
+    "query": [{"text": "slm markdown"}],
+    "documents": [
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+        },
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+        },
+    ],
+}
+
+TEXT_TEXT_TEST_DATA = {
+    "query": [{"text": "slm markdown"}],
+    "documents": [
+        {"text": LONG_TEXT_DOC},
+        {"text": "数据提取么？为什么不用正则啊,你用正则不就全解决了么?"},
+    ],
+}
+
+IMAGE_TEXT_TEST_DATA = {
+    "query": [
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+        }
+    ],
+    "documents": [
+        {"text": LONG_TEXT_DOC},
+        {"text": "数据提取么?为什么不用正则啊,你用正则不就全解决了么?"},
+    ],
+}
+
+IMAGE_IMAGE_TEST_DATA = {
+    "query": [
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+        }
+    ],
+    "documents": [
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+        },
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+        },
+    ],
+}

-def vllm_reranker(
+TEXT_MIXED_DOCS_TEST_DATA = {
+    "query": [{"text": "slm markdown"}],
+    "documents": [
+        {"text": LONG_TEXT_DOC},
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+        },
+        {"text": "数据提取么？为什么不用正则啊,你用正则不就全解决了么?"},
+        {
+            "image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+        },
+    ],
+}
+
+
+def _normalize_image(image_val: str) -> str:
+    """Normalize image value to proper format for HF model."""
+    return (
+        image_val
+        if image_val.startswith(("http://", "https://"))
+        else f"data:image/png;base64,{image_val}"
+    )
+
+
+def create_score_multimodal_param(
+    content_parts: list[dict],
+) -> ScoreMultiModalParam:
+    """
+    Create a ScoreMultiModalParam from a list of content dictionaries.
+
+    Each dict supports the following formats:
+    - Text: {'text': 'content'}
+    - Image URL: {'image': 'https://...'}
+    - Image Base64: {'image': 'base64_str'}
+    """
+    formatted_content = []
+
+    for part in content_parts:
+        if "text" in part:
+            formatted_content.append(
+                ChatCompletionContentPartTextParam(
+                    type="text",
+                    text=part["text"],
+                )
+            )
+        elif "image" in part:
+            image_val = part["image"]
+            if image_val.startswith(("http://", "https://")):
+                formatted_content.append(
+                    ChatCompletionContentPartImageParam(
+                        type="image_url",
+                        image_url={"url": image_val},
+                    )
+                )
+            else:
+                formatted_content.append(
+                    ChatCompletionContentPartImageEmbedsParam(
+                        type="image_embeds", image_embeds=image_val
+                    )
+                )
+
+    return ScoreMultiModalParam(content=formatted_content)
+
+
+def _run_vllm(
    vllm_runner: type[VllmRunner],
-    model_name: str,
+    model: str,
    dtype: str,
-    query_strs: list[str],
-    document_strs: list[str],
-    query_type: str = "text",
-    doc_type: str = "text",
-):
-    def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
-        return {"type": "image_url", "image_url": {"url": f"{url}"}}
-
-    query: list[str] | ScoreMultiModalParam
-    if query_type == "text":
-        query = query_strs
-    elif query_type == "image":
-        query = ScoreMultiModalParam(
-            content=[create_image_param(url) for url in query_strs]
-        )
-
-    documents: list[str] | ScoreMultiModalParam
-    if doc_type == "text":
-        documents = document_strs
-    elif doc_type == "image":
-        documents = ScoreMultiModalParam(
-            content=[create_image_param(url) for url in document_strs]
-        )
+    query_strs: list[dict[str, str]],
+    document_strs: list[dict[str, str]],
+) -> list[float]:
+    """Run vLLM reranker and return scores."""
+    query = create_score_multimodal_param(query_strs)
+    documents = create_score_multimodal_param(document_strs)

    with vllm_runner(
-        model_name,
+        model,
        runner="pooling",
        dtype=dtype,
        max_num_seqs=2,
        max_model_len=2048,
-        mm_processor_kwargs=mm_processor_kwargs,
-        limit_mm_per_prompt=limit_mm_per_prompt,
+        mm_processor_kwargs=MM_PROCESSOR_KWARGS,
+        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
        outputs = vllm_model.llm.score(query, documents)

    return [output.outputs.score for output in outputs]


-def hf_reranker(
+def _run_hf(
    hf_runner: type[HfRunner],
-    model_name: str,
+    model: str,
    dtype: str,
-    query_strs: list[str],
-    document_strs: list[str],
-    query_type: str = "text",
-    doc_type: str = "text",
-):
-    checkpoint_to_hf_mapper = {
-        "visual.": "model.visual.",
-        "model.": "model.language_model.",
-    }
-
-    data_pairs = [[query_strs[0], d] for d in document_strs]
+    query_strs: list[dict[str, str]],
+    document_strs: list[dict[str, str]],
+) -> list[float]:
+    """Run HuggingFace reranker and return scores."""
+    query = query_strs[0]
+    if "text" in query:
+        query_type = "text"
+        query_data = query["text"]
+    elif "image" in query:
+        query_type = "image"
+        query_data = _normalize_image(query["image"])
+    else:
+        raise ValueError("Unsupported query format")
+
+    # Separate documents by type
+    text_docs: list[str] = []
+    image_docs: list[str] = []
+    text_indices: list[int] = []
+    image_indices: list[int] = []
+
+    for idx, doc in enumerate(document_strs):
+        if "text" in doc:
+            text_docs.append(doc["text"])
+            text_indices.append(idx)
+        elif "image" in doc:
+            image_docs.append(_normalize_image(doc["image"]))
+            image_indices.append(idx)
+        else:
+            raise ValueError(f"Unsupported document format at index {idx}")
+
+    scores: list[None | float] = [None] * len(document_strs)

    with hf_runner(
-        model_name,
+        model,
        dtype=dtype,
        trust_remote_code=True,
        auto_cls=AutoModel,
-        model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
+        model_kwargs={"key_mapping": CHECKPOINT_TO_HF_MAPPER},
    ) as hf_model:
-        return hf_model.model.compute_score(
-            data_pairs, max_length=2048, query_type=query_type, doc_type=doc_type
-        )
+        # Score text documents
+        if text_docs:
+            text_scores = hf_model.model.compute_score(
+                [[query_data, d] for d in text_docs],
+                max_length=2048,
+                query_type=query_type,
+                doc_type="text",
+            )
+            for i, s in zip(text_indices, text_scores):
+                scores[i] = s

+        # Score image documents
+        if image_docs:
+            image_scores = hf_model.model.compute_score(
+                [[query_data, d] for d in image_docs],
+                max_length=2048,
+                query_type=query_type,
+                doc_type="image",
+            )
+            for i, s in zip(image_indices, image_scores):
+                scores[i] = s

-# Visual Documents Reranking
-@pytest.mark.parametrize("model_name", [model_name])
-@pytest.mark.parametrize("dtype", ["half"])
-def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
-    query = ["slm markdown"]
-    documents = [
-        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
-        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
-    ]
-
-    hf_outputs = hf_reranker(
-        hf_runner, model_name, dtype, query, documents, "text", "image"
-    )
-    vllm_outputs = vllm_reranker(
-        vllm_runner, model_name, dtype, query, documents, "text", "image"
-    )
+    assert all(s is not None for s in scores)
+    return cast(list[float], scores)

-    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
-    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)

+def _run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    model: str,
+    dtype: str,
+    query_strs: list[dict[str, str]],
+    document_strs: list[dict[str, str]],
+) -> None:
+    """Run comparison test between vLLM and HuggingFace implementations."""
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).

-# Textual Documents Reranking
-@pytest.mark.parametrize("model_name", [model_name])
-@pytest.mark.parametrize("dtype", ["half"])
-def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
-    query = ["slm markdown"]
-    documents = [
-        """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient 
-        web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML 
-        into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding 
-        large language models. The models effectiveness results from two key innovations: (1) a three-stage 
-        data synthesis pipeline that generates high quality, diverse training data by iteratively drafting, 
-        refining, and critiquing web content extraction; and (2) a unified training framework combining 
-        continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that 
-        ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated 
-        benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly 
-        lower computational requirements.""",  # noqa: E501
-        "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
-    ]
-    hf_outputs = hf_reranker(
-        hf_runner, model_name, dtype, query, documents, "text", "text"
-    )
-    vllm_outputs = vllm_reranker(
-        vllm_runner, model_name, dtype, query, documents, "text", "text"
+    vllm_outputs = _run_vllm(vllm_runner, model, dtype, query_strs, document_strs)
+    hf_outputs = _run_hf(hf_runner, model, dtype, query_strs, document_strs)
+
+    # Compare outputs
+    assert len(hf_outputs) == len(vllm_outputs), (
+        f"Output length mismatch: HF={len(hf_outputs)}, vLLM={len(vllm_outputs)}"
    )

-    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
-    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
+    for i, (hf_score, vllm_score) in enumerate(zip(hf_outputs, vllm_outputs)):
+        assert hf_score == pytest.approx(vllm_score, rel=0.02), (
+            f"Score mismatch at index {i}: HF={hf_score}, vLLM={vllm_score}"
+        )


-# Image Querying for Textual Documents
-@pytest.mark.parametrize("model_name", [model_name])
+@pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
-def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
-    query = [
-        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-    ]
-    documents = [
-        """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
-        web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
-        into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
-        large language models. The models effectiveness results from two key innovations: (1) a three-stage
-        data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
-        refining, and critiquing web content extraction; and (2) a unified training framework combining
-        continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
-        ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
-        benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
-        lower computational requirements.""",  # noqa: E501
-        "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
-    ]
-
-    hf_outputs = hf_reranker(
-        hf_runner, model_name, dtype, query, documents, "image", "text"
-    )
-    vllm_outputs = vllm_reranker(
-        vllm_runner, model_name, dtype, query, documents, "image", "text"
+def test_model_text_image(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    """Visual Documents Reranking"""
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        model,
+        dtype,
+        TEXT_IMAGE_TEST_DATA["query"],
+        TEXT_IMAGE_TEST_DATA["documents"],
    )

-    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
-    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_text_text(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    """Textual Documents Reranking"""
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        model,
+        dtype,
+        TEXT_TEXT_TEST_DATA["query"],
+        TEXT_TEXT_TEST_DATA["documents"],
+    )


-# Image Querying for Image Documents
-@pytest.mark.parametrize("model_name", [model_name])
+@pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
-def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
-    query = [
-        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-    ]
-    documents = [
-        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
-        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
-    ]
-
-    hf_outputs = hf_reranker(
-        hf_runner, model_name, dtype, query, documents, "image", "image"
+def test_model_image_text(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    """Image Querying for Textual Documents"""
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        model,
+        dtype,
+        IMAGE_TEXT_TEST_DATA["query"],
+        IMAGE_TEXT_TEST_DATA["documents"],
    )
-    vllm_outputs = vllm_reranker(
-        vllm_runner, model_name, dtype, query, documents, "image", "image"
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_image_image(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    """Image Querying for Image Documents"""
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        model,
+        dtype,
+        IMAGE_IMAGE_TEST_DATA["query"],
+        IMAGE_IMAGE_TEST_DATA["documents"],
    )

-    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
-    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_text_mixed_documents(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    """Text Query for Mixed Text and Image Documents"""
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        model,
+        dtype,
+        TEXT_MIXED_DOCS_TEST_DATA["query"],
+        TEXT_MIXED_DOCS_TEST_DATA["documents"],
+    )
--- a/tests/models/multimodal/pooling/test_radio.py
+++ b/tests/models/multimodal/pooling/test_radio.py
@@ -40,15 +40,15 @@ def run_radio_test(
        for image in images
    ]

-    config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+    hf_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)

    # RADIO model on HF does not properly handle torch_dtype argument
    # And relies on args["dtype"] which we have to patch manually:
-    config.args["dtype"] = torch_dtype
+    hf_config.args["dtype"] = torch_dtype

    hf_model = AutoModel.from_pretrained(
        model_id,
-        config=config,
+        config=hf_config,
        dtype=torch_dtype,
        trust_remote_code=True,
    ).to("cuda")
@@ -62,13 +62,14 @@ def run_radio_test(
    hf_model.make_preprocessor_external()

    hf_outputs_per_image = [
-        hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
+        hf_model(pixel_value.to("cuda")) for pixel_value in pixel_values
    ]

-    radio_config = RadioConfig(
-        model_name=config.args["model"], reg_tokens=config.args["register_multiple"]
+    vllm_config = RadioConfig(
+        model_name=hf_config.args["model"],
+        **hf_config.args,
    )
-    vllm_model = RadioModel(radio_config)
+    vllm_model = RadioModel(vllm_config)
    vllm_model.load_weights(hf_model.state_dict())
    vllm_model = vllm_model.to("cuda", torch_dtype)

@@ -80,7 +81,8 @@ def run_radio_test(

    cos_similar = nn.CosineSimilarity(dim=-1)
    for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
-        assert cos_similar(vllm_output, hf_output).mean() > 0.99
+        assert cos_similar(vllm_output[0], hf_output[0]).mean() > 0.99
+        assert cos_similar(vllm_output[1], hf_output[1]).mean() > 0.99


 @pytest.mark.parametrize(
@@ -90,7 +92,9 @@ def run_radio_test(
    ],
 )
 @pytest.mark.parametrize("dtype", ["half", "bfloat16"])
-def test_radio(dist_init, image_assets, model_id, dtype: str) -> None:
+def test_radio(
+    default_vllm_config, dist_init, image_assets, model_id, dtype: str
+) -> None:
    run_radio_test(
        image_assets,
        model_id,

--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -38,6 +38,7 @@ def _run_test(
    *,
    dtype: str,
    tokenization_kwargs: dict[str, Any] | None = None,
+    attention_config: dict[str, Any] | None = None,
 ) -> None:
    if tokenization_kwargs is None:
        tokenization_kwargs = {}
@@ -49,6 +50,7 @@ def _run_test(
        enforce_eager=True,
        max_model_len=64,
        gpu_memory_utilization=0.7,
+        attention_config=attention_config,
    ) as vllm_model:
        vllm_outputs = vllm_model.embed(
            input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
@@ -90,6 +92,7 @@ def test_models_text(
    hf_runner,
    vllm_runner,
    image_assets,
+    siglip_attention_config,
    model: str,
    dtype: str,
 ) -> None:
@@ -108,6 +111,7 @@ def test_models_text(
            "padding": "max_length",
            "max_length": 64,
        },  # siglip2 was trained with this padding setting.
+        attention_config=siglip_attention_config,
    )


@@ -117,6 +121,7 @@ def test_models_image(
    hf_runner,
    vllm_runner,
    image_assets,
+    siglip_attention_config,
    model: str,
    dtype: str,
 ) -> None:
@@ -133,6 +138,7 @@ def test_models_image(
        input_images,
        model,
        dtype=dtype,
+        attention_config=siglip_attention_config,
    )


@@ -141,6 +147,7 @@ def test_models_image(
 def test_models_text_image_no_crash(
    vllm_runner,
    image_assets,
+    siglip_attention_config,
    model: str,
    dtype: str,
 ) -> None:
@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
        enforce_eager=True,
        max_model_len=64,
        gpu_memory_utilization=0.7,
+        attention_config=siglip_attention_config,
    ) as vllm_model:
        with pytest.raises(ValueError, match="not both"):
            vllm_model.embed(texts, images=images)

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -86,11 +86,25 @@ def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
    return mm_data


+def glmasr_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
+    """
+    Patch the multimodal data for GLM-ASR model.
+    GLM-ASR requires text and audio to match 1:1, so we limit audio to 1.
+    """
+    if "audio" in mm_data:
+        audio = mm_data["audio"]
+        if isinstance(audio, list) and len(audio) > 1:
+            # Limit to single audio to match text requirement
+            mm_data["audio"] = [audio[0]]
+    return mm_data
+
+
 # For some multimodal models, tokenizer will always add bos_token
 # at the beginning of prompt by default, causing hf_processor outputs
 # incorrect token ids. So we need use `add_special_tokens=False` here
 # to leave bos_token to be added by the processor.
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
+    "nemotron_parse": False,
    "ovis": False,
    "ovis2_5": False,
    "paligemma": False,
@@ -106,9 +120,11 @@ _IGNORE_MM_KEYS = {
 }

 MM_DATA_PATCHES = {
-    # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
+    # Ernie4.5-VL, GLM4.1V and Qwen3-VL requires video metadata
+    "ernie4_5_moe_vl": qwen3_vl_patch_mm_data,
    "glm4v": glm4_1v_patch_mm_data,
    "glm4v_moe": glm4_1v_patch_mm_data,
+    "glmasr": glmasr_patch_mm_data,
    "qwen3_vl": qwen3_vl_patch_mm_data,
    "qwen3_vl_moe": qwen3_vl_patch_mm_data,
 }
@@ -212,7 +228,11 @@ def _test_processing_correctness(
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
        model_id = model_id_or_arch
    model_info.check_available_online(on_fail="skip")
-    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_transformers_version(
+        on_fail="skip",
+        check_max_version=False,
+        check_version_reason="vllm",
+    )

    model_config = ModelConfig(
        model_id,
@@ -386,6 +406,11 @@ def test_processing_correctness(
        pytest.skip("Fix later")
    if model_id == "jinaai/jina-reranker-m0":
        pytest.skip("Fix later")
+    if model_id in {"Qwen/Qwen-VL", "Qwen/Qwen-VL-Chat"}:
+        pytest.skip(
+            "Qwen-VL tokenizer requires downloading a font file from "
+            "servers that often refuse connections in CI"
+        )

    _test_processing_correctness(
        model_id,

--- a/tests/models/multimodal/processing/test_gemma3.py
+++ b/tests/models/multimodal/processing/test_gemma3.py
@@ -2,14 +2,154 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
+import torch

+from vllm.model_executor.models.gemma3n_audio_utils import (
+    adjust_audio_features_to_expected_length,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY

 from ....conftest import ImageTestAssets
 from ...utils import build_model_context

+# Gemma3 (image) model
+GEMMA3_MODEL_ID = "google/gemma-3-4b-it"

-@pytest.mark.parametrize("model_id", ["google/gemma-3-4b-it"])
+# Gemma3n (multimodal with audio) model
+GEMMA3N_MODEL_ID = "google/gemma-3n-E2B-it"
+
+# Expected audio tokens for Gemma3n (audio_soft_tokens_per_image)
+GEMMA3N_EXPECTED_AUDIO_TOKENS = 188
+
+
+class TestGemma3nAudioTensorLogic:
+    """CPU-based tests for Gemma3n audio feature tensor manipulation.
+
+    These tests validate the padding/truncation logic in
+    adjust_audio_features_to_expected_length() which fixes the
+    integer overflow in _process_audio_input when audio_seq_len > 188.
+    """
+
+    def test_padding_when_audio_short(self):
+        """Test that short audio is padded to expected length."""
+        batch_size, seq_len, embed_dim = 1, 100, 256
+        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
+
+        audio_features = torch.randn(batch_size, seq_len, embed_dim)
+        padding_embs = torch.zeros(1, 1, embed_dim)
+
+        result, tokens_truncated = adjust_audio_features_to_expected_length(
+            audio_features, expected_tokens, padding_embs
+        )
+
+        assert result.shape == (batch_size, expected_tokens, embed_dim)
+        assert tokens_truncated == 0
+        # First 100 tokens should be original, rest should be padding (zeros)
+        assert torch.allclose(result[:, :seq_len, :], audio_features)
+        assert torch.allclose(
+            result[:, seq_len:, :],
+            torch.zeros(batch_size, expected_tokens - seq_len, embed_dim),
+        )
+
+    def test_truncation_when_audio_long(self):
+        """Test that long audio is truncated to expected length.
+
+        This is the key test for the overflow fix. Previously, when
+        audio_seq_len > expected_tokens, the code would compute a negative
+        padding value causing: RuntimeError: numel: integer multiplication overflow
+        """
+        batch_size, seq_len, embed_dim = 1, 192, 256  # 192 > 188
+        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
+
+        audio_features = torch.randn(batch_size, seq_len, embed_dim)
+        padding_embs = torch.zeros(1, 1, embed_dim)
+
+        result, tokens_truncated = adjust_audio_features_to_expected_length(
+            audio_features, expected_tokens, padding_embs
+        )
+
+        assert result.shape == (batch_size, expected_tokens, embed_dim)
+        assert tokens_truncated == seq_len - expected_tokens  # 192 - 188 = 4
+        # Result should be first 188 tokens of original
+        assert torch.allclose(result, audio_features[:, :expected_tokens, :])
+
+    def test_no_change_when_exact_length(self):
+        """Test that exact-length audio passes through unchanged."""
+        batch_size, embed_dim = 1, 256
+        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
+
+        audio_features = torch.randn(batch_size, expected_tokens, embed_dim)
+        padding_embs = torch.zeros(1, 1, embed_dim)
+
+        result, tokens_truncated = adjust_audio_features_to_expected_length(
+            audio_features, expected_tokens, padding_embs
+        )
+
+        assert result.shape == audio_features.shape
+        assert tokens_truncated == 0
+        assert torch.allclose(result, audio_features)
+
+    def test_original_bug_would_fail(self):
+        """Verify the original buggy implementation would cause overflow.
+
+        The original code always tried to pad, which fails when
+        audio_seq_len > expected_tokens because expand() gets negative size.
+        """
+        batch_size, seq_len, embed_dim = 1, 192, 256
+        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
+
+        padding_embs = torch.zeros(1, 1, embed_dim)
+
+        # Original buggy logic (always pads, never truncates)
+        extra_padding_tokens = expected_tokens - seq_len  # = -4 (negative!)
+
+        with pytest.raises(RuntimeError):
+            # This should fail with negative size error
+            padding_embs.expand(batch_size, extra_padding_tokens, embed_dim)
+
+    @pytest.mark.parametrize(
+        "seq_len",
+        [50, 100, 150, 187, 188, 189, 192, 200, 300],
+    )
+    def test_various_audio_lengths(self, seq_len: int):
+        """Test padding/truncation with various audio lengths."""
+        batch_size, embed_dim = 1, 256
+        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
+
+        audio_features = torch.randn(batch_size, seq_len, embed_dim)
+        padding_embs = torch.zeros(1, 1, embed_dim)
+
+        # Should not raise any errors
+        result, tokens_truncated = adjust_audio_features_to_expected_length(
+            audio_features, expected_tokens, padding_embs
+        )
+
+        # Output should always be expected_tokens length
+        assert result.shape == (batch_size, expected_tokens, embed_dim)
+
+        # Verify truncation count is correct
+        if seq_len > expected_tokens:
+            assert tokens_truncated == seq_len - expected_tokens
+        else:
+            assert tokens_truncated == 0
+
+    def test_batch_processing(self):
+        """Test that batch processing works correctly."""
+        batch_size, seq_len, embed_dim = 4, 192, 256
+        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS
+
+        audio_features = torch.randn(batch_size, seq_len, embed_dim)
+        padding_embs = torch.zeros(1, 1, embed_dim)
+
+        result, tokens_truncated = adjust_audio_features_to_expected_length(
+            audio_features, expected_tokens, padding_embs
+        )
+
+        assert result.shape == (batch_size, expected_tokens, embed_dim)
+        assert tokens_truncated == seq_len - expected_tokens
+
+
+@pytest.mark.parametrize("model_id", [GEMMA3_MODEL_ID])
 def test_get_image_size_with_most_features(
    image_assets: ImageTestAssets, model_id: str
 ):

--- a/tests/models/multimodal/processing/test_qwen3_omni.py
+++ b/tests/models/multimodal/processing/test_qwen3_omni.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for Qwen3 Omni audio processing and sample rate handling."""
+
+from typing import Any
+
+import numpy as np
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen3-Omni-30B-A3B-Instruct"])
+@pytest.mark.parametrize(
+    ("audio_sample_rate", "audio_duration_sec"),
+    [
+        (16000, 1.0),  # Native Whisper sample rate, 1 second
+        (16000, 2.0),  # Native Whisper sample rate, 2 seconds
+    ],
+)
+def test_processor_with_audio_sample_rate(
+    model_id: str,
+    audio_sample_rate: int,
+    audio_duration_sec: float,
+) -> None:
+    """
+    Test that vLLM's processor generates expected outputs with audio_sample_rate.
+
+    This validates that the processor correctly handles audio_sample_rate
+    passed via hf_processor_mm_kwargs and generates audio tokens.
+    """
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"audio": 1, "image": 0, "video": 0},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = processor.info.get_tokenizer()
+
+    # Create audio data at the specified sample rate
+    audio_length = int(audio_sample_rate * audio_duration_sec)
+    rng = np.random.RandomState(42)
+    audio_data = rng.rand(audio_length).astype(np.float32)
+
+    # Build prompt with audio placeholder
+    prompt = "<|audio_start|><|audio_pad|><|audio_end|>"
+    mm_data = {"audio": [(audio_data, audio_sample_rate)]}
+
+    # Apply processor with audio_sample_rate in mm_kwargs
+    hf_processor_mm_kwargs: dict[str, Any] = {
+        "audio_sample_rate": audio_sample_rate,
+    }
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Verify audio tokens are generated
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    audio_token_id = tokenizer.convert_tokens_to_ids(hf_processor.audio_token)
+    aud_tok_count = processed_inputs["prompt_token_ids"].count(audio_token_id)
+
+    assert aud_tok_count >= 1, (
+        f"Expected at least 1 audio token but got {aud_tok_count}. "
+        f"sample_rate: {audio_sample_rate}Hz, duration: {audio_duration_sec}s"
+    )
+
+
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen3-Omni-30B-A3B-Instruct"])
+def test_longer_audio_generates_more_tokens(model_id: str) -> None:
+    """
+    Test that longer audio generates more tokens than shorter audio.
+
+    This validates that audio_sample_rate is being used correctly by checking
+    that audio duration affects token count as expected.
+    """
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"audio": 1, "image": 0, "video": 0},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = processor.info.get_tokenizer()
+
+    audio_sample_rate = 16000
+    rng = np.random.RandomState(42)
+
+    def get_token_count(duration: float) -> int:
+        audio_length = int(audio_sample_rate * duration)
+        audio_data = rng.rand(audio_length).astype(np.float32)
+        prompt = "<|audio_start|><|audio_pad|><|audio_end|>"
+        mm_data = {"audio": [(audio_data, audio_sample_rate)]}
+        hf_processor_mm_kwargs: dict[str, Any] = {
+            "audio_sample_rate": audio_sample_rate,
+        }
+        processed = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+        hf_proc = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+        audio_token_id = tokenizer.convert_tokens_to_ids(hf_proc.audio_token)
+        return processed["prompt_token_ids"].count(audio_token_id)
+
+    short_tokens = get_token_count(1.0)
+    long_tokens = get_token_count(2.0)
+
+    assert long_tokens > short_tokens, (
+        f"Expected longer audio (2s) to have more tokens than shorter (1s). "
+        f"Got short={short_tokens}, long={long_tokens}"
+    )
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -138,25 +138,25 @@ def create_batched_mm_kwargs(
    )


-# TODO(Isotr0py): Don't initalize model during test
+# TODO(Isotr0py): Don't initialize model during test
 @contextmanager
 def initialize_dummy_model(
    model_cls: type[nn.Module],
    model_config: ModelConfig,
 ):
    temp_file = tempfile.mkstemp()[1]
-    init_distributed_environment(
-        world_size=1,
-        rank=0,
-        distributed_init_method=f"file://{temp_file}",
-        local_rank=0,
-        backend="nccl",
-    )
-    initialize_model_parallel(tensor_model_parallel_size=1)
-
    current_device = torch.get_default_device()
    vllm_config = VllmConfig(model_config=model_config)
    with set_current_vllm_config(vllm_config=vllm_config):
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            local_rank=0,
+            backend="nccl",
+        )
+        initialize_model_parallel(tensor_model_parallel_size=1)
+
        with set_default_torch_dtype(model_config.dtype):
            torch.set_default_device(current_platform.device_type)
            model = model_cls(vllm_config=vllm_config)
@@ -172,7 +172,11 @@ def initialize_dummy_model(
 def test_model_tensor_schema(model_id: str):
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
    model_info.check_available_online(on_fail="skip")
-    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_transformers_version(
+        on_fail="skip",
+        check_max_version=False,
+        check_version_reason="vllm",
+    )

    model_arch = next(
        arch for arch, info in HF_EXAMPLE_MODELS.hf_models.items() if info == model_info

--- a/tests/models/quantization/untest_fp8.py
+++ b/tests/models/quantization/untest_fp8.py
@@ -9,7 +9,7 @@ import os
 import pytest

 from tests.quantization.utils import is_quant_method_supported
-from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
+from vllm.v1.attention.backends.fa_utils import flash_attn_supports_fp8
 from vllm.platforms import current_platform
 from ..utils import check_logprobs_close
 from ...utils import models_path_prefix
@@ -76,7 +76,6 @@ def test_models(

    with monkeypatch.context() as m:
        m.setenv("TOKENIZERS_PARALLELISM", "true")
-        m.setenv("VLLM_ATTENTION_BACKEND", backend)

        MAX_MODEL_LEN = 1024
        NUM_LOG_PROBS = 8
@@ -87,6 +86,7 @@ def test_models(
            tensor_parallel_size=tensor_parallel_size,
            enforce_eager=enforce_eager,
            kv_cache_dtype="auto",
+            attention_config={"backend": backend},
        ) as vllm_model:
            baseline_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS
@@ -98,6 +98,7 @@ def test_models(
            tensor_parallel_size=tensor_parallel_size,
            enforce_eager=enforce_eager,
            kv_cache_dtype=kv_cache_dtype,
+            attention_config={"backend": backend},
        ) as vllm_model:
            test_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS

--- a/tests/models/quantization/untest_gptq_marlin_24.py
+++ b/tests/models/quantization/untest_gptq_marlin_24.py
@@ -65,7 +65,10 @@ def test_models(
    num_logprobs: int,
 ) -> None:
    with vllm_runner(
-        model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
+        model_pair.model_marlin,
+        dtype=dtype,
+        quantization="gptq_marlin_24",
+        allow_deprecated_quantization=True,
    ) as marlin_24_model:
        marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -51,9 +51,11 @@ class _HfExamplesInfo:
    The maximum version of HF Transformers that this model runs on.
    """

-    transformers_version_reason: str | None = None
+    transformers_version_reason: dict[Literal["vllm", "hf"], str] | None = None
    """
-    The reason for the minimum/maximum version requirement.
+    The type and reason to skip test for the minimum/maximum version requirement.
+    vllm: skip all vLLM tests if the version requirement is not met.
+    hf: only skip tests that uses HF runner if the version requirement is not met.
    """

    require_embed_inputs: bool = False
@@ -113,6 +115,7 @@ class _HfExamplesInfo:
        self,
        *,
        on_fail: Literal["error", "skip", "return"],
+        check_version_reason: Literal["vllm", "hf"] = "hf",
        check_min_version: bool = True,
        check_max_version: bool = True,
    ) -> str | None:
@@ -133,23 +136,28 @@ class _HfExamplesInfo:
        msg = f"`transformers=={current_version}` installed, but `transformers"
        # Only check the base version for the min/max version, otherwise preview
        # models cannot be run because `x.yy.0.dev0`<`x.yy.0`
-        if (
-            check_min_version
-            and min_version
-            and Version(cur_base_version) < Version(min_version)
-        ):
+        if min_version and Version(cur_base_version) < Version(min_version):
+            is_version_valid = not check_min_version
            msg += f">={min_version}` is required to run this model."
-        elif (
-            check_max_version
-            and max_version
-            and Version(cur_base_version) > Version(max_version)
-        ):
+        elif max_version and Version(cur_base_version) > Version(max_version):
+            is_version_valid = not check_max_version
            msg += f"<={max_version}` is required to run this model."
        else:
-            return None
+            is_version_valid = True

-        if self.transformers_version_reason:
-            msg += f" Reason: {self.transformers_version_reason}"
+        # check if Transformers version breaks the corresponding model runner,
+        # skip test when model runner not compatible
+        is_reason_valid = not (
+            check_version_reason
+            and self.transformers_version_reason
+            and check_version_reason in self.transformers_version_reason
+        )
+        is_transformers_valid = is_version_valid and is_reason_valid
+        if is_transformers_valid:
+            return None
+        elif self.transformers_version_reason:
+            for reason_type, reason in self.transformers_version_reason.items():
+                msg += f" Reason({reason_type}): {reason}"

        if on_fail == "error":
            raise RuntimeError(msg)
@@ -219,7 +227,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
        trust_remote_code=True,
    ),
    "CwmForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "facebook/cwm"), min_transformers_version="4.58"),
-    "DbrxForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "databricks/dbrx-instruct")),
+    # FIXME: databricks/dbrx-instruct has been deleted
+    "DbrxForCausalLM": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "databricks/dbrx-instruct"), is_available_online=False
+    ),
    "DeciLMForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "nvidia/Llama-3_3-Nemotron-Super-49B-v1"),
        trust_remote_code=True,
@@ -243,6 +254,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
        os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), trust_remote_code=True
    ),
    "Exaone4ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-4.0-32B")),
+    "ExaoneMoEForCausalLM": _HfExamplesInfo(
+        "LGAI-EXAONE/K-EXAONE-236B-A23B", min_transformers_version="5.0.0"
+    ),
    "Fairseq2LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mgleize/fairseq2-dummy-Llama-3.2-1B")),
    "FalconForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/falcon-7b")),
    "FalconH1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/Falcon-H1-0.5B-Base")),
@@ -282,6 +296,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Grok1ModelForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "hpcai-tech/grok-1"), trust_remote_code=True
    ),
+    "Grok1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "xai-org/grok-2"), trust_remote_code=True),
    "HunYuanDenseV1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tencent/Hunyuan-7B-Instruct")),
    "HunYuanMoEV1ForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "tencent/Hunyuan-A13B-Instruct"), trust_remote_code=True
@@ -302,6 +317,16 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Jais2ForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "inceptionai/Jais-2-8B-Chat"), min_transformers_version="4.58"
    ),
+    "IQuestCoderForCausalLM": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "IQuestLab/IQuest-Coder-V1-40B-Instruct"), trust_remote_code=True
+    ),
+    "IQuestLoopCoderForCausalLM": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "IQuestLab/IQuest-Coder-V1-40B-Loop-Instruct"), trust_remote_code=True
+    ),
+    "JAISLMHeadModel": _HfExamplesInfo(os.path.join(models_path_prefix, "inceptionai/jais-13b-chat")),
+    "Jais2ForCausalLM": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "inceptionai/Jais-2-8B-Chat"), min_transformers_version="4.58"
+    ),
    "JambaForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "ai21labs/AI21-Jamba-1.5-Mini"),
        extras={
@@ -348,6 +373,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "MiniCPM3ForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"), trust_remote_code=True
    ),
+    "MiniCPM4ForCausalLM": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "openbmb/MiniCPM4.1-8B"), trust_remote_code=True
+    ),
    "MiniMaxForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-Text-01-hf")),
    "MiniMaxText01ForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-Text-01"),
@@ -370,7 +398,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
        {"tiny": os.path.join(models_path_prefix, "TitanML/tiny-mixtral")},
    ),
    "MptForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mpt"), is_available_online=False),
-    "MPTForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mosaicml/mpt-7b")),
+    # FIXME: mosaicml/mpt-7b has been deleted
+    "MPTForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mosaicml/mpt-7b"), is_available_online=False),
    "NemotronForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "nvidia/Minitron-8B-Base")),
    "NemotronHForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "nvidia/Nemotron-H-8B-Base-8K"), trust_remote_code=True
@@ -394,6 +423,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "PanguEmbeddedForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "FreedomIntelligence/openPangu-Embedded-7B-V1.1"), trust_remote_code=True
    ),
+    "PanguProMoEV2ForCausalLM": _HfExamplesInfo(
+        "",
+        trust_remote_code=True,
+        is_available_online=False,
+    ),
    "PanguUltraMoEForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1"),
        trust_remote_code=True,
@@ -416,7 +450,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "QWenLMHeadModel": _HfExamplesInfo(
        os.path.join(models_path_prefix, "Qwen/Qwen-7B-Chat"),
        max_transformers_version="4.53",
-        transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
+        transformers_version_reason={
+            "hf": "HF model uses remote code that is not compatible with latest Transformers"  # noqa: E501
+        },
        trust_remote_code=True,
    ),
    "Qwen2ForCausalLM": _HfExamplesInfo(
@@ -463,6 +499,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    ),
    "Zamba2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Zyphra/Zamba2-7B-instruct")),
    "MiMoForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-7B-RL"), trust_remote_code=True),
+    "MiMoV2FlashForCausalLM": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-V2-Flash"), trust_remote_code=True
+    ),
    "Dots1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "rednote-hilab/dots.llm1.inst")),
 }

@@ -484,7 +523,10 @@ _EMBEDDING_EXAMPLE_MODELS = {
        os.path.join(models_path_prefix, "internlm/internlm2-1_8b-reward"), trust_remote_code=True
    ),
    "JambaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-reward-dev")),
-    "LlamaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "llama", is_available_online=False)),
+    "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
+    "LlamaBidirectionalModel": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "nvidia/llama-nemotron-embed-1b-v2"), trust_remote_code=True
+    ),
    "MistralModel": _HfExamplesInfo(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")),
    "ModernBertModel": _HfExamplesInfo(
        os.path.join(models_path_prefix, "Alibaba-NLP/gte-modernbert-base"), trust_remote_code=True
@@ -496,12 +538,16 @@ _EMBEDDING_EXAMPLE_MODELS = {
    "Qwen2ForRewardModel": _HfExamplesInfo(
        os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-RM-72B"),
        max_transformers_version="4.53",
-        transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
+        transformers_version_reason={
+            "hf": "HF model uses remote code that is not compatible with latest Transformers"  # noqa: E501
+        },
    ),
    "Qwen2ForProcessRewardModel": _HfExamplesInfo(
        os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-PRM-7B"),
        max_transformers_version="4.53",
-        transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
+        transformers_version_reason={
+            "hf": "HF model uses remote code that is not compatible with latest Transformers"  # noqa: E501
+        },
    ),
    "RobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "sentence-transformers/stsb-roberta-base-v2")),
    "RobertaForMaskedLM": _HfExamplesInfo(os.path.join(models_path_prefix, "sentence-transformers/all-roberta-large-v1")),
@@ -551,6 +597,9 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
        trust_remote_code=True,
        hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
    ),
+    "LlamaBidirectionalForSequenceClassification": _HfExamplesInfo(
+        "nvidia/llama-nemotron-rerank-1b-v2", trust_remote_code=True
+    ),
    "ModernBertForSequenceClassification": _HfExamplesInfo(
        os.path.join(models_path_prefix, "Alibaba-NLP/gte-reranker-modernbert-base")
    ),
@@ -581,6 +630,15 @@ _AUTOMATIC_CONVERTED_MODELS = {
        os.path.join(models_path_prefix, "tomaarsen/Qwen3-Reranker-0.6B-seq-cls")
    ),
    "Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"),
+    "Qwen3VLForSequenceClassification": _HfExamplesInfo(
+        "Qwen/Qwen3-VL-Reranker-2B",
+        is_available_online=False,
+        hf_overrides={
+            "architectures": ["Qwen3VLForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        },
+    ),
 }

 _MULTIMODAL_EXAMPLE_MODELS = {
@@ -607,7 +665,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        os.path.join(models_path_prefix, "deepseek-ai/deepseek-vl2-tiny"),
        extras={"fork": os.path.join(models_path_prefix, "Isotr0py/deepseek-vl2-tiny")},
        max_transformers_version="4.48",
-        transformers_version_reason="HF model is not compatible.",
+        transformers_version_reason={"hf": "HF model is not compatible."},
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
    ),
    "DeepseekOCRForCausalLM": _HfExamplesInfo(
@@ -624,6 +682,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "FuyuForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "adept/fuyu-8b")),
    "Gemma3ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "google/gemma-3-4b-it")),
    "Gemma3nForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "google/gemma-3n-E2B-it")),
+    "GlmAsrForConditionalGeneration": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "zai-org/GLM-ASR-Nano-2512"),
+        trust_remote_code=True,
+        min_transformers_version="5.0",
+    ),
    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo(
        os.path.join(models_path_prefix, "ibm-granite/granite-speech-3.3-2b")
    ),
@@ -639,7 +702,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        trust_remote_code=True,
        extras={"2b": os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-2b")},
        max_transformers_version="4.48",
-        transformers_version_reason="HF model is not compatible.",
+        transformers_version_reason={"hf": "HF model is not compatible."},
    ),
    "HCXVisionForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"),
@@ -653,6 +716,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3"),
        extras={"tiny": os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")},
    ),
+    "IsaacForConditionalGeneration": _HfExamplesInfo(
+        "PerceptronAI/Isaac-0.1",
+        trust_remote_code=True,
+        extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"},
+    ),
    "InternS1ForConditionalGeneration": _HfExamplesInfo(
        os.path.join(models_path_prefix, "internlm/Intern-S1"), trust_remote_code=True
    ),
@@ -668,6 +736,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        trust_remote_code=True,
    ),
    "InternVLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "OpenGVLab/InternVL3-1B-hf")),
+    "KananaVForConditionalGeneration": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "kakaocorp/kanana-1.5-v-3b-instruct"),
+        trust_remote_code=True,
+    ),
    "KeyeForConditionalGeneration": _HfExamplesInfo(
        os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview"),
        trust_remote_code=True,
@@ -681,13 +753,21 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        extras={"thinking": os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Thinking")},
        trust_remote_code=True,
        max_transformers_version="4.53.3",
-        transformers_version_reason="HF model uses deprecated transformers API "
-        "(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
-        "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
+        transformers_version_reason={
+            "hf": (
+                "HF model uses deprecated transformers API "
+                "(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
+                "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31"
+            )
+        },
    ),
    "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
        os.path.join(models_path_prefix, "lightonai/LightOnOCR-1B-1025")
    ),
+    "Lfm2VlForConditionalGeneration": _HfExamplesInfo(
+        "LiquidAI/LFM2-VL-450M",
+        min_transformers_version="5.0.0",
+    ),
    "Llama4ForConditionalGeneration": _HfExamplesInfo(
        os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
        max_model_len=10240,
@@ -712,7 +792,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "MantisForConditionalGeneration": _HfExamplesInfo(
        os.path.join(models_path_prefix, "TIGER-Lab/Mantis-8B-siglip-llama3"),
        max_transformers_version="4.48",
-        transformers_version_reason="HF model is not compatible.",
+        transformers_version_reason={"hf": "HF model is not compatible."},
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
    ),
    "MiDashengLMModel": _HfExamplesInfo(
@@ -739,7 +819,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "MolmoForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924"),
        max_transformers_version="4.48",
-        transformers_version_reason="Incorrectly-detected `tensorflow` import.",
+        transformers_version_reason={
+            "vllm": "Incorrectly-detected `tensorflow` import from processor."
+        },
        extras={"olmo": os.path.join(models_path_prefix, "allenai/Molmo-7B-O-0924")},
        trust_remote_code=True,
    ),
@@ -758,7 +840,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        os.path.join(models_path_prefix, "AIDC-AI/Ovis2-1B"),
        trust_remote_code=True,
        max_transformers_version="4.53",
-        transformers_version_reason="HF model is not compatible",
+        transformers_version_reason={"hf": "HF model is not compatible"},
        extras={
            "1.6-llama": os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Llama3.2-3B"),
            "1.6-gemma": os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Gemma2-9B"),
@@ -777,7 +859,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct"),
        trust_remote_code=True,
        max_transformers_version="4.48",
-        transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
+        transformers_version_reason={
+            "hf": "HF model use deprecated imports which have been removed."
+        },  # noqa: E501
        extras={"phi3.5": os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")},
    ),
    "Phi4MMForCausalLM": _HfExamplesInfo(
@@ -796,7 +880,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        extras={"chat": os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat")},
        trust_remote_code=True,
        max_transformers_version="4.53.3",
-        transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
+        transformers_version_reason={
+            "hf": "HF model uses deprecated imports which have been removed."
+        },  # noqa: E501
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
    ),
    "Qwen2AudioForConditionalGeneration": _HfExamplesInfo(
@@ -851,7 +937,15 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        # disable this temporarily until we support HF format
        is_available_online=False,
    ),
+    "VoxtralStreamingGeneration": _HfExamplesInfo(
+        "<place-holder>",
+        # disable this temporarily until we support HF format
+        is_available_online=False,
+    ),
    # [Encoder-decoder]
+    "NemotronParseForConditionalGeneration": _HfExamplesInfo(
+        "nvidia/NVIDIA-Nemotron-Parse-v1.1", trust_remote_code=True
+    ),
    "WhisperForConditionalGeneration": _HfExamplesInfo(
        os.path.join(models_path_prefix, "openai/whisper-large-v3-turbo"),
        extras={"v3": os.path.join(models_path_prefix, "openai/whisper-large-v3")},
@@ -926,6 +1020,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
        trust_remote_code=True,
        speculative_model=os.path.join(models_path_prefix, "baidu/ERNIE-4.5-21B-A3B-PT"),
    ),
+    "ExaoneMoeMTP": _HfExamplesInfo(
+        "LGAI-EXAONE/K-EXAONE-236B-A23B",
+        speculative_model="LGAI-EXAONE/K-EXAONE-236B-A23B",
+        min_transformers_version="5.0.0",
+    ),
    "Glm4MoeMTPModel": _HfExamplesInfo(
        os.path.join(models_path_prefix, "zai-org/GLM-4.5"),
        speculative_model="zai-org/GLM-4.5",

--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -66,7 +66,11 @@ def can_initialize(

    model_info = EXAMPLE_MODELS.get_hf_info(model_arch)
    model_info.check_available_online(on_fail="skip")
-    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_transformers_version(
+        on_fail="skip",
+        check_max_version=False,
+        check_version_reason="vllm",
+    )

    hf_overrides_fn = partial(
        dummy_hf_overrides,
@@ -108,11 +112,12 @@ def can_initialize(
        patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
        monkeypatch.context() as m,
    ):
-        if model_arch == "GptOssForCausalLM":
-            # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
-            # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
-            # L4 supports FA3.
-            m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+        # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
+        # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
+        # L4 supports FA3.
+        attention_config = (
+            {"backend": "TRITON_ATTN"} if model_arch == "GptOssForCausalLM" else None
+        )
        if model_arch == "WhisperForConditionalGeneration":
            m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")

@@ -143,6 +148,7 @@ def can_initialize(
            else "vllm",
            hf_overrides=hf_overrides_fn,
            max_num_seqs=model_info.max_num_seqs,
+            attention_config=attention_config,
        )



--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -34,7 +34,11 @@ models_path_prefix = os.getenv('VLLM_OPTEST_MODELS_PATH') or os.getenv("OPTEST_M
 def test_registry_imports(model_arch):
    # Skip if transformers version is incompatible
    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_transformers_version(
+        on_fail="skip",
+        check_max_version=False,
+        check_version_reason="vllm",
+    )
    # Ensure all model classes can be imported successfully
    model_cls = ModelRegistry._try_load_model_cls(model_arch)
    assert model_cls is not None

--- a/tests/models/test_terratorch.py
+++ b/tests/models/test_terratorch.py
@@ -38,7 +38,7 @@ def test_inference(
        max_num_seqs=32,
        default_torch_num_threads=1,
    ) as vllm_model:
-        vllm_output = vllm_model.llm.encode(prompt)
+        vllm_output = vllm_model.llm.encode(prompt, pooling_task="plugin")
        assert torch.equal(
            torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
        )
--- a/tests/models/test_vision.py
+++ b/tests/models/test_vision.py
@@ -21,6 +21,7 @@ from vllm.model_executor.models.vision import (
 from vllm.platforms import current_platform
 from vllm.utils.network_utils import get_open_port
 from vllm.utils.system_utils import update_environment_variables
+from vllm.utils.torch_utils import set_random_seed

 pytestmark = pytest.mark.cpu_test

@@ -98,7 +99,7 @@ def run_dp_sharded_vision_model_vs_direct(
    """

    # Set random seed for reproducibility
-    current_platform.seed_everything(0)
+    set_random_seed(0)

    device = f"{current_platform.device_name}:{local_rank}"
    current_platform.set_device(device)
@@ -284,7 +285,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(
    calling the model directly.
    """
    # Set random seed for reproducibility
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    device = f"{current_platform.device_name}:{local_rank}"
    current_platform.set_device(device)
    torch.set_default_device(device)
@@ -408,7 +409,7 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
 ):
    """Test run_dp_sharded_mrope_vision_model with uneven load distribution."""
    # Set up distributed environment
-    current_platform.seed_everything(123)
+    set_random_seed(123)
    device = f"{current_platform.device_name}:{local_rank}"
    current_platform.set_device(device)
    torch.set_default_device(device)

--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -10,7 +10,8 @@ import torch
 import torch.nn.functional as F
 from transformers import PretrainedConfig

-from vllm.config.model import ModelConfig, ModelDType, RunnerOption
+from vllm.config.model import AttnTypeStr, ModelConfig, ModelDType, RunnerOption
+from vllm.config.pooler import SequencePoolingType, TokenPoolingType
 from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.multimodal.processing import InputProcessingContext
 from vllm.tokenizers import cached_tokenizer_from_config
@@ -292,7 +293,11 @@ def build_model_context(
    """
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
    model_info.check_available_online(on_fail="skip")
-    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_transformers_version(
+        on_fail="skip",
+        check_max_version=False,
+        check_version_reason="vllm",
+    )

    model_config_kwargs = model_config_kwargs or {}
    limit_mm_per_prompt = limit_mm_per_prompt or {}
@@ -375,7 +380,11 @@ class ModelInfo:
    max_model_len: int | None = None
    hf_dtype: str = "float32"
    hf_overrides: dict[str, Any] | None = None
-    default_pooling_type: str = ""
+    seq_pooling_type: SequencePoolingType | None = None
+    tok_pooling_type: TokenPoolingType | None = None
+    attn_type: AttnTypeStr | None = None
+    is_prefix_caching_supported: bool | None = None
+    is_chunked_prefill_supported: bool | None = None
    enable_test: bool = True


@@ -386,29 +395,10 @@ class EmbedModelInfo(ModelInfo):
    matryoshka_dimensions: list[int] | None = None


-@dataclass
-class CLSPoolingEmbedModelInfo(EmbedModelInfo):
-    default_pooling_type: str = "CLS"
-
-
-@dataclass
-class LASTPoolingEmbedModelInfo(EmbedModelInfo):
-    default_pooling_type: str = "LAST"
-
-
 @dataclass
 class RerankModelInfo(ModelInfo):
    mteb_score: float | None = None
-
-
-@dataclass
-class CLSPoolingRerankModelInfo(RerankModelInfo):
-    default_pooling_type: str = "CLS"
-
-
-@dataclass
-class LASTPoolingRerankModelInfo(RerankModelInfo):
-    default_pooling_type: str = "LAST"
+    chat_template_name: str | None = None


 @dataclass
@@ -483,12 +473,16 @@ def dummy_hf_overrides(
        "num_kv_shared_layers": 1,
    }

+    _hf_config = hf_config
+
    class DummyConfig:
+        hf_config = _hf_config
        hf_text_config = text_config

+    model_arch_config = ModelConfig.get_model_arch_config(DummyConfig)
    # Only set MoE related config when the model has MoE layers.
    # Otherwise all models detected as MoE by _get_transformers_backend_cls.
-    if ModelConfig.get_num_experts(DummyConfig) > 0:
+    if model_arch_config.num_experts > 0:
        update_dict.update(
            {
                "num_experts": num_experts,

--- a/tests/multimodal/test_audio.py
+++ b/tests/multimodal/test_audio.py
@@ -7,10 +7,16 @@ from unittest.mock import patch

 import numpy as np
 import pytest
+import torch

 from vllm.multimodal.audio import (
+    MONO_AUDIO_SPEC,
+    PASSTHROUGH_AUDIO_SPEC,
    AudioMediaIO,
    AudioResampler,
+    AudioSpec,
+    ChannelReduction,
+    normalize_audio,
    resample_audio_librosa,
    resample_audio_scipy,
 )
@@ -137,3 +143,500 @@ def test_audio_media_io_encode_base64(dummy_audio):
        decoded = base64.b64decode(out)
        assert decoded == b"dummy_wav_data"
        mock_write.assert_called_once()
+
+
+# ============================================================
+# Tests for normalize_audio function
+# ============================================================
+
+
+class TestNormalizeAudio:
+    """Tests for normalize_audio function with different specs."""
+
+    def test_passthrough_preserves_audio(self):
+        """Passthrough spec should not modify audio."""
+        stereo = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
+        result = normalize_audio(stereo, PASSTHROUGH_AUDIO_SPEC)
+        np.testing.assert_array_equal(result, stereo)
+
+    def test_mono_spec_with_numpy_stereo(self):
+        """Mono spec should reduce stereo numpy array to 1D."""
+        stereo = np.array([[1.0, 2.0], [-1.0, 0.0]], dtype=np.float32)
+        result = normalize_audio(stereo, MONO_AUDIO_SPEC)
+        assert result.ndim == 1
+        np.testing.assert_array_almost_equal(result, [0.0, 1.0])
+
+    def test_mono_spec_with_torch_stereo(self):
+        """Mono spec should reduce stereo torch tensor to 1D."""
+        stereo = torch.tensor([[1.0, 2.0], [-1.0, 0.0]])
+        result = normalize_audio(stereo, MONO_AUDIO_SPEC)
+        assert result.ndim == 1
+        torch.testing.assert_close(result, torch.tensor([0.0, 1.0]))
+
+    def test_mono_passthrough_for_1d_numpy(self):
+        """1D numpy array should pass through unchanged with mono spec."""
+        mono = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        result = normalize_audio(mono, MONO_AUDIO_SPEC)
+        assert result.ndim == 1
+        np.testing.assert_array_equal(result, mono)
+
+    def test_mono_passthrough_for_1d_torch(self):
+        """1D torch tensor should pass through unchanged with mono spec."""
+        mono = torch.tensor([1.0, 2.0, 3.0])
+        result = normalize_audio(mono, MONO_AUDIO_SPEC)
+        assert result.ndim == 1
+        torch.testing.assert_close(result, mono)
+
+    def test_first_channel_reduction(self):
+        """FIRST reduction should take only the first channel."""
+        spec = AudioSpec(target_channels=1, channel_reduction=ChannelReduction.FIRST)
+        stereo = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+        result = normalize_audio(stereo, spec)
+        np.testing.assert_array_equal(result, [1.0, 2.0])
+
+    def test_max_channel_reduction(self):
+        """MAX reduction should take max across channels."""
+        spec = AudioSpec(target_channels=1, channel_reduction=ChannelReduction.MAX)
+        stereo = np.array([[1.0, 4.0], [3.0, 2.0]], dtype=np.float32)
+        result = normalize_audio(stereo, spec)
+        np.testing.assert_array_equal(result, [3.0, 4.0])
+
+    def test_sum_channel_reduction(self):
+        """SUM reduction should sum across channels."""
+        spec = AudioSpec(target_channels=1, channel_reduction=ChannelReduction.SUM)
+        stereo = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+        result = normalize_audio(stereo, spec)
+        np.testing.assert_array_equal(result, [4.0, 6.0])
+
+    def test_invalid_3d_array_raises(self):
+        """3D arrays should raise ValueError."""
+        audio_3d = np.random.randn(2, 3, 4).astype(np.float32)
+        with pytest.raises(ValueError, match="Unsupported audio"):
+            normalize_audio(audio_3d, MONO_AUDIO_SPEC)
+
+    def test_channel_expansion_raises(self):
+        """Expanding from mono to stereo should raise ValueError."""
+        mono = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        spec = AudioSpec(target_channels=2)
+        with pytest.raises(ValueError, match="Cannot expand"):
+            normalize_audio(mono, spec)
+
+    def test_time_channels_format_numpy(self):
+        """Audio in (time, channels) format should be transposed to (channels, time).
+
+        This handles the case where audio loaders like soundfile return
+        (time, channels) format instead of (channels, time) like torchaudio.
+        """
+        # Create audio in (time, channels) format: 1000 samples, 2 channels
+        audio_time_channels = np.array(
+            [[1.0, -1.0]] * 1000,  # 1000 time steps, 2 channels
+            dtype=np.float32,
+        )
+        assert audio_time_channels.shape == (1000, 2)  # (time, channels)
+
+        result = normalize_audio(audio_time_channels, MONO_AUDIO_SPEC)
+
+        # Should be reduced to mono 1D
+        assert result.ndim == 1
+        assert result.shape == (1000,)
+        # Mean of [1.0, -1.0] at each time step should be 0.0
+        np.testing.assert_array_almost_equal(result, np.zeros(1000))
+
+    def test_time_channels_format_torch(self):
+        """Torch tensor in (time, channels) format should be transposed."""
+        # Create audio in (time, channels) format: 1000 samples, 2 channels
+        audio_time_channels = torch.tensor(
+            [[1.0, -1.0]] * 1000,  # 1000 time steps, 2 channels
+        )
+        assert audio_time_channels.shape == (1000, 2)  # (time, channels)
+
+        result = normalize_audio(audio_time_channels, MONO_AUDIO_SPEC)
+
+        # Should be reduced to mono 1D
+        assert result.ndim == 1
+        assert result.shape == (1000,)
+        # Mean of [1.0, -1.0] at each time step should be 0.0
+        torch.testing.assert_close(result, torch.zeros(1000))
+
+    def test_channels_time_format_preserved(self):
+        """Audio already in (channels, time) format should work correctly."""
+        # Create audio in standard (channels, time) format: 2 channels, 1000 samples
+        audio_channels_time = np.array(
+            [[1.0] * 1000, [-1.0] * 1000],  # 2 channels, 1000 time steps
+            dtype=np.float32,
+        )
+        assert audio_channels_time.shape == (2, 1000)  # (channels, time)
+
+        result = normalize_audio(audio_channels_time, MONO_AUDIO_SPEC)
+
+        # Should be reduced to mono 1D
+        assert result.ndim == 1
+        assert result.shape == (1000,)
+        # Mean of [1.0, -1.0] at each time step should be 0.0
+        np.testing.assert_array_almost_equal(result, np.zeros(1000))
+
+    def test_ambiguous_square_audio_numpy(self):
+        """Square audio arrays (N, N) should use shape[0] > shape[1] heuristic.
+
+        For a square array, shape[0] == shape[1], so no transpose happens
+        and we assume (channels, time) format.
+        """
+        # Create square audio: 4 channels, 4 samples
+        audio_square = np.array(
+            [
+                [1.0, 2.0, 3.0, 4.0],
+                [5.0, 6.0, 7.0, 8.0],
+                [9.0, 10.0, 11.0, 12.0],
+                [13.0, 14.0, 15.0, 16.0],
+            ],
+            dtype=np.float32,
+        )
+        assert audio_square.shape == (4, 4)
+
+        result = normalize_audio(audio_square, MONO_AUDIO_SPEC)
+
+        # Should be reduced to mono 1D with mean across channels (axis 0)
+        assert result.ndim == 1
+        assert result.shape == (4,)
+        # Mean across 4 channels: [1+5+9+13, 2+6+10+14, ...] / 4
+        expected = np.array([7.0, 8.0, 9.0, 10.0])
+        np.testing.assert_array_almost_equal(result, expected)
+
+
+# ============================================================
+# Tests for MultiModalDataParser integration with target_channels
+# ============================================================
+
+
+class TestMultiModalDataParserChannelNormalization:
+    """Tests for MultiModalDataParser.target_channels integration.
+
+    These tests verify that the target_channels parameter is properly used
+    in the _parse_audio_data method to normalize audio channels.
+    """
+
+    def test_parser_normalizes_stereo_to_mono(self):
+        """Parser should normalize stereo to mono when target_channels=1."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Create parser with mono normalization enabled
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Create stereo audio (simulating torchaudio output)
+        stereo_audio = np.array(
+            [[1.0, 1.0, 1.0], [-1.0, -1.0, -1.0]],  # 2 channels, 3 samples
+            dtype=np.float32,
+        )
+
+        # Parse audio data
+        result = parser._parse_audio_data((stereo_audio, 16000))
+
+        # Check that result is mono (1D)
+        audio_item = result.get(0)
+        assert audio_item.ndim == 1, f"Expected 1D mono audio, got {audio_item.ndim}D"
+        assert audio_item.shape == (3,), f"Expected shape (3,), got {audio_item.shape}"
+        # Channel average of [1, 1, 1] and [-1, -1, -1] should be [0, 0, 0]
+        np.testing.assert_array_almost_equal(audio_item, np.zeros(3))
+
+    def test_parser_preserves_stereo_when_target_channels_none(self):
+        """Parser should preserve stereo when target_channels=None."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Create parser without channel normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=None,
+        )
+
+        # Create stereo audio
+        stereo_audio = np.array(
+            [[1.0, 1.0, 1.0], [-1.0, -1.0, -1.0]],
+            dtype=np.float32,
+        )
+
+        # Parse audio data
+        result = parser._parse_audio_data((stereo_audio, 16000))
+
+        # Check that result preserves original shape (after resampling)
+        audio_item = result.get(0)
+        # When target_channels=None, stereo audio should be preserved
+        assert audio_item.ndim == 2, f"Expected 2D stereo audio, got {audio_item.ndim}D"
+
+    def test_parser_mono_passthrough_when_target_channels_1(self):
+        """Parser should pass through mono audio unchanged when target_channels=1."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Create parser with mono normalization enabled
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Create mono audio (already 1D)
+        mono_audio = np.random.randn(16000).astype(np.float32)
+
+        # Parse audio data
+        result = parser._parse_audio_data((mono_audio, 16000))
+
+        # Check that result is still mono (1D)
+        audio_item = result.get(0)
+        assert audio_item.ndim == 1
+        assert audio_item.shape == (16000,)
+
+    def test_parser_with_target_channels_2(self):
+        """Parser should reduce 6-channel to 2-channel when target_channels=2."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Create parser with stereo target
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=2,
+        )
+
+        # Create 6-channel audio (5.1 surround)
+        surround_audio = np.random.randn(6, 1000).astype(np.float32)
+
+        # Parse audio data
+        result = parser._parse_audio_data((surround_audio, 16000))
+
+        # Check that result is stereo (2 channels)
+        audio_item = result.get(0)
+        assert audio_item.ndim == 2
+        assert audio_item.shape[0] == 2  # 2 channels
+
+
+# ============================================================
+# End-to-End Audio Pipeline Tests
+# ============================================================
+
+
+class TestAudioPipelineE2E:
+    """End-to-end tests for audio normalization in the full pipeline.
+
+    These tests verify the complete flow from raw audio input through
+    the MultiModalDataParser, simulating different audio loader formats.
+    """
+
+    def test_stereo_audio_normalized_to_mono_e2e(self):
+        """Full pipeline: stereo audio (torchaudio format) → mono output."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate torchaudio output: (channels, time) format
+        # Stereo audio with left channel = 1.0, right channel = -1.0
+        stereo_torchaudio = np.array(
+            [[1.0] * 16000, [-1.0] * 16000],  # 2 channels, 1 second at 16kHz
+            dtype=np.float32,
+        )
+        assert stereo_torchaudio.shape == (2, 16000)
+
+        # Create parser with mono normalization (like Whisper models)
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((stereo_torchaudio, 16000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D
+        assert audio_output.ndim == 1, f"Expected 1D, got {audio_output.ndim}D"
+        assert audio_output.shape == (16000,)
+
+        # Verify channel averaging: mean of [1.0, -1.0] = 0.0
+        np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
+
+    def test_soundfile_format_normalized_to_mono_e2e(self):
+        """Full pipeline: soundfile format (time, channels) → mono output."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate soundfile output: (time, channels) format
+        # 16000 samples, 2 channels
+        stereo_soundfile = np.array(
+            [[0.5, -0.5]] * 16000,  # Each row is [left, right]
+            dtype=np.float32,
+        )
+        assert stereo_soundfile.shape == (16000, 2)
+
+        # Create parser with mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((stereo_soundfile, 16000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D
+        assert audio_output.ndim == 1, f"Expected 1D, got {audio_output.ndim}D"
+        assert audio_output.shape == (16000,)
+
+        # Verify channel averaging: mean of [0.5, -0.5] = 0.0
+        np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
+
+    def test_librosa_mono_passthrough_e2e(self):
+        """Full pipeline: librosa mono format → preserved as mono."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate librosa output: already mono (time,) format
+        mono_librosa = np.random.randn(16000).astype(np.float32)
+        assert mono_librosa.shape == (16000,)
+
+        # Create parser with mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((mono_librosa, 16000))
+        audio_output = result.get(0)
+
+        # Verify output is still mono 1D
+        assert audio_output.ndim == 1
+        assert audio_output.shape == (16000,)
+
+        # Verify audio content is preserved
+        np.testing.assert_array_almost_equal(audio_output, mono_librosa)
+
+    def test_multichannel_5_1_surround_to_mono_e2e(self):
+        """Full pipeline: 5.1 surround (6 channels) → mono output."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate 5.1 surround audio: 6 channels
+        surround_audio = np.array(
+            [
+                [1.0] * 8000,  # Front Left
+                [2.0] * 8000,  # Front Right
+                [3.0] * 8000,  # Center
+                [4.0] * 8000,  # LFE (subwoofer)
+                [5.0] * 8000,  # Rear Left
+                [6.0] * 8000,  # Rear Right
+            ],
+            dtype=np.float32,
+        )
+        assert surround_audio.shape == (6, 8000)
+
+        # Create parser with mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((surround_audio, 16000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D
+        assert audio_output.ndim == 1
+
+        # Verify channel averaging: mean of [1,2,3,4,5,6] = 3.5
+        expected_value = (1.0 + 2.0 + 3.0 + 4.0 + 5.0 + 6.0) / 6
+        np.testing.assert_array_almost_equal(
+            audio_output, np.full(8000, expected_value), decimal=5
+        )
+
+    def test_torch_tensor_input_e2e(self):
+        """Full pipeline: torch.Tensor stereo input → mono numpy output."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate torch tensor input (from torchaudio)
+        stereo_torch = torch.tensor(
+            [[1.0] * 8000, [-1.0] * 8000],  # 2 channels
+            dtype=torch.float32,
+        )
+        assert stereo_torch.shape == (2, 8000)
+
+        # Create parser with mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        # Note: Parser expects numpy, so we convert first (simulating real usage)
+        result = parser._parse_audio_data((stereo_torch.numpy(), 16000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D numpy array
+        assert audio_output.ndim == 1
+        assert isinstance(audio_output, np.ndarray)
+
+        # Verify channel averaging
+        np.testing.assert_array_almost_equal(audio_output, np.zeros(8000), decimal=5)
+
+    def test_passthrough_preserves_stereo_e2e(self):
+        """Full pipeline: stereo with target_channels=None → stereo preserved."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Stereo audio
+        stereo_audio = np.array(
+            [[1.0] * 8000, [-1.0] * 8000],
+            dtype=np.float32,
+        )
+
+        # Create parser WITHOUT mono normalization (passthrough)
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=None,  # Passthrough - no normalization
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((stereo_audio, 16000))
+        audio_output = result.get(0)
+
+        # Verify output preserves stereo (2D)
+        assert audio_output.ndim == 2
+        assert audio_output.shape == (2, 8000)
+
+    def test_resampling_with_channel_normalization_e2e(self):
+        """Full pipeline: resample + channel normalize in single pass."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Stereo audio at 48kHz (common recording rate)
+        stereo_48k = np.array(
+            [[1.0] * 48000, [-1.0] * 48000],  # 1 second at 48kHz
+            dtype=np.float32,
+        )
+
+        # Create parser with both resampling and mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,  # Resample to 16kHz
+            target_channels=1,  # Normalize to mono
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((stereo_48k, 48000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D at target sample rate
+        assert audio_output.ndim == 1
+        # After resampling from 48kHz to 16kHz, length should be ~16000
+        assert audio_output.shape[0] == 16000
+
+    def test_very_short_audio_e2e(self):
+        """Full pipeline: very short audio (< 1 frame) handled correctly."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Very short stereo audio (10 samples)
+        short_stereo = np.array(
+            [[1.0] * 10, [-1.0] * 10],
+            dtype=np.float32,
+        )
+
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        result = parser._parse_audio_data((short_stereo, 16000))
+        audio_output = result.get(0)
+
+        # Should still produce mono output
+        assert audio_output.ndim == 1
+        assert audio_output.shape == (10,)
+        np.testing.assert_array_almost_equal(audio_output, np.zeros(10))