Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

4eabe123 · zhuwenwen · 45840cd2 · 58738772 · 4eabe123 · 4eabe123
Commit 4eabe123 authored May 28, 2025 by zhuwenwen
20 changed files
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
 # SPDX-License-Identifier: Apache-2.0

 import os
-import re
 from collections.abc import Sequence
 from typing import Optional

 import librosa
 import pytest
+import regex as re
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer

 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
-from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.image import convert_image_mode, rescale_image_size
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs

@@ -267,7 +267,7 @@ def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,

    # use the example speech question so that the model outputs are reasonable
    audio = librosa.load(speech_question, sr=None)
-    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+    image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")

    inputs_vision_speech = [
        (

--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -3,11 +3,13 @@
 for manipulating the input / output of HF & vLLM test runners, which are
 typically specific to a small subset of models.
 """
-import re
 import types
 from pathlib import PosixPath
 from typing import Optional, Union

+import numpy as np
+import numpy.typing as npt
+import regex as re
 import torch
 from PIL.Image import Image
 from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
@@ -495,13 +497,20 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size

-        def __call__(self, text: str, images: Union[Image, list[Image]],
-                     **kwargs):
+        def __call__(
+            self,
+            text: str,
+            images: Union[Image, list[Image]] = None,
+            videos: Union[npt.NDArray, list[npt.NDArray]] = None,
+            **kwargs,
+        ):
            from vllm.model_executor.models.internvl import (
                IMG_CONTEXT, IMG_END, IMG_START,
-                image_to_pixel_values_internvl)
+                image_to_pixel_values_internvl, video_to_pixel_values_internvl)
            images = [images] if isinstance(images, Image) else images
-            pixel_values = [
+            videos = [videos] if isinstance(videos, np.ndarray) else videos
+            if images is not None:
+                pixel_values_images = [
                    image_to_pixel_values_internvl(
                        image,
                        input_size=self.image_size,
@@ -510,15 +519,52 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                        use_thumbnail=self.use_thumbnail,
                    ) for image in images
                ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
+                num_patches_images = [
+                    pixel_value.shape[0] for pixel_value in pixel_values_images
                ]
-            pixel_values = torch.cat(pixel_values, dim=0)
-            for num_patches in num_patches_list:
+            else:
+                pixel_values_images, num_patches_images = [], []
+
+            if videos is not None:
+                pixel_values_videos = [
+                    video_to_pixel_values_internvl(
+                        video,
+                        input_size=self.image_size,
+                        min_num=1,
+                        max_num=1,
+                        use_thumbnail=False,
+                    ) for video in videos
+                ]
+                num_patches_videos = [
+                    pixel_value.shape[0] for pixel_value in pixel_values_videos
+                ]
+            else:
+                pixel_values_videos, num_patches_videos = [], []
+
+            pixel_values = []
+            while ("<image>" in text) or ("<video>" in text):
+                image_index = text.find("<image>")
+                video_index = text.find("<video>")
+                if image_index == -1 or (video_index > -1
+                                         and video_index < image_index):
+                    num_patches = num_patches_videos.pop(0)
+                    pixel_values.append(pixel_values_videos.pop(0))
+                    context_tokens = IMG_START + \
+                        IMG_CONTEXT * self.num_image_token + IMG_END
+                    video_tokens = ''.join([
+                        f'Frame{i+1}: {context_tokens}'
+                        for i in range(num_patches)
+                    ])
+                    text = text.replace('<video>', video_tokens, 1)
+                else:
+                    num_patches = num_patches_images.pop(0)
+                    pixel_values.append(pixel_values_images.pop(0))
                    context_tokens = IMG_CONTEXT * self.num_image_token \
                        * num_patches
                    image_tokens = IMG_START + context_tokens + IMG_END
                    text = text.replace('<image>', image_tokens, 1)
+            pixel_values = torch.cat(pixel_values, dim=0)
+
            prompt = self.tokenizer(text, return_tensors="pt")
            prompt.update({"pixel_values": pixel_values})
            return prompt

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -9,15 +9,15 @@ from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
                                                       UserMessage)
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast

 from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.inputs import MultiModalInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache
-from vllm.transformers_utils.tokenizer import (MistralTokenizer,
-                                               cached_tokenizer_from_config)
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
+                                               cached_tokenizer_from_config,
+                                               encode_tokens)

 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import HF_EXAMPLE_MODELS
@@ -28,7 +28,6 @@ def _test_processing_correctness(
    hit_rate: float,
    num_batches: int,
    simplify_rate: float,
-    ignore_mm_keys: Optional[set[str]] = None,
 ):
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
    model_info.check_available_online(on_fail="skip")
@@ -99,10 +98,23 @@ def _test_processing_correctness(
        }

        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
+
+        # Mistral chat outputs tokens directly, rather than text prompts
+        if isinstance(tokenizer, MistralTokenizer):
+            images = mm_data.get("image", [])
+            request = ChatCompletionRequest(messages=[
+                UserMessage(content=[
+                    TextChunk(text=""),
+                    *(ImageChunk(image=image) for image in images),
+                ]),
+            ])
+            res = tokenizer.mistral.encode_chat_completion(request)
+            prompt = res.tokens
+        else:
            prompt = dummy_inputs.get_dummy_processor_inputs(
                model_config.max_model_len,
                mm_counts,
-        ).prompt_text
+            ).prompt

        # Drop unnecessary keys and test single -> multi conversion
        if rng.rand() < simplify_rate:
@@ -112,8 +124,7 @@ def _test_processing_correctness(
                elif len(mm_data[k]) == 1:
                    mm_data[k] = mm_data[k][0]

-        if isinstance(tokenizer, MistralTokenizer):
-            _test_processing_correctness_mistral(
+        _test_processing_correctness_one(
            model_config,
            tokenizer,
            prompt,
@@ -121,58 +132,51 @@ def _test_processing_correctness(
            baseline_processor,
            cached_processor,
            batch_idx,
-                ignore_mm_keys=ignore_mm_keys,
-            )
-        else:
-            _test_processing_correctness_hf(
-                model_config,
-                tokenizer,
-                prompt,
-                mm_data,
-                baseline_processor,
-                cached_processor,
-                batch_idx,
-                ignore_mm_keys=ignore_mm_keys,
        )


-def _test_processing_correctness_hf(
+# For some multimodal models, tokenizer will always add bos_token
+# at the beginning of prompt by default, causing hf_processor outputs
+# incorrect token ids. So we need use `add_special_tokens=False` here
+# to leave bos_token to be added by the processor.
+_ADD_SPECIAL_TOKENS_OVERRIDES = {
+    "mllama": False,
+    "ovis": False,
+    "ultravox": False,
+    "whisper": False,
+}
+
+_IGNORE_MM_KEYS = {
+    # In Ultravox, the audio_features can be different depending on padding
+    # The slight difference should not be a problem though, since
+    # attention_mask lets us ignore the difference.
+    "ultravox": {"audio_features"},
+}
+
+
+def _test_processing_correctness_one(
    model_config: ModelConfig,
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    prompt: str,
+    tokenizer: AnyTokenizer,
+    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
    baseline_processor: BaseMultiModalProcessor,
    cached_processor: BaseMultiModalProcessor,
    batch_idx: int,
-    ignore_mm_keys: Optional[set[str]] = None,
 ):
-    if model_config.hf_config.model_type in ("mllama", "ovis", "ultravox",
-                                             "whisper"):
-        # For some multimodal models, tokenizer will always add bos_token
-        # at the beginning of prompt by default, causing hf_processor outputs
-        # incorrect token ids. So we need use `add_special_tokens=False` here
-        # to leave bos_token to be added by the processor.
-        token_prompt = tokenizer.encode(prompt, add_special_tokens=False)
-    else:
-        token_prompt = tokenizer.encode(prompt)
+    model_type = model_config.hf_config.model_type
+    ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())

-    baseline_result = baseline_processor.apply(
-        prompt,
-        mm_data=mm_data,
-        hf_processor_mm_kwargs={},
-    )
-    cached_result = cached_processor.apply(
+    if isinstance(prompt, str):
+        text_prompt = prompt
+        token_prompt = encode_tokens(
+            tokenizer,
            prompt,
-        mm_data=mm_data,
-        hf_processor_mm_kwargs={},
-    )
-
-    _assert_inputs_equal(
-        baseline_result,
-        cached_result,
-        ignore_mm_keys=ignore_mm_keys,
-        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+            add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type),
        )
+    else:
+        # Mistral does not support decode_tokens with skip_special_tokens=False
+        text_prompt = None
+        token_prompt = prompt

    baseline_tokenized_result = baseline_processor.apply(
        token_prompt,
@@ -180,13 +184,6 @@ def _test_processing_correctness_hf(
        hf_processor_mm_kwargs={},
    )

-    _assert_inputs_equal(
-        baseline_result,
-        baseline_tokenized_result,
-        ignore_mm_keys=ignore_mm_keys,
-        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
-    )
-
    cached_tokenized_result = cached_processor.apply(
        token_prompt,
        mm_data=mm_data,
@@ -194,53 +191,45 @@ def _test_processing_correctness_hf(
    )

    _assert_inputs_equal(
-        cached_result,
+        baseline_tokenized_result,
        cached_tokenized_result,
        ignore_mm_keys=ignore_mm_keys,
-        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+        msg=f"Failed ({batch_idx=}, {token_prompt=}, {mm_data=})",
    )

-
-def _test_processing_correctness_mistral(
-    model_config: ModelConfig,
-    tokenizer: MistralTokenizer,
-    prompt: str,
-    mm_data: MultiModalDataDict,
-    baseline_processor: BaseMultiModalProcessor,
-    cached_processor: BaseMultiModalProcessor,
-    batch_idx: int,
-    ignore_mm_keys: Optional[set[str]] = None,
-):
-    images = mm_data.get("image", [])
-    if not isinstance(images, list):
-        images = [images]
-
-    request = ChatCompletionRequest(messages=[
-        UserMessage(content=[
-            TextChunk(text=prompt),
-            *(ImageChunk(image=image) for image in images),
-        ]),
-    ])
-    res = tokenizer.mistral.encode_chat_completion(request)
-    token_prompt = res.tokens
-
-    # Mistral chat outputs tokens directly, rather than text prompts
-    baseline_tokenized_result = baseline_processor.apply(
-        token_prompt,
+    if text_prompt is not None:
+        baseline_text_result = baseline_processor.apply(
+            text_prompt,
            mm_data=mm_data,
            hf_processor_mm_kwargs={},
        )
-    cached_tokenized_result = cached_processor.apply(
-        token_prompt,
+        cached_text_result = cached_processor.apply(
+            text_prompt,
            mm_data=mm_data,
            hf_processor_mm_kwargs={},
        )

        _assert_inputs_equal(
+            baseline_text_result,
+            cached_text_result,
+            ignore_mm_keys=ignore_mm_keys,
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {mm_data=})",
+        )
+
+        _assert_inputs_equal(
+            baseline_text_result,
            baseline_tokenized_result,
+            ignore_mm_keys=ignore_mm_keys,
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
+            f"{token_prompt=}, {mm_data=})",
+        )
+
+        _assert_inputs_equal(
+            cached_text_result,
            cached_tokenized_result,
            ignore_mm_keys=ignore_mm_keys,
-        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
+            f"{token_prompt=}, {mm_data=})",
        )


@@ -258,6 +247,7 @@ def _test_processing_correctness_mistral(
    "ibm-granite/granite-speech-3.3-8b",
    "h2oai/h2ovl-mississippi-800m",
    "OpenGVLab/InternVL2-1B",
+    "OpenGVLab/InternVL3-1B",
    "HuggingFaceM4/Idefics3-8B-Llama3",
    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
    "moonshotai/Kimi-VL-A3B-Instruct",
@@ -280,6 +270,7 @@ def _test_processing_correctness_mistral(
    "AIDC-AI/Ovis2-1B",
    "google/paligemma-3b-mix-224",
    "google/paligemma2-3b-ft-docci-448",
+    "microsoft/Phi-3.5-vision-instruct",
    "microsoft/Phi-4-multimodal-instruct",
    "mistralai/Pixtral-12B-2409",
    "mistral-community/pixtral-12b",
@@ -303,41 +294,6 @@ def test_processing_correctness(
    num_batches: int,
    simplify_rate: float,
 ):
-    ignore_mm_keys = None
-    if 'ultravox' in model_id:
-        # In Ultravox, the audio_features can be different depending on padding
-        # The slight difference should not be a problem though, since
-        # attention_mask lets us ignore the difference.
-        ignore_mm_keys = {"audio_features"}
-
-    _test_processing_correctness(
-        model_id,
-        hit_rate=hit_rate,
-        num_batches=num_batches,
-        simplify_rate=simplify_rate,
-        ignore_mm_keys=ignore_mm_keys,
-    )
-
-
-# yapf: disable
-@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
-@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
-@pytest.mark.parametrize("num_batches", [32])
-@pytest.mark.parametrize("simplify_rate", [1.0])
-# yapf: enable
-def test_processing_correctness_phi3v(
-    model_id: str,
-    hit_rate: float,
-    num_batches: int,
-    simplify_rate: float,
-):
-    # HACK - this is an attempted workaround for the following bug
-    # https://github.com/huggingface/transformers/issues/34307
-    from transformers import AutoImageProcessor  # noqa: F401
-    from transformers import AutoProcessor  # noqa: F401
-
-    AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
-
    _test_processing_correctness(
        model_id,
        hit_rate=hit_rate,
@@ -356,16 +312,10 @@ def _assert_inputs_equal(
    if ignore_mm_keys is None:
        ignore_mm_keys = set()

-    if msg is None:
-        assert "mm_kwargs" in a and "mm_kwargs" in b
-    else:
    assert "mm_kwargs" in a and "mm_kwargs" in b, msg

    for key in ignore_mm_keys:
        a["mm_kwargs"].pop(key, None)
        b["mm_kwargs"].pop(key, None)

-    if msg is None:
-        assert a == b
-    else:
    assert a == b, msg
--- a/tests/models/multimodal/processing/test_mllama.py
+++ b/tests/models/multimodal/processing/test_mllama.py
@@ -49,7 +49,7 @@ def test_profiling(
                        ] * max_num_seqs

    mm_kwargs = processor.apply(
-        prompt=dummy_mm_data.prompt_text,
+        prompt=dummy_mm_data.prompt,
        mm_data=dummy_mm_data.mm_data,
        hf_processor_mm_kwargs=dict(),
    )["mm_kwargs"]

--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -78,8 +78,12 @@ DOLPHIN_CONFIG = GGUFTestConfig(
 )

 MODELS = [
-    LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG,
-    DOLPHIN_CONFIG
+    LLAMA_CONFIG,
+    QWEN2_CONFIG,
+    PHI3_CONFIG,
+    GPT2_CONFIG,
+    # STABLELM_CONFIG,  # enable this when v1 support head_size=80
+    DOLPHIN_CONFIG,
    # STARCODER_CONFIG, # broken
 ]


--- a/tests/models/quantization/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -41,8 +41,8 @@ EXPECTED_STRS_MAP = {
    reason=
    "Prevent unstable test based on golden strings from breaking the build "
    " and test input model being too large and hanging the system.")
-@pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
-                    reason="nvfp4 is not supported on this GPU type.")
+@pytest.mark.skipif(not is_quant_method_supported("modelopt_fp4"),
+                    reason="modelopt_fp4 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
    model = LLM(
@@ -50,7 +50,7 @@ def test_models(example_prompts, model_name) -> None:
        max_model_len=MAX_MODEL_LEN,
        trust_remote_code=True,
        enforce_eager=True,
-        quantization="nvfp4",
+        quantization="modelopt_fp4",
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -8,6 +8,8 @@ import pytest
 from packaging.version import Version
 from transformers import __version__ as TRANSFORMERS_VERSION

+from vllm.config import TokenizerMode
+

 @dataclass(frozen=True)
 class _HfExamplesInfo:
@@ -20,7 +22,7 @@ class _HfExamplesInfo:
    tokenizer: Optional[str] = None
    """Set the tokenizer to load for this architecture."""

-    tokenizer_mode: str = "auto"
+    tokenizer_mode: TokenizerMode = "auto"
    """Set the tokenizer type for this architecture."""

    speculative_model: Optional[str] = None
@@ -55,9 +57,18 @@ class _HfExamplesInfo:
    trust_remote_code: bool = False
    """The ``trust_remote_code`` level required to load the model."""

+    v0_only: bool = False
+    """The model is only available with the vLLM V0 engine."""
+
    hf_overrides: dict[str, Any] = field(default_factory=dict)
    """The ``hf_overrides`` required to load the model."""

+    max_model_len: Optional[int] = None
+    """
+    The maximum model length to use for this model. Some models default to a
+    length that is too large to fit into memory in CI.
+    """
+
    def check_transformers_version(
        self,
        *,
@@ -124,7 +135,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
                                         trust_remote_code=True),
    "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B",
-                                        extras={"tiny": "hmellor/bamba-tiny-random"}),  # noqa: E501
+                                        extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}),  # noqa: E501
    "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
                                        {"1b": "bigscience/bloomz-1b1"}),
    "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
@@ -147,6 +158,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
    "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
    "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
+    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct",
+                                          is_available_online=False,
+                                          min_transformers_version="4.52.2"),
    "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
    "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
    "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
@@ -212,10 +226,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
                                        trust_remote_code=True),
    "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
-    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
+    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True),
    "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
    "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
-                                            trust_remote_code=True),
+                                            trust_remote_code=True,
+                                            v0_only=True),
    "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                         trust_remote_code=True),
    "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
@@ -231,7 +246,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                     is_available_online=False),
    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
                                                is_available_online=False),
-    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
+    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
+                                           v0_only=True),
    "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
@@ -300,7 +316,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
-                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
+                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"},  # noqa: E501
+                                                     v0_only=True),
    "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
@@ -319,15 +336,18 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                      max_transformers_version="4.48",  # noqa: E501
                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
    "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
-                                         extras={"2B": "OpenGVLab/InternVL2-2B"},  # noqa: E501
+                                         extras={"2B": "OpenGVLab/InternVL2-2B",
+                                                 "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
                                         trust_remote_code=True),
    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                        {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                      extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
-                                                      trust_remote_code=True),
+                                                      trust_remote_code=True,
+                                                      v0_only=True),
    "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
-                                                      min_transformers_version="4.51"),
+                                                      min_transformers_version="4.51",
+                                                      max_model_len=10240),
    "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
                                                     extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
                                                             "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}),  # noqa: E501
@@ -346,7 +366,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
                                trust_remote_code=True),
    "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
-                                              trust_remote_code=True),
+                                              trust_remote_code=True,
+                                              v0_only=True),
    "Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503",  # noqa: E501
                                                        extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}),  # noqa: E501
    "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
@@ -379,6 +400,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"),  # noqa: E501
    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B",
                                        min_transformers_version="4.52"),
+    "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ",  # noqa: E501
+                                                           min_transformers_version="4.52"),
    "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
    "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501

--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -15,12 +15,12 @@ from .registry import HF_EXAMPLE_MODELS


 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
-def test_can_initialize(model_arch):
+def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")

-    # Avoid OOM
+    # Avoid OOM and reduce initialization time by only using 1 layer
    def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
        hf_config.update(model_info.hf_overrides)

@@ -34,6 +34,12 @@ def test_can_initialize(model_arch):
            "num_local_experts": 2,
        })

+        if hasattr(hf_config, "vision_config"):
+            hf_config.vision_config.update({
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+            })
+
        return hf_config

    # Avoid calling model.forward()
@@ -46,7 +52,7 @@ def test_can_initialize(model_arch):
        scheduler_kv_cache_config = get_kv_cache_config(
            vllm_config,
            kv_cache_specs[0],
-            20 * GiB_bytes,
+            10 * GiB_bytes,
        )

        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
@@ -55,7 +61,9 @@ def test_can_initialize(model_arch):
    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
                       _initialize_kv_caches_v0),
          patch.object(V1EngineCore, "_initialize_kv_caches",
-                       _initialize_kv_caches_v1)):
+                       _initialize_kv_caches_v1), monkeypatch.context() as m):
+        if model_info.v0_only:
+            m.setenv("VLLM_USE_V1", "0")
        LLM(
            model_info.default,
            tokenizer=model_info.tokenizer,
@@ -65,6 +73,7 @@ def test_can_initialize(model_arch):
                "num_speculative_tokens": 1,
            } if model_info.speculative_model else None,
            trust_remote_code=model_info.trust_remote_code,
+            max_model_len=model_info.max_model_len,
            load_format="dummy",
            hf_overrides=hf_overrides,
        )
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -4,6 +4,7 @@ import pytest

 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
+from vllm.multimodal.image import convert_image_mode

 from ..utils import create_new_process_for_each_test

@@ -58,7 +59,7 @@ def test_oot_registration_embedding(
            assert all(v == 0 for v in output.outputs.embedding)


-image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")


 @create_new_process_for_each_test()

--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
 # SPDX-License-Identifier: Apache-2.0
 """Test the functionality of the Transformers backend."""
+from typing import Any, Optional, Union
+
 import pytest

 from vllm.platforms import current_platform

 from ..conftest import HfRunner, VllmRunner
+from ..core.block.e2e.test_correctness_sliding_window import prep_prompts
 from ..utils import multi_gpu_test
 from .utils import check_logprobs_close


 def check_implementation(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
+    runner_ref: type[Union[HfRunner, VllmRunner]],
+    runner_test: type[VllmRunner],
    example_prompts: list[str],
    model: str,
+    kwargs_ref: Optional[dict[str, Any]] = None,
+    kwargs_test: Optional[dict[str, Any]] = None,
    **kwargs,
 ):
+    if kwargs_ref is None:
+        kwargs_ref = {}
+    if kwargs_test is None:
+        kwargs_test = {}
+
    max_tokens = 32
    num_logprobs = 5

-    with vllm_runner(model, **kwargs) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+    args = (example_prompts, max_tokens, num_logprobs)
+
+    with runner_test(model, **kwargs_test, **kwargs) as model_test:
+        outputs_test = model_test.generate_greedy_logprobs(*args)

-    with hf_runner(model) as hf_model:
-        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+    with runner_ref(model, **kwargs_ref) as model_ref:
+        if isinstance(model_ref, VllmRunner):
+            outputs_ref = model_ref.generate_greedy_logprobs(*args)
+        else:
+            outputs_ref = model_ref.generate_greedy_logprobs_limit(*args)

    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
+        outputs_0_lst=outputs_ref,
+        outputs_1_lst=outputs_test,
+        name_0="ref",
+        name_1="test",
    )


@@ -58,6 +71,18 @@ def test_models(
                         model_impl=model_impl)


+def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
+    prompts, _, _ = prep_prompts(4, (800, 801))
+    kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
+    kwargs_test = {"model_impl": "transformers", **kwargs_ref}
+    check_implementation(vllm_runner,
+                         vllm_runner,
+                         prompts,
+                         model="hmellor/tiny-random-Gemma2ForCausalLM",
+                         kwargs_ref=kwargs_ref,
+                         kwargs_test=kwargs_test)
+
+
 @multi_gpu_test(num_gpus=2)
 def test_distributed(
    hf_runner: type[HfRunner],
@@ -65,8 +90,11 @@ def test_distributed(
    example_prompts,
 ):
    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
-    check_implementation(hf_runner, vllm_runner, example_prompts,
-                         "meta-llama/Llama-3.2-1B-Instruct", **kwargs)
+    check_implementation(hf_runner,
+                         vllm_runner,
+                         example_prompts,
+                         "meta-llama/Llama-3.2-1B-Instruct",
+                         kwargs_test=kwargs)


 @pytest.mark.skipif(

--- a/tests/models/test_utils.py
+++ b/tests/models/test_utils.py
@@ -77,3 +77,73 @@ def test_module_with_child_containing_batchnorm_can_autoload():
    assert torch.all(
        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
+
+
+def test_module_skip_prefix():
+    """Ensure the auto weight loader can skip prefix."""
+    mod = ModuleWithNestedBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        # weights needed to be filtered out
+        redundant_weights = {
+            "prefix.bn.weight": torch.Tensor([1, 2]),
+            "prefix.bn.bias": torch.Tensor([3, 4]),
+        }
+        yield from (mod.state_dict() | redundant_weights).items()
+
+    new_mod = ModuleWithNestedBatchNorm()
+
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod, skip_prefixes=["prefix."])
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
+
+
+def test_module_skip_substr():
+    """Ensure the auto weight loader can skip prefix."""
+    mod = ModuleWithNestedBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        # weights needed to be filtered out
+        redundant_weights = {
+            "nested_mod.0.substr.weight": torch.Tensor([1, 2]),
+            "nested_mod.0.substr.bias": torch.Tensor([3, 4]),
+            "nested_mod.substr.weight": torch.Tensor([1, 2]),
+            "nested_mod.substr.bias": torch.Tensor([3, 4]),
+        }
+        yield from (mod.state_dict() | redundant_weights).items()
+
+    new_mod = ModuleWithNestedBatchNorm()
+
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod, skip_substrs=["substr."])
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
--- a/tests/multimodal/assets/rgba.png
+++ b/tests/multimodal/assets/rgba.png
--- a/tests/multimodal/test_image.py
+++ b/tests/multimodal/test_image.py
+# SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ImageChops
+
+from vllm.multimodal.image import convert_image_mode
+
+ASSETS_DIR = Path(__file__).parent / "assets"
+assert ASSETS_DIR.exists()
+
+
+def test_rgb_to_rgb():
+    # Start with an RGB image.
+    original_image = Image.open(ASSETS_DIR / "image1.png").convert("RGB")
+    converted_image = convert_image_mode(original_image, "RGB")
+
+    # RGB to RGB should be a no-op.
+    diff = ImageChops.difference(original_image, converted_image)
+    assert diff.getbbox() is None
+
+
+def test_rgba_to_rgb():
+    original_image = Image.open(ASSETS_DIR / "rgba.png")
+    original_image_numpy = np.array(original_image)
+
+    converted_image = convert_image_mode(original_image, "RGB")
+    converted_image_numpy = np.array(converted_image)
+
+    for i in range(original_image_numpy.shape[0]):
+        for j in range(original_image_numpy.shape[1]):
+            # Verify that all transparent pixels are converted to white.
+            if original_image_numpy[i][j][3] == 0:
+                assert converted_image_numpy[i][j][0] == 255
+                assert converted_image_numpy[i][j][1] == 255
+                assert converted_image_numpy[i][j][2] == 255
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -10,6 +10,7 @@ import numpy as np
 import pytest
 from PIL import Image, ImageChops

+from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
                                   merge_and_sort_multimodal_metadata)
@@ -53,7 +54,7 @@ def get_supported_suffixes() -> tuple[str, ...]:


 def _image_equals(a: Image.Image, b: Image.Image) -> bool:
-    return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
+    return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()


 @pytest.mark.asyncio

--- a/tests/neuron/2_core/test_eagle.py
+++ b/tests/neuron/2_core/test_eagle.py
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+import shutil
+import tempfile
+
+import torch
+from huggingface_hub import snapshot_download
+from safetensors import safe_open
+
+from vllm import LLM, SamplingParams
+
+
+def patch_eagle_draft_with_lm_head(target_model_id: str,
+                                   draft_model_id: str) -> str:
+    # In NxDI, draft model checkpoint must include lm_head weights from target
+    # model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
+    # /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
+    # #eagle-checkpoint-compatibility
+    final_draft_dir = "/tmp/patched_eagle_draft"
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        target_dir = snapshot_download(repo_id=target_model_id,
+                                       local_dir=os.path.join(
+                                           tmp_dir, "target"))
+        draft_dir = snapshot_download(repo_id=draft_model_id,
+                                      local_dir=os.path.join(tmp_dir, "draft"))
+
+        lm_head_key = "lm_head.weight"
+        index_path = os.path.join(target_dir, "model.safetensors.index.json")
+        with open(index_path) as f:
+            index = json.load(f)
+        shard_name = index["weight_map"][lm_head_key]
+        target_safetensor_path = os.path.join(target_dir, shard_name)
+
+        with safe_open(target_safetensor_path, framework="pt") as f:
+            target_lm_head = f.get_tensor(lm_head_key)
+
+        draft_path = os.path.join(draft_dir, "pytorch_model.bin")
+        draft_state_dict = torch.load(draft_path, map_location="cpu")
+        draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16)
+        torch.save(draft_state_dict, draft_path)
+
+        shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True)
+
+    return final_draft_dir
+
+
+def test_eagle():
+    patched_draft_path = patch_eagle_draft_with_lm_head(
+        target_model_id="meta-llama/Llama-2-7b-hf",
+        draft_model_id="yuhuili/EAGLE-llama2-chat-7B")
+    llm = LLM(
+        model="meta-llama/Llama-2-7b-hf",
+        speculative_config={
+            "model": patched_draft_path,
+            "num_speculative_tokens": 5,
+            "max_model_len": 128
+        },
+        max_num_seqs=1,
+        max_model_len=128,
+        tensor_parallel_size=2,
+        override_neuron_config={
+            "enable_eagle_speculation": True,
+            "enable_fused_speculation": True,
+            "fused_qkv": True
+        },
+    )
+    prompts = [
+        "The president of the United States is",
+    ]
+    outputs = llm.generate(prompts, SamplingParams(top_k=1))
+    expected_output = " the head of state and head of government of " \
+    "the United States. The president direct"
+
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
+        assert (expected_output == generated_text)
+
+    print("Neuron Eagle speculation test passed.")
--- a/tests/neuron/2_core/test_mistral.py
+++ b/tests/neuron/2_core/test_mistral.py
@@ -7,26 +7,58 @@ def test_mistral():
    llm = LLM(model="mistralai/Mistral-7B-v0.1",
              tensor_parallel_size=2,
              max_num_seqs=4,
-              max_model_len=512,
+              max_model_len=128,
              use_v2_block_manager=True,
              override_neuron_config={
                  "sequence_parallel_enabled": False,
                  "skip_warmup": True
-              },
-              device="neuron")
+              })

+    # Send more prompts than the compiled batch size (4) and request
+    # varying generation lengths to test accuracy related to Neuron
+    # specific sequence id sorting.
    prompts = [
        "The president of the United States is",
        "The capital of France is",
+        "What is Annapurna labs?",
+        "I believe the meaning of life is",
+        "Tell me a story about a brave knight",
+        "Hello, my name is Llama",
    ]
-    outputs = llm.generate(prompts, SamplingParams(top_k=1))
+
+    sampling_params = [
+        SamplingParams(top_k=1, max_tokens=10),
+        SamplingParams(top_k=1, max_tokens=20),
+        SamplingParams(top_k=1, max_tokens=30),
+        SamplingParams(top_k=1, max_tokens=40),
+        SamplingParams(top_k=1, max_tokens=50),
+        SamplingParams(top_k=1, max_tokens=60)
+    ]
+
+    outputs = llm.generate(prompts, sampling_params)

    expected_outputs = [
-        " the most powerful person in the world. He is the head of state "
-        "and head",
-        " a city of many faces. It is a city of history, culture, art"
+        " the most powerful person in the world. He is",
+        " a city of many faces. It is a city of history, culture, art, "
+        "fashion, and",
+        "\n\nAnnapurna Labs is a semiconductor company that was founded "
+        "in 2013 by Amazon. The company is",
+        " to be happy.\n\nI believe that happiness is a choice.\n\nI "
+        "believe that happiness is a state of mind.\n\nI believe that "
+        "happiness is a journey.\n\nI believe",
+        " who rescued a princess from a dragon.\n\nTell me a story about"
+        " a princess who rescued herself from a dragon.\n\nTell me a "
+        "story about a princess who rescued herself from a dragon and "
+        "then rescued a knight from",
+        " and I am a 10 year old male. I am a very friendly and "
+        "affectionate boy who loves to be around people. I am a very "
+        "active boy who loves to play and run around. I am a very smart "
+        "boy who loves to learn new things. I am a very loyal boy"
    ]

    for expected_output, output in zip(expected_outputs, outputs):
        generated_text = output.outputs[0].text
+        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
        assert (expected_output == generated_text)
+
+    print("Neuron Mistral test passed.")
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -29,5 +29,5 @@ def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
    # ignore the backend env variable if it is set
    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
-        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+        backend = get_attn_backend(16, torch.float16, "auto", 16, False)
        assert backend.get_name() == "Dummy_Backend"
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -37,12 +37,6 @@ models_pre_quant_8bit_to_test = [
    ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
 ]

-models_pre_quant_8bit_to_test = [
-    ('meta-llama/Llama-Guard-3-8B-INT8',
-     'read pre-quantized llama 8-bit model'),
-    ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
-]
-

 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')

--- a/tests/runai_model_streamer_test/test_weight_utils.py
+++ b/tests/runai_model_streamer_test/test_weight_utils.py
@@ -23,10 +23,11 @@ def test_runai_model_loader():
        runai_model_streamer_tensors = {}
        hf_safetensors_tensors = {}

-        for name, tensor in runai_safetensors_weights_iterator(safetensors):
+        for name, tensor in runai_safetensors_weights_iterator(
+                safetensors, True):
            runai_model_streamer_tensors[name] = tensor

-        for name, tensor in safetensors_weights_iterator(safetensors):
+        for name, tensor in safetensors_weights_iterator(safetensors, True):
            hf_safetensors_tensors[name] = tensor

        assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors)

--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@@ -5,14 +5,6 @@ from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig


-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Tensorizer only tested on V0 so far.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 @pytest.fixture(autouse=True)
 def cleanup():
    cleanup_dist_env_and_memory(shutdown_ray=True)