Merge tag 'v0.9.0' into v0.9.0-ori

7a985548 · zhuwenwen · 45d3785c · dc1440cf · 7a985548 · 7a985548
Commit 7a985548 authored May 22, 2025 by zhuwenwen
20 changed files
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -4,7 +4,7 @@ import pytest

 from vllm.multimodal import MULTIMODAL_REGISTRY

-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context


@@ -19,7 +19,7 @@ from ...utils import build_model_context
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
    model_id: str,
    mm_processor_kwargs: dict[str, object],
    expected_toks_per_img: int,

--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -5,7 +5,7 @@ from transformers import SmolVLMConfig

 from vllm.multimodal import MULTIMODAL_REGISTRY

-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context


@@ -21,7 +21,7 @@ from ...utils import build_model_context
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
    model_id: str,
    mm_processor_kwargs: dict[str, object],
    expected_toks_per_img: int,

--- a/tests/models/embedding/language/__init__.py
+++ b/tests/models/embedding/language/__init__.py
--- a/tests/models/decoder_only/language/test_aqlm.py
+++ b/tests/models/decoder_only/language/test_aqlm.py
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of a AQLM model between vLLM and HF Transformers
-
-Run `pytest tests/models/test_aqlm.py`.
-"""
-
 import pytest

 from tests.quantization.utils import is_quant_method_supported
+from vllm.platforms import current_platform

 # These ground truth generations were generated using `transformers==4.38.1
 # aqlm==1.1.0 torch==2.2.0`
@@ -39,8 +35,9 @@ ground_truth_generations = [
 ]


-@pytest.mark.quant_model
-@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
+@pytest.mark.skipif(not is_quant_method_supported("aqlm")
+                    or current_platform.is_rocm()
+                    or not current_platform.is_cuda(),
                    reason="AQLM is not supported on this GPU type.")
 @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
 @pytest.mark.parametrize("dtype", ["half"])

--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
@@ -7,8 +7,8 @@ import torch

 from vllm.multimodal.image import rescale_image_size

-from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
-from ...utils import check_logprobs_close
+from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
+from ..utils import check_logprobs_close

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -20,7 +20,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({

 def run_awq_test(
    vllm_runner: type[VllmRunner],
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
    source_model: str,
    quant_model: str,
    *,
@@ -85,7 +85,6 @@ def run_awq_test(
        )


-@pytest.mark.quant_model
 @pytest.mark.parametrize(
    ("source_model", "quant_model"),
    [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],

--- a/tests/models/test_bitblas.py
+++ b/tests/models/test_bitblas.py
@@ -8,14 +8,12 @@ bitblas/GPTQ models are in the top 3 selections of each other.
 Note: bitblas internally uses locks to synchronize the threads. This can
 result in very slight nondeterminism for bitblas. As a result, we re-run the 
 test up to 3 times to see if we pass.
-
-Run `pytest tests/models/test_bitblas.py`.
 """
 from dataclasses import dataclass

 import pytest

-from .utils import check_logprobs_close
+from ..utils import check_logprobs_close


 @dataclass

--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -4,20 +4,15 @@
 """Tests fp8 models against ground truth generation
 Note: these tests will only pass on L4 GPU.
 """
-import os
-from typing import Optional
-
 import pytest

-from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR

-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close


-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize(
@@ -60,6 +55,14 @@ def test_models(
    Only checks log probs match to cover the discrepancy in
    numerical sensitive kernels.
    """
+
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+
+    if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
+        pytest.skip(
+            f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
+
    with monkeypatch.context() as m:
        m.setenv("TOKENIZERS_PARALLELISM", 'true')
        m.setenv(STR_BACKEND_ENV_VAR, backend)

--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -14,9 +14,9 @@ from transformers import AutoTokenizer

 from tests.quantization.utils import is_quant_method_supported

-from ....conftest import VllmRunner
-from ....utils import multi_gpu_test
-from ...utils import check_logprobs_close
+from ...conftest import VllmRunner
+from ...utils import multi_gpu_test
+from ..utils import check_logprobs_close

 os.environ["TOKENIZERS_PARALLELISM"] = "true"

@@ -38,7 +38,6 @@ LLAMA_CONFIG = GGUFTestConfig(
    original_model="meta-llama/Llama-3.2-1B-Instruct",
    gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
    gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
-    marks=[pytest.mark.quant_model],
 )

 QWEN2_CONFIG = GGUFTestConfig(

--- a/tests/models/test_gptq_bitblas.py
+++ b/tests/models/test_gptq_bitblas.py
@@ -8,14 +8,12 @@ bitblas/GPTQ models are in the top 3 selections of each other.
 Note: bitblas internally uses locks to synchronize the threads. This can
 result in very slight nondeterminism for bitblas. As a result, we re-run the 
 test up to 3 times to see if we pass.
-
-Run `pytest tests/models/test_bitblas.py`.
 """
 from dataclasses import dataclass

 import pytest

-from .utils import check_logprobs_close
+from ..utils import check_logprobs_close


 @dataclass

--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
 # SPDX-License-Identifier: Apache-2.0
-"""Compares the outputs of gptq vs gptq_marlin 
+"""Compares the outputs of gptq vs gptq_marlin.
+
 Note: GPTQ and Marlin do not have bitwise correctness.
 As a result, in this test, we just confirm that the top selected tokens of the
 Marlin/GPTQ models are in the top 5 selections of each other.
 Note: Marlin internally uses locks to synchronize the threads. This can
 result in very slight nondeterminism for Marlin. As a result, we re-run the test
 up to 3 times to see if we pass.
-
-Run `pytest tests/models/test_gptq_marlin.py`.
 """
 import os

@@ -15,8 +14,9 @@ import pytest

 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
+from vllm.platforms import current_platform

-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close

 os.environ["TOKENIZERS_PARALLELISM"] = "true"

@@ -34,9 +34,10 @@ MODELS = [
 ]


-@pytest.mark.quant_model
 @pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin")
+                    or current_platform.is_rocm()
+                    or not current_platform.is_cuda(),
                    reason="gptq_marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half", "bfloat16"])

--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@@ -4,16 +4,15 @@
 Note: GPTQ and Marlin_24 do not have bitwise correctness.
 As a result, in this test, we just confirm that the top selected tokens of the
 Marlin/GPTQ models are in the top 3 selections of each other.
-
-Run `pytest tests/models/test_marlin_24.py`.
 """
 from dataclasses import dataclass

 import pytest

 from tests.quantization.utils import is_quant_method_supported
+from vllm.platforms import current_platform

-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close


 @dataclass
@@ -39,9 +38,10 @@ model_pairs = [
 ]


-@pytest.mark.quant_model
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24")
+                    or current_platform.is_rocm()
+                    or not current_platform.is_cuda(),
                    reason="Marlin24 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_pair", model_pairs)
 @pytest.mark.parametrize("dtype", ["half"])

--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/decoder_only/language/test_modelopt.py
@@ -40,7 +40,6 @@ EXPECTED_STRS_MAP = {
 @pytest.mark.skip(
    reason=
    "Prevent unstable test based on golden strings from breaking the build.")
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)

--- a/tests/models/quantization/test_mxfp4.py
+++ b/tests/models/quantization/test_mxfp4.py
+# SPDX-License-Identifier: Apache-2.0
+# flake8: noqa
+"""Tests Quark mxfp4 models against ground truth generation
+"""
+import pytest
+
+from vllm import LLM, SamplingParams
+
+MODELS = ["amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8"]
+
+EXPECTED_STRS_MAP = {
+    "amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8": [
+        '\n### Key Features\n\n* **High-throughput Inference**: vLL',
+        '\nArtificial intelligence (AI) has evolved significantly since its inception in the 1',
+        'Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been',
+        'A neural network is a machine learning model inspired by the structure of the human brain. It consists of',
+        '\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol',
+        '\nThe COVID-19 pandemic has had a profound impact on global economic structures and business',
+        'The Mona Lisa painting, created by Leonardo da Vinci in the early 16th',
+        " everybody knows this proverbial saying, but did you know that it's not entirely accurate?",
+    ]
+}
+
+
+@pytest.mark.skip(reason="Model to be released in the future")
+@pytest.mark.quant_model
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(example_prompts, model_name) -> None:
+    sampling_params = SamplingParams(max_tokens=20, temperature=0)
+    llm = LLM(
+        model=model_name,
+        kv_cache_dtype="fp8",
+        quantization="quark",
+    )
+    outputs = llm.generate(example_prompts, sampling_params)
+    for i, output in enumerate(outputs):
+        output_str = output.outputs[0].text
+        expected_str = EXPECTED_STRS_MAP[model_name][i]
+        assert expected_str == output_str, (
+            f"Expected: {expected_str!r}\nvLLM: {output_str!r}")
--- a/tests/models/decoder_only/language/test_nvfp4.py
+++ b/tests/models/decoder_only/language/test_nvfp4.py
@@ -41,7 +41,6 @@ EXPECTED_STRS_MAP = {
    reason=
    "Prevent unstable test based on golden strings from breaking the build "
    " and test input model being too large and hanging the system.")
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
                    reason="nvfp4 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -72,12 +72,15 @@ class _HfExamplesInfo:
            return

        current_version = TRANSFORMERS_VERSION
+        cur_base_version = Version(current_version).base_version
        min_version = self.min_transformers_version
        max_version = self.max_transformers_version
        msg = f"`transformers=={current_version}` installed, but `transformers"
-        if min_version and Version(current_version) < Version(min_version):
+        # Only check the base version for the min/max version, otherwise preview
+        # models cannot be run because `x.yy.0.dev0`<`x.yy.0`
+        if min_version and Version(cur_base_version) < Version(min_version):
            msg += f">={min_version}` is required to run this model."
-        elif max_version and Version(current_version) > Version(max_version):
+        elif max_version and Version(cur_base_version) > Version(max_version):
            msg += f"<={max_version}` is required to run this model."
        else:
            return
@@ -120,7 +123,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                         trust_remote_code=True),
    "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
                                         trust_remote_code=True),
-    "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B"),
+    "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B",
+                                        extras={"tiny": "hmellor/bamba-tiny-random"}),  # noqa: E501
    "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
                                        {"1b": "bigscience/bloomz-1b1"}),
    "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
@@ -162,6 +166,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                          {"1b": "EleutherAI/pythia-1.4b"}),
    "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
    "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
+    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview",  # noqa: E501
+                                                   min_transformers_version="4.52.0"),  # noqa: E501
    "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
    "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                             trust_remote_code=True),
@@ -176,7 +182,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
    "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
                                        extras={"tiny": "ai21labs/Jamba-tiny-dev"}),  # noqa: E501
-    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct"),
+    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct",
+                                        extras={"guard": "meta-llama/Llama-Guard-3-1B",  # noqa: E501
+                                                "hermes": "NousResearch/Hermes-3-Llama-3.1-8B"}),  # noqa: E501
    "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                        is_available_online=False),
    "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
@@ -191,13 +199,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                                trust_remote_code=True),
    "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
    "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1",  # noqa: E501
-                                          {"falcon3": "ehristoforu/Falcon3-MoE-2x7B-Insruct"}),  # noqa: E501
+                                          {"tiny": "TitanML/tiny-mixtral"}),  # noqa: E501
    "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"),  # noqa: E501
    "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
    "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
    "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
    "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
-    "Olmo2ForCausalLM": _HfExamplesInfo("shanearora/OLMo-7B-1124-hf"),
+    "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"),
    "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
    "OPTForCausalLM": _HfExamplesInfo("facebook/opt-125m",
                                      {"1b": "facebook/opt-iml-max-1.3b"}),
@@ -217,16 +225,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-0.5B-Instruct",
                                        extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}), # noqa: E501
    "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
-    "Qwen3ForCausalLM": _HfExamplesInfo(
-        "Qwen/Qwen3-8B",
-        is_available_online=False,
-        min_transformers_version="4.51"
-    ),
-    "Qwen3MoeForCausalLM": _HfExamplesInfo(
-        "Qwen/Qwen3-MoE-15B-A2B",
-        is_available_online=False,
-        min_transformers_version="4.51"
-    ),
+    "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
+    "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
                                     is_available_online=False),
    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
@@ -242,6 +242,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                         is_available_online=False,
                                         trust_remote_code=True),
    "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
+    "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
+                                        trust_remote_code=True),
    # [Encoder-decoder]
    "BartModel": _HfExamplesInfo("facebook/bart-base"),
    "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
@@ -254,11 +256,17 @@ _EMBEDDING_EXAMPLE_MODELS = {
    "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
    "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                                               trust_remote_code=True),
+    "GteNewModel": _HfExamplesInfo("Alibaba-NLP/gte-base-en-v1.5",
+                                   trust_remote_code=True,
+                                   hf_overrides={"architectures":
+                                                     ["GteNewModel"]}),
    "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
                                               trust_remote_code=True),
    "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
    "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
    "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
+    "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base",
+                                trust_remote_code=True),
    "NomicBertModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-long",  # noqa: E501
                                               trust_remote_code=True),
    "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
@@ -337,6 +345,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
                                trust_remote_code=True),
+    "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
+                                              trust_remote_code=True),
    "Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503",  # noqa: E501
                                                        extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}),  # noqa: E501
    "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
@@ -353,6 +363,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                        max_transformers_version="4.48",
                                        transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
                                        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501
+    "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
+                            extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
+                                    "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
    "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                        trust_remote_code=True),
    "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
@@ -364,8 +377,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"),  # noqa: E501
-    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B",  # noqa: E501
-                                                                  min_transformers_version="4.52"),  # noqa: E501
+    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B",
+                                        min_transformers_version="4.52"),
    "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
    "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
@@ -375,7 +388,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    # Therefore, we borrow the BartTokenizer from the original Bart model
    "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
                                                         tokenizer="Isotr0py/Florence-2-tokenizer",
-                                                         trust_remote_code=True),  # noqa: E501
+                                                         trust_remote_code=True,),  # noqa: E501
    "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
    "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct"),  # noqa: E501
    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
@@ -399,6 +412,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                            trust_remote_code=True,
                                            speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
                                            tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
+    "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
+                                    trust_remote_code=True,
+                                    speculative_model="XiaomiMiMo/MiMo-7B-RL")
 }

 _TRANSFORMERS_MODELS = {

--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
 # SPDX-License-Identifier: Apache-2.0
-"""Test the functionality of the Transformers backend.
-
-Run `pytest tests/models/test_transformers.py`.
-"""
+"""Test the functionality of the Transformers backend."""
 import pytest

+from vllm.platforms import current_platform
+
 from ..conftest import HfRunner, VllmRunner
 from ..utils import multi_gpu_test
 from .utils import check_logprobs_close
@@ -36,6 +35,9 @@ def check_implementation(
    )


+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.")
 @pytest.mark.parametrize(
    "model,model_impl",
    [
@@ -67,6 +69,9 @@ def test_distributed(
                         "meta-llama/Llama-3.2-1B-Instruct", **kwargs)


+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="bitsandbytes quantization is currently not supported in rocm.")
 @pytest.mark.parametrize("model, quantization_kwargs", [
    (
        "meta-llama/Llama-3.2-1B-Instruct",

--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -2,9 +2,10 @@

 import warnings
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union

 import torch
+import torch.nn.functional as F

 from vllm.config import ModelConfig, TaskOption
 from vllm.inputs import InputContext
@@ -12,6 +13,9 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs

 from .registry import HF_EXAMPLE_MODELS

+if TYPE_CHECKING:
+    from ..conftest import HfRunner
+
 TokensText = tuple[list[int], str]


@@ -291,3 +295,64 @@ def build_model_context(
        **model_config_kwargs,
    )
    return InputContext(model_config)
+
+
+def check_embeddings_close(
+    *,
+    embeddings_0_lst: Sequence[list[float]],
+    embeddings_1_lst: Sequence[list[float]],
+    name_0: str,
+    name_1: str,
+    tol: float = 1e-3,
+) -> None:
+    assert len(embeddings_0_lst) == len(embeddings_1_lst)
+
+    for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
+            zip(embeddings_0_lst, embeddings_1_lst)):
+        assert len(embeddings_0) == len(embeddings_1), (
+            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
+
+        sim = F.cosine_similarity(torch.tensor(embeddings_0),
+                                  torch.tensor(embeddings_1),
+                                  dim=0)
+
+        fail_msg = (f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
+                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
+
+        assert sim >= 1 - tol, fail_msg
+
+
+def matryoshka_fy(tensor: torch.Tensor, dimensions: int):
+    tensor = torch.tensor(tensor)
+    tensor = tensor[..., :dimensions]
+    tensor = F.normalize(tensor, p=2, dim=1)
+    return tensor
+
+
+class EmbedModelInfo(NamedTuple):
+    name: str
+    is_matryoshka: bool = False
+    matryoshka_dimensions: Optional[list[int]] = None
+    architecture: str = ""
+    dtype: str = "auto"
+    enable_test: bool = True
+
+
+def run_embedding_correctness_test(
+    hf_model: "HfRunner",
+    inputs: list[str],
+    vllm_outputs: Sequence[list[float]],
+    dimensions: Optional[int] = None,
+):
+    hf_outputs = hf_model.encode(inputs)
+    if dimensions:
+        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
--- a/tests/multimodal/assets/image1.png
+++ b/tests/multimodal/assets/image1.png
--- a/tests/multimodal/assets/image2.png
+++ b/tests/multimodal/assets/image2.png
--- a/tests/multimodal/test_hasher.py
+++ b/tests/multimodal/test_hasher.py
+# SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
+
+import numpy as np
+import pytest
+import torch
+from PIL import Image, ImageDraw
+
+from vllm.multimodal.hasher import MultiModalHasher
+
+ASSETS_DIR = Path(__file__).parent / "assets"
+assert ASSETS_DIR.exists()
+
+
+# NOTE: Images that are the same visually are allowed to have the same hash
+@pytest.mark.parametrize("mode_pair", [("1", "L"), ("RGBA", "CMYK")])
+def test_hash_collision_image_mode(mode_pair):
+    mode1, mode2 = mode_pair
+    image1 = Image.new(mode1, size=(10, 10), color=1)
+    image2 = Image.new(mode2, size=(10, 10), color=1)
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
+
+
+def test_hash_collision_image_palette():
+    # These images differ only in Image.palette._palette
+    image1 = Image.open(ASSETS_DIR / "image1.png")
+    image2 = Image.open(ASSETS_DIR / "image2.png")
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
+
+
+def test_hash_collision_image_transpose():
+    image1 = Image.new("1", size=(10, 20))
+    ImageDraw.Draw(image1).line([(0, 0), (10, 0)])
+
+    image2 = Image.new("1", size=(20, 10))
+    ImageDraw.Draw(image2).line([(0, 0), (0, 10)])
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
+
+
+def test_hash_collision_tensor_shape():
+    # The hash should be different though the data is the same when flattened
+    arr1 = torch.zeros((5, 10, 20, 3))
+    arr2 = torch.zeros((10, 20, 5, 3))
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)
+
+
+def test_hash_collision_array_shape():
+    # The hash should be different though the data is the same when flattened
+    arr1 = np.zeros((5, 10, 20, 3))
+    arr2 = np.zeros((10, 20, 5, 3))
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)