Merge tag 'v0.9.0' into v0.9.0-ori

7a985548 · zhuwenwen · 45d3785c · dc1440cf · 7a985548 · 7a985548
Commit 7a985548 authored May 22, 2025 by zhuwenwen
20 changed files
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
 # SPDX-License-Identifier: Apache-2.0

 from collections import OrderedDict
+from typing import NamedTuple, Optional
 from unittest.mock import patch

 import pytest
@@ -9,52 +10,96 @@ from torch import nn

 from vllm.lora.utils import (get_adapter_absolute_path,
                             parse_fine_tuned_lora_name, replace_submodule)
+from vllm.model_executor.models.utils import WeightsMapper
+
+
+class LoRANameParserTestConfig(NamedTuple):
+    name: str
+    module_name: str
+    is_lora_a: bool
+    is_bias: bool
+    weights_mapper: Optional[WeightsMapper] = None


 def test_parse_fine_tuned_lora_name_valid():
-    fixture = {
-        ("base_model.model.lm_head.lora_A.weight", "lm_head", True, False),
-        ("base_model.model.lm_head.lora_B.weight", "lm_head", False, False),
-        (
+    fixture = [
+        LoRANameParserTestConfig("base_model.model.lm_head.lora_A.weight",
+                                 "lm_head", True, False),
+        LoRANameParserTestConfig("base_model.model.lm_head.lora_B.weight",
+                                 "lm_head", False, False),
+        LoRANameParserTestConfig(
            "base_model.model.model.embed_tokens.lora_embedding_A",
            "model.embed_tokens",
            True,
            False,
        ),
-        (
+        LoRANameParserTestConfig(
            "base_model.model.model.embed_tokens.lora_embedding_B",
            "model.embed_tokens",
            False,
            False,
        ),
-        (
+        LoRANameParserTestConfig(
            "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
            "model.layers.9.mlp.down_proj",
            True,
            False,
        ),
-        (
+        LoRANameParserTestConfig(
            "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
            "model.layers.9.mlp.down_proj",
            False,
            False,
        ),
-        (
+        LoRANameParserTestConfig(
            "language_model.layers.9.mlp.down_proj.lora_A.weight",
            "language_model.layers.9.mlp.down_proj",
            True,
            False,
        ),
-        (
+        LoRANameParserTestConfig(
            "language_model.layers.9.mlp.down_proj.lora_B.weight",
            "language_model.layers.9.mlp.down_proj",
            False,
            False,
        ),
-    }
-    for name, module_name, is_lora_a, is_bias in fixture:
+        # Test with WeightsMapper
+        LoRANameParserTestConfig(
+            "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
+            "language_model.model.layers.9.mlp.down_proj",
+            True,
+            False,
+            weights_mapper=WeightsMapper(
+                orig_to_new_prefix={"model.": "language_model.model."}),
+        ),
+        LoRANameParserTestConfig(
+            "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
+            "language_model.model.layers.9.mlp.down_proj",
+            False,
+            False,
+            weights_mapper=WeightsMapper(
+                orig_to_new_prefix={"model.": "language_model.model."}),
+        ),
+        LoRANameParserTestConfig(
+            "model.layers.9.mlp.down_proj.lora_A.weight",
+            "language_model.model.layers.9.mlp.down_proj",
+            True,
+            False,
+            weights_mapper=WeightsMapper(
+                orig_to_new_prefix={"model.": "language_model.model."}),
+        ),
+        LoRANameParserTestConfig(
+            "model.layers.9.mlp.down_proj.lora_B.weight",
+            "language_model.model.layers.9.mlp.down_proj",
+            False,
+            False,
+            weights_mapper=WeightsMapper(
+                orig_to_new_prefix={"model.": "language_model.model."}),
+        ),
+    ]
+    for name, module_name, is_lora_a, is_bias, weights_mapper in fixture:
        assert (module_name, is_lora_a,
-                is_bias) == parse_fine_tuned_lora_name(name)
+                is_bias) == parse_fine_tuned_lora_name(name, weights_mapper)


 def test_parse_fine_tuned_lora_name_invalid():

--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -58,13 +58,19 @@ def test_worker_apply_lora(sql_lora_files):
            download_dir=None,
            load_format="dummy",
        ),
-        parallel_config=ParallelConfig(1, 1, False),
+        parallel_config=ParallelConfig(
+            pipeline_parallel_size=1,
+            tensor_parallel_size=1,
+            data_parallel_size=1,
+        ),
        scheduler_config=SchedulerConfig("generate", 32, 32, 32),
        device_config=DeviceConfig("cuda"),
-        cache_config=CacheConfig(block_size=16,
-                                 gpu_memory_utilization=1.,
-                                 swap_space=0,
-                                 cache_dtype="auto"),
+        cache_config=CacheConfig(
+            block_size=16,
+            gpu_memory_utilization=1.0,
+            swap_space=0,
+            cache_dtype="auto",
+        ),
        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
                               max_loras=32),
    )

--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
 # SPDX-License-Identifier: Apache-2.0

 import pytest
+import torch

 from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (GeluAndMul,
                                                   ReLUSquaredActivation,
                                                   SiluAndMul)
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    dispatch_fused_experts_func, dispatch_topk_func,
-    torch_vllm_inplace_fused_experts, torch_vllm_outplace_fused_experts,
-    vllm_topk_softmax)
+from vllm.model_executor.layers.fused_moe.fused_moe import (dispatch_topk_func,
+                                                            vllm_topk_softmax)
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
    is_rocm_aiter_moe_enabled)
 from vllm.model_executor.layers.layernorm import (
    RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
    rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform


@@ -98,35 +99,45 @@ def test_enabled_ops_invalid(env: str):
            RMSNorm(1024).enabled()


+@pytest.mark.skipif(
+    not current_platform.is_rocm() or not current_platform.is_fp8_fnuz(),
+    reason="AITER is a feature exclusive for ROCm and FP8_FNUZ")
+@pytest.mark.parametrize("use_cutlass", [True, False])
 @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
-def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
+@pytest.mark.parametrize("use_rocm_aiter_gemm_w8a8_blockscale", ["0", "1"])
+def test_w8a8_blockscale_dispatch(use_cutlass: bool, use_rocm_aiter: str,
+                                  use_rocm_aiter_gemm_w8a8_blockscale: str,
+                                  monkeypatch):
+
    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
-    topk_func = dispatch_topk_func()
-    is_rocm_aiter_moe_enabled.cache_clear()
-    if current_platform.is_rocm() and int(use_rocm_aiter):
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            rocm_aiter_topk_softmax)
-        assert topk_func == rocm_aiter_topk_softmax
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR",
+                       use_rocm_aiter_gemm_w8a8_blockscale)
+
+    use_aiter_and_is_supported = (bool(int(use_rocm_aiter)) and bool(
+        int(use_rocm_aiter_gemm_w8a8_blockscale)))
+    block_scale_func = dispatch_w8a8_blockscale_func(
+        use_cutlass, use_aiter_and_is_supported=use_aiter_and_is_supported)
+    if use_cutlass:
+        assert block_scale_func == cutlass_scaled_mm
+    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
+            use_rocm_aiter_gemm_w8a8_blockscale):
+        assert block_scale_func == (
+            torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale)
    else:
-        assert topk_func == vllm_topk_softmax
+        assert block_scale_func == w8a8_block_fp8_matmul


 @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
-@pytest.mark.parametrize("inplace", [True, False])
-def test_fused_experts_dispatch(use_rocm_aiter: str, inplace: bool,
-                                monkeypatch):
-
+def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    topk_func = dispatch_topk_func()
    is_rocm_aiter_moe_enabled.cache_clear()
-    fused_experts_func = dispatch_fused_experts_func(inplace)
    if current_platform.is_rocm() and int(use_rocm_aiter):
        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            rocm_aiter_fused_experts)
-        assert fused_experts_func == rocm_aiter_fused_experts
-    elif inplace:
-        assert fused_experts_func == torch_vllm_inplace_fused_experts
+            rocm_aiter_topk_softmax)
+        assert topk_func == rocm_aiter_topk_softmax
    else:
-        assert fused_experts_func == torch_vllm_outplace_fused_experts
+        assert topk_func == vllm_topk_softmax


 @pytest.mark.parametrize("add_residual", [True, False])

--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -202,12 +202,15 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):

 def test_guided_decoding_backend_options():
    """Test backend-specific options"""
-    params = GuidedDecodingParams(
-        backend="xgrammar:option-1,option-2,option-3")
-    assert params.backend_options() == ["option-1", "option-2", "option-3"]
-
-    no_fallback = GuidedDecodingParams(backend="xgrammar:option-1,no-fallback")
-    assert no_fallback.no_fallback()
+    with pytest.warns(DeprecationWarning):
+        guided_decoding_params = GuidedDecodingParams(
+            backend=
+            "xgrammar:no-fallback,disable-any-whitespace,no-additional-properties"
+        )
+    assert guided_decoding_params.backend == "xgrammar"
+    assert guided_decoding_params.disable_fallback
+    assert guided_decoding_params.disable_any_whitespace
+    assert guided_decoding_params.disable_additional_properties


 def test_pickle_xgrammar_tokenizer_data():

--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/weight_utils.py
@@ -20,11 +20,11 @@ def test_hf_transfer_auto_activation():
    try:
        # enable hf hub transfer if available
        import hf_transfer  # type: ignore # noqa
-        HF_TRANFER_ACTIVE = True
+        HF_TRANSFER_ACTIVE = True
    except ImportError:
-        HF_TRANFER_ACTIVE = False
+        HF_TRANSFER_ACTIVE = False
    assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
-            HF_TRANFER_ACTIVE)
+            HF_TRANSFER_ACTIVE)


 def test_download_weights_from_hf():

--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
-# SPDX-License-Identifier: Apache-2.0
-
-from collections.abc import Sequence
-from typing import NamedTuple, Optional
-
-import torch
-import torch.nn.functional as F
-
-
-def check_embeddings_close(
-    *,
-    embeddings_0_lst: Sequence[list[float]],
-    embeddings_1_lst: Sequence[list[float]],
-    name_0: str,
-    name_1: str,
-    tol: float = 1e-3,
-) -> None:
-    assert len(embeddings_0_lst) == len(embeddings_1_lst)
-
-    for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
-            zip(embeddings_0_lst, embeddings_1_lst)):
-        assert len(embeddings_0) == len(embeddings_1), (
-            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
-
-        sim = F.cosine_similarity(torch.tensor(embeddings_0),
-                                  torch.tensor(embeddings_1),
-                                  dim=0)
-
-        fail_msg = (f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
-                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
-
-        assert sim >= 1 - tol, fail_msg
-
-
-def matryoshka_fy(tensor, dimensions):
-    tensor = torch.tensor(tensor)
-    tensor = tensor[..., :dimensions]
-    tensor = F.normalize(tensor, p=2, dim=1)
-    return tensor
-
-
-class EmbedModelInfo(NamedTuple):
-    name: str
-    is_matryoshka: bool
-    matryoshka_dimensions: Optional[list[int]] = None
-    architecture: str = ""
-    enable_test: bool = True
-
-
-def correctness_test(hf_model,
-                     inputs,
-                     vllm_outputs: Sequence[list[float]],
-                     dimensions: Optional[int] = None):
-
-    hf_outputs = hf_model.encode(inputs)
-    if dimensions:
-        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
-
-    check_embeddings_close(
-        embeddings_0_lst=hf_outputs,
-        embeddings_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-        tol=1e-2,
-    )
--- a/tests/models/encoder_decoder/vision_language/__init__.py
+++ b/tests/models/encoder_decoder/vision_language/__init__.py
--- a/tests/models/encoder_decoder/vision_language/test_broadcast.py
+++ b/tests/models/encoder_decoder/vision_language/test_broadcast.py
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-from ....utils import multi_gpu_test
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@pytest.mark.parametrize("model", [
-    "meta-llama/Llama-3.2-11B-Vision-Instruct",
-])
-def test_models(hf_runner, vllm_runner, image_assets,
-                distributed_executor_backend, model) -> None:
-
-    dtype = "half"
-    max_tokens = 5
-    num_logprobs = 5
-    tensor_parallel_size = 2
-
-    if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
-        from .test_mllama import models, run_test
-    else:
-        raise NotImplementedError(f"Unsupported model: {model}")
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model=models[0],
-        size_factors=[0.25, 0.5, 1.0],
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-    )
--- a/tests/models/decoder_only/__init__.py
+++ b/tests/models/decoder_only/__init__.py
--- a/tests/models/decoder_only/audio_language/__init__.py
+++ b/tests/models/decoder_only/audio_language/__init__.py
--- a/tests/models/encoder_decoder/language/test_bart.py
+++ b/tests/models/encoder_decoder/language/test_bart.py
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
-
-Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
-"""
 from typing import Optional

 import pytest

--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM when using greedy sampling.
-
-Run `pytest tests/models/test_models.py`.
-"""
+import os
+from typing import Optional

 import pytest
 import torch
@@ -29,7 +27,8 @@ AITER_MODEL_LIST = [
    "openbmb/MiniCPM3-4B",
    "Qwen/Qwen-7B-Chat",
    "Qwen/Qwen2.5-0.5B-Instruct",
-    "ehristoforu/Falcon3-MoE-2x7B-Insruct",
+    "TitanML/tiny-mixtral",
+    "Qwen/Qwen3-8B",
 ]


@@ -80,12 +79,14 @@ AITER_MODEL_LIST = [
            "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
            marks=[pytest.mark.core_model],
        ),
+        pytest.param(
+            "Qwen/Qwen3-8B",  # qwen (text-only)
+        ),
        pytest.param("stabilityai/stablelm-3b-4e1t"),  # stablelm
        pytest.param("bigcode/starcoder2-3b"),  # starcoder2
        pytest.param(
-            "ehristoforu/Falcon3-MoE-2x7B-Insruct",  # mixtral
-            marks=[pytest.mark.cpu_model,
-                   large_gpu_mark(min_gb=48)],
+            "TitanML/tiny-mixtral",  # mixtral
+            marks=[pytest.mark.cpu_model],
        )
    ])
 @pytest.mark.parametrize("max_tokens", [32])
@@ -112,19 +113,38 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
        # in parts of the operators
        pytest.skip(f"Skipping '{model}' model test with AITER kernel.")

+    use_prompt_embeds = os.getenv("VLLM_USE_V1") == "0"
+
    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

+        prompt_embeds: Optional[list[torch.Tensor]] = ([] if use_prompt_embeds
+                                                       else None)
+
+        prompt_token_ids = []
+        for prompt in example_prompts:
+            token_ids = hf_model.tokenizer(prompt,
+                                           return_tensors="pt").input_ids.to(
+                                               hf_model.model.device)
+            prompt_token_ids.append(token_ids)
+            if prompt_embeds is not None:
+                prompt_embeds.append(hf_model.model.get_input_embeddings()(
+                    token_ids).squeeze(0))
+
    with vllm_runner(
            model,
            tokenizer_name=model_info.tokenizer or model,
            tokenizer_mode=model_info.tokenizer_mode,
            trust_remote_code=model_info.trust_remote_code,
            max_num_seqs=2,
+            enable_prompt_embeds=use_prompt_embeds,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
+        if prompt_embeds is not None:
+            vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
+                prompt_embeds, max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
@@ -132,6 +152,14 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
        name_0="hf",
        name_1="vllm",
    )
+    if prompt_embeds is not None:
+        check_logprobs_close(
+            outputs_0_lst=vllm_outputs,
+            outputs_1_lst=vllm_outputs_from_embeds,
+            name_0="vllm",
+            name_1="vllm_from_embeds",
+        )
+
    if use_rocm_aiter:
        # this is to ensure that vllm engine
        # has deallocated the memory before running the next

--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.
-
-Run `pytest tests/models/test_granite.py`.
-"""
 import pytest

 from ...utils import check_logprobs_close

--- a/tests/models/language/generation/test_granitemoehybrid.py
+++ b/tests/models/language/generation/test_granitemoehybrid.py
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from ...utils import check_logprobs_close
+
+# Path of the checkpoints
+MODELS = [
+    "ibm-granite/granite-4.0-tiny-preview",
+]
+
+
+@pytest.mark.skip(
+    reason="Granite 4.0 is not yet available in huggingface transformers")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float16", "bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_model_equivalence_to_hf_greedy(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+):
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/models/decoder_only/language/test_hybrid.py
+++ b/tests/models/decoder_only/language/test_hybrid.py
@@ -23,12 +23,15 @@ SSM_MODELS = [

 HYBRID_MODELS = [
    "ai21labs/Jamba-tiny-dev",
+    # NOTE: ibm-granite/granite-4.0-tiny-preview are skipped currently as
+    # it is not yet available in huggingface transformers
+    # "ibm-granite/granite-4.0-tiny-preview",
    # NOTE: Running Plamo2 in transformers implementation requires to install
    # causal-conv1d package, which is not listed as a test dependency as it's
    # not compatible with pip-compile.
    "pfnet/plamo-2-1b",
    "Zyphra/Zamba2-1.2B-instruct",
-    "ibm-ai-platform/Bamba-9B",
+    "hmellor/bamba-tiny-random",
 ]

 # Avoid OOM
@@ -289,23 +292,25 @@ def test_multistep_correctness(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 @pytest.mark.parametrize("max_tokens", [64])
-def test_hybrid_distributed_produces_identical_generation(
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_distributed_correctness(
    vllm_runner,
    example_prompts,
    model: str,
    max_tokens: int,
+    num_logprobs: int,
 ) -> None:
-    with vllm_runner(model, tensor_parallel_size=2,
+    with vllm_runner(model, tensor_parallel_size=1,
                     max_num_seqs=2) as vllm_model:
-        vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts,
-                                                       max_tokens)
+        vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)

-    with vllm_runner(model, tensor_parallel_size=1,
+    with vllm_runner(model, tensor_parallel_size=2,
                     max_num_seqs=2) as vllm_model:
-        vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts,
-                                                       max_tokens)
+        vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)

-    check_outputs_equal(
+    check_logprobs_close(
        outputs_0_lst=vllm_outputs_tp_1,
        outputs_1_lst=vllm_outputs_tp_2,
        name_0="vllm_tp_1",

--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
-
-Run `pytest tests/models/test_mistral.py`.
-"""
 import copy
 import json


--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for moe models using greedy sampling.
-
-Run `pytest tests/models/test_phimoe.py`.
-"""
 import pytest
 import torch


--- a/tests/models/decoder_only/language/__init__.py
+++ b/tests/models/decoder_only/language/__init__.py
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
+# SPDX-License-Identifier: Apache-2.0
+import math
+from collections.abc import Sequence
+
+import mteb
+import numpy as np
+import pytest
+
+from tests.models.utils import EmbedModelInfo
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+
+# Most models on the STS12 task (See #17175):
+# - Model implementation and minor changes in tensor dtype
+#   results in differences less than 1e-4
+# - Different model results in differences more than 1e-3
+# 1e-4 is a good tolerance threshold
+MTEB_EMBED_TASKS = ["STS12"]
+MTEB_EMBED_TOL = 1e-4
+
+
+class VllmMtebEncoder(mteb.Encoder):
+
+    def __init__(self, vllm_model):
+        super().__init__()
+        self.model = vllm_model
+        self.rng = np.random.default_rng(seed=42)
+
+    def encode(
+        self,
+        sentences: Sequence[str],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        # Hoping to discover potential scheduling
+        # issues by randomizing the order.
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+        outputs = self.model.encode(sentences, use_tqdm=False)
+        embeds = np.array(outputs)
+        embeds = embeds[np.argsort(r)]
+        return embeds
+
+
+class OpenAIClientMtebEncoder(mteb.Encoder):
+
+    def __init__(self, model_name: str, client):
+        super().__init__()
+        self.model_name = model_name
+        self.client = client
+        self.rng = np.random.default_rng(seed=42)
+
+    def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
+        # Hoping to discover potential scheduling
+        # issues by randomizing the order.
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+
+        embeddings = self.client.embeddings.create(model=self.model_name,
+                                                   input=sentences)
+        outputs = [d.embedding for d in embeddings.data]
+        embeds = np.array(outputs)
+        embeds = embeds[np.argsort(r)]
+        return embeds
+
+
+def run_mteb_embed_task(encoder, tasks):
+    tasks = mteb.get_tasks(tasks=tasks)
+    evaluation = mteb.MTEB(tasks=tasks)
+    results = evaluation.run(encoder, verbosity=0, output_folder=None)
+
+    main_score = results[0].scores["test"][0]["main_score"]
+    return main_score
+
+
+def run_mteb_embed_task_st(model_name, tasks):
+    from sentence_transformers import SentenceTransformer
+    model = SentenceTransformer(model_name)
+    return run_mteb_embed_task(model, tasks)
+
+
+def mteb_test_embed_models(hf_runner,
+                           vllm_runner,
+                           model_info: EmbedModelInfo,
+                           vllm_extra_kwargs=None):
+    if not model_info.enable_test:
+        # A model family has many models with the same architecture,
+        # and we don't need to test each one.
+        pytest.skip("Skipping test.")
+
+    vllm_extra_kwargs = vllm_extra_kwargs or {}
+
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     max_model_len=None,
+                     dtype=model_info.dtype,
+                     **vllm_extra_kwargs) as vllm_model:
+
+        if model_info.architecture:
+            assert (model_info.architecture
+                    in vllm_model.model.llm_engine.model_config.architectures)
+
+        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
+                                              MTEB_EMBED_TASKS)
+        vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
+        model_dtype = getattr(
+            vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype",
+            vllm_dtype)
+
+    with set_default_torch_dtype(model_dtype) and hf_runner(
+            model_info.name, is_sentence_transformer=True,
+            dtype=model_dtype) as hf_model:
+        st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
+
+    print("VLLM:", vllm_dtype, vllm_main_score)
+    print("SentenceTransformer:", model_dtype, st_main_score)
+    print("Difference:", st_main_score - vllm_main_score)
+
+    assert math.isclose(st_main_score, vllm_main_score, rel_tol=MTEB_EMBED_TOL)
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the classification outputs of HF and vLLM models.
-
-Run `pytest tests/models/test_cls_models.py`.
-"""
 import pytest
 import torch
 from transformers import AutoModelForSequenceClassification
@@ -19,7 +15,7 @@ from vllm.platforms import current_platform
 )
 @pytest.mark.parametrize("dtype",
                         ["half"] if current_platform.is_rocm() else ["float"])
-def test_classification_models(
+def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,