Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -9,7 +9,7 @@ import vllm.lora.ops.torch_ops as torch_ops
 import vllm.lora.ops.triton_ops as triton_ops
 from vllm.lora.ops.triton_ops import LoRAKernelMeta
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed

 from .utils import PunicaTensors, assert_close, generate_data_for_nslices

@@ -395,7 +395,7 @@ def test_kernels(
    Tests LoRA kernels.
    """
    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)

    if op_type == "shrink":
        check_lora_shrink_kernel(
@@ -447,7 +447,7 @@ def test_kernels_hidden_size(
    Tests SGMV and LoRA kernels.
    """
    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)

    if op_type == "shrink":
        check_lora_shrink_kernel(

--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -2,10 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass

+import os
 import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import BeamSearchParams
+from ..utils import models_path_prefix


 @dataclass
@@ -14,9 +16,12 @@ class TestConfig:
    lora_path: str
    max_num_seqs: int = 2
    max_loras: int = 2
-    max_lora_rank: int = 16
-    max_model_len: int = 4096
+    max_lora_rank: int = 32
+    enable_tower_connector_lora: bool = False
+    max_model_len: int = 8192
+    gpu_memory_utilization: float = 0.85
    mm_processor_kwargs: dict[str, int] | None = None
+    mm_processor_cache_gb: float = 4

    def __post_init__(self):
        if self.mm_processor_kwargs is None:
@@ -48,8 +53,11 @@ class Qwen2VLTester:
            enable_lora=True,
            max_loras=self.config.max_loras,
            max_lora_rank=self.config.max_lora_rank,
+            enable_tower_connector_lora=self.config.enable_tower_connector_lora,
            trust_remote_code=True,
+            gpu_memory_utilization=self.config.gpu_memory_utilization,
            mm_processor_kwargs=self.config.mm_processor_kwargs,
+            mm_processor_cache_gb=self.config.mm_processor_cache_gb,
            max_model_len=self.config.max_model_len,
        )

@@ -58,6 +66,7 @@ class Qwen2VLTester:
        images: list[ImageAsset],
        expected_outputs: list[str],
        lora_id: int | None = None,
+        lora_name: str | None = None,
        temperature: float = 0,
        max_tokens: int = 5,
    ):
@@ -73,10 +82,11 @@ class Qwen2VLTester:
            for asset in images
        ]

-        lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
+        lora_request = LoRARequest(
+            lora_name if lora_name else str(lora_id), lora_id, self.config.lora_path
+        )
        outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
        generated_texts = [output.outputs[0].text.strip() for output in outputs]
-
        # Validate outputs
        for generated, expected in zip(generated_texts, expected_outputs):
            assert expected.startswith(generated), (
@@ -127,6 +137,22 @@ EXPECTED_OUTPUTS = [
    "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
 ]

+EXPECTED_OUTPUTS_LANGUAGE = [
+    "A stop sign is shown in an Asian city, with buildings and a car in the "
+    "background.",
+    "The Tokyo Skytree can be seen behind the pink blossoms of the cherry trees.",
+]
+
+EXPECTED_OUTPUTS_VISION = [
+    "A stop sign in front of oriental buildings.",
+    "A tree with pink flowers in front of it and a blue sky behind the flowers.",
+]
+
+EXPECTED_OUTPUTS_VISION_NO_CONNECTOR = [
+    "A stop sign is located on the street of a Chinese neighborhood.",
+    "A closeup shot of the Tokyo Skytree with pink flowers in the foreground.",
+]
+
 # NOTE - beam search .text contains the whole text
 EXPECTED_BEAM_SEARCH_OUTPUTS = [
    [
@@ -137,6 +163,7 @@ EXPECTED_BEAM_SEARCH_OUTPUTS = [

 QWEN2VL_MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")
 QWEN25VL_MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")
+QWEN3VL_MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen3-VL-4B-Instruct")


 def test_qwen2vl_lora(qwen2vl_lora_files):
@@ -175,3 +202,99 @@ def test_qwen25vl_lora(qwen25vl_lora_files):
    # Test with different LoRA IDs
    for lora_id in [1, 2]:
        tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
+
+
+def test_qwen25vl_vision_lora(qwen25vl_vision_lora_files):
+    config = TestConfig(
+        model_path=QWEN25VL_MODEL_PATH,
+        lora_path=qwen25vl_vision_lora_files,
+        # Currently, tower_connector_lora is incompatible with
+        # the multi-modal processor cache.
+        # TODO: Remove this restriction
+        mm_processor_cache_gb=0,
+        enable_tower_connector_lora=True,
+    )
+    tester = Qwen2VLTester(config)
+    for lora_id in [1, 2]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS,
+            lora_id=lora_id,
+        )
+
+
+def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files):
+    config = TestConfig(
+        model_path=QWEN3VL_MODEL_PATH,
+        lora_path=qwen3vl_vision_lora_files,
+        # Currently, tower_connector_lora is incompatible with
+        # the multi-modal processor cache.
+        # TODO: Remove this restriction
+        mm_processor_cache_gb=0,
+        enable_tower_connector_lora=True,
+    )
+    tester = Qwen2VLTester(config)
+    for lora_id in [1, 2]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS,
+            lora_id=lora_id,
+        )
+
+
+def test_qwen2vl_multiple_lora_types(
+    qwen2vl_language_lora_files,
+    qwen2vl_vision_tower_connector_lora_files,
+    qwen2vl_vision_tower_lora_files,
+):
+    """
+    Test multiple LoRA adapter types (language, vision tower + connector,
+    vision tower only) using the same LLM instance to verify mm_encoder_cache
+    behavior with different LoRA requests.
+
+    By reusing the same LLM instance across different LoRA requests, we ensure that
+    the multimodal encoder cache correctly manages state transitions between
+    language-only and vision-enabled LoRA adapters.
+    """
+    config = TestConfig(
+        model_path=QWEN2VL_MODEL_PATH,
+        # We'll override the lora_path for each specific test, but need to provide
+        # an initial path for initialization
+        lora_path=qwen2vl_language_lora_files,
+        # Currently, tower_connector_lora is incompatible with
+        # the multi-modal processor cache.
+        # TODO: Remove this restriction
+        mm_processor_cache_gb=0,
+        enable_tower_connector_lora=True,
+    )
+    tester = Qwen2VLTester(config)
+
+    # Test 1: Language-only LoRA adapter
+    tester.config.lora_path = qwen2vl_language_lora_files
+    for lora_id in [1, 2]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS_LANGUAGE,
+            lora_id=lora_id,
+            lora_name="language_only",
+        )
+
+    # Test 2: Vision tower + connector LoRA adapter
+    tester.config.lora_path = qwen2vl_vision_tower_connector_lora_files
+    for lora_id in [3, 4]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS_VISION,
+            lora_id=lora_id,
+            lora_name="vision_tower_connector",
+        )
+
+    # Test 3: Vision tower only LoRA adapter (no connector)
+    tester.config.lora_path = qwen2vl_vision_tower_lora_files
+    for lora_id in [5, 6]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR,
+            lora_id=lora_id,
+            lora_name="vision_tower",
+        )
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -3,7 +3,7 @@

 from collections import OrderedDict
 from typing import NamedTuple
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch

 import pytest
 from huggingface_hub.utils import HfHubHTTPError
@@ -194,5 +194,8 @@ def test_get_adapter_absolute_path_huggingface_error(
    # Hugging Face model identifier with download error
    path = "org/repo"
    mock_exist.return_value = False
-    mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info")
+    mock_snapshot_download.side_effect = HfHubHTTPError(
+        "failed to query model info",
+        response=MagicMock(),
+    )
    assert get_adapter_absolute_path(path) == path
--- a/tests/model_executor/model_loader/runai_streamer_loader/conftest.py
+++ b/tests/model_executor/model_loader/runai_streamer_loader/conftest.py
@@ -29,11 +29,7 @@ class RunaiDummyExecutor(UniProcExecutor):
            is_driver_worker=is_driver_worker,
        )

-        wrapper_kwargs = {
-            "vllm_config": self.vllm_config,
-        }
-
-        self.driver_worker = WorkerWrapperBase(**wrapper_kwargs)
+        self.driver_worker = WorkerWrapperBase()

        self.collective_rpc("init_worker", args=([worker_rpc_kwargs],))
        self.collective_rpc("init_device")
--- a/tests/model_executor/model_loader/tensorizer_loader/conftest.py
+++ b/tests/model_executor/model_loader/tensorizer_loader/conftest.py
@@ -67,7 +67,7 @@ def assert_from_collective_rpc(engine: LLM, closure: Callable, closure_kwargs: d
 class DummyExecutor(UniProcExecutor):
    def _init_executor(self) -> None:
        """Initialize the worker and load the model."""
-        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, rpc_rank=0)
+        self.driver_worker = WorkerWrapperBase(rpc_rank=0)
        distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
        local_rank = 0
        # set local rank as the device index if specified

--- a/tests/model_executor/test_eagle_quantization.py
+++ b/tests/model_executor/test_eagle_quantization.py
@@ -55,7 +55,7 @@ def test_get_draft_quant_config_without_draft_model():

 @torch.inference_mode()
 @pytest.mark.parametrize("device", DEVICES)
-def test_fc_layer_quant_config_usage(dist_init, device) -> None:
+def test_fc_layer_quant_config_usage(default_vllm_config, dist_init, device) -> None:
    import torch

    from vllm.model_executor.layers.linear import ReplicatedLinear

--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -5,12 +5,8 @@ import os

 import pytest

-from vllm.model_executor.layers.pooler import (
-    CLSPool,
-    DispatchPooler,
-    MeanPool,
-    PoolingType,
-)
+from vllm.model_executor.layers.pooler import DispatchPooler
+from vllm.model_executor.layers.pooler.seqwise import CLSPool, MeanPool
 from vllm.model_executor.models.bert import BertEmbeddingModel
 from vllm.model_executor.models.roberta import RobertaEmbeddingModel
 from vllm.platforms import current_platform
@@ -51,8 +47,9 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
        assert model_config.encoder_config["do_lower_case"]

        # asserts on the pooling config files
-        assert model_config.pooler_config.pooling_type == PoolingType.CLS.name
-        assert model_config.pooler_config.normalize
+        assert model_config.pooler_config.seq_pooling_type == "CLS"
+        assert model_config.pooler_config.tok_pooling_type == "ALL"
+        assert model_config.pooler_config.use_activation

        # asserts on the tokenizer loaded
        assert model_config.tokenizer == os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5")
@@ -95,8 +92,9 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
        assert not model_config.encoder_config["do_lower_case"]

        # asserts on the pooling config files
-        assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
-        assert model_config.pooler_config.normalize
+        assert model_config.pooler_config.seq_pooling_type == "MEAN"
+        assert model_config.pooler_config.tok_pooling_type == "ALL"
+        assert model_config.pooler_config.use_activation

        # asserts on the tokenizer loaded
        assert model_config.tokenizer == os.path.join(models_path_prefix, "intfloat/multilingual-e5-base")

--- a/tests/models/fixtures/qwen2_5_math_prm_reward_step.json
+++ b/tests/models/fixtures/qwen2_5_math_prm_reward_step.json
+[[[0.0006361007690429688, 0.99951171875], [0.81884765625, 0.1812744140625], [0.025543212890625, 0.974609375], [0.0004382133483886719, 0.99951171875]]]
\ No newline at end of file
--- a/tests/models/language/generation/conftest.py
+++ b/tests/models/language/generation/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM language generation tests."""
+
+import warnings
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def pytest_sessionstart(session):
+    """Configure ROCm-specific settings before test session starts."""
+    if not current_platform.is_rocm():
+        return
+
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+    warnings.warn(
+        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
+        "to avoid HuggingFace Transformers accuracy issues",
+        UserWarning,
+        stacklevel=1,
+    )
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -12,6 +12,11 @@ from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 from ....utils import models_path_prefix

+# Models that require embedding scaling for prompt_embeds test
+EMBED_SCALING_MODELS = {
+    "openbmb/MiniCPM4.1-8B",
+}
+
 # This list contains the model that are using AITER kernel.
 # Skip model that are not using AITER tests.
 # When more AITER kernels are added, this list will not be
@@ -66,8 +71,8 @@ AITER_MODEL_LIST = [
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
        pytest.param(
-            os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"),
-            marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
+            os.path.join(models_path_prefix, "openbmb/MiniCPM4.1-8B"),  # minicpm
+            marks=[pytest.mark.core_model, large_gpu_mark(min_gb=48)],
        ),
        pytest.param(
            os.path.join(models_path_prefix, "facebook/opt-125m"),  # opt
@@ -137,16 +142,20 @@ def test_models(

        prompt_embeds: list[torch.Tensor] | None = [] if use_prompt_embeds else None

-        prompt_token_ids = []
        for prompt in example_prompts:
            token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
                hf_model.model.device
            )
-            prompt_token_ids.append(token_ids)
            if prompt_embeds is not None:
-                prompt_embeds.append(
-                    hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
-                )
+                embed = hf_model.model.get_input_embeddings()(token_ids)
+
+                # MiniCPM models apply scale_emb to embeddings internally.
+                # vLLM expects pre-scaled embeddings when using inputs_embeds.
+                if model in EMBED_SCALING_MODELS:
+                    config = hf_model.model.config
+                    embed = embed * config.scale_emb
+
+                prompt_embeds.append(embed.squeeze(0))

    with vllm_runner(
        model,

--- a/tests/models/language/generation/test_grok.py
+++ b/tests/models/language/generation/test_grok.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from ...utils import dummy_hf_overrides
+
+MODELS = ["xai-org/grok-2"]
+
+
+def _grok2_dummy_overrides(hf_config):
+    hf_config = dummy_hf_overrides(hf_config, model_arch="Grok1ForCausalLM")
+    text_config = hf_config.get_text_config()
+    text_config.update(
+        {
+            "hidden_size": 256,
+            "intermediate_size": 512,
+            "moe_intermediate_size": 256,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 2,
+            "head_dim": 64,
+        }
+    )
+    return hf_config
+
+
+@pytest.mark.parametrize("model", MODELS)
+def test_dummy_generate(vllm_runner, monkeypatch, model: str) -> None:
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+        with vllm_runner(
+            model,
+            load_format="dummy",
+            max_model_len=128,
+            hf_overrides=_grok2_dummy_overrides,
+            enforce_eager=True,
+        ) as llm:
+            prompt = "Hello from Grok-2"
+            tokenizer = llm.get_llm().get_tokenizer()
+            prompt_len = len(tokenizer.encode(prompt))
+            outputs = llm.generate_greedy([prompt], max_tokens=1)
+            output_ids, output_str = outputs[0]
+            assert len(output_ids) > prompt_len
+            assert output_str is not None
--- a/tests/models/language/generation/test_phimoe.py
+++ b/tests/models/language/generation/test_phimoe.py
@@ -62,6 +62,19 @@ def test_phimoe_routing_function():
        assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])


+# There is a known issue that triggers `AttributeError: 'DynamicCache'
+# object has no attribute 'seen_tokens'` when running:
+# `tests/models/language/generation/test_phimoe.py::test_models
+#   [5-64-bfloat16-microsoft/Phi-3.5-MoE-instruct]`
+# This issue is being investigated and tracked in:
+#   https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/discussions/58
+# It is platform-agnostic. Therefore, we skip this test on all platforms for now.
+@pytest.mark.skip(
+    reason="Skipping due to known issue: "
+    "'DynamicCache' object has no attribute 'seen_tokens'. See: "
+    "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/discussions/58 "
+    "for details.",
+)
 @pytest.mark.skipif(
    condition=current_platform.is_cpu(),
    reason="This test takes a lot time to run on CPU, "

--- a/tests/models/language/pooling/conftest.py
+++ b/tests/models/language/pooling/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM language generation tests."""
+
+import warnings
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def pytest_sessionstart(session):
+    """Configure ROCm-specific settings before test session starts."""
+    if not current_platform.is_rocm():
+        return
+
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+    torch.set_float32_matmul_precision("high")
+    warnings.warn(
+        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
+        "to avoid HuggingFace Transformers accuracy issues",
+        UserWarning,
+        stacklevel=1,
+    )
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -61,7 +61,7 @@ def test_models(
    vllm_extra_kwargs = {}
    if model == (os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base"):
        vllm_extra_kwargs["pooler_config"] = PoolerConfig(
-            pooling_type="MEAN", normalize=False
+            seq_pooling_type="MEAN", normalize=False
        )

    max_model_len: int | None = 512

--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@@ -88,7 +88,7 @@ def test_gemma_multimodal(
        convert="classify",
        load_format="auto",
        hf_overrides=update_config,
-        pooler_config=PoolerConfig(pooling_type="LAST"),
+        pooler_config=PoolerConfig(seq_pooling_type="LAST"),
        max_model_len=512,
        enforce_eager=True,
        tensor_parallel_size=1,

--- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py
+++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
@@ -66,7 +66,7 @@ def test_embed_models_using_normalize(
        model,
        max_model_len=512,
        dtype=dtype,
-        pooler_config=PoolerConfig(normalize=False),
+        pooler_config=PoolerConfig(use_activation=False),
    ) as vllm_model:
        wo_normalize = torch.tensor(vllm_model.embed(example_prompts))

@@ -74,7 +74,7 @@ def test_embed_models_using_normalize(
        model,
        max_model_len=512,
        dtype=dtype,
-        pooler_config=PoolerConfig(normalize=True),
+        pooler_config=PoolerConfig(use_activation=True),
    ) as vllm_model:
        w_normalize = torch.tensor(vllm_model.embed(example_prompts))

@@ -146,7 +146,7 @@ def test_multi_vector_retrieval_models_using_normalize(
        model,
        max_model_len=512,
        dtype=dtype,
-        pooler_config=PoolerConfig(normalize=False),
+        pooler_config=PoolerConfig(use_activation=False),
    ) as vllm_model:
        wo_normalize = vllm_model.token_embed(example_prompts)

@@ -154,7 +154,7 @@ def test_multi_vector_retrieval_models_using_normalize(
        model,
        max_model_len=512,
        dtype=dtype,
-        pooler_config=PoolerConfig(normalize=True),
+        pooler_config=PoolerConfig(use_activation=True),
    ) as vllm_model:
        w_normalize = vllm_model.token_embed(example_prompts)


--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from typing import TYPE_CHECKING

 import pytest
 import torch
@@ -9,7 +11,18 @@ from transformers import AutoModel
 from vllm.platforms import current_platform

 from ....conftest import HfRunner
-from ...utils import check_transformers_version
+from ....utils import VLLM_PATH
+from ...registry import HF_EXAMPLE_MODELS
+
+if TYPE_CHECKING:
+    from _typeshed import StrPath
+
+
+FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
+assert FIXTURES_PATH.exists()
+FIXTURE_REWARD_RESULT = {
+    "Qwen/Qwen2.5-Math-PRM-7B": FIXTURES_PATH / "qwen2_5_math_prm_reward_step.json",
+}


 @pytest.fixture
@@ -60,6 +73,16 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
    return hf_model


+def dump_reward_outputs(outputs: list[list[float]], filename: "StrPath"):
+    with open(filename, "w", encoding="utf-8") as f:
+        json.dump(outputs, f)
+
+
+def load_reward_outputs(filename: "StrPath") -> list[list[float]]:
+    with open(filename, encoding="utf-8") as f:
+        return json.load(f)
+
+
 @pytest.mark.parametrize(
    "model",
    [
@@ -77,9 +100,8 @@ def test_prm_models(
    model: str,
    dtype: str,
 ) -> None:
-    check_transformers_version(
-        "Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
-    )
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_transformers_version(on_fail="skip")

    if current_platform.is_cpu():
        pytest.skip("CPU only supports V1")
@@ -91,9 +113,46 @@ def test_prm_models(
        hf_model = step_reward_patch_hf_model(hf_model)
        hf_outputs = hf_model.reward(math_step_prompts)

+    dump_reward_outputs(
+        hf_outputs,
+        FIXTURE_REWARD_RESULT[model],
+    )
+
    # check logits difference
    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = torch.tensor(hf_output).float()
        vllm_output = torch.tensor(vllm_output).float()

        assert torch.allclose(hf_output, vllm_output, 1.5e-2)
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param(
+            "Qwen/Qwen2.5-Math-PRM-7B",
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_prm_models_with_golden_outputs(
+    vllm_runner,
+    math_step_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    if not FIXTURE_REWARD_RESULT.get(model):
+        pytest.skip(f"No available golden outputs for {model}.")
+
+    with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.reward(math_step_prompts)
+
+    golden_outputs = load_reward_outputs(FIXTURE_REWARD_RESULT[model])
+
+    # check logits difference
+    for golden_output, vllm_output in zip(golden_outputs, vllm_outputs):
+        golden_output = torch.tensor(golden_output).float()
+        vllm_output = torch.tensor(vllm_output).float()
+
+        assert torch.allclose(golden_output, vllm_output, 1.5e-2)
--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
@@ -5,6 +5,7 @@ import torch
 from transformers import AutoModelForTokenClassification

 from tests.models.utils import softmax
+from vllm.platforms import current_platform


 @pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
@@ -21,8 +22,17 @@ def test_bert_models(
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)

+    # Use eager attention on ROCm to avoid HF Transformers flash attention
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    hf_model_kwargs = {}
+    if current_platform.is_rocm():
+        hf_model_kwargs["attn_implementation"] = "eager"
+
    with hf_runner(
-        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
+        model,
+        dtype=dtype,
+        auto_cls=AutoModelForTokenClassification,
+        model_kwargs=hf_model_kwargs,
    ) as hf_model:
        tokenizer = hf_model.tokenizer
        hf_outputs = []
@@ -34,9 +44,9 @@ def test_bert_models(

    # check logits difference
    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
-        hf_output = torch.tensor(hf_output).cpu().float()
-        vllm_output = torch.tensor(vllm_output).cpu().float()
-        assert torch.allclose(hf_output, vllm_output, 1e-2)
+        hf_output = hf_output.detach().clone().cpu().float()
+        vllm_output = vllm_output.detach().clone().cpu().float()
+        torch.testing.assert_close(hf_output, vllm_output, atol=1.2e-2, rtol=1e-3)


 @pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
@@ -52,8 +62,17 @@ def test_modernbert_models(
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)

+    # Use eager attention on ROCm to avoid HF Transformers flash attention
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    hf_model_kwargs = {}
+    if current_platform.is_rocm():
+        hf_model_kwargs["attn_implementation"] = "eager"
+
    with hf_runner(
-        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
+        model,
+        dtype=dtype,
+        auto_cls=AutoModelForTokenClassification,
+        model_kwargs=hf_model_kwargs,
    ) as hf_model:
        tokenizer = hf_model.tokenizer
        hf_outputs = []
@@ -65,9 +84,9 @@ def test_modernbert_models(

    # check logits difference
    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
-        hf_output = torch.tensor(hf_output).cpu().float()
-        vllm_output = torch.tensor(vllm_output).cpu().float()
-        assert torch.allclose(hf_output, vllm_output, atol=1e-2)
+        hf_output = hf_output.detach().clone().cpu().float()
+        vllm_output = vllm_output.detach().clone().cpu().float()
+        torch.testing.assert_close(hf_output, vllm_output, atol=1.2e-2, rtol=1e-3)


 @pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"])
@@ -96,6 +115,6 @@ def test_auto_conversion(

    # check logits difference
    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
-        hf_output = torch.tensor(hf_output).cpu().float()
-        vllm_output = torch.tensor(vllm_output).cpu().float()
+        hf_output = hf_output.detach().clone().cpu().float()
+        vllm_output = vllm_output.detach().clone().cpu().float()
        assert torch.allclose(hf_output, vllm_output, atol=1e-2)
--- a/tests/models/language/pooling_mteb_test/mteb_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import tempfile
-
 import mteb
 import numpy as np
-import requests
 import torch
 from mteb.models import ModelMeta
 from mteb.types import Array
@@ -14,7 +11,6 @@ from torch.utils.data import DataLoader
 import tests.ci_envs as ci_envs
 from tests.models.utils import (
    EmbedModelInfo,
-    RerankModelInfo,
    check_embeddings_close,
    get_vllm_extra_kwargs,
 )
@@ -23,14 +19,10 @@ from tests.models.utils import (
 # - Model implementation and minor changes in tensor dtype
 #   results in differences less than 1e-4
 # - Different model results in differences more than 1e-3
-# 1e-4 is a good tolerance threshold
+# 5e-4 is a good tolerance threshold
 MTEB_EMBED_TASKS = ["STS12"]
-MTEB_EMBED_TOL = 1e-4
+MTEB_EMBED_TOL = 5e-4

-# See #19344
-MTEB_RERANK_TASKS = ["NFCorpus"]
-MTEB_RERANK_LANGS = ["eng"]
-MTEB_RERANK_TOL = 2e-3

 _empty_model_meta = ModelMeta(
    loader=None,
@@ -54,29 +46,9 @@ _empty_model_meta = ModelMeta(
 )


-class VllmMtebEncoder(mteb.EncoderProtocol):
+class MtebEmbedMixin(mteb.EncoderProtocol):
    mteb_model_meta = _empty_model_meta

-    def __init__(self, vllm_model):
-        self.llm = vllm_model
-        self.rng = np.random.default_rng(seed=42)
-
-    def encode(
-        self,
-        inputs: DataLoader[mteb.types.BatchedInput],
-        *args,
-        **kwargs,
-    ) -> np.ndarray:
-        # Hoping to discover potential scheduling
-        # issues by randomizing the order.
-        sentences = [text for batch in inputs for text in batch["text"]]
-        r = self.rng.permutation(len(sentences))
-        sentences = [sentences[i] for i in r]
-        outputs = self.llm.embed(sentences, use_tqdm=False)
-        embeds = np.array(outputs)
-        embeds = embeds[np.argsort(r)]
-        return embeds
-
    def similarity(
        self,
        embeddings1: np.ndarray,
@@ -102,31 +74,29 @@ class VllmMtebEncoder(mteb.EncoderProtocol):
        return sim


-class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
-    mteb_model_meta = _empty_model_meta
-
+class VllmMtebEncoder(MtebEmbedMixin):
    def __init__(self, vllm_model):
        self.llm = vllm_model
        self.rng = np.random.default_rng(seed=42)

-    def predict(
+    def encode(
        self,
-        inputs1: DataLoader[mteb.types.BatchedInput],
-        inputs2: DataLoader[mteb.types.BatchedInput],
+        inputs: DataLoader[mteb.types.BatchedInput],
        *args,
        **kwargs,
    ) -> np.ndarray:
-        queries = [text for batch in inputs1 for text in batch["text"]]
-        corpus = [text for batch in inputs2 for text in batch["text"]]
-
-        outputs = self.llm.score(
-            queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
-        )
-        scores = np.array(outputs)
-        return scores
+        # Hoping to discover potential scheduling
+        # issues by randomizing the order.
+        sentences = [text for batch in inputs for text in batch["text"]]
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+        outputs = self.llm.embed(sentences, use_tqdm=False)
+        embeds = np.array(outputs)
+        embeds = embeds[np.argsort(r)]
+        return embeds


-class OpenAIClientMtebEncoder(VllmMtebEncoder):
+class OpenAIClientMtebEncoder(MtebEmbedMixin):
    def __init__(self, model_name: str, client):
        self.model_name = model_name
        self.client = client
@@ -153,58 +123,6 @@ class OpenAIClientMtebEncoder(VllmMtebEncoder):
        return embeds


-class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
-    mteb_model_meta = _empty_model_meta
-
-    def __init__(self, model_name: str, url):
-        self.model_name = model_name
-        self.url = url
-        self.rng = np.random.default_rng(seed=42)
-
-    def predict(
-        self,
-        inputs1: DataLoader[mteb.types.BatchedInput],
-        inputs2: DataLoader[mteb.types.BatchedInput],
-        *args,
-        **kwargs,
-    ) -> np.ndarray:
-        queries = [text for batch in inputs1 for text in batch["text"]]
-        full_corpus = [text for batch in inputs2 for text in batch["text"]]
-
-        outputs = []
-        for query, corpus in zip(queries, full_corpus):
-            outputs.append(self.get_score(query, corpus))
-
-        scores = np.array(outputs)
-        return scores
-
-    def get_score(self, query, corpus):
-        response = requests.post(
-            self.url,
-            json={
-                "model": self.model_name,
-                "text_1": query,
-                "text_2": corpus,
-                "truncate_prompt_tokens": -1,
-            },
-        ).json()
-        return response["data"][0]["score"]
-
-
-class RerankClientMtebEncoder(ScoreClientMtebEncoder):
-    def get_score(self, query, corpus):
-        response = requests.post(
-            self.url,
-            json={
-                "model": self.model_name,
-                "query": query,
-                "documents": [corpus],
-                "truncate_prompt_tokens": -1,
-            },
-        ).json()
-        return response["results"][0]["relevance_score"]
-
-
 def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
    tasks = mteb.get_tasks(tasks=tasks)
    results = mteb.evaluate(
@@ -243,12 +161,24 @@ def mteb_test_embed_models(
        if model_info.architecture:
            assert model_info.architecture in model_config.architectures

-        # Confirm whether vllm uses the correct default_pooling_type, which
-        # relates to whether chunked prefill and prefix caching are enabled
-        assert (
-            model_config._model_info.default_pooling_type
-            == model_info.default_pooling_type
-        )
+        # Confirm whether the important configs in model_config are correct.
+        pooler_config = model_config.pooler_config
+        if model_info.seq_pooling_type is not None:
+            assert pooler_config.seq_pooling_type == model_info.seq_pooling_type
+        if model_info.tok_pooling_type is not None:
+            assert pooler_config.tok_pooling_type == model_info.tok_pooling_type
+        if model_info.attn_type is not None:
+            assert model_config.attn_type == model_info.attn_type
+        if model_info.is_prefix_caching_supported is not None:
+            assert (
+                model_config.is_prefix_caching_supported
+                == model_info.is_prefix_caching_supported
+            )
+        if model_info.is_chunked_prefill_supported is not None:
+            assert (
+                model_config.is_chunked_prefill_supported
+                == model_info.is_chunked_prefill_supported
+            )

        vllm_main_score = run_mteb_embed_task(
            VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
@@ -299,117 +229,3 @@ def mteb_test_embed_models(
    # We are not concerned that the vllm mteb results are better
    # than SentenceTransformers, so we only perform one-sided testing.
    assert st_main_score - vllm_main_score < atol
-
-
-def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
-    with tempfile.TemporaryDirectory() as prediction_folder:
-        bm25s = mteb.get_model("bm25s")
-        eval_splits = ["test"]
-
-        mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
-            tasks=tasks, languages=languages, eval_splits=eval_splits
-        )
-
-        mteb.evaluate(
-            bm25s,
-            mteb_tasks,
-            prediction_folder=prediction_folder,
-            show_progress_bar=False,
-            # don't save results for test runs
-            cache=None,
-            overwrite_strategy="always",
-        )
-
-        second_stage_tasks = []
-        for task in mteb_tasks:
-            second_stage_tasks.append(
-                task.convert_to_reranking(
-                    prediction_folder,
-                    top_k=10,
-                )
-            )
-
-        results = mteb.evaluate(
-            cross_encoder,
-            second_stage_tasks,
-            show_progress_bar=False,
-            cache=None,
-        )
-        main_score = results[0].scores["test"][0]["main_score"]
-    return main_score
-
-
-def mteb_test_rerank_models_hf(
-    hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
-):
-    with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
-        if hf_model_callback is not None:
-            hf_model_callback(hf_model)
-
-        st_main_score = run_mteb_rerank(
-            hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS
-        )
-        st_dtype = next(hf_model.model.model.parameters()).dtype
-    return st_main_score, st_dtype
-
-
-def mteb_test_rerank_models(
-    hf_runner,
-    vllm_runner,
-    model_info: RerankModelInfo,
-    vllm_extra_kwargs=None,
-    hf_model_callback=None,
-    vllm_mteb_encoder=VllmMtebCrossEncoder,
-    atol=MTEB_RERANK_TOL,
-):
-    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
-
-    with vllm_runner(
-        model_info.name,
-        runner="pooling",
-        max_model_len=None,
-        max_num_seqs=8,
-        **vllm_extra_kwargs,
-    ) as vllm_model:
-        model_config = vllm_model.llm.llm_engine.model_config
-
-        # Confirm whether vllm is using the correct architecture
-        if model_info.architecture:
-            assert model_info.architecture in model_config.architectures
-
-        # Score API is only enabled for num_labels == 1
-        assert model_config.hf_config.num_labels == 1
-
-        # Confirm whether vllm uses the correct default_pooling_type, which
-        # relates to whether chunked prefill and prefix caching are enabled
-        assert (
-            model_config._model_info.default_pooling_type
-            == model_info.default_pooling_type
-        )
-
-        vllm_main_score = run_mteb_rerank(
-            vllm_mteb_encoder(vllm_model),
-            tasks=MTEB_RERANK_TASKS,
-            languages=MTEB_RERANK_LANGS,
-        )
-        vllm_dtype = model_config.dtype
-        head_dtype = model_config.head_dtype
-
-    # Accelerate mteb test by setting
-    # SentenceTransformers mteb score to a constant
-    if model_info.mteb_score is None:
-        st_main_score, st_dtype = mteb_test_rerank_models_hf(
-            hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback
-        )
-    else:
-        st_main_score = model_info.mteb_score
-        st_dtype = "Constant"
-
-    print("Model:", model_info.name)
-    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
-    print("SentenceTransformers:", st_dtype, st_main_score)
-    print("Difference:", st_main_score - vllm_main_score)
-
-    # We are not concerned that the vllm mteb results are better
-    # than SentenceTransformers, so we only perform one-sided testing.
-    assert st_main_score - vllm_main_score < atol
--- a/tests/models/language/pooling_mteb_test/mteb_score_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_score_utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import tempfile
+from pathlib import Path
+from typing import Any
+
+import mteb
+import numpy as np
+import requests
+import torch
+from mteb.models import ModelMeta
+from torch.utils.data import DataLoader
+
+from tests.conftest import HfRunner
+from tests.models.utils import (
+    RerankModelInfo,
+    get_vllm_extra_kwargs,
+)
+
+# See #19344
+MTEB_RERANK_TASKS = ["NFCorpus"]
+MTEB_RERANK_LANGS = ["eng"]
+MTEB_RERANK_TOL = 2e-3
+
+template_home = (
+    Path(__file__).parent.parent.parent.parent.parent
+    / "examples/pooling/score/template"
+)
+
+_empty_model_meta = ModelMeta(
+    loader=None,
+    name="vllm/model",
+    revision="1",
+    release_date=None,
+    languages=None,
+    framework=[],
+    similarity_fn_name=None,
+    n_parameters=None,
+    memory_usage_mb=None,
+    max_tokens=None,
+    embed_dim=None,
+    license=None,
+    open_weights=None,
+    public_training_code=None,
+    public_training_data=None,
+    use_instructions=None,
+    training_datasets=None,
+    modalities=["text"],  # 'image' can be added to evaluate multimodal models
+)
+
+
+class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol):
+    mteb_model_meta = _empty_model_meta
+
+
+class VllmMtebCrossEncoder(MtebCrossEncoderMixin):
+    def __init__(self, vllm_model):
+        self.llm = vllm_model
+        self.rng = np.random.default_rng(seed=42)
+        self.chat_template: str | None = getattr(vllm_model, "chat_template", None)
+
+    def predict(
+        self,
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        corpus = [text for batch in inputs2 for text in batch["text"]]
+
+        # Hoping to discover potential scheduling
+        # issues by randomizing the order.
+        r = self.rng.permutation(len(queries))
+        queries = [queries[i] for i in r]
+        corpus = [corpus[i] for i in r]
+
+        outputs = self.llm.score(
+            queries,
+            corpus,
+            truncate_prompt_tokens=-1,
+            use_tqdm=False,
+            chat_template=self.chat_template,
+        )
+        scores = np.array(outputs)
+        scores = scores[np.argsort(r)]
+        return scores
+
+
+class ScoreClientMtebEncoder(MtebCrossEncoderMixin):
+    mteb_model_meta = _empty_model_meta
+
+    def __init__(self, model_name: str, url):
+        self.model_name = model_name
+        self.url = url
+
+    def predict(
+        self,
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        full_corpus = [text for batch in inputs2 for text in batch["text"]]
+
+        outputs = []
+        for query, corpus in zip(queries, full_corpus):
+            outputs.append(self.get_score(query, corpus))
+
+        scores = np.array(outputs)
+        return scores
+
+    def get_score(self, query, corpus):
+        response = requests.post(
+            self.url,
+            json={
+                "model": self.model_name,
+                "text_1": query,
+                "text_2": corpus,
+                "truncate_prompt_tokens": -1,
+            },
+        ).json()
+        return response["data"][0]["score"]
+
+
+class RerankClientMtebEncoder(ScoreClientMtebEncoder):
+    def get_score(self, query, corpus):
+        response = requests.post(
+            self.url,
+            json={
+                "model": self.model_name,
+                "query": query,
+                "documents": [corpus],
+                "truncate_prompt_tokens": -1,
+            },
+        ).json()
+        return response["results"][0]["relevance_score"]
+
+
+class HFMtebCrossEncoder(MtebCrossEncoderMixin, HfRunner):
+    chat_template: str | None = None
+
+    def __init__(self, model_name: str, dtype: str = "auto", **kwargs: Any) -> None:
+        HfRunner.__init__(
+            self, model_name=model_name, is_cross_encoder=True, dtype=dtype, **kwargs
+        )
+
+    @torch.no_grad
+    def predict(
+        self,
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        corpus = [text for batch in inputs2 for text in batch["text"]]
+
+        if self.chat_template is not None:
+            tokenizer = self.model.tokenizer
+            prompts = []
+            for query, document in zip(queries, corpus):
+                conversation = [
+                    {"role": "query", "content": query},
+                    {"role": "document", "content": document},
+                ]
+
+                prompt = tokenizer.apply_chat_template(
+                    conversation=conversation,
+                    tools=None,
+                    chat_template=self.chat_template,
+                    tokenize=False,
+                )
+                prompts.append(prompt)
+            outputs_list = HfRunner.classify(self, prompts)
+            scores = np.array(outputs_list).squeeze(-1)
+            return scores
+        else:
+            prompts = list(zip(queries, corpus))
+            outputs_tensor = HfRunner.predict(self, prompts, show_progress_bar=False)
+            return outputs_tensor.cpu().numpy()
+
+
+def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
+    with tempfile.TemporaryDirectory() as prediction_folder:
+        bm25s = mteb.get_model("bm25s")
+        eval_splits = ["test"]
+
+        mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
+            tasks=tasks, languages=languages, eval_splits=eval_splits
+        )
+
+        mteb.evaluate(
+            bm25s,
+            mteb_tasks,
+            prediction_folder=prediction_folder,
+            show_progress_bar=False,
+            # don't save results for test runs
+            cache=None,
+            overwrite_strategy="always",
+        )
+
+        second_stage_tasks = []
+        for task in mteb_tasks:
+            second_stage_tasks.append(
+                task.convert_to_reranking(
+                    prediction_folder,
+                    top_k=10,
+                )
+            )
+
+        results = mteb.evaluate(
+            cross_encoder,
+            second_stage_tasks,
+            show_progress_bar=False,
+            cache=None,
+        )
+        main_score = results[0].scores["test"][0]["main_score"]
+    return main_score
+
+
+def mteb_test_rerank_models(
+    vllm_runner,
+    model_info: RerankModelInfo,
+    hf_runner=HFMtebCrossEncoder,
+    vllm_extra_kwargs=None,
+    vllm_mteb_encoder=VllmMtebCrossEncoder,
+    atol=MTEB_RERANK_TOL,
+):
+    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
+
+    # Maybe load chat_template.
+    chat_template: str | None = None
+    if model_info.chat_template_name is not None:
+        chat_template = (template_home / model_info.chat_template_name).read_text()
+
+    with vllm_runner(
+        model_info.name,
+        runner="pooling",
+        max_model_len=None,
+        max_num_seqs=8,
+        **vllm_extra_kwargs,
+    ) as vllm_model:
+        model_config = vllm_model.llm.llm_engine.model_config
+        vllm_model.chat_template = chat_template
+
+        # Confirm whether vllm is using the correct architecture
+        if model_info.architecture:
+            assert model_info.architecture in model_config.architectures
+
+        # Score API is only enabled for num_labels == 1
+        assert model_config.hf_config.num_labels == 1
+
+        # Confirm whether the important configs in model_config are correct.
+        pooler_config = model_config.pooler_config
+        if model_info.seq_pooling_type is not None:
+            assert pooler_config.seq_pooling_type == model_info.seq_pooling_type
+        if model_info.tok_pooling_type is not None:
+            assert pooler_config.tok_pooling_type == model_info.tok_pooling_type
+        if model_info.attn_type is not None:
+            assert model_config.attn_type == model_info.attn_type
+        if model_info.is_prefix_caching_supported is not None:
+            assert (
+                model_config.is_prefix_caching_supported
+                == model_info.is_prefix_caching_supported
+            )
+        if model_info.is_chunked_prefill_supported is not None:
+            assert (
+                model_config.is_chunked_prefill_supported
+                == model_info.is_chunked_prefill_supported
+            )
+
+        vllm_main_score = run_mteb_rerank(
+            vllm_mteb_encoder(vllm_model),
+            tasks=MTEB_RERANK_TASKS,
+            languages=MTEB_RERANK_LANGS,
+        )
+        vllm_dtype = model_config.dtype
+        head_dtype = model_config.head_dtype
+
+    # Accelerate mteb test by setting
+    # SentenceTransformers mteb score to a constant
+    if model_info.mteb_score is None:
+        with hf_runner(model_info.name, dtype=model_info.hf_dtype) as hf_model:
+            hf_model.chat_template = chat_template
+            st_main_score = run_mteb_rerank(
+                hf_model,
+                tasks=MTEB_RERANK_TASKS,
+                languages=MTEB_RERANK_LANGS,
+            )
+            st_dtype = next(hf_model.model.model.parameters()).dtype
+    else:
+        st_main_score = model_info.mteb_score
+        st_dtype = "Constant"
+
+    print("Model:", model_info.name)
+    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
+    print("SentenceTransformers:", st_dtype, st_main_score)
+    print("Difference:", st_main_score - vllm_main_score)
+
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < atol