Merge tag 'v0.9.2' into v0.9.2-ori

99324e25 · zhuwenwen · cc7f22a8 · a5dd03c1 · cc7f22a8 · 99324e25
Commit 99324e25 authored Jul 12, 2025 by zhuwenwen
20 changed files
--- a/tests/models/language/generation/test_granitemoehybrid.py
+++ b/tests/models/language/generation/test_granitemoehybrid.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from ...utils import check_logprobs_close
-
-# Path of the checkpoints
-MODELS = [
-    "ibm-granite/granite-4.0-tiny-preview",
-]
-
-
-@pytest.mark.skip(
-    reason="Granite 4.0 is not yet available in huggingface transformers")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float16", "bfloat16"])
-@pytest.mark.parametrize("max_tokens", [64])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_model_equivalence_to_hf_greedy(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-):
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
-
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -3,12 +3,16 @@

 import pytest

+from tests.models.registry import HF_EXAMPLE_MODELS
 from tests.utils import multi_gpu_test
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams

 from ...utils import check_logprobs_close, check_outputs_equal

+# Mark all tests as hybrid
+pytestmark = pytest.mark.hybrid_model
+
 # NOTE: The first model in each list is taken as the primary model,
 # meaning that it will be used in all tests in this file
 # The rest of the models will only be tested by test_models
@@ -16,25 +20,55 @@ from ...utils import check_logprobs_close, check_outputs_equal
 SSM_MODELS = [
    "state-spaces/mamba-130m-hf",
    "tiiuae/falcon-mamba-tiny-dev",
-    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
-    # Mamba2 is buggy for Codestral as it doesn't handle n_groups.
-    # See https://github.com/huggingface/transformers/pull/35943
-    # "mistralai/Mamba-Codestral-7B-v0.1",
+    "mistralai/Mamba-Codestral-7B-v0.1",
 ]

 HYBRID_MODELS = [
    "ai21labs/Jamba-tiny-dev",
-    # NOTE: ibm-granite/granite-4.0-tiny-preview are skipped currently as
-    # it is not yet available in huggingface transformers
-    # "ibm-granite/granite-4.0-tiny-preview",
    # NOTE: Running Plamo2 in transformers implementation requires to install
    # causal-conv1d package, which is not listed as a test dependency as it's
    # not compatible with pip-compile.
    "pfnet/plamo-2-1b",
    "Zyphra/Zamba2-1.2B-instruct",
    "hmellor/tiny-random-BambaForCausalLM",
+    "ibm-ai-platform/Bamba-9B-v1",
+    "nvidia/Nemotron-H-8B-Base-8K",
+    "ibm-granite/granite-4.0-tiny-preview",
+    "tiiuae/Falcon-H1-0.5B-Base",
+]
+
+HF_UNSUPPORTED_MODELS = [
+    # The HF transformers implementation of
+    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
+    # doesn't compare vLLM output with HF output.
+    # See https://github.com/huggingface/transformers/pull/35943
+    "mistralai/Mamba-Codestral-7B-v0.1",
+    # Note: I'm not seeing the same output from vLLM V0 vs. HF transformers
+    # for Nemotron-H-8B; currently only compare vLLM V0 vs. vLLM V1
+    "nvidia/Nemotron-H-8B-Base-8K",
+    # NOTE: Currently the test fails due to HF transformers issue fixed in:
+    # https://github.com/huggingface/transformers/pull/39033
+    # We will enable vLLM test for Granite after next HF transformers release.
+    "ibm-granite/granite-4.0-tiny-preview",
 ]

+V1_SUPPORTED_MODELS = [
+    "mistralai/Mamba-Codestral-7B-v0.1",
+    "ibm-ai-platform/Bamba-9B-v1",
+    "Zyphra/Zamba2-1.2B-instruct",
+    "nvidia/Nemotron-H-8B-Base-8K",
+    "ibm-granite/granite-4.0-tiny-preview",
+    "tiiuae/Falcon-H1-0.5B-Base",
+]
+
+ATTN_BLOCK_SIZES = {
+    "ibm-ai-platform/Bamba-9B-v1": 528,
+    "Zyphra/Zamba2-1.2B-instruct": 80,
+    "nvidia/Nemotron-H-8B-Base-8K": 528,
+    "ibm-granite/granite-4.0-tiny-preview": 400,
+    "tiiuae/Falcon-H1-0.5B-Base": 800,
+}
+
 # Avoid OOM
 MAX_NUM_SEQS = 4

@@ -46,24 +80,67 @@ def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
+    monkeypatch,
    model: str,
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
+
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
    with hf_runner(model) as hf_model:
-        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+        if model not in HF_UNSUPPORTED_MODELS:
+            hf_outputs = hf_model.generate_greedy_logprobs_limit(
+                example_prompts, max_tokens, num_logprobs)
+        else:
+            hf_outputs = None

    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
+        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+    if model in V1_SUPPORTED_MODELS:
+        if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
+            block_size = ATTN_BLOCK_SIZES[model]
+        else:
+            block_size = 16
+
+        with monkeypatch.context() as m:
+            m.setenv("VLLM_USE_V1", "1")
+            if model in HYBRID_MODELS:
+                # required due to reorder_batch behaviour
+                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+            with vllm_runner(model,
+                             max_num_seqs=MAX_NUM_SEQS,
+                             enforce_eager=True,
+                             enable_prefix_caching=False,
+                             block_size=block_size) as vllm_model:
+                vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+                    example_prompts, max_tokens, num_logprobs)
+    else:
+        vllm_v1_outputs = None
+
+    if hf_outputs is not None:
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_v0_outputs,
+            name_0="hf",
+            name_1="vllm-v0",
+        )
+
+    if model in V1_SUPPORTED_MODELS:
+        ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
+        check_logprobs_close(
+            outputs_0_lst=ref_outputs,
+            outputs_1_lst=vllm_v1_outputs,
+            name_0="hf" if hf_outputs is not None else "vllm-v0",
+            name_1="vllm-v1",
+        )


 @pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
@@ -76,6 +153,14 @@ def test_batching(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
+
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
    for_loop_outputs = []
    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        for prompt in example_prompts:

--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -10,6 +10,7 @@ import pytest
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
    MistralToolCall, MistralToolParser)
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.transformers_utils.tokenizer import MistralTokenizer

 from ...utils import check_logprobs_close

@@ -318,3 +319,53 @@ def test_mistral_guided_decoding(
                                schema=SAMPLE_JSON_SCHEMA)
        except jsonschema.exceptions.ValidationError:
            pytest.fail("Generated response is not valid with JSON schema")
+
+
+def test_mistral_function_call_nested_json():
+    """Ensure that the function-name regex captures the entire outer-most
+    JSON block, including nested braces."""
+
+    # Create a minimal stub tokenizer that provides the few attributes the
+    # parser accesses (`version` and `get_vocab`).
+    class _StubMistralTokenizer(MistralTokenizer):
+        version = 11  # Satisfy the version check
+
+        def __init__(self):
+            pass
+
+        @staticmethod
+        def get_vocab():
+            # Provide the special TOOL_CALLS token expected by the parser.
+            return {"[TOOL_CALLS]": 0}
+
+    tokenizer = _StubMistralTokenizer()
+    parser = MistralToolParser(tokenizer)
+
+    # Craft a model output featuring nested JSON inside the arguments.
+    args_dict = {
+        "city": "Dallas",
+        "state": "TX",
+        "unit": "fahrenheit",
+        "sub_dict": {
+            "foo": "bar",
+            "inner": {
+                "x": 1,
+                "y": 2
+            }
+        },
+    }
+
+    model_output = (
+        f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}")
+
+    parsed = parser.extract_tool_calls(model_output, None)
+
+    # Assertions: the tool call is detected and the full nested JSON is parsed
+    # without truncation.
+    assert parsed.tools_called
+
+    assert MistralToolCall.is_valid_id(parsed.tool_calls[0].id)
+    assert parsed.tool_calls[0].function.name == "get_current_weather"
+    assert json.loads(parsed.tool_calls[0].function.arguments) == args_dict
+    # No additional content outside the tool call should be returned.
+    assert parsed.content is None
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -55,7 +55,7 @@ def correctness_test_embed_models(hf_runner,
                     task="embed",
                     max_model_len=None,
                     **vllm_extra_kwargs) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_outputs = vllm_model.embed(example_prompts)

    with hf_runner(
            model_info.name,

--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import tempfile
 from collections.abc import Sequence
+from typing import Optional

 import mteb
 import numpy as np
 import pytest
+import requests

-from tests.models.utils import EmbedModelInfo
+from tests.models.utils import EmbedModelInfo, RerankModelInfo

-# Most models on the STS12 task (See #17175):
+# Most embedding models on the STS12 task (See #17175):
 # - Model implementation and minor changes in tensor dtype
 #   results in differences less than 1e-4
 # - Different model results in differences more than 1e-3
@@ -16,6 +20,11 @@ from tests.models.utils import EmbedModelInfo
 MTEB_EMBED_TASKS = ["STS12"]
 MTEB_EMBED_TOL = 1e-4

+# See #19344
+MTEB_RERANK_TASKS = ["NFCorpus"]
+MTEB_RERANK_LANGS = ["en"]
+MTEB_RERANK_TOL = 1e-3
+

 class VllmMtebEncoder(mteb.Encoder):

@@ -34,11 +43,32 @@ class VllmMtebEncoder(mteb.Encoder):
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]
-        outputs = self.model.encode(sentences, use_tqdm=False)
+        outputs = self.model.embed(sentences, use_tqdm=False)
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds

+    def predict(
+        self,
+        sentences: list[tuple[str, str,
+                              Optional[str]]],  # query, corpus, prompt
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+
+        queries = [s[0] for s in sentences]
+        corpus = [s[1] for s in sentences]
+
+        outputs = self.model.score(queries,
+                                   corpus,
+                                   truncate_prompt_tokens=-1,
+                                   use_tqdm=False)
+        scores = np.array(outputs)
+        scores = scores[np.argsort(r)]
+        return scores
+

 class OpenAIClientMtebEncoder(mteb.Encoder):

@@ -62,21 +92,72 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
        return embeds


+class ScoreClientMtebEncoder(mteb.Encoder):
+
+    def __init__(self, model_name: str, url):
+        super().__init__()
+        self.model_name = model_name
+        self.url = url
+        self.rng = np.random.default_rng(seed=42)
+
+    def predict(
+        self,
+        sentences: list[tuple[str, str,
+                              Optional[str]]],  # query, corpus, prompt
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+
+        outputs = []
+        for query, corpus, prompt in sentences:
+            outputs.append(self.get_score(query, corpus))
+
+        scores = np.array(outputs)
+        scores = scores[np.argsort(r)]
+        return scores
+
+    def get_score(self, query, corpus):
+        response = requests.post(self.url,
+                                 json={
+                                     "model": self.model_name,
+                                     "text_1": query,
+                                     "text_2": corpus,
+                                     "truncate_prompt_tokens": -1,
+                                 }).json()
+        return response['data'][0]["score"]
+
+
+class RerankClientMtebEncoder(ScoreClientMtebEncoder):
+
+    def get_score(self, query, corpus):
+        response = requests.post(self.url,
+                                 json={
+                                     "model": self.model_name,
+                                     "query": query,
+                                     "documents": [corpus],
+                                     "truncate_prompt_tokens": -1,
+                                 }).json()
+        return response['results'][0]["relevance_score"]
+
+
 def run_mteb_embed_task(encoder, tasks):
    tasks = mteb.get_tasks(tasks=tasks)
    evaluation = mteb.MTEB(tasks=tasks)
-    results = evaluation.run(encoder, verbosity=0, output_folder=None)
+    results = evaluation.run(
+        encoder,
+        verbosity=0,
+        output_folder=None,
+        encode_kwargs={
+            "show_progress_bar": False,
+        },
+    )

    main_score = results[0].scores["test"][0]["main_score"]
    return main_score


-def run_mteb_embed_task_st(model_name, tasks):
-    from sentence_transformers import SentenceTransformer
-    model = SentenceTransformer(model_name)
-    return run_mteb_embed_task(model, tasks)
-
-
 def mteb_test_embed_models(hf_runner,
                           vllm_runner,
                           model_info: EmbedModelInfo,
@@ -118,3 +199,105 @@ def mteb_test_embed_models(hf_runner,
    print("Difference:", st_main_score - vllm_main_score)

    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
+
+
+def run_mteb_rerank(cross_encoder, tasks, languages):
+    with tempfile.TemporaryDirectory() as results_folder:
+        bm25s = mteb.get_model("bm25s")
+        tasks = mteb.get_tasks(tasks=tasks, languages=languages)
+
+        subset = "default"
+        eval_splits = ["test"]
+
+        evaluation = mteb.MTEB(tasks=tasks)
+        evaluation.run(
+            bm25s,
+            verbosity=0,
+            eval_splits=eval_splits,
+            save_predictions=True,
+            output_folder=f"{results_folder}/stage1",
+            encode_kwargs={"show_progress_bar": False},
+        )
+
+        results = evaluation.run(
+            cross_encoder,
+            verbosity=0,
+            eval_splits=eval_splits,
+            top_k=10,
+            save_predictions=True,
+            output_folder=f"{results_folder}/stage2",
+            previous_results=
+            f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
+            encode_kwargs={"show_progress_bar": False},
+        )
+        main_score = results[0].scores["test"][0]["main_score"]
+    return main_score
+
+
+def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
+    with hf_runner(model_name, is_cross_encoder=True,
+                   dtype="float32") as hf_model:
+
+        original_predict = hf_model.predict
+
+        def _predict(
+            sentences: list[tuple[str, str,
+                                  Optional[str]]],  # query, corpus, prompt
+            *args,
+            **kwargs,
+        ):
+            # vllm and st both remove the prompt, fair comparison.
+            prompts = [(s[0], s[1]) for s in sentences]
+            return original_predict(prompts, *args, **kwargs, batch_size=8)
+
+        hf_model.predict = _predict
+        hf_model.original_predict = original_predict
+
+        if hf_model_callback is not None:
+            hf_model_callback(hf_model)
+
+        st_main_score = run_mteb_rerank(hf_model,
+                                        tasks=MTEB_RERANK_TASKS,
+                                        languages=MTEB_RERANK_LANGS)
+        st_dtype = next(hf_model.model.model.parameters()).dtype
+    return st_main_score, st_dtype
+
+
+def mteb_test_rerank_models(hf_runner,
+                            vllm_runner,
+                            model_info: RerankModelInfo,
+                            vllm_extra_kwargs=None,
+                            hf_model_callback=None):
+    if not model_info.enable_test:
+        # A model family has many models with the same architecture,
+        # and we don't need to test each one.
+        pytest.skip("Skipping test.")
+
+    vllm_extra_kwargs = vllm_extra_kwargs or {}
+    vllm_extra_kwargs["dtype"] = model_info.dtype
+
+    with vllm_runner(model_info.name,
+                     task="score",
+                     max_model_len=None,
+                     max_num_seqs=8,
+                     **vllm_extra_kwargs) as vllm_model:
+
+        model_config = vllm_model.model.llm_engine.model_config
+
+        if model_info.architecture:
+            assert (model_info.architecture in model_config.architectures)
+        assert model_config.hf_config.num_labels == 1
+
+        vllm_main_score = run_mteb_rerank(VllmMtebEncoder(vllm_model),
+                                          tasks=MTEB_RERANK_TASKS,
+                                          languages=MTEB_RERANK_LANGS)
+        vllm_dtype = model_config.dtype
+
+    st_main_score, st_dtype = mteb_test_rerank_models_hf(
+        hf_runner, model_info.name, hf_model_callback)
+
+    print("VLLM:", vllm_dtype, vllm_main_score)
+    print("SentenceTransformers:", st_dtype, st_main_score)
+    print("Difference:", st_main_score - vllm_main_score)
+
+    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
--- a/tests/models/language/pooling/test_baai.py
+++ b/tests/models/language/pooling/test_baai.py
@@ -2,8 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

-from .embed_utils import EmbedModelInfo, correctness_test_embed_models
-from .mteb_utils import mteb_test_embed_models
+from ...utils import EmbedModelInfo, RerankModelInfo
+from .embed_utils import correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models

 MODELS = [
    ########## BertModel
@@ -57,6 +58,20 @@ MODELS = [
                   enable_test=True),
 ]

+RERANK_MODELS = [
+    ########## XLMRobertaForSequenceClassification
+    RerankModelInfo("BAAI/bge-reranker-base",
+                    architecture="XLMRobertaForSequenceClassification",
+                    enable_test=True),
+    RerankModelInfo("BAAI/bge-reranker-large",
+                    architecture="XLMRobertaForSequenceClassification",
+                    enable_test=False),
+    RerankModelInfo("BAAI/bge-reranker-v2-m3",
+                    architecture="XLMRobertaForSequenceClassification",
+                    dtype="float32",
+                    enable_test=False)
+]
+

 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
@@ -70,3 +85,9 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
                                  example_prompts) -> None:
    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
                                  example_prompts)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(hf_runner, vllm_runner,
+                            model_info: RerankModelInfo) -> None:
+    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -6,6 +6,14 @@ from transformers import AutoModelForSequenceClassification

 from vllm.platforms import current_platform

+# TODO: enable when float32 is supported by V1
+# @pytest.fixture(autouse=True)
+# def v1(run_with_both_engines):
+#     # Simple autouse wrapper to run both engines for each test
+#     # This can be promoted up to conftest.py to run for every
+#     # test in a package
+#     pass
+

 @pytest.mark.parametrize(
    "model",
@@ -29,7 +37,7 @@ def test_models(
        # switch to use ROCm CK FA backend
        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")

-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.classify(example_prompts)

    with hf_runner(model,

--- a/tests/models/language/pooling/test_cross_encoder.py
+++ b/tests/models/language/pooling/test_cross_encoder.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
+
+RERANK_MODELS = [
+    RerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
+                    architecture="BertForSequenceClassification"),
+    RerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
+                    architecture="Qwen3ForSequenceClassification")
+]
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(hf_runner, vllm_runner,
+                            model_info: RerankModelInfo) -> None:
+    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from typing import Optional
+
 import pytest

 from vllm.config import PoolerConfig
@@ -8,6 +11,14 @@ from vllm.platforms import current_platform
 from ...utils import check_embeddings_close


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.mark.parametrize(
    "model",
    [
@@ -20,15 +31,27 @@ from ...utils import check_embeddings_close
                     marks=[pytest.mark.core_model]),
        pytest.param("intfloat/e5-mistral-7b-instruct",
                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
-        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
+        # the qwen models interfere with each other (see PR
+        # https://github.com/vllm-project/vllm/pull/18720).
+        # To avoid this problem, for now we skip v0 since it will be
+        # deprecated anyway.
+        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
+                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
        # [Encoder-only]
        pytest.param("BAAI/bge-base-en-v1.5",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
-        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
-        pytest.param("intfloat/multilingual-e5-small"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
+                     marks=[
+                         pytest.mark.core_model, pytest.mark.cpu_model,
+                         pytest.mark.skip_v1
+                     ]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
+                     marks=[pytest.mark.skip_v1]),
+        pytest.param("intfloat/multilingual-e5-small",
+                     marks=[pytest.mark.skip_v1]),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+                     marks=[pytest.mark.skip_v1]),
        # [Cross-Encoder]
-        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
+        pytest.param("sentence-transformers/stsb-roberta-base-v2",
+                     marks=[pytest.mark.skip_v1]),
    ],
 )
 def test_models(
@@ -38,6 +61,9 @@ def test_models(
    model,
    monkeypatch,
 ) -> None:
+    if model == "intfloat/e5-mistral-7b-instruct" and current_platform.is_cpu(
+    ) and os.environ.get("VLLM_USE_V1", "0") == "1":
+        pytest.skip("CPU V1 doesn't support sliding window")

    if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
        # ROCm Triton FA does not currently support sliding window attention
@@ -49,6 +75,13 @@ def test_models(
        vllm_extra_kwargs["override_pooler_config"] = \
            PoolerConfig(pooling_type="MEAN", normalize=False)

+    max_model_len: Optional[int] = 512
+    if model in [
+            "sentence-transformers/all-MiniLM-L12-v2",
+            "sentence-transformers/stsb-roberta-base-v2"
+    ]:
+        max_model_len = None
+
    # The example_prompts has ending "\n", for example:
    # "Write a short story about a robot that dreams for the first time.\n"
    # sentence_transformers will strip the input texts, see:
@@ -62,9 +95,9 @@ def test_models(

    with vllm_runner(model,
                     task="embed",
-                     max_model_len=None,
+                     max_model_len=max_model_len,
                     **vllm_extra_kwargs) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_outputs = vllm_model.embed(example_prompts)

    check_embeddings_close(
        embeddings_0_lst=hf_outputs,

--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -45,12 +45,27 @@ MODELS = [
    EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
                   architecture="ModernBertModel",
                   enable_test=True),
+    ########## Qwen3ForCausalLM
+    EmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
+                   architecture="Qwen3ForCausalLM",
+                   dtype="float32",
+                   enable_test=True),
+    EmbedModelInfo("Qwen/Qwen3-Embedding-4B",
+                   architecture="Qwen3ForCausalLM",
+                   dtype="float32",
+                   enable_test=False),
+]
+
+V1FlashAttentionImpNotSupported = [
+    "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
 ]


 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
+                           monkeypatch) -> None:
+    if model_info.name in V1FlashAttentionImpNotSupported:
+        monkeypatch.setenv("VLLM_USE_V1", "0")

    vllm_extra_kwargs: dict[str, Any] = {}
    if model_info.architecture == "GteNewModel":
@@ -62,8 +77,10 @@ def test_embed_models_mteb(hf_runner, vllm_runner,

 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
+                                  model_info: EmbedModelInfo, example_prompts,
+                                  monkeypatch) -> None:
+    if model_info.name in V1FlashAttentionImpNotSupported:
+        monkeypatch.setenv("VLLM_USE_V1", "0")

    vllm_extra_kwargs: dict[str, Any] = {}
    if model_info.architecture == "GteNewModel":

--- a/tests/models/language/pooling/test_intfloat.py
+++ b/tests/models/language/pooling/test_intfloat.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

 from ...utils import EmbedModelInfo

--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -6,28 +6,10 @@ import pytest

 from vllm import PoolingParams

-from .embed_utils import (EmbedModelInfo, check_embeddings_close,
+from ...utils import EmbedModelInfo, RerankModelInfo
+from .embed_utils import (check_embeddings_close,
                          correctness_test_embed_models, matryoshka_fy)
-from .mteb_utils import mteb_test_embed_models
-
-SCORING_MODELS = [
-    "jinaai/jina-reranker-v2-base-multilingual",  # Roberta
-]
-
-TEXTS_1 = ["Organic skincare products for sensitive skin"]
-
-TEXTS_2 = [
-    "Organic skincare for sensitive skin with aloe vera and chamomile.",
-    "New makeup trends focus on bold colors and innovative techniques",
-    "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille",
-    "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken",  # noqa: E501
-    "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla",  # noqa: E501
-    "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras",  # noqa: E501
-    "针对敏感肌专门设计的天然有机护肤产品",
-    "新的化妆趋势注重鲜艳的颜色和创新的技巧",
-    "敏感肌のために特別に設計された天然有機スキンケア製品",
-    "新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています",
-]
+from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models

 EMBEDDING_MODELS = [
    EmbedModelInfo("jinaai/jina-embeddings-v3",
@@ -35,47 +17,13 @@ EMBEDDING_MODELS = [
                   is_matryoshka=True)
 ]

-
-@pytest.fixture(scope="module", params=SCORING_MODELS)
-def model_name(request):
-    yield request.param
-
-
-@pytest.mark.parametrize("dtype", ["half"])
-def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
-
-    text_pair = [TEXTS_1[0], TEXTS_2[0]]
-
-    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
-        hf_outputs = hf_model.predict([text_pair]).tolist()
-
-    with vllm_runner(model_name, task="score", dtype=dtype,
-                     max_model_len=None) as vllm_model:
-        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
-
-    assert len(vllm_outputs) == 1
-    assert len(hf_outputs) == 1
-
-    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
-
-
-@pytest.mark.parametrize("dtype", ["half"])
-def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
-
-    text_pairs = [[TEXTS_1[0], text] for text in TEXTS_2]
-
-    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
-        hf_outputs = hf_model.predict(text_pairs).tolist()
-
-    with vllm_runner(model_name, task="score", dtype=dtype,
-                     max_model_len=None) as vllm_model:
-        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
-
-    assert len(vllm_outputs) == 10
-    assert len(hf_outputs) == 10
-
-    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
-    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
+RERANK_MODELS = [
+    RerankModelInfo(
+        "jinaai/jina-reranker-v2-base-multilingual",
+        architecture="XLMRobertaForSequenceClassification",
+        dtype="float32",
+    )
+]


 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
@@ -106,6 +54,12 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
                                  hf_model_callback=hf_model_callback)


+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(hf_runner, vllm_runner,
+                            model_info: RerankModelInfo) -> None:
+    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
+
+
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("dimensions", [16, 32])
@@ -144,11 +98,11 @@ def test_matryoshka(

        if dimensions not in matryoshka_dimensions:
            with pytest.raises(ValueError):
-                vllm_model.encode(
+                vllm_model.embed(
                    example_prompts,
                    pooling_params=PoolingParams(dimensions=dimensions))
        else:
-            vllm_outputs = vllm_model.encode(
+            vllm_outputs = vllm_model.embed(
                example_prompts,
                pooling_params=PoolingParams(dimensions=dimensions))


--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import pytest
+import torch
+
+from tests.conftest import HfRunner
+
+from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
+
+RERANK_MODELS = [
+    RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
+                    architecture="Qwen2ForSequenceClassification",
+                    dtype="float32",
+                    enable_test=True),
+    RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
+                    architecture="Qwen2ForSequenceClassification",
+                    dtype="float32",
+                    enable_test=False)
+]
+
+
+class MxbaiRerankerHfRunner(HfRunner):
+
+    def __init__(self,
+                 model_name: str,
+                 dtype: str = "auto",
+                 *args: Any,
+                 **kwargs: Any) -> None:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                       padding_side='left')
+        self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
+        self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
+
+    def predict(self, prompts: list[list[str]], *args,
+                **kwargs) -> torch.Tensor:
+
+        def process_inputs(pairs):
+            inputs = self.tokenizer(pairs,
+                                    padding=False,
+                                    truncation='longest_first',
+                                    return_attention_mask=False)
+            for i, ele in enumerate(inputs['input_ids']):
+                inputs['input_ids'][i] = ele
+            inputs = self.tokenizer.pad(inputs,
+                                        padding=True,
+                                        return_tensors="pt")
+            for key in inputs:
+                inputs[key] = inputs[key].to(self.model.device)
+            return inputs
+
+        @torch.no_grad()
+        def compute_logits(inputs):
+            logits = self.model(**inputs).logits[:, -1, :]
+            yes_logits = logits[:, self.yes_loc]
+            no_logits = logits[:, self.no_loc]
+            logits = yes_logits - no_logits
+            scores = logits.float().sigmoid()
+            return scores
+
+        scores = []
+        for prompt in prompts:
+            inputs = process_inputs([prompt])
+            score = compute_logits(inputs)
+            scores.append(score[0].item())
+        return torch.Tensor(scores)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
+    vllm_extra_kwargs: dict[str, Any] = {}
+    if model_info.architecture == "Qwen2ForSequenceClassification":
+        vllm_extra_kwargs["hf_overrides"] = {
+            "architectures": ["Qwen2ForSequenceClassification"],
+            "classifier_from_token": ["0", "1"],
+            "method": "from_2_way_softmax",
+        }
+
+    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
+                            vllm_extra_kwargs)
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import pytest
+import torch
+
+from tests.conftest import HfRunner
+
+from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
+
+RERANK_MODELS = [
+    RerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
+                    architecture="Qwen3ForSequenceClassification",
+                    dtype="float32",
+                    enable_test=True),
+    RerankModelInfo("Qwen/Qwen3-Reranker-4B",
+                    architecture="Qwen3ForSequenceClassification",
+                    dtype="float32",
+                    enable_test=False)
+]
+
+
+class Qwen3RerankerHfRunner(HfRunner):
+
+    def __init__(self,
+                 model_name: str,
+                 dtype: str = "auto",
+                 *args: Any,
+                 **kwargs: Any) -> None:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                       padding_side='left')
+        self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
+        self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
+
+    def predict(self, prompts: list[list[str]], *args,
+                **kwargs) -> torch.Tensor:
+
+        def process_inputs(pairs):
+            inputs = self.tokenizer(pairs,
+                                    padding=False,
+                                    truncation='longest_first',
+                                    return_attention_mask=False)
+            for i, ele in enumerate(inputs['input_ids']):
+                inputs['input_ids'][i] = ele
+            inputs = self.tokenizer.pad(inputs,
+                                        padding=True,
+                                        return_tensors="pt")
+            for key in inputs:
+                inputs[key] = inputs[key].to(self.model.device)
+            return inputs
+
+        @torch.no_grad()
+        def compute_logits(inputs):
+            batch_scores = self.model(**inputs).logits[:, -1, :]
+            true_vector = batch_scores[:, self.token_true_id]
+            false_vector = batch_scores[:, self.token_false_id]
+            batch_scores = torch.stack([false_vector, true_vector], dim=1)
+            batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
+            scores = batch_scores[:, 1].exp()
+            return scores
+
+        scores = []
+        for prompt in prompts:
+            inputs = process_inputs([prompt])
+            score = compute_logits(inputs)
+            scores.append(score[0].item())
+        return torch.Tensor(scores)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
+
+    assert model_info.architecture == "Qwen3ForSequenceClassification"
+
+    vllm_extra_kwargs: dict[str, Any] = {
+        "hf_overrides": {
+            "architectures": ["Qwen3ForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        }
+    }
+
+    if model_info.name == "Qwen/Qwen3-Reranker-4B":
+        vllm_extra_kwargs["max_num_seqs"] = 1
+
+    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
+                            vllm_extra_kwargs)
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+import torch.nn.functional as F
+from transformers import AutoModel
+
+from vllm.platforms import current_platform
+
+from ....conftest import HfRunner
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.fixture
+def math_step_prompts():
+    # ruff: noqa: E501
+    data = {
+        "system":
+        "Please reason step by step, and put your final answer within \\boxed{}. ",
+        "query":
+        "Sue lives in a fun neighborhood.  One weekend, the neighbors decided to play a prank on Sue.  On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard.  On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard.  Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
+        "response": [
+            "To find out how many more pink plastic flamingos were out than white plastic flamingos at noon on Sunday, we can break down the problem into steps. First, on Friday, the neighbors start with 18 pink plastic flamingos.",
+            "On Saturday, they take back one third of the flamingos. Since there were 18 flamingos, (1/3 \\times 18 = 6) flamingos are taken back. So, they have (18 - 6 = 12) flamingos left in their possession. Then, they paint these 6 flamingos white and put them back out on Sue's front yard. Now, Sue has the original 12 pink flamingos plus the 6 new white ones. Thus, by the end of Saturday, Sue has (12 + 6 = 18) pink flamingos and 6 white flamingos.",
+            "On Sunday, the neighbors add another 18 pink plastic flamingos to Sue's front yard. By the end of Sunday morning, Sue has (18 + 18 = 36) pink flamingos and still 6 white flamingos.",
+            "To find the difference, subtract the number of white flamingos from the number of pink flamingos: (36 - 6 = 30). Therefore, at noon on Sunday, there were 30 more pink plastic flamingos out than white plastic flamingos. The answer is (\\boxed{30}).",
+        ],
+    }
+    answer = "<extra_0>".join(data['response']) + "<extra_0>"
+    prompt = f"<im_start>system\n{data['system']}<im_end>\n<im_start>user\n{data['query']}<im_end>\n<im_start>assistant\n{answer}<im_end><|endoftext|>"
+    return [prompt]
+
+
+def step_reward_patch_hf_model(hf_model: HfRunner):
+
+    # Patch the hf_runner to use the step reward function
+    def make_step_rewards(logits: torch.Tensor,
+                          token_masks: torch.Tensor) -> list[list[float]]:
+        probabilities = F.softmax(logits, dim=-1)
+        probabilities = probabilities * token_masks.unsqueeze(-1)
+
+        all_scores_res: list[list[float]] = []
+        for i in range(probabilities.size(0)):
+            sample = probabilities[i]  # seq_len, num_labels
+            positive_probs = sample[sample != 0].view(-1, 2)
+            non_zero_elements_list = positive_probs.cpu().tolist()
+            all_scores_res.append(non_zero_elements_list)
+        return all_scores_res
+
+    def reward(prompts: list[str]) -> list[list[float]]:
+        input_ids = hf_model.tokenizer(prompts, return_tensors="pt").input_ids
+        input_ids = hf_model.wrap_device(input_ids)
+        outputs = hf_model.model(input_ids=input_ids)
+
+        step_sep_id = hf_model.tokenizer.encode("<extra_0>")[0]
+        token_masks = (input_ids == step_sep_id)
+        return make_step_rewards(outputs[0], token_masks)
+
+    hf_model.reward = reward  # type: ignore[attr-defined]
+
+    return hf_model
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param("Qwen/Qwen2.5-Math-PRM-7B",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_prm_models(
+    hf_runner,
+    vllm_runner,
+    math_step_prompts,
+    model: str,
+    dtype: str,
+    monkeypatch,
+) -> None:
+    if current_platform.is_rocm():
+        # ROCm Triton FA does not currently support sliding window attention
+        # switch to use ROCm CK FA backend
+        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
+
+    with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.encode(math_step_prompts)
+
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
+        hf_model = step_reward_patch_hf_model(hf_model)
+        hf_outputs = hf_model.reward(math_step_prompts)
+
+    # check logits difference
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output)
+        vllm_output = torch.tensor(vllm_output)
+
+        assert torch.allclose(hf_output, vllm_output, 1.5e-2)
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -33,9 +33,6 @@ if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"

 REQUIRES_V0_MODELS = [
-    # V1 Test: no way to fall back for head_dim = 80
-    # https://github.com/vllm-project/vllm/issues/14524
-    "qwen_vl",
    # V1 Test: not enough KV cache space in C1.
    "fuyu",
 ]
@@ -107,6 +104,8 @@ VLM_TEST_SETTINGS = {
            ),
            limit_mm_per_prompt={"image": 4},
        )],
+        # TODO: Revert to "auto" when CPU backend can use torch > 2.6
+        dtype="bfloat16" if current_platform.is_cpu() else "auto",
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
    "paligemma": VLMTestInfo(
@@ -219,8 +218,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "blip2": VLMTestInfo(
-        # TODO: Change back to 2.7b once head_dim = 80 is supported
-        models=["Salesforce/blip2-opt-6.7b"],
+        models=["Salesforce/blip2-opt-2.7b"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
@@ -307,11 +305,38 @@ VLM_TEST_SETTINGS = {
        num_logprobs=10,
        marks=[large_gpu_mark(min_gb=32)],
    ),
+    "glm4_1v": VLMTestInfo(
+        models=["THUDM/GLM-4.1V-9B-Thinking"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
+        max_model_len=2048,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
+        num_logprobs=10,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        auto_cls=AutoModelForImageTextToText,
+    ),
+    "glm4_1v-video": VLMTestInfo(
+        models=["THUDM/GLM-4.1V-9B-Thinking"],
+        # GLM4.1V require include video metadata for input
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.video_with_metadata_glm4_1v(),
+            limit_mm_per_prompt={"video": 1},
+        )],
+        # This is needed to run on machine with 24GB VRAM
+        vllm_runner_kwargs={"gpu_memory_utilization": 0.95},
+    ),
    "h2ovl": VLMTestInfo(
        models = [
            "h2oai/h2ovl-mississippi-800m",
-            # TODO: Re-enable once head_dim = 80 is supported
-            # "h2oai/h2ovl-mississippi-2b",
+            "h2oai/h2ovl-mississippi-2b",
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501

--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -203,6 +203,9 @@ def build_embedding_inputs_from_test_info(

    images = [asset.pil_image for asset in image_assets]
    embeds = test_info.convert_assets_to_embeddings(image_assets)
+    if test_info.dtype != "auto":
+        dtype = getattr(torch, test_info.dtype)  # type: ignore
+        embeds = [e.to(dtype=dtype) for e in embeds]
    assert len(images) == len(model_prompts)

    inputs = build_single_image_inputs(images, model_prompts, size_wrapper)

--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -129,3 +129,23 @@ def windows_attention_image_qwen2_5_vl():

    wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
    return build_single_image_inputs([image], [prompt], wrapped_sf)
+
+
+def video_with_metadata_glm4_1v():
+    video_array = VIDEO_ASSETS[0].np_ndarrays
+    metadata = VIDEO_ASSETS[0].metadata
+    question = "Describe the video."
+    video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
+    formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
+
+    scales = [0.1, 0.2, 0.25]
+    video_input = [[(rescale_video_size(video_array, scale), metadata)]
+                   for scale in scales]
+    prompts = [formatted_prompt] * len(video_input)
+
+    return [
+        PromptWithMultiModalInput(
+            prompts=prompts,
+            video_data=video_input,
+        )
+    ]
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -16,9 +16,11 @@ import torch
 from PIL.Image import Image
 from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
                          GenerationConfig, GenerationMixin)
+from transformers.video_utils import VideoMetadata

 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
+from vllm.utils import is_list_of

 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
 from .types import RunnerOutput
@@ -373,6 +375,28 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    return hf_model


+def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for GLM4.1V."""
+    hf_processor = hf_model.processor
+
+    def processor(*args, videos=None, **kwargs):
+        if videos is not None and is_list_of(videos, tuple):
+            # If videos is a list of tuples, we assume each tuple contains
+            # (video_array, metadata) as in the case of GLM4.1V.
+            video_metadata = [[VideoMetadata(**video[1])] for video in videos]
+            videos = [[video[0]] for video in videos]
+        else:
+            video_metadata = None
+
+        return hf_processor(*args,
+                            videos=videos,
+                            video_metadata=video_metadata,
+                            **kwargs)
+
+    hf_model.processor = processor
+    return hf_model
+
+
 def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for H2OVL."""


--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -98,7 +98,7 @@ def _run_test(
                     max_model_len=8192) as vllm_model:
        tokenizer = vllm_model.model.get_tokenizer()
        texts = [
-            # this is necessary because vllm_model.encode will not apply any
+            # this is necessary because vllm_model.embed will not apply any
            # templating to the prompt, and therefore lacks an image_pad
            # token unless one is inserted beforehand (the (28,28) image
            # above is converted to an image pad token by the chat template).
@@ -109,7 +109,7 @@ def _run_test(
            # vllm will replace the pad token with the actual image,
            # which may be a placeholder image, later.
        ]
-        vllm_outputs = vllm_model.encode(texts, images=input_images)
+        vllm_outputs = vllm_model.embed(texts, images=input_images)

    hf_outputs = []
    with hf_runner(model,