Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori

d2b52805 · zhuwenwen · 9a521c23 · 5438967f · d2b52805 · 9a521c23
Commit d2b52805 authored Sep 07, 2025 by zhuwenwen
20 changed files
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -50,7 +50,6 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
        max_loras=4,
        distributed_executor_backend="ray",
        tensor_parallel_size=tp_size,
-        enable_chunked_prefill=True,
    )

    expected_lora_output = [

--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import vllm
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "microsoft/phi-2"
-
-PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
-    prompts = [
-        PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "Which catalog publisher has published the most catalogs?",
-            context="CREATE TABLE catalogs (catalog_publisher VARCHAR);"),
-        PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "Which trip started from the station with the largest dock count? Give me the trip id.",  # noqa: E501
-            context=
-            "CREATE TABLE trip (id VARCHAR, start_station_id VARCHAR); CREATE TABLE station (id VARCHAR, dock_count VARCHAR);"  # noqa: E501
-        ),
-        PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "How many marine species are found in the Southern Ocean?",  # noqa: E501
-            context=
-            "CREATE TABLE marine_species (name VARCHAR(50), common_name VARCHAR(50), location VARCHAR(50));"  # noqa: E501
-        ),
-    ]
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=64,
-                                          stop="### End")
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
-    )
-    # Print the outputs.
-    generated_texts: list[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-def test_phi2_lora(phi2_lora_files):
-    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
-    # Otherwise, the lora-test will fail due to CUDA OOM.
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=2,
-                   enforce_eager=True,
-                   enable_chunked_prefill=True)
-
-    expected_lora_output = [
-        "SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;",  # noqa: E501
-        "SELECT trip.id FROM trip JOIN station ON trip.start_station_id = station.id WHERE station.dock_count = (SELECT MAX(dock_count) FROM station);",  # noqa: E501
-        "SELECT COUNT(*) FROM marine_species WHERE location = 'Southern Ocean';",  # noqa: E501
-    ]
-
-    output1 = do_sample(llm, phi2_lora_files, lora_id=1)
-    for i in range(len(expected_lora_output)):
-        assert output1[i].startswith(expected_lora_output[i])
-    output2 = do_sample(llm, phi2_lora_files, lora_id=2)
-    for i in range(len(expected_lora_output)):
-        assert output2[i].startswith(expected_lora_output[i])
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -4,17 +4,14 @@
 import os
 import random
 import tempfile
-from typing import Union
 from unittest.mock import patch

-import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                         ModelConfig, ParallelConfig, SchedulerConfig,
                         VllmConfig)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
-from vllm.v1.worker.gpu_worker import Worker as V1Worker
-from vllm.worker.worker import Worker
+from vllm.v1.worker.gpu_worker import Worker

 NUM_LORAS = 16

@@ -22,18 +19,11 @@ NUM_LORAS = 16
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):

-    def set_active_loras(worker: Union[Worker, V1Worker],
-                         lora_requests: list[LoRARequest]):
+    def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
        lora_mapping = LoRAMapping([], [])
-        if isinstance(worker, Worker):
-            # v0 case
-            worker.model_runner.set_active_loras(lora_requests, lora_mapping)
-        else:
-            # v1 case
-            worker.model_runner.lora_manager.set_active_adapters(
-                lora_requests, lora_mapping)

-    worker_cls = V1Worker if envs.VLLM_USE_V1 else Worker
+        worker.model_runner.lora_manager.set_active_adapters(
+            lora_requests, lora_mapping)

    vllm_config = VllmConfig(
        model_config=ModelConfig(
@@ -62,7 +52,7 @@ def test_worker_apply_lora(sql_lora_files):
                               max_cpu_loras=NUM_LORAS,
                               max_loras=NUM_LORAS),
    )
-    worker = worker_cls(
+    worker = Worker(
        vllm_config=vllm_config,
        local_rank=0,
        rank=0,

--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import json
+import os
 from dataclasses import dataclass
 from typing import Optional, Union

 import torch
+from safetensors.torch import save_file

 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights

@@ -340,3 +343,76 @@ def generate_data_for_nslices(
        seq_len_tensor,
        indices,
    )
+
+
+def create_peft_lora(
+    model: torch.nn.Module,
+    save_dir: str,
+    target_modules: list[str],
+    rank: int = 8,
+    alpha: int = 16,
+    dropout: float = 0.1,
+    lora_dtype: torch.dtype = torch.float16,
+) -> dict[str, torch.Tensor]:
+    lora_weights = {}
+    adapter_config = {
+        "peft_type": "LORA",
+        "auto_mapping": None,
+        "base_model_name_or_path": "dummy_model",
+        "revision": None,
+        "task_type": "CAUSAL_LM",
+        "inference_mode": False,
+        "r": rank,
+        "lora_alpha": alpha,
+        "lora_dropout": dropout,
+        "fan_in_fan_out": False,
+        "bias": "none",
+        "modules_to_save": None,
+        "init_lora_weights": True,
+        "layers_to_transform": None,
+        "layers_pattern": None,
+        "target_modules": target_modules,
+        "exclude_modules": None,
+        "use_rslora": False,
+        "use_dora": False,
+        "loftq_config": None,
+    }
+
+    for module_name in target_modules:
+
+        module = model
+        for attr in module_name.split("."):
+            module = getattr(module, attr)
+
+        if hasattr(module, "input_size") and hasattr(module, "output_size"):
+
+            in_features = module.input_size
+            out_features = module.output_size
+
+        elif hasattr(module, "embedding_dim") and hasattr(
+                module, "num_embeddings"):
+            # ParallelLMHead
+            in_features = module.embedding_dim
+            out_features = module.num_embeddings
+        else:
+            raise ValueError(
+                f"Unable to determine dimensions for module {module_name}")
+
+        lora_A = torch.randn(rank, in_features, dtype=lora_dtype)
+
+        torch.nn.init.kaiming_uniform_(lora_A, a=5**0.5)
+
+        lora_B = torch.zeros(out_features, rank, dtype=lora_dtype)
+
+        # PEFT style
+        lora_weights[f"base_model.model.{module_name}.lora_A.weight"] = lora_A
+        lora_weights[f"base_model.model.{module_name}.lora_B.weight"] = lora_B
+
+    config_path = os.path.join(save_dir, "adapter_config.json")
+    with open(config_path, "w", encoding="utf-8") as f:
+        json.dump(adapter_config, f, indent=2, ensure_ascii=False)
+
+    weights_path = os.path.join(save_dir, "adapter_model.safetensors")
+    save_file(lora_weights, weights_path)
+
+    return lora_weights
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -92,7 +92,8 @@ AITER_MODEL_LIST = [
        pytest.param(
            "allenai/OLMoE-1B-7B-0924-Instruct",
            marks=[pytest.mark.cpu_model],
-        )
+        ),
+        pytest.param("swiss-ai/Apertus-8B"),  # apertus
    ])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])

--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -31,6 +31,7 @@ HYBRID_MODELS = [
    "hmellor/tiny-random-BambaForCausalLM",
    "ibm-granite/granite-4.0-tiny-preview",
    "tiiuae/Falcon-H1-0.5B-Base",
+    "LiquidAI/LFM2-1.2B",
 ]

 HF_UNSUPPORTED_MODELS = [
@@ -52,18 +53,21 @@ V1_SUPPORTED_MODELS = [
    "hmellor/tiny-random-BambaForCausalLM",
    "ibm-granite/granite-4.0-tiny-preview",
    "tiiuae/Falcon-H1-0.5B-Base",
+    "LiquidAI/LFM2-1.2B",
 ]

-# Avoid OOM
-MAX_NUM_SEQS = 4
-
-# Once we add support for FCG in Mamba1, this list will be removed and tests
-# all test cases will use enforce_eager=False
-ENFORCE_EAGER_MODELS_V1 = [
-    "state-spaces/mamba-130m-hf",
+FULL_CUDA_GRAPH_MODELS = [
    "ai21labs/Jamba-tiny-dev",
+    "Zyphra/Zamba2-1.2B-instruct",
 ]

+V0_UNSUPPORTED_MODELS = [
+    "LiquidAI/LFM2-1.2B",
+]
+
+# Avoid OOM
+MAX_NUM_SEQS = 4
+

 @pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
@@ -96,31 +100,23 @@ def test_models(
        else:
            hf_outputs = None

-    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        if model not in V0_UNSUPPORTED_MODELS:
+            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                    example_prompts, max_tokens, num_logprobs)
+        else:
+            vllm_v0_outputs = None

    if model in V1_SUPPORTED_MODELS:
-        enforce_eager = False
-        with monkeypatch.context() as m:
-            m.setenv("VLLM_USE_V1", "1")
-            if model in HYBRID_MODELS:
-                # required due to reorder_batch behaviour
-                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
-
-            if model in ENFORCE_EAGER_MODELS_V1:
-                enforce_eager = True
-
-            with vllm_runner(model,
-                             max_num_seqs=MAX_NUM_SEQS,
-                             enforce_eager=enforce_eager,
-                             enable_prefix_caching=False) as vllm_model:
-                vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                    example_prompts, max_tokens, num_logprobs)
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
    else:
        vllm_v1_outputs = None

-    if hf_outputs is not None:
+    if hf_outputs is not None and vllm_v0_outputs is not None:
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_v0_outputs,
@@ -130,6 +126,7 @@ def test_models(

    if model in V1_SUPPORTED_MODELS:
        ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
+        assert ref_outputs is not None
        check_logprobs_close(
            outputs_0_lst=ref_outputs,
            outputs_1_lst=vllm_v1_outputs,
@@ -138,7 +135,7 @@ def test_models(
        )


-@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_batching(
@@ -148,7 +145,6 @@ def test_batching(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@@ -186,29 +182,32 @@ def test_chunked_prefill(
    max_tokens: int,
    num_logprobs: int,
    chunked_prefill_token_size: int,
+    monkeypatch,
 ) -> None:
    max_num_seqs = chunked_prefill_token_size
    max_num_batched_tokens = chunked_prefill_token_size

-    with vllm_runner(model,
-                     enable_chunked_prefill=True,
-                     max_num_batched_tokens=max_num_batched_tokens,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-        chunked = vllm_model.generate_greedy_logprobs(example_prompts,
-                                                      max_tokens, num_logprobs)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with vllm_runner(model,
+                         enable_chunked_prefill=True,
+                         max_num_batched_tokens=max_num_batched_tokens,
+                         max_num_seqs=max_num_seqs) as vllm_model:
+            chunked = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)

-    with vllm_runner(model,
-                     enable_chunked_prefill=False,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-        non_chunked = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+        with vllm_runner(model,
+                         enable_chunked_prefill=False,
+                         max_num_seqs=max_num_seqs) as vllm_model:
+            non_chunked = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)

-    check_logprobs_close(
-        outputs_0_lst=chunked,
-        outputs_1_lst=non_chunked,
-        name_0="chunked",
-        name_1="non_chunked",
-    )
+        check_logprobs_close(
+            outputs_0_lst=chunked,
+            outputs_1_lst=non_chunked,
+            name_0="chunked",
+            name_1="non_chunked",
+        )


 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -279,25 +278,29 @@ def test_models_preemption_recompute(
    example_prompts,
    model: str,
    max_tokens: int,
+    monkeypatch,
 ) -> None:
    """
    Tests that outputs are identical with and w/o preemptions (recompute).
    """
-    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        scheduler = vllm_model.llm.llm_engine.scheduler[0]
-        scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
-        preempt_vllm_outputs = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-        scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=preempt_vllm_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="vllm_preepmtions",
-        name_1="vllm",
-    )
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            scheduler = vllm_model.llm.llm_engine.scheduler[0]
+            scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
+            preempt_vllm_outputs = vllm_model.generate_greedy(
+                example_prompts, max_tokens)
+
+            scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
+
+        check_outputs_equal(
+            outputs_0_lst=preempt_vllm_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="vllm_preepmtions",
+            name_1="vllm",
+        )


 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -373,7 +376,7 @@ def test_distributed_correctness(
    )


-@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"])
+@pytest.mark.parametrize("model", FULL_CUDA_GRAPH_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_full_cuda_graph(
@@ -400,23 +403,20 @@ def test_full_cuda_graph(
        else:
            hf_outputs = None

+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        if model not in V0_UNSUPPORTED_MODELS:
+            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                    example_prompts, max_tokens, num_logprobs)
+        else:
+            vllm_v0_outputs = None
+
    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        if model in HYBRID_MODELS:
-            # required due to reorder_batch behaviour
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
-        with vllm_runner(model,
-                         max_num_seqs=MAX_NUM_SEQS,
-                         compilation_config={'full_cuda_graph': True},
-                         enable_prefix_caching=False) as vllm_model:
-            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-
-    if hf_outputs is not None:
+    if hf_outputs is not None and vllm_v0_outputs is not None:
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_v0_outputs,
@@ -425,6 +425,7 @@ def test_full_cuda_graph(
        )

    ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
+    assert ref_outputs is not None
    check_logprobs_close(
        outputs_0_lst=ref_outputs,
        outputs_1_lst=vllm_v1_outputs,
@@ -460,24 +461,20 @@ def test_fp32_state(
        else:
            hf_outputs = None

-    with vllm_runner(model,
-                     max_num_seqs=MAX_NUM_SEQS,
-                     mamba_ssm_cache_dtype="float32") as vllm_model:
-        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        if model in HYBRID_MODELS:
-            # required due to reorder_batch behaviour
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+        m.setenv("VLLM_USE_V1", "0")
        with vllm_runner(model,
                         max_num_seqs=MAX_NUM_SEQS,
-                         mamba_ssm_cache_dtype="float32",
-                         enable_prefix_caching=False) as vllm_model:
-            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+                         mamba_ssm_cache_dtype="float32") as vllm_model:
+            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, num_logprobs)

+    with vllm_runner(model,
+                     max_num_seqs=MAX_NUM_SEQS,
+                     mamba_ssm_cache_dtype="float32") as vllm_model:
+        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
    if hf_outputs is not None:
        check_logprobs_close(
            outputs_0_lst=hf_outputs,

--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -51,6 +51,9 @@ def correctness_test_embed_models(hf_runner,
    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
    with vllm_runner(model_info.name,
                     runner="pooling",
                     max_model_len=None,

--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -172,6 +172,9 @@ def mteb_test_embed_models(hf_runner,
    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
    with vllm_runner(model_info.name,
                     runner="pooling",
                     max_model_len=None,
@@ -284,6 +287,9 @@ def mteb_test_rerank_models(hf_runner,
    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
    with vllm_runner(model_info.name,
                     runner="pooling",
                     max_model_len=None,

--- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
@@ -13,7 +13,14 @@ from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models

 RERANK_MODELS = [
    LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
-                               architecture="GemmaForSequenceClassification"),
+                               architecture="GemmaForSequenceClassification",
+                               hf_overrides={
+                                   "architectures":
+                                   ["GemmaForSequenceClassification"],
+                                   "classifier_from_token": ["Yes"],
+                                   "method":
+                                   "no_post_processing",
+                               }),
 ]

 PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."  # noqa: E501
@@ -119,22 +126,9 @@ class GemmaMtebEncoder(VllmMtebEncoder):


 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo,
-                            monkeypatch) -> None:
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    assert model_info.architecture == "GemmaForSequenceClassification"
-
-    vllm_extra_kwargs: dict[str, Any] = {
-        "hf_overrides": {
-            "architectures": ["GemmaForSequenceClassification"],
-            "classifier_from_token": ["Yes"],
-            "method": "no_post_processing",
-        }
-    }
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:

    mteb_test_rerank_models(GemmaRerankerHfRunner,
                            vllm_runner,
                            model_info,
-                            vllm_extra_kwargs,
                            vllm_mteb_encoder=GemmaMtebEncoder)
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -10,14 +10,6 @@ from vllm.platforms import current_platform
 from ...utils import check_embeddings_close, check_transformers_version


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.mark.parametrize(
    "model",
    [
@@ -32,21 +24,15 @@ def v1(run_with_both_engines):
            "intfloat/e5-mistral-7b-instruct",
            # CPU v1 doesn't support sliding window
            marks=[pytest.mark.core_model]),
-        # the qwen models interfere with each other (see PR
-        # https://github.com/vllm-project/vllm/pull/18720).
-        # To avoid this problem, for now we skip v0 since it will be
-        # deprecated anyway.
        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
-                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
+                     marks=[pytest.mark.cpu_model]),
        # [Encoder-only]
        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
        pytest.param("intfloat/multilingual-e5-small"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
        # [Cross-Encoder]
-        pytest.param("sentence-transformers/stsb-roberta-base-v2",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
    ],
 )
 def test_models(

--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -14,6 +14,7 @@ from ....utils import RemoteOpenAIServer

 MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
 MAX_MODEL_LEN = 4000
+ATOL = 0.002


 def _arr(arr):
@@ -97,16 +98,16 @@ def get_test_data():

 def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
    cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
-    assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=0.001)
+    assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=ATOL)

    cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
-    assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=0.001)
+    assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=ATOL)

    cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
-    assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=0.001)
+    assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=ATOL)

    cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
-    assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=0.001)
+    assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=ATOL)


 def test_gritlm_offline_embedding(vllm_runner):

--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any

 import pytest

@@ -33,12 +32,15 @@ MODELS = [
    ########### NewModel
    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
                             architecture="GteNewModel",
+                             hf_overrides={"architectures": ["GteNewModel"]},
                             enable_test=True),
    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
                             architecture="GteNewModel",
+                             hf_overrides={"architectures": ["GteNewModel"]},
                             enable_test=True),
    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
                             architecture="GteNewModel",
+                             hf_overrides={"architectures": ["GteNewModel"]},
                             enable_test=True),
    ########### Qwen2ForCausalLM
    LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
@@ -60,11 +62,16 @@ MODELS = [
 ]

 RERANK_MODELS = [
-    # classifier_pooling: mean
    CLSPoolingRerankModelInfo(
+        # classifier_pooling: mean
        "Alibaba-NLP/gte-reranker-modernbert-base",
        architecture="ModernBertForSequenceClassification",
        enable_test=True),
+    CLSPoolingRerankModelInfo(
+        "Alibaba-NLP/gte-multilingual-reranker-base",
+        architecture="GteNewForSequenceClassification",
+        hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
+        enable_test=True),
 ]


@@ -75,12 +82,7 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
        check_transformers_version(model_info.name,
                                   max_transformers_version="4.53.2")

-    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.architecture == "GteNewModel":
-        vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
-
-    mteb_test_embed_models(hf_runner, vllm_runner, model_info,
-                           vllm_extra_kwargs)
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)


 @pytest.mark.parametrize("model_info", MODELS)
@@ -91,12 +93,8 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
        check_transformers_version(model_info.name,
                                   max_transformers_version="4.53.2")

-    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.architecture == "GteNewModel":
-        vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
-
    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts, vllm_extra_kwargs)
+                                  example_prompts)


 @pytest.mark.parametrize("model_info", RERANK_MODELS)

--- a/tests/models/language/pooling/test_multilabel_classification_support.py
+++ b/tests/models/language/pooling/test_multilabel_classification_support.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModelForSequenceClassification
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["Rami/multi-label-class-classification-on-github-issues"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_classify_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.classify(example_prompts)
+
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+        hf_outputs = hf_model.classify(example_prompts)
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output)
+        vllm_output = torch.tensor(vllm_output)
+
+        assert torch.allclose(hf_output, vllm_output,
+                              1e-3 if dtype == "float" else 1e-2)
--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
@@ -10,12 +10,20 @@ from tests.conftest import HfRunner
 from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
 from .mteb_utils import mteb_test_rerank_models

+mxbai_rerank_hf_overrides = {
+    "architectures": ["Qwen2ForSequenceClassification"],
+    "classifier_from_token": ["0", "1"],
+    "method": "from_2_way_softmax",
+}
+
 RERANK_MODELS = [
    LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
                               architecture="Qwen2ForSequenceClassification",
+                               hf_overrides=mxbai_rerank_hf_overrides,
                               enable_test=True),
    LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
                               architecture="Qwen2ForSequenceClassification",
+                               hf_overrides=mxbai_rerank_hf_overrides,
                               enable_test=False)
 ]

@@ -71,13 +79,4 @@ class MxbaiRerankerHfRunner(HfRunner):

 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
-    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.architecture == "Qwen2ForSequenceClassification":
-        vllm_extra_kwargs["hf_overrides"] = {
-            "architectures": ["Qwen2ForSequenceClassification"],
-            "classifier_from_token": ["0", "1"],
-            "method": "from_2_way_softmax",
-        }
-
-    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
-                            vllm_extra_kwargs)
+    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info)
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -11,12 +11,20 @@ from tests.utils import multi_gpu_test
 from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
 from .mteb_utils import mteb_test_rerank_models

+qwen3_reranker_hf_overrides = {
+    "architectures": ["Qwen3ForSequenceClassification"],
+    "classifier_from_token": ["no", "yes"],
+    "is_original_qwen3_reranker": True,
+}
+
 RERANK_MODELS = [
    LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
                               architecture="Qwen3ForSequenceClassification",
+                               hf_overrides=qwen3_reranker_hf_overrides,
                               enable_test=True),
    LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
                               architecture="Qwen3ForSequenceClassification",
+                               hf_overrides=qwen3_reranker_hf_overrides,
                               enable_test=False)
 ]

@@ -74,18 +82,7 @@ class Qwen3RerankerHfRunner(HfRunner):
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:

-    assert model_info.architecture == "Qwen3ForSequenceClassification"
-
-    vllm_extra_kwargs: dict[str, Any] = {
-        "hf_overrides": {
-            "architectures": ["Qwen3ForSequenceClassification"],
-            "classifier_from_token": ["no", "yes"],
-            "is_original_qwen3_reranker": True,
-        }
-    }
-
-    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
-                            vllm_extra_kwargs)
+    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)


 @pytest.mark.parametrize("model_info", RERANK_MODELS)
@@ -96,16 +93,8 @@ def test_rerank_models_mteb_tp(vllm_runner,
    assert model_info.architecture == "Qwen3ForSequenceClassification"

    vllm_extra_kwargs: dict[str, Any] = {
-        "hf_overrides": {
-            "architectures": ["Qwen3ForSequenceClassification"],
-            "classifier_from_token": ["no", "yes"],
-            "is_original_qwen3_reranker": True,
-        },
        "tensor_parallel_size": 2,
    }

-    mteb_test_rerank_models(Qwen3RerankerHfRunner,
-                            vllm_runner,
-                            model_info,
-                            vllm_extra_kwargs,
-                            atol=1.2e-2)
+    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
+                            vllm_extra_kwargs)
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -13,14 +13,6 @@ from ....conftest import HfRunner
 from ...utils import check_transformers_version


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture
 def math_step_prompts():
    # ruff: noqa: E501

--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -23,15 +23,6 @@ TEXTS_2 = [
    "The capital of Germany is Berlin.",
 ]

-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 DTYPE = "half"



--- a/tests/models/language/pooling/test_st_projector.py
+++ b/tests/models/language/pooling/test_st_projector.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from .mteb_utils import mteb_test_embed_models
+
+# ST models with projector (Dense) layers
+ST_PROJECTOR_MODELS = [
+    CLSPoolingEmbedModelInfo(
+        "TencentBAC/Conan-embedding-v1",
+        architecture="BertModel",
+        enable_test=True,
+    ),
+]
+
+
+@pytest.mark.parametrize("model_info", ST_PROJECTOR_MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
+
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -189,23 +189,21 @@ VLM_TEST_SETTINGS = {
        },
        marks=[pytest.mark.core_model],
    ),
-    # FIXME(Isotr0py): Enable this test after
-    # https://github.com/huggingface/transformers/pull/39470 released
-    # "idefics3-transformers": VLMTestInfo(
-    #     models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
-    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-    #     prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
-    #     img_idx_to_prompt=lambda idx: "<image>",
-    #     max_model_len=8192,
-    #     max_num_seqs=2,
-    #     auto_cls=AutoModelForImageTextToText,
-    #     hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
-    #     image_size_factors=[(0.25, 0.5, 1.0)],
-    #     vllm_runner_kwargs={
-    #         "model_impl": "transformers",
-    #     },
-    #     marks=[pytest.mark.core_model],
-    # ),
+    "idefics3-transformers": VLMTestInfo(
+        models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
+        image_size_factors=[(0.25, 0.5, 1.0)],
+        vllm_runner_kwargs={
+            "model_impl": "transformers",
+        },
+        marks=[pytest.mark.core_model],
+    ),
    # Pixel values from processor are not 4D or 5D arrays
    "qwen2_5_vl-transformers": VLMTestInfo(
        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
@@ -222,21 +220,6 @@ VLM_TEST_SETTINGS = {
        },
        marks=[large_gpu_mark(min_gb=32)],
    ),
-    # Check "auto" with fallback to transformers
-    "internvl-transformers": VLMTestInfo(
-        models=["OpenGVLab/InternVL3-1B-hf"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
-        max_model_len=4096,
-        use_tokenizer_eos=True,
-        image_size_factors=[(0.25, 0.5, 1.0)],
-        vllm_runner_kwargs={
-            "model_impl": "auto",
-        },
-        auto_cls=AutoModelForImageTextToText,
-        marks=[pytest.mark.core_model],
-    ),
    #### Extended model tests
    "aria": VLMTestInfo(
        models=["rhymes-ai/Aria"],
@@ -337,10 +320,6 @@ VLM_TEST_SETTINGS = {
        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        # FIXME(Isotr0py): This model is broken in Transformers v4.54.1, we
-        # should enable this again after the fix is released:
-        # https://github.com/huggingface/transformers/pull/39915
-        marks=[pytest.mark.skip("HF model is broken")],
    ),
    "gemma3": VLMTestInfo(
        models=["google/gemma-3-4b-it"],
@@ -461,6 +440,20 @@ VLM_TEST_SETTINGS = {
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
+    "intern_vl-hf": VLMTestInfo(
+        models=["OpenGVLab/InternVL3-1B-hf"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO,
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
+        video_idx_to_prompt=lambda idx: "<video>",
+        max_model_len=8192,
+        use_tokenizer_eos=True,
+        auto_cls=AutoModelForImageTextToText,
+    ),
    "kimi_vl": VLMTestInfo(
        models=["moonshotai/Kimi-VL-A3B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -621,6 +614,23 @@ VLM_TEST_SETTINGS = {
        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
    ),
+    "ovis2_5": VLMTestInfo(
+        models=["AIDC-AI/Ovis2.5-2B"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<video>\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype="half",
+        num_logprobs=10,
+        patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
+        hf_model_kwargs={"revision": "refs/pr/5"},
+    ),
    "phi3v": VLMTestInfo(
        models=["microsoft/Phi-3.5-vision-instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),

--- a/tests/models/multimodal/generation/test_mllama.py
+++ b/tests/models/multimodal/generation/test_mllama.py
@@ -5,6 +5,7 @@ from typing import Optional, overload

 import pytest
 import torch
+from packaging.version import Version
 from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
 from transformers import __version__ as TRANSFORMERS_VERSION

@@ -287,8 +288,8 @@ def clear_cache():
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 @pytest.mark.skipif(
-    TRANSFORMERS_VERSION == "4.55.0",
-    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
+    reason="Transformers v4.55 has a regression issue on mllama, "
    "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
                                     model, sizes, dtype, max_tokens,
@@ -319,8 +320,8 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 @pytest.mark.skipif(
-    TRANSFORMERS_VERSION == "4.55.0",
-    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
+    reason="Transformers v4.55 has a regression issue on mllama, "
    "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
                                     model, dtype, max_tokens, num_logprobs,
@@ -372,8 +373,8 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 @pytest.mark.skipif(
-    TRANSFORMERS_VERSION == "4.55.0",
-    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
+    reason="Transformers v4.55 has a regression issue on mllama, "
    "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
                                   dtype, max_tokens, num_logprobs,
@@ -416,8 +417,8 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.skipif(
-    TRANSFORMERS_VERSION == "4.55.0",
-    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
+    reason="Transformers v4.55 has a regression issue on mllama, "
    "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_distributed(
    hf_runner,