Merge tag 'v0.10.0' into v0.10.0-dev

711aa9d5 · zhuwenwen · 751c492c · 6d8d0a24 · 711aa9d5 · 711aa9d5
Commit 711aa9d5 authored Jul 30, 2025 by zhuwenwen
20 changed files
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -38,8 +38,8 @@ ERROR_CASES = [
 ]


-def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
-    peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1,
+def test_peft_helper_pass(sql_lora_files, tmp_path):
+    peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
                                            max_position_embeddings=4096)
    lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
    peft_helper.validate_legal(lora_config)
@@ -56,15 +56,12 @@ def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
        "embed_tokens",
        "lm_head",
    ]
-    assert peft_helper.context_length == 16384
    assert peft_helper.vllm_max_position_embeddings == 4096
-    assert peft_helper.vllm_long_context_scaling_factor == float(
-        math.ceil(peft_helper.context_length /
-                  peft_helper.vllm_max_position_embeddings))
+
    # test RSLoRA
    rslora_config = dict(use_rslora=True)
    test_dir = tmp_path / "test_rslora"
-    shutil.copytree(long_context_lora_files_16k_1, test_dir)
+    shutil.copytree(sql_lora_files, test_dir)

    # Load and modify configuration
    config_path = test_dir / "adapter_config.json"

--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+
 import os
-import pytest

 import vllm
 from vllm.lora.request import LoRARequest
@@ -51,9 +51,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    return generated_texts


-# Skipping for V1 for now as we are hitting,
-# "Head size 80 is not supported by FlashAttention." error.
-@pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention")
 def test_phi2_lora(phi2_lora_files):
    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
    # Otherwise, the lora-test will fail due to CUDA OOM.

--- a/tests/lora/test_transformers_model.py
+++ b/tests/lora/test_transformers_model.py
@@ -9,7 +9,7 @@ from vllm.platforms import current_platform

 from ..utils import create_new_process_for_each_test, multi_gpu_test

-MODEL_PATH = "ArthurZ/ilama-3.2-1B"
+MODEL_PATH = "hmellor/Ilama-3.2-1B"

 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501


--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import os
-import time

+import os
 import pytest
 import ray
 from prometheus_client import REGISTRY

 import vllm.envs as envs
 from vllm import EngineArgs, LLMEngine
-from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.metrics import RayPrometheusStatLogger
@@ -48,7 +46,7 @@ def test_metric_counter_prompt_tokens(
                     dtype=dtype,
                     disable_log_stats=False,
                     gpu_memory_utilization=0.4) as vllm_model:
-        tokenizer = vllm_model.model.get_tokenizer()
+        tokenizer = vllm_model.llm.get_tokenizer()
        prompt_token_counts = [
            len(tokenizer.encode(p)) for p in example_prompts
        ]
@@ -60,7 +58,7 @@ def test_metric_counter_prompt_tokens(
        vllm_prompt_token_count = sum(prompt_token_counts)

        _ = vllm_model.generate_greedy(example_prompts, max_tokens)
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
        metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
            **stat_logger.labels)._value.get()

@@ -84,8 +82,8 @@ def test_metric_counter_generation_tokens(
                     disable_log_stats=False,
                     gpu_memory_utilization=0.4) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        tokenizer = vllm_model.model.get_tokenizer()
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        tokenizer = vllm_model.llm.get_tokenizer()
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
            **stat_logger.labels)._value.get()
        vllm_generation_count = 0
@@ -120,8 +118,8 @@ def test_metric_counter_generation_tokens_multi_step(
            disable_async_output_proc=disable_async_output_proc,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        tokenizer = vllm_model.model.get_tokenizer()
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        tokenizer = vllm_model.llm.get_tokenizer()
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
            **stat_logger.labels)._value.get()
        vllm_generation_count = 0
@@ -152,7 +150,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
                     disable_log_stats=False,
                     gpu_memory_utilization=0.3,
                     served_model_name=served_model_name) as vllm_model:
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
        metrics_tag_content = stat_logger.labels["model_name"]

    if envs.VLLM_CI_USE_S3:
@@ -236,149 +234,6 @@ def test_engine_log_metrics_regression(
    assert_metrics(model, engine, disable_log_stats, len(example_prompts))


-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [10])
-def test_metric_spec_decode(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    k = 5
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            disable_log_stats=False,
-            gpu_memory_utilization=0.4,
-            speculative_config={
-                "model": model,
-                "num_speculative_tokens": k,
-            },
-    ) as vllm_model:
-
-        # Force log interval to be 0 to catch all metrics.
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
-        stat_logger.local_interval = 0
-
-        # Note that the purpose of this test is to verify spec decode
-        # metrics instead of functional correctness, so the expected values
-        # are intended to be loose.
-        metric_name_to_expected_fn = {
-            "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
-            "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
-            "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
-            "counter_spec_decode_num_draft_tokens": lambda v: v == k,
-            "counter_spec_decode_num_emitted_tokens":
-            lambda v: 0 <= v <= k + 1,
-        }
-
-        # Use one request to better inspect the metrics.
-        prompts = example_prompts[:1]
-
-        _ = vllm_model.generate_greedy(prompts, max_tokens)
-        for metric_name, is_expected in metric_name_to_expected_fn.items():
-            metric_val = getattr(
-                stat_logger.metrics,
-                metric_name).labels(**stat_logger.labels)._value.get()
-            assert is_expected(metric_val), (
-                f"the value of metric {metric_name} ({metric_val}) "
-                "does not meet expectation")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [10])
-@pytest.mark.parametrize("log_interval", [1, 3, 5, 7])
-def test_metric_spec_decode_interval(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    log_interval: int,
-) -> None:
-    k = 5
-
-    engine_args = EngineArgs(
-        model=model,
-        dtype=dtype,
-        disable_log_stats=False,
-        gpu_memory_utilization=0.4,
-        speculative_config={
-            "model": model,
-            "num_speculative_tokens": k,
-        },
-        enforce_eager=True,
-    )
-
-    engine = LLMEngine.from_engine_args(engine_args)
-
-    try:
-
-        engine.add_request(
-            "request-id-0",
-            example_prompts[0],
-            SamplingParams(max_tokens=max_tokens),
-        )
-
-        # set log internal
-        stat_logger = engine.stat_loggers['prometheus']
-        stat_logger.local_interval = log_interval
-
-        # prefill
-        engine.step()
-
-        # wait for 5 seconds to ensure that spec decode metrics
-        # get triggered in first decode step
-        time.sleep(5)
-
-        # first decode step should trigger async collection of metrics
-        engine.step()
-
-        # wait one second to allow H2D transfer to finish
-        time.sleep(1)
-
-        # second decode step should now be able to collect the spec
-        # decode stats and the request should also be finished
-        engine.step()
-
-        # must have finisehd now
-        assert not engine.has_unfinished_requests()
-
-        # wait to ensure logging occurs
-        time.sleep(log_interval)
-
-        # force logging
-        engine.step()
-
-        # Note that the purpose of this test is to verify spec decode
-        # metrics instead of functional correctness, so the expected values
-        # are intended to be loose.
-        metric_name_to_expected_fn = {
-            "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
-            "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
-            "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
-            "counter_spec_decode_num_draft_tokens": lambda v: v == k,
-            "counter_spec_decode_num_emitted_tokens":
-            lambda v: 0 <= v <= k + 1,
-        }
-
-        for metric_name, is_expected in metric_name_to_expected_fn.items():
-            metric_val = getattr(
-                stat_logger.metrics,
-                metric_name).labels(**stat_logger.labels)._value.get()
-            assert is_expected(metric_val), (
-                f"the value of metric {metric_name} ({metric_val}) "
-                "does not meet expectation")
-
-    finally:
-        del engine
-        cleanup_dist_env_and_memory()
-
-
 def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
                   num_requests: int) -> None:
    if disable_log_stats:

--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -50,20 +50,15 @@ def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex,
                                  whitespace_pattern=None,
                                  reasoner=None)

-    token_ids = zephyr_7B_tokenzer.encode(
-        f"Give an example IPv4 address with this regex: {sample_regex}")
    tensor = torch.rand(32000)
    original_tensor = torch.clone(tensor)
-    regex_LP(token_ids, tensor)
+    tensor = regex_LP([], tensor)
    assert tensor.shape == original_tensor.shape
    assert not torch.allclose(tensor, original_tensor)

-    token_ids = zephyr_7B_tokenzer.encode(
-        f"Give an employee profile that fits this schema: {sample_json_schema}"
-    )
    tensor = torch.rand(32000)
    original_tensor = torch.clone(tensor)
-    json_LP(token_ids, tensor)
+    tensor = json_LP([], tensor)
    assert tensor.shape == original_tensor.shape
    assert not torch.allclose(tensor, original_tensor)

@@ -85,8 +80,6 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
        seed=0,
        dtype="bfloat16",
    )
-    token_ids = zephyr_7B_tokenzer.encode(
-        f"Give an example IPv4 address with this regex: {sample_regex}")
    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)

    regex_lp = get_local_guided_decoding_logits_processor(
@@ -96,13 +89,11 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
    assert regex_lp is not None
    tensor = torch.rand(32000)
    original_tensor = torch.clone(tensor)
-    tensor = regex_lp(token_ids, tensor)
+    # allowed tokens at state 0
+    tensor = regex_lp([], tensor)
    assert tensor.shape == original_tensor.shape
    assert not torch.allclose(tensor, original_tensor)

-    token_ids = zephyr_7B_tokenzer.encode(
-        f"Give an employee profile that fits this schema: {sample_json_schema}"
-    )
    json_request = GuidedDecodingParams(json=sample_json_schema,
                                        backend=backend)
    json_lp = await get_guided_decoding_logits_processor(
@@ -110,7 +101,7 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
    assert json_lp is not None
    tensor = torch.rand(32000)
    original_tensor = torch.clone(tensor)
-    tensor = json_lp(token_ids, tensor)
+    tensor = json_lp([], tensor)
    assert tensor.shape == original_tensor.shape
    assert not torch.allclose(tensor, original_tensor)

@@ -134,7 +125,6 @@ async def test_guided_logits_processor_with_reasoning(
        dtype="bfloat16",
    )
    token_ids = deepseek_r1_qwen_tokenizer.encode(
-        f"Give an example IPv4 address with this regex: {sample_regex}."
        "<think>here is the thinking process")
    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)

@@ -145,14 +135,13 @@ async def test_guided_logits_processor_with_reasoning(
                    regex_request, deepseek_r1_qwen_tokenizer, config,
                    reasoning_backend)
    assert regex_lp is not None
-    tensor = torch.rand(32000)
+    tensor = torch.rand(151664)
    original_tensor = torch.clone(tensor)
    tensor = regex_lp(token_ids, tensor)
    assert tensor.shape == original_tensor.shape
    assert torch.allclose(tensor, original_tensor)

    token_ids = deepseek_r1_qwen_tokenizer.encode(
-        f"Give an employee profile that fits this schema: {sample_json_schema}."
        "<think>here is the thinking process")
    json_request = GuidedDecodingParams(json=sample_json_schema,
                                        backend=backend)
@@ -162,7 +151,7 @@ async def test_guided_logits_processor_with_reasoning(
        await get_guided_decoding_logits_processor(
            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
    assert json_lp is not None
-    tensor = torch.rand(32000)
+    tensor = torch.rand(151664)
    original_tensor = torch.clone(tensor)
    tensor = json_lp(token_ids, tensor)
    assert tensor.shape == original_tensor.shape
@@ -170,8 +159,7 @@ async def test_guided_logits_processor_with_reasoning(

    # Thinking is over, so the tensor should change.
    token_ids = deepseek_r1_qwen_tokenizer.encode(
-        f"Give an employee profile that fits this schema: {sample_json_schema}."
-        "<think>here is the thinking process</think> Then")
+        "<think>here is the thinking process</think>")
    json_request = GuidedDecodingParams(json=sample_json_schema,
                                        backend=backend)
    json_lp = get_local_guided_decoding_logits_processor(
@@ -180,7 +168,7 @@ async def test_guided_logits_processor_with_reasoning(
        await get_guided_decoding_logits_processor(
            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
    assert json_lp is not None
-    tensor = torch.rand(32000)
+    tensor = torch.rand(151664)
    original_tensor = torch.clone(tensor)
    tensor = json_lp(token_ids, tensor)
    assert tensor.shape == original_tensor.shape
@@ -205,19 +193,6 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
        GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")


-def test_guided_decoding_backend_options():
-    """Test backend-specific options"""
-    with pytest.warns(DeprecationWarning):
-        guided_decoding_params = GuidedDecodingParams(
-            backend=
-            "xgrammar:no-fallback,disable-any-whitespace,no-additional-properties"
-        )
-    assert guided_decoding_params.backend == "xgrammar"
-    assert guided_decoding_params.disable_fallback
-    assert guided_decoding_params.disable_any_whitespace
-    assert guided_decoding_params.disable_additional_properties
-
-
 def test_pickle_xgrammar_tokenizer_data():
    try:
        import xgrammar as xgr

--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -5,7 +5,8 @@ import os

 import pytest

-from vllm.model_executor.layers.pooler import CLSPool, MeanPool, PoolingType
+from vllm.model_executor.layers.pooler import (CLSPool, DispatchPooler,
+                                               MeanPool, PoolingType)
 from vllm.model_executor.models.bert import BertEmbeddingModel
 from vllm.model_executor.models.roberta import RobertaEmbeddingModel
 from vllm.platforms import current_platform
@@ -33,8 +34,8 @@ def test_model_loading_with_params(vllm_runner):
        output = vllm_model.embed("Write a short story about a robot that"
                                  " dreams for the first time.\n")

-        model_config = vllm_model.model.llm_engine.model_config
-        model_tokenizer = vllm_model.model.llm_engine.tokenizer
+        model_config = vllm_model.llm.llm_engine.model_config
+        model_tokenizer = vllm_model.llm.llm_engine.tokenizer

        # asserts on the bert model config file
        assert model_config.encoder_config["max_seq_length"] == 512
@@ -50,7 +51,8 @@ def test_model_loading_with_params(vllm_runner):

        def check_model(model):
            assert isinstance(model, BertEmbeddingModel)
-            assert isinstance(model._pooler, CLSPool)
+            assert isinstance(pooler := model.pooler, DispatchPooler)
+            assert isinstance(pooler.poolers_by_task["embed"].pooling, CLSPool)

        vllm_model.apply_model(check_model)

@@ -71,8 +73,8 @@ def test_roberta_model_loading_with_params(vllm_runner):
        output = vllm_model.embed("Write a short story about a robot that"
                                  " dreams for the first time.\n")

-        model_config = vllm_model.model.llm_engine.model_config
-        model_tokenizer = vllm_model.model.llm_engine.tokenizer
+        model_config = vllm_model.llm.llm_engine.model_config
+        model_tokenizer = vllm_model.llm.llm_engine.tokenizer

        # asserts on the bert model config file
        assert model_config.encoder_config["max_seq_length"] == 512
@@ -88,7 +90,9 @@ def test_roberta_model_loading_with_params(vllm_runner):

        def check_model(model):
            assert isinstance(model, RobertaEmbeddingModel)
-            assert isinstance(model._pooler, MeanPool)
+            assert isinstance(pooler := model.pooler, DispatchPooler)
+            assert isinstance(pooler.poolers_by_task["embed"].pooling,
+                              MeanPool)

        vllm_model.apply_model(check_model)

@@ -109,13 +113,14 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner):
        output = vllm_model.embed("Write a short story about a robot that"
                                  " dreams for the first time.\n")

-        model_tokenizer = vllm_model.model.llm_engine.tokenizer
+        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
        assert model_tokenizer.tokenizer_id == model_name

        def check_model(model):
            assert isinstance(model, RobertaEmbeddingModel)
            assert not hasattr(model, "lm_head")
-            assert isinstance(model._pooler, CLSPool)
+            assert isinstance(pooler := model.pooler, DispatchPooler)
+            assert isinstance(pooler.poolers_by_task["embed"].pooling, CLSPool)

        vllm_model.apply_model(check_model)


--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -41,7 +41,7 @@ AITER_MODEL_LIST = [
    [
        pytest.param(
            os.path.join(models_path_prefix, "bigscience/bloom-560m"),  # bloom - testing alibi slopes
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model],
        ),
        pytest.param(
            os.path.join(models_path_prefix, "openai-community/gpt2"),  # gpt2
@@ -89,7 +89,11 @@ AITER_MODEL_LIST = [
        pytest.param(os.path.join(models_path_prefix, "bigcode/starcoder2-3b")),  # starcoder2
        pytest.param(
            os.path.join(models_path_prefix, "TitanML/tiny-mixtral"),  # mixtral
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model],
+        ),
+        pytest.param(
+            os.path.join(models_path_prefix, "allenai/OLMoE-1B-7B-0924-Instruct"),
+            marks=[pytest.mark.cpu_model],
        )
    ])
 @pytest.mark.parametrize("max_tokens", [32])

--- a/tests/models/language/generation/test_gemma.py
+++ b/tests/models/language/generation/test_gemma.py
@@ -15,13 +15,13 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
                load_format="dummy",
        ) as llm:
            if model == "google/gemma-3-4b-it":
-                normalizers = llm.model.collective_rpc(
+                normalizers = llm.llm.collective_rpc(
                    lambda self: self.model_runner.model.language_model.model.
                    normalizer.cpu().item())
-                config = llm.model.llm_engine.model_config.hf_config.text_config
+                config = llm.llm.llm_engine.model_config.hf_config.text_config
            else:
-                normalizers = llm.model.collective_rpc(
+                normalizers = llm.llm.collective_rpc(
                    lambda self: self.model_runner.model.model.normalizer.cpu(
                    ).item())
-                config = llm.model.llm_engine.model_config.hf_config
+                config = llm.llm.llm_engine.model_config.hf_config
            assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -63,13 +63,6 @@ V1_SUPPORTED_MODELS = [
    os.path.join(models_path_prefix,"tiiuae/Falcon-H1-0.5B-Base"),
 ]

-ATTN_BLOCK_SIZES = {
-    os.path.join(models_path_prefix,"ibm-ai-platform/Bamba-9B-v1"): 528,
-    os.path.join(models_path_prefix,"Zyphra/Zamba2-1.2B-instruct"): 80,
-    os.path.join(models_path_prefix,"nvidia/Nemotron-H-8B-Base-8K"): 528,
-    os.path.join(models_path_prefix,"ibm-granite/granite-4.0-tiny-preview"): 400,
-    os.path.join(models_path_prefix,"tiiuae/Falcon-H1-0.5B-Base"): 800,
-}

 # Avoid OOM
 MAX_NUM_SEQS = 4
@@ -107,11 +100,6 @@ def test_models(
            example_prompts, max_tokens, num_logprobs)

    if model in V1_SUPPORTED_MODELS:
-        if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
-            block_size = ATTN_BLOCK_SIZES[model]
-        else:
-            block_size = 16
-
        with monkeypatch.context() as m:
            m.setenv("VLLM_USE_V1", "1")
            if model in HYBRID_MODELS:
@@ -119,9 +107,7 @@ def test_models(
                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
            with vllm_runner(model,
                             max_num_seqs=MAX_NUM_SEQS,
-                             enforce_eager=True,
-                             enable_prefix_caching=False,
-                             block_size=block_size) as vllm_model:
+                             enable_prefix_caching=False) as vllm_model:
                vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                    example_prompts, max_tokens, num_logprobs)
    else:
@@ -291,7 +277,7 @@ def test_models_preemption_recompute(
    Tests that outputs are identical with and w/o preemptions (recompute).
    """
    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        scheduler = vllm_model.model.llm_engine.scheduler[0]
+        scheduler = vllm_model.llm.llm_engine.scheduler[0]
        scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
        preempt_vllm_outputs = vllm_model.generate_greedy(
            example_prompts, max_tokens)

--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -240,8 +240,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str,
                     load_format="mistral") as vllm_model:
        for prompt in SYMBOLIC_LANG_PROMPTS:
            msg = {"role": "user", "content": prompt}
-            outputs = vllm_model.model.chat([msg],
-                                            sampling_params=SAMPLING_PARAMS)
+            outputs = vllm_model.llm.chat([msg],
+                                          sampling_params=SAMPLING_PARAMS)
            assert "�" not in outputs[0].outputs[0].text.strip()


@@ -255,11 +255,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
                     load_format="mistral") as vllm_model:

        msgs = copy.deepcopy(MSGS)
-        outputs = vllm_model.model.chat(msgs,
-                                        tools=TOOLS,
-                                        sampling_params=SAMPLING_PARAMS)
+        outputs = vllm_model.llm.chat(msgs,
+                                      tools=TOOLS,
+                                      sampling_params=SAMPLING_PARAMS)

-        tokenizer = vllm_model.model.get_tokenizer()
+        tokenizer = vllm_model.llm.get_tokenizer()
        tool_parser = MistralToolParser(tokenizer)

        model_output = outputs[0].outputs[0].text.strip()
@@ -310,7 +310,7 @@ def test_mistral_guided_decoding(
                f"Give an example JSON for an employee profile that "
                f"fits this schema: {SAMPLE_JSON_SCHEMA}"
            }]
-            outputs = vllm_model.model.chat(messages, sampling_params=params)
+            outputs = vllm_model.llm.chat(messages, sampling_params=params)

        generated_text = outputs[0].outputs[0].text
        json_response = json.loads(generated_text)

--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -23,14 +23,14 @@ MTEB_EMBED_TOL = 1e-4
 # See #19344
 MTEB_RERANK_TASKS = ["NFCorpus"]
 MTEB_RERANK_LANGS = ["en"]
-MTEB_RERANK_TOL = 1e-3
+MTEB_RERANK_TOL = 2e-3


 class VllmMtebEncoder(mteb.Encoder):

    def __init__(self, vllm_model):
        super().__init__()
-        self.model = vllm_model
+        self.llm = vllm_model
        self.rng = np.random.default_rng(seed=42)

    def encode(
@@ -43,7 +43,7 @@ class VllmMtebEncoder(mteb.Encoder):
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]
-        outputs = self.model.embed(sentences, use_tqdm=False)
+        outputs = self.llm.embed(sentences, use_tqdm=False)
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds
@@ -61,10 +61,10 @@ class VllmMtebEncoder(mteb.Encoder):
        queries = [s[0] for s in sentences]
        corpus = [s[1] for s in sentences]

-        outputs = self.model.score(queries,
-                                   corpus,
-                                   truncate_prompt_tokens=-1,
-                                   use_tqdm=False)
+        outputs = self.llm.score(queries,
+                                 corpus,
+                                 truncate_prompt_tokens=-1,
+                                 use_tqdm=False)
        scores = np.array(outputs)
        scores = scores[np.argsort(r)]
        return scores
@@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner,

        if model_info.architecture:
            assert (model_info.architecture
-                    in vllm_model.model.llm_engine.model_config.architectures)
+                    in vllm_model.llm.llm_engine.model_config.architectures)

        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                              MTEB_EMBED_TASKS)
-        vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
+        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype

    with hf_runner(model_info.name,
                   is_sentence_transformer=True,
@@ -267,7 +267,9 @@ def mteb_test_rerank_models(hf_runner,
                            vllm_runner,
                            model_info: RerankModelInfo,
                            vllm_extra_kwargs=None,
-                            hf_model_callback=None):
+                            hf_model_callback=None,
+                            vllm_mteb_encoder=VllmMtebEncoder,
+                            atol=MTEB_RERANK_TOL):
    if not model_info.enable_test:
        # A model family has many models with the same architecture,
        # and we don't need to test each one.
@@ -282,13 +284,13 @@ def mteb_test_rerank_models(hf_runner,
                     max_num_seqs=8,
                     **vllm_extra_kwargs) as vllm_model:

-        model_config = vllm_model.model.llm_engine.model_config
+        model_config = vllm_model.llm.llm_engine.model_config

        if model_info.architecture:
            assert (model_info.architecture in model_config.architectures)
        assert model_config.hf_config.num_labels == 1

-        vllm_main_score = run_mteb_rerank(VllmMtebEncoder(vllm_model),
+        vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
                                          tasks=MTEB_RERANK_TASKS,
                                          languages=MTEB_RERANK_LANGS)
        vllm_dtype = model_config.dtype
@@ -300,4 +302,4 @@ def mteb_test_rerank_models(hf_runner,
    print("SentenceTransformers:", st_dtype, st_main_score)
    print("Difference:", st_main_score - vllm_main_score)

-    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
+    assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
--- a/tests/models/language/pooling/test_baai.py
+++ b/tests/models/language/pooling/test_baai.py
@@ -68,7 +68,6 @@ RERANK_MODELS = [
                    enable_test=False),
    RerankModelInfo("BAAI/bge-reranker-v2-m3",
                    architecture="XLMRobertaForSequenceClassification",
-                    dtype="float32",
                    enable_test=False)
 ]


--- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import numpy as np
+import pytest
+import torch
+
+from tests.conftest import HfRunner
+
+from .mteb_utils import (RerankModelInfo, VllmMtebEncoder,
+                         mteb_test_rerank_models)
+
+RERANK_MODELS = [
+    RerankModelInfo("BAAI/bge-reranker-v2-gemma",
+                    architecture="GemmaForSequenceClassification"),
+]
+
+PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."  # noqa: E501
+
+
+class GemmaRerankerHfRunner(HfRunner):
+
+    def __init__(self,
+                 model_name: str,
+                 dtype: str = "auto",
+                 *args: Any,
+                 **kwargs: Any) -> None:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                       padding_side='left')
+        self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
+
+    @torch.no_grad()
+    def predict(self, prompts: list[list[str]], *args,
+                **kwargs) -> torch.Tensor:
+
+        def get_inputs(pairs, tokenizer, prompt=None):
+            if prompt is None:
+                prompt = PROMPT
+
+            sep = "\n"
+            prompt_inputs = tokenizer(prompt,
+                                      return_tensors=None,
+                                      add_special_tokens=False)["input_ids"]
+            sep_inputs = tokenizer(sep,
+                                   return_tensors=None,
+                                   add_special_tokens=False)["input_ids"]
+            inputs = []
+            for query, passage in pairs:
+                query_inputs = tokenizer(
+                    f"A: {query}",
+                    return_tensors=None,
+                    add_special_tokens=False,
+                    truncation=True,
+                )
+                passage_inputs = tokenizer(
+                    f"B: {passage}",
+                    return_tensors=None,
+                    add_special_tokens=False,
+                    truncation=True,
+                )
+                item = tokenizer.prepare_for_model(
+                    [tokenizer.bos_token_id] + query_inputs["input_ids"],
+                    sep_inputs + passage_inputs["input_ids"],
+                    truncation="only_second",
+                    padding=False,
+                    return_attention_mask=False,
+                    return_token_type_ids=False,
+                    add_special_tokens=False,
+                )
+                item["input_ids"] = item[
+                    "input_ids"] + sep_inputs + prompt_inputs
+                item["attention_mask"] = [1] * len(item["input_ids"])
+                inputs.append(item)
+            return tokenizer.pad(
+                inputs,
+                padding=True,
+                return_tensors="pt",
+            )
+
+        scores = []
+        for query, doc, *_ in prompts:
+            pairs = [(query, doc)]
+            inputs = get_inputs(pairs, self.tokenizer)
+            inputs = inputs.to(self.model.device)
+            _n_tokens = inputs["input_ids"].shape[1]
+            logits = self.model(**inputs, return_dict=True).logits
+            _scores = (logits[:, -1,
+                              self.yes_loc].view(-1, ).float().sigmoid())
+            scores.append(_scores[0].item())
+        return torch.Tensor(scores)
+
+
+class GemmaMtebEncoder(VllmMtebEncoder):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prompt = PROMPT
+        self.query_template = "A: {query}\n"
+        self.document_template = "B: {doc}\n{prompt}"
+
+    def predict(
+        self,
+        sentences: list[tuple[str, str,
+                              Optional[str]]],  # query, corpus, prompt
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+
+        _sentences = []
+        for query, corpus, prompt in sentences:
+            query = self.query_template.format(query=query)
+            corpus = self.document_template.format(doc=corpus, prompt=prompt)
+            _sentences.append((query, corpus, prompt))
+
+        return super().predict(_sentences, *args, **kwargs)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo,
+                            monkeypatch) -> None:
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+
+    assert model_info.architecture == "GemmaForSequenceClassification"
+
+    vllm_extra_kwargs: dict[str, Any] = {
+        "hf_overrides": {
+            "architectures": ["GemmaForSequenceClassification"],
+            "classifier_from_token": ["Yes"],
+            "method": "no_post_processing",
+        }
+    }
+
+    mteb_test_rerank_models(GemmaRerankerHfRunner,
+                            vllm_runner,
+                            model_info,
+                            vllm_extra_kwargs,
+                            vllm_mteb_encoder=GemmaMtebEncoder)
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 from typing import Optional

+import os
 import pytest

 from vllm.config import PoolerConfig
@@ -31,8 +31,10 @@ def v1(run_with_both_engines):
        # [Decoder-only]
        pytest.param(os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
                     marks=[pytest.mark.core_model]),
-        pytest.param(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param(
+            os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
+            # CPU v1 doesn't support sliding window
+            marks=[pytest.mark.core_model]),
        # the qwen models interfere with each other (see PR
        # https://github.com/vllm-project/vllm/pull/18720).
        # To avoid this problem, for now we skip v0 since it will be
@@ -40,11 +42,13 @@ def v1(run_with_both_engines):
        pytest.param(os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base"),
                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
        # [Encoder-only]
-        pytest.param(os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
-                     marks=[
-                         pytest.mark.core_model, pytest.mark.cpu_model,
-                         pytest.mark.skip_v1
-                     ]),
+        pytest.param(
+            os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
+            marks=[
+                # CPU only supports V1
+                pytest.mark.core_model,
+                pytest.mark.skip_v1
+            ]),
        pytest.param(os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2"),
                     marks=[pytest.mark.skip_v1]),
        pytest.param(os.path.join(models_path_prefix, "intfloat/multilingual-e5-small"),
@@ -66,10 +70,6 @@ def test_models(
    model,
    monkeypatch,
 ) -> None:
-    if model == os.path.join(models_path_prefix,"intfloat/e5-mistral-7b-instruct") and current_platform.is_cpu(
-    ) and os.environ.get("VLLM_USE_V1", "0") == "1":
-        pytest.skip("CPU V1 doesn't support sliding window")
-
    if model == os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2") and current_platform.is_rocm():
        # ROCm Triton FA does not currently support sliding window attention
        # switch to use ROCm CK FA backend

--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -2,10 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations

-import importlib.util
-from array import array
-
 import os
+import numpy as np
+
 import openai
 import pytest
 from scipy.spatial.distance import cosine
@@ -16,9 +15,6 @@ from vllm.config import ModelConfig
 from ....utils import RemoteOpenAIServer
 from ....utils import models_path_prefix

-# GritLM embedding implementation is only supported by XFormers backend.
-pytestmark = pytest.mark.skipif(not importlib.util.find_spec("xformers"),
-                                reason="GritLM requires XFormers")

 MODEL_NAME = os.path.join(models_path_prefix, "parasail-ai/GritLM-7B-vllm")
 MAX_MODEL_LEN = 4000
@@ -28,11 +24,11 @@ def _arr(arr):
    """
    Convert a list of integers to an array of integers.
    """
-    return array("i", arr)
+    return np.array(arr)


 def test_find_array():
-    from vllm.model_executor.models.gritlm import GritLMPooler
+    from vllm.model_executor.models.gritlm import GritLMMeanPool

    model_config = ModelConfig(
        MODEL_NAME,
@@ -43,17 +39,19 @@ def test_find_array():
        dtype="bfloat16",
        seed=0,
    )
-    pooler = GritLMPooler(model_config=model_config)
+    pooling = GritLMMeanPool(model_config=model_config)

    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
-    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
+    assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+    assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+    assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+    assert pooling._find_array(arr, _arr([3, 4, 5]), end_idx=3) == -1
+    assert pooling._find_array(arr, _arr([3, 4, 5]), end_idx=4) == 3
+    assert pooling._find_array(arr, _arr([3, 5]), start_idx=0) == -1

    with pytest.raises(ValueError):
-        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
+        pooling._find_array(arr, _arr([3, 4, 5]), start_idx=-1)


 def run_llm_encode(
@@ -126,7 +124,7 @@ def test_gritlm_offline_embedding(vllm_runner):
            task="embed",
            max_model_len=MAX_MODEL_LEN,
    ) as vllm_model:
-        llm = vllm_model.model
+        llm = vllm_model.llm

        d_rep = run_llm_encode(
            llm,
@@ -173,7 +171,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
            task="generate",
            max_model_len=MAX_MODEL_LEN,
    ) as vllm_model:
-        llm = vllm_model.model
+        llm = vllm_model.llm

        sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
        outputs = llm.generate(input, sampling_params=sampling_params)

--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -18,11 +18,8 @@ EMBEDDING_MODELS = [
 ]

 RERANK_MODELS = [
-    RerankModelInfo(
-        "jinaai/jina-reranker-v2-base-multilingual",
-        architecture="XLMRobertaForSequenceClassification",
-        dtype="float32",
-    )
+    RerankModelInfo("jinaai/jina-reranker-v2-base-multilingual",
+                    architecture="XLMRobertaForSequenceClassification")
 ]


@@ -90,10 +87,10 @@ def test_matryoshka(
                     task="embed",
                     dtype=dtype,
                     max_model_len=None) as vllm_model:
-        assert vllm_model.model.llm_engine.model_config.is_matryoshka
+        assert vllm_model.llm.llm_engine.model_config.is_matryoshka

        matryoshka_dimensions = (
-            vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
+            vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
        assert matryoshka_dimensions is not None

        if dimensions not in matryoshka_dimensions:

--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
@@ -12,11 +12,9 @@ from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
 RERANK_MODELS = [
    RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
                    architecture="Qwen2ForSequenceClassification",
-                    dtype="float32",
                    enable_test=True),
    RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
                    architecture="Qwen2ForSequenceClassification",
-                    dtype="float32",
                    enable_test=False)
 ]


--- a/tests/models/language/pooling/test_nomic_max_model_len.py
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -23,7 +23,7 @@ max_model_len = int(original_max_position_embeddings * factor)
 def test_default(model_info, vllm_runner):
    with vllm_runner(model_info.name, task="embed",
                     max_model_len=None) as vllm_model:
-        model_config = vllm_model.model.llm_engine.model_config
+        model_config = vllm_model.llm.llm_engine.model_config
        if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
            # For nomic-embed-text-v2-moe the length is set to 512
            # by sentence_bert_config.json.
@@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
    # set max_model_len <= 512
    with vllm_runner(model_info.name, task="embed",
                     max_model_len=256) as vllm_model:
-        model_config = vllm_model.model.llm_engine.model_config
+        model_config = vllm_model.llm.llm_engine.model_config
        assert model_config.max_model_len == 256

    # set 512 < max_model_len <= 2048
@@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
    else:
        with vllm_runner(model_info.name, task="embed",
                         max_model_len=1024) as vllm_model:
-            model_config = vllm_model.model.llm_engine.model_config
+            model_config = vllm_model.llm.llm_engine.model_config
            assert model_config.max_model_len == 1024



--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -6,17 +6,16 @@ import pytest
 import torch

 from tests.conftest import HfRunner
+from tests.utils import multi_gpu_test

 from .mteb_utils import RerankModelInfo, mteb_test_rerank_models

 RERANK_MODELS = [
    RerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
                    architecture="Qwen3ForSequenceClassification",
-                    dtype="float32",
                    enable_test=True),
    RerankModelInfo("Qwen/Qwen3-Reranker-4B",
                    architecture="Qwen3ForSequenceClassification",
-                    dtype="float32",
                    enable_test=False)
 ]

@@ -89,3 +88,29 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:

    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
                            vllm_extra_kwargs)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+@multi_gpu_test(num_gpus=2)
+def test_rerank_models_mteb_tp(vllm_runner,
+                               model_info: RerankModelInfo) -> None:
+
+    assert model_info.architecture == "Qwen3ForSequenceClassification"
+
+    vllm_extra_kwargs: dict[str, Any] = {
+        "hf_overrides": {
+            "architectures": ["Qwen3ForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        },
+        "tensor_parallel_size": 2,
+    }
+
+    if model_info.name == "Qwen/Qwen3-Reranker-4B":
+        vllm_extra_kwargs["max_num_seqs"] = 1
+
+    mteb_test_rerank_models(Qwen3RerankerHfRunner,
+                            vllm_runner,
+                            model_info,
+                            vllm_extra_kwargs,
+                            atol=1.2e-2)
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
 import pytest
 import torch
 import torch.nn.functional as F
@@ -84,6 +86,9 @@ def test_prm_models(
    dtype: str,
    monkeypatch,
 ) -> None:
+    if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
+        pytest.skip("CPU only supports V1")
+
    if current_platform.is_rocm():
        # ROCm Triton FA does not currently support sliding window attention
        # switch to use ROCm CK FA backend