Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

38d80967 · zhuwenwen · 33650733 · 880c741b · 38d80967 · 38d80967
Commit 38d80967 authored Sep 12, 2025 by zhuwenwen
20 changed files
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -11,8 +11,8 @@ from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
                                       native_w8a8_block_matmul)
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    get_col_major_tma_aligned_tensor, per_token_group_quant_fp8,
-    w8a8_block_fp8_matmul)
+    cutlass_scaled_mm, get_col_major_tma_aligned_tensor,
+    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_gemm
 from vllm.utils.deep_gemm import fp8_gemm_nt, per_block_cast_to_fp8
@@ -98,6 +98,54 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
    assert rel_diff < 0.001


+@torch.inference_mode()
+def test_w8a8_block_fp8_cutlass_matmul():
+    # Test simple case where weight.shape % 128 != 0,
+    # like in DSV3 kv_a_proj_with_mqa
+    M = 32
+    N = 576
+    K = 7168
+    block_size = [128, 128]
+    out_dtype = torch.bfloat16
+    seed = 0
+
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
+    # Hopper requires row-major format for scales
+    Bs_cutlass = Bs.T.contiguous() if current_platform.is_device_capability(
+        90) else Bs
+
+    A_fp8, As = per_token_group_quant_fp8(A_fp32,
+                                          block_size[1],
+                                          column_major_scales=False)
+    # CUTLASS uses column-major format for scales
+    A_fp8_cutlass, As_cutlass = per_token_group_quant_fp8(
+        A_fp32, block_size[1], column_major_scales=True)
+
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                       out_dtype)
+    out = cutlass_scaled_mm(A_fp8_cutlass, B_fp8, As_cutlass, Bs_cutlass,
+                            block_size, out_dtype)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
+
+
 @pytest.mark.parametrize(
    "M,N,K,block_size,out_dtype,seed",
    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))

--- a/tests/kernels/test_onednn.py
+++ b/tests/kernels/test_onednn.py
@@ -111,6 +111,49 @@ def onednn_int8_gemm_test_helper(primitive_cache_size: int,
        torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)


+def onednn_gemm_test_helper(primitive_cache_size: int,
+                            m: int,
+                            n: int,
+                            k: int,
+                            use_bias: bool,
+                            use_stride: bool,
+                            dtype: torch.dtype = torch.bfloat16,
+                            device: str = "cpu"):
+    if use_stride:
+        a = torch.rand((m, 2 * k), dtype=dtype, device=device) * 1.5
+        a = a[:, :k]
+    else:
+        a = torch.rand((m, k), dtype=dtype, device=device) * 1.5
+
+    b = torch.rand((n, k), dtype=dtype, device=device) * 1.5
+
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=dtype) * 5
+        bias_f32 = bias.float()
+    else:
+        bias = None
+        bias_f32 = None
+
+    handler = ops.create_onednn_mm(
+        b.t(),
+        primitive_cache_size,
+    )
+
+    out = ops.onednn_mm(handler, a, bias)
+    baseline = torch.nn.functional.linear(a.float(), b.float(),
+                                          bias_f32).to(dtype=a.dtype)
+
+    torch.testing.assert_close(out, baseline)
+
+    if use_bias:
+        # To test runtime bias setting
+        out = ops.onednn_mm(handler, a, None)
+        baseline = torch.nn.functional.linear(a.float(), b.float(),
+                                              None).to(dtype=a.dtype)
+
+        torch.testing.assert_close(out, baseline)
+
+
 @pytest.mark.parametrize("n,k", NK_FACTORS)
 @pytest.mark.parametrize("m_list", M_FACTORS)
 @pytest.mark.parametrize("per_tensor_a_scale", [True, False])
@@ -142,3 +185,30 @@ def test_onednn_int8_scaled_gemm(
            use_azp=use_azp,
            out_dtype=output_type,
        )
+
+
+@pytest.mark.parametrize("n,k", NK_FACTORS)
+@pytest.mark.parametrize("m_list", M_FACTORS)
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("use_stride", [True, False])
+@pytest.mark.parametrize("dtype", DTYPE)
+@pytest.mark.parametrize("primitive_cache_size", CACHE_SIZES)
+def test_onednn_gemm(
+    n: int,
+    k: int,
+    m_list: tuple[int],
+    use_bias: bool,
+    use_stride: bool,
+    dtype: torch.dtype,
+    primitive_cache_size: int,
+):
+    for m in m_list:
+        onednn_gemm_test_helper(
+            primitive_cache_size=primitive_cache_size,
+            m=m,
+            n=n,
+            k=k,
+            use_bias=use_bias,
+            use_stride=use_stride,
+            dtype=dtype,
+        )
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1236,7 +1236,7 @@ def baseline_scaled_mm(a: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:

    # We treat N-dimensional group scaling as extended numpy-style broadcasting
-    # in numpy simply stretches dimensions with an extent of 1 to match the
+    # in numpy simply stretches dimensions with an extent of 1 to match
    # the target shape by repeating the data along that dimension (broadcasting)
    # , we extend these semantics to say if the extent of a dimension in the
    # source shape is not 1 and does not match the target shape we repeat each
@@ -1247,7 +1247,7 @@ def baseline_scaled_mm(a: torch.Tensor,
    # then we would expand a to:
    #       a = [[1, 1, 2, 2],
    #            [3, 3, 4, 4]]
-    # NOTE this function this function does not explicitly broadcast dimensions
+    # NOTE this function does not explicitly broadcast dimensions
    # with an extent of 1, since this can be done implicitly by pytorch
    def group_broadcast(t, shape):
        for i, s in enumerate(shape):

--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -128,7 +128,7 @@ if __name__ == "__main__":
    print(f"initialized! My rank is {my_rank}")

    config = KVTransferConfig(
-        kv_connector='PyNcclConnector',
+        kv_connector='P2pNcclConnector',
        kv_buffer_device='cuda',
        kv_buffer_size=1e9,
        kv_rank=my_rank,

--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -137,7 +137,7 @@ if __name__ == "__main__":
    )

    config = KVTransferConfig(
-        kv_connector='PyNcclConnector',
+        kv_connector='P2pNcclConnector',
        kv_buffer_device='cuda',
        kv_buffer_size=1e9,
        kv_rank=my_rank,

--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -59,10 +59,10 @@ async def requests_processing_time(llm,
 @pytest.mark.asyncio
 async def test_add_lora(chatglm3_lora_files):
    """ 
-    The add_lora function is used to pre-load some LoRA adapters into the
+    The add_lora function is used to preload some LoRA adapters into the
    engine in anticipation of future requests using these adapters. To test
    this functionality, we use the async engine to process some requests - We
-    do it twice, once with add_lora() pre-loading and once without.
+    do it twice, once with add_lora() preloading and once without.

    We measure the request processing time in both cases and expect the time 
    to be lesser in the case with add_lora() calls.

--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -11,21 +11,21 @@ import pytest
 import torch
 import torch.nn.functional as F

-from vllm.config import LoRAConfig
-from vllm.lora.fully_sharded_layers import (
-    ColumnParallelLinearWithShardedLoRA,
-    MergedColumnParallelLinearWithShardedLoRA,
-    MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA,
-    RowParallelLinearWithShardedLoRA)
+from vllm.config.lora import LoRAConfig
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
+                              ColumnParallelLinearWithShardedLoRA,
                              LogitsProcessorWithLoRA, LoRAMapping,
                              MergedColumnParallelLinearWithLoRA,
+                              MergedColumnParallelLinearWithShardedLoRA,
                              MergedQKVParallelLinearWithLoRA,
+                              MergedQKVParallelLinearWithShardedLoRA,
                              QKVParallelLinearWithLoRA,
+                              QKVParallelLinearWithShardedLoRA,
                              ReplicatedLinearWithLoRA,
                              RowParallelLinearWithLoRA,
+                              RowParallelLinearWithShardedLoRA,
                              VocabParallelEmbeddingWithLoRA)
 # yapf: enable
 from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
@@ -60,9 +60,9 @@ DEVICES = ([
 # prefill stage(True) or decode stage(False)
 STAGES = [True, False]

-NUM_RANDOM_SEEDS = 6
+NUM_RANDOM_SEEDS = 2

-VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
+VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 2


 @pytest.fixture(autouse=True)

--- a/tests/lora/test_lora_allowed_token_ids.py
+++ b/tests/lora/test_lora_allowed_token_ids.py
@@ -3,8 +3,8 @@

 import pytest

-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         VllmConfig)
+from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
@@ -18,7 +18,7 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
    adapters that define additional tokens.
    """

-    # Setup a base model compatible with the sql_lora_files adapter and
+    # Set up a base model compatible with the sql_lora_files adapter and
    # a known number of tokens in the base model.
    model_config = ModelConfig(
        model=llama_2_7b_base_huggingface_id,
@@ -84,7 +84,7 @@ def test_allowed_token_ids_with_lora_adapter_no_vocab(
    adapters that do not define additional tokens.
    """

-    # Setup a base model compatible with the qwen25vl_lora_files adapter and
+    # Set up a base model compatible with the qwen25vl_lora_files adapter and
    # a known number of tokens in the base model.
    model_config = ModelConfig(
        model=qwen25vl_base_huggingface_id,

--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -8,7 +8,7 @@ import torch
 from safetensors.torch import load_file
 from torch import nn

-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                              MergedColumnParallelLinearWithLoRA,
                              RowParallelLinearWithLoRA)

--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -7,7 +7,7 @@ import shutil

 import pytest

-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.peft_helper import PEFTHelper

 ERROR_CASES = [

--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -6,9 +6,10 @@ import random
 import tempfile
 from unittest.mock import patch

-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VllmConfig)
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.config.load import LoadConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker

--- a/tests/model_executor/model_loader/test_registry.py
+++ b/tests/model_executor/model_loader/test_registry.py
@@ -4,7 +4,8 @@
 import pytest
 from torch import nn

-from vllm.config import LoadConfig, ModelConfig
+from vllm.config import ModelConfig
+from vllm.config.load import LoadConfig
 from vllm.model_executor.model_loader import (get_model_loader,
                                              register_model_loader)
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader

--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -13,13 +13,15 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (dispatch_topk_func,
                                                            vllm_topk_softmax)
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
    is_rocm_aiter_moe_enabled)
-from vllm.model_executor.layers.layernorm import (
-    RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
-    rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
+from vllm.model_executor.layers.layernorm import (RMSNorm,
+                                                  dispatch_rocm_rmsnorm_func,
+                                                  fused_add_rms_norm, rms_norm)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform

+RMS_NORM_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
+

 # Registered subclass for test
 @CustomOp.register("relu3")
@@ -149,24 +151,27 @@ def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):


 @pytest.mark.parametrize("add_residual", [True, False])
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
 @pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
 @pytest.mark.skipif(not current_platform.is_rocm(),
                    reason="AITER is a feature exclusive for ROCm")
-def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
-                           use_rocm_aiter_norm: str, monkeypatch):
+def test_rms_norm_dispatch(add_residual: bool, dtype: torch.dtype,
+                           use_rocm_aiter: str, use_rocm_aiter_norm: str,
+                           monkeypatch):
    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
    monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
-    rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual)
-
-    if not add_residual:
-        if current_platform.is_rocm() and int(use_rocm_aiter) and int(
-                use_rocm_aiter_norm):
-            assert rms_norm_func == rocm_aiter_rms_norm
-        else:
-            assert rms_norm_func == rms_norm
-    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
-            use_rocm_aiter_norm):
-        assert rms_norm_func == rocm_aiter_fused_add_rms_norm
-    else:
+    rms_norm_func = dispatch_rocm_rmsnorm_func(add_residual, dtype)
+
+    should_use_rocm_aiter = current_platform.is_rocm() and int(use_rocm_aiter) \
+        and int(use_rocm_aiter_norm) and dtype in RMS_NORM_SUPPORTED_DTYPES
+
+    if add_residual and should_use_rocm_aiter:
+        assert rms_norm_func == torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add
+    elif should_use_rocm_aiter:
+        assert rms_norm_func == torch.ops.vllm.rocm_aiter_rms_norm
+    elif add_residual:
        assert rms_norm_func == fused_add_rms_norm
+    else:
+        assert rms_norm_func == rms_norm
--- a/tests/models/language/generation/test_bart.py
+++ b/tests/models/language/generation/test_bart.py
@@ -178,6 +178,7 @@ def run_test(
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+@pytest.mark.skip(reason="bart not supported in V1")
 def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
                dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:

@@ -201,6 +202,7 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
+@pytest.mark.skip(reason="bart not supported in V1")
 def test_models_distributed(hf_runner, vllm_runner,
                            example_encoder_decoder_prompts,
                            distributed_executor_backend, model, dtype,

--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -13,7 +13,7 @@ from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close

 # These have unsupported head_dim for FA. We do not
-# not have a clean way to fall back, so we fail with
+# have a clean way to fall back, so we fail with
 # a clear msg when it happens.
 # https://github.com/vllm-project/vllm/issues/14524
 REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
@@ -93,7 +93,7 @@ AITER_MODEL_LIST = [
            "allenai/OLMoE-1B-7B-0924-Instruct",
            marks=[pytest.mark.cpu_model],
        ),
-        pytest.param("swiss-ai/Apertus-8B"),  # apertus
+        pytest.param("swiss-ai/Apertus-8B-2509"),  # apertus
    ])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])

--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -25,8 +25,7 @@ SSM_MODELS = [

 HYBRID_MODELS = [
    "ai21labs/Jamba-tiny-dev",
-    # skipping until vLLM implementation issues are resolved
-    # "pfnet/plamo-2-1b",
+    "pfnet/plamo-2-1b",
    "Zyphra/Zamba2-1.2B-instruct",
    "hmellor/tiny-random-BambaForCausalLM",
    "ibm-granite/granite-4.0-tiny-preview",
@@ -34,20 +33,10 @@ HYBRID_MODELS = [
    "LiquidAI/LFM2-1.2B",
 ]

-HF_UNSUPPORTED_MODELS = [
-    # The HF transformers implementation of
-    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
-    # doesn't compare vLLM output with HF output.
-    # See https://github.com/huggingface/transformers/pull/35943
-    "yujiepan/mamba2-codestral-v0.1-tiny-random",
-    # transformers 4.55 is still producing garbage for this model
-    # TODO(tdoublep): follow-up on transformers side
-    "ibm-granite/granite-4.0-tiny-preview"
-]
-
 V1_SUPPORTED_MODELS = [
    "state-spaces/mamba-130m-hf",
    "ai21labs/Jamba-tiny-dev",
+    "pfnet/plamo-2-1b",
    "yujiepan/mamba2-codestral-v0.1-tiny-random",
    "Zyphra/Zamba2-1.2B-instruct",
    "hmellor/tiny-random-BambaForCausalLM",
@@ -58,6 +47,7 @@ V1_SUPPORTED_MODELS = [

 FULL_CUDA_GRAPH_MODELS = [
    "ai21labs/Jamba-tiny-dev",
+    "pfnet/plamo-2-1b",
    "Zyphra/Zamba2-1.2B-instruct",
 ]

@@ -65,6 +55,11 @@ V0_UNSUPPORTED_MODELS = [
    "LiquidAI/LFM2-1.2B",
 ]

+FP32_STATE_MODELS = [
+    "state-spaces/mamba-130m-hf",
+    "Zyphra/Zamba2-1.2B-instruct",
+]
+
 # Avoid OOM
 MAX_NUM_SEQS = 4

@@ -85,20 +80,13 @@ def test_models(
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
-        hf_version_check = model_info.check_transformers_version(
-            on_fail="return")
+        model_info.check_transformers_version(on_fail="skip")
    except ValueError:
-        hf_version_check = None
-
-    if hf_version_check is not None:
-        print(f"Skipping transformers comparison because: {hf_version_check}")
+        pass

    with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS and hf_version_check is None:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, num_logprobs)
-        else:
-            hf_outputs = None
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "0")
@@ -116,7 +104,7 @@ def test_models(
    else:
        vllm_v1_outputs = None

-    if hf_outputs is not None and vllm_v0_outputs is not None:
+    if vllm_v0_outputs is not None:
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_v0_outputs,
@@ -125,12 +113,10 @@ def test_models(
        )

    if model in V1_SUPPORTED_MODELS:
-        ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
-        assert ref_outputs is not None
        check_logprobs_close(
-            outputs_0_lst=ref_outputs,
+            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_v1_outputs,
-            name_0="hf" if hf_outputs is not None else "vllm-v0",
+            name_0="hf",
            name_1="vllm-v1",
        )

@@ -315,7 +301,7 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
    finished_requests_ids is larger than the maximum mamba block capacity.

    This could generally happen due to the fact that hybrid does support
-    statelessness mechanism where it can cleanup new incoming requests in
+    statelessness mechanism where it can clean up new incoming requests in
    a single step.
    """
    try:
@@ -336,7 +322,7 @@ def test_state_cleanup(
    This test is for verifying that the Hybrid state is cleaned up between
    steps.
    
-    If its not cleaned, an error would be expected.
+    If it's not cleaned, an error would be expected.
    """
    try:
        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
@@ -397,11 +383,8 @@ def test_full_cuda_graph(
        pass

    with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, num_logprobs)
-        else:
-            hf_outputs = None
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "0")
@@ -416,7 +399,7 @@ def test_full_cuda_graph(
        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

-    if hf_outputs is not None and vllm_v0_outputs is not None:
+    if vllm_v0_outputs is not None:
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_v0_outputs,
@@ -424,17 +407,15 @@ def test_full_cuda_graph(
            name_1="vllm-v0",
        )

-    ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
-    assert ref_outputs is not None
    check_logprobs_close(
-        outputs_0_lst=ref_outputs,
+        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_v1_outputs,
-        name_0="hf" if hf_outputs is not None else "vllm-v0",
+        name_0="hf",
        name_1="vllm-v1",
    )


-@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"])
+@pytest.mark.parametrize("model", FP32_STATE_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_fp32_state(
@@ -455,11 +436,8 @@ def test_fp32_state(
        pass

    with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, num_logprobs)
-        else:
-            hf_outputs = None
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "0")
@@ -475,18 +453,16 @@ def test_fp32_state(
        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

-    if hf_outputs is not None:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_v0_outputs,
-            name_0="hf",
-            name_1="vllm-v0",
-        )
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_v0_outputs,
+        name_0="hf",
+        name_1="vllm-v0",
+    )

-    ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
    check_logprobs_close(
-        outputs_0_lst=ref_outputs,
+        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_v1_outputs,
-        name_0="hf" if hf_outputs is not None else "vllm-v0",
+        name_0="hf",
        name_1="vllm-v1",
    )
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -20,7 +20,7 @@ MISTRAL_FORMAT_MODELS = [
    "mistralai/Mistral-7B-Instruct-v0.3",
    # uses the v3-Tekken tokenizer
    "mistralai/Ministral-8B-Instruct-2410",
-    # Mistral-Nemo is to big for CI, but passes locally
+    # Mistral-Nemo is too big for CI, but passes locally
    # "mistralai/Mistral-Nemo-Instruct-2407"
 ]

@@ -273,7 +273,7 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:


 def test_mistral_function_call_nested_json():
-    """Ensure that the function-name regex captures the entire outer-most
+    """Ensure that the function-name regex captures the entire outermost
    JSON block, including nested braces."""

    # Create a minimal stub tokenizer that provides the few attributes the

--- a/tests/models/language/generation_ppl_test/__init__.py
+++ b/tests/models/language/generation_ppl_test/__init__.py
--- a/tests/models/language/generation_ppl_test/ppl_utils.py
+++ b/tests/models/language/generation_ppl_test/ppl_utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://huggingface.co/docs/transformers/perplexity
+from typing import Optional, cast
+
+import pytest
+import torch
+from datasets import load_dataset
+
+from tests.models.utils import (GenerateModelInfo,
+                                TokensTextLogprobsPromptLogprobs)
+from vllm.logprobs import Logprob
+
+# See #24485
+PPL_TOL = 0.01
+MAX_LENGTH = 1024
+
+
+@torch.inference_mode
+def wikitext_ppl_test(hf_runner,
+                      vllm_runner,
+                      model_info: GenerateModelInfo,
+                      max_length=MAX_LENGTH,
+                      vllm_extra_kwargs=None,
+                      atol=PPL_TOL):
+
+    # A model family has many models with the same architecture,
+    # and we don't need to test each one.
+    if not model_info.enable_test:
+        pytest.skip("Skipping test.")
+
+    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+
+    # Allow vllm to test using the given dtype, such as float32
+    vllm_extra_kwargs = vllm_extra_kwargs or {}
+    vllm_extra_kwargs["dtype"] = model_info.dtype
+
+    # Allow vllm to test using hf_overrides
+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
+    with vllm_runner(model_info.name,
+                     gpu_memory_utilization=0.7,
+                     max_model_len=max_length,
+                     max_num_seqs=1,
+                     enforce_eager=True,
+                     **vllm_extra_kwargs) as vllm_model:
+        # Use max_num_seqs=1 to avoid OOM,
+        # and batch different requests together.
+
+        model_config = vllm_model.llm.llm_engine.model_config
+
+        # Confirm whether vllm is using the correct architecture
+        if model_info.architecture:
+            assert (model_info.architecture in model_config.architectures)
+
+        max_length = min(model_config.max_model_len - 1, max_length)
+        stride = max_length
+
+        tokenizer = vllm_model.llm.get_tokenizer()
+        tokens = tokenizer.encode("\n\n".join(dataset["text"]))
+        n_tokens = len(tokens)
+
+        chunks = []
+        for begin_loc in range(0, n_tokens, stride):
+            end_loc = min(begin_loc + max_length, n_tokens)
+            chunks.append(tokens[begin_loc:end_loc])
+
+        outputs = vllm_model.generate_greedy_logprobs(prompts=chunks,
+                                                      max_tokens=1,
+                                                      num_logprobs=None,
+                                                      num_prompt_logprobs=0,
+                                                      use_tqdm=False)
+        nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
+        n_tokens = 0
+        for output in outputs:
+            output = cast(TokensTextLogprobsPromptLogprobs, output)
+            token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
+
+            assert token_datas[0] is None
+            token_log_probs = []
+            for token_data in token_datas[1:]:
+                assert token_data is not None
+                assert len(token_data) == 1
+                token_log_prob = list(token_data.values())[0].logprob
+                token_log_probs.append(token_log_prob)
+
+            neg_log_likelihood = -torch.tensor(
+                token_log_probs, dtype=torch.float32, device="cpu").sum()
+            nll_sum += neg_log_likelihood
+            n_tokens += len(token_log_probs)
+        vllm_ppl = float(torch.exp(nll_sum / n_tokens))
+        vllm_dtype = model_config.dtype
+
+    # Accelerate ppl test by setting Transformers ppl score to a constant
+    if model_info.hf_ppl is None:
+        with hf_runner(
+                model_info.name,
+                dtype=model_info.hf_dtype,
+        ) as hf_model:
+            nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
+            n_tokens = 0
+            for chunk in chunks:
+                inputs = hf_model.wrap_device(
+                    {"input_ids": torch.tensor([chunk])})
+                input_ids = inputs["input_ids"]
+                outputs = hf_model.model(input_ids, labels=input_ids)
+                neg_log_likelihood = outputs.loss
+
+                neg_log_likelihood = neg_log_likelihood.to(torch.float32).cpu()
+
+                num_loss_tokens = len(chunk) - 1
+                nll_sum += neg_log_likelihood * num_loss_tokens
+                n_tokens += num_loss_tokens
+
+            hf_ppl = float(torch.exp(nll_sum / n_tokens))
+            hf_dtype = next(hf_model.model.parameters()).dtype
+    else:
+        hf_ppl = model_info.hf_ppl
+        hf_dtype = "Constant"
+
+    differ = (vllm_ppl - hf_ppl) / hf_ppl
+    print("Model:", model_info.name)
+    print("VLLM:", vllm_dtype, vllm_ppl)
+    print("Transformers:", hf_dtype, hf_ppl)
+    print("Difference (%):", differ * 100)
+
+    # PPL the smaller, the better
+    # We are not concerned that the vllm PPL is less than Transformers,
+    # so we only perform one-sided testing.
+    assert differ < atol
--- a/tests/models/language/generation_ppl_test/test_gemma.py
+++ b/tests/models/language/generation_ppl_test/test_gemma.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from tests.models.utils import GenerateModelInfo
+
+from .ppl_utils import wikitext_ppl_test
+
+MODELS = [
+    GenerateModelInfo("google/gemma-2b"),
+    GenerateModelInfo("google/gemma-2-2b"),
+    GenerateModelInfo("google/gemma-3-4b-it"),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
+    wikitext_ppl_test(hf_runner, vllm_runner, model_info)