Merge tag 'v0.8.5' into v0.8.5-dev

dcb5624a · zhuwenwen · 55880ca2 · ba41cc90 · dcb5624a · 55880ca2
Commit dcb5624a authored Apr 29, 2025 by zhuwenwen
20 changed files
--- a/tests/kernels/test_cutlass_mla_decode.py
+++ b/tests/kernels/test_cutlass_mla_decode.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+import vllm._custom_ops as ops
+from vllm.platforms import current_platform
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason="Cutlass MLA Requires compute capability of 10 or above.",
+        allow_module_level=True)
+
+
+def ref_mla(
+        out: Tensor,  # (bs, num_heads, v_head_dim)
+        query: Tensor,  # (bs, num_heads, head_dim)
+        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+        scale: float,
+        block_tables: Tensor,  # (bs, max_num_blocks)
+        seq_lens: Tensor,  # (bs,)
+):
+    bs, num_heads, v_head_dim = out.shape
+    head_dim = query.shape[2]
+
+    for i in range(bs):
+        # gather and flatten KV-cache
+        kv = kv_cache[
+            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1,
+                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
+        v = kv[:, :, :v_head_dim]
+
+        q = query[i].view(num_heads, 1, head_dim)
+        o = F.scaled_dot_product_attention(q,
+                                           kv,
+                                           v,
+                                           scale=scale,
+                                           enable_gqa=True)
+        out[i] = o.view(num_heads, v_head_dim)
+
+    return out
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("mean_seq_len", [128, 1024, 4096])
+@pytest.mark.parametrize("bs", [1, 2, 4])
+@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.parametrize("block_size", [16, 64, 128])
+def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
+                            varlen: bool, block_size: int):
+    torch.set_default_dtype(dtype)
+    torch.set_default_device('cuda')
+    torch.manual_seed(42)
+
+    d = 576
+    h_q = 128
+    dv = 512
+
+    q_nope_dim = 128
+    q_pe_dim = 64
+    scale = (q_nope_dim + q_pe_dim)**(-0.5)
+    if varlen:
+        seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
+        seq_lens = seq_lens.clip(2).to(torch.int32)
+    else:
+        seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
+    max_seq_len = seq_lens.max().item()
+    block_num = (max_seq_len + block_size - 1) // block_size
+
+    # Pad block_num so that small blocks can be packed into full 128-sized
+    # CUTLASS tiles. One 128-wide tile can hold (128 // block_size) small
+    # blocks.
+    pack_factor = 128 // block_size
+    block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor
+
+    q = torch.randn(bs, h_q, d)
+    block_table = torch.randint(0,
+                                bs * block_num, (bs, block_num),
+                                dtype=torch.int32)
+
+    kv_cache = torch.randn(block_table.numel(), block_size, d)
+
+    out_ref = q.new_zeros(bs, h_q, dv)
+    ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
+    out_ans = torch.zeros_like(out_ref)
+    q_nope = q[:, :, :dv].clone()
+    q_pe = q[:, :, dv:].clone()
+    ops.cutlass_mla_decode(out_ans, q_nope, q_pe, kv_cache, seq_lens,
+                           block_table, scale)
+
+    torch.testing.assert_close(out_ans, out_ref, atol=1e-2, rtol=1e-2)
--- a/tests/kernels/test_cutlass_moe.py
+++ b/tests/kernels/test_cutlass_moe.py
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import torch
-
-from vllm import _custom_ops as ops
-from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
-from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
-                                                            fused_topk)
-from vllm.platforms import current_platform
-
-NUM_EXPERTS = [40, 64]
-TOP_KS = [6, 8]
-
-
-def run(a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
-        w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
-        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
-    with set_current_vllm_config(
-            VllmConfig(parallel_config=ParallelConfig(
-                pipeline_parallel_size=1))):
-        return cutlass_moe_fp8(a,
-                               w1_q,
-                               w2_q,
-                               w1_scale,
-                               w2_scale,
-                               topk_weights,
-                               topk_ids,
-                               ab_strides1,
-                               c_strides1,
-                               ab_strides2,
-                               c_strides2,
-                               a1_scale=a_scale)
-
-
-@pytest.mark.parametrize("m", [2, 64, 224])
-@pytest.mark.parametrize("n", [1024, 3072])
-@pytest.mark.parametrize("k", [1024, 1536])
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
-@pytest.mark.skipif(
-    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
-def test_cutlass_moe_no_graph(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    per_act_token: bool,
-    per_out_ch: bool,
-):
-    current_platform.seed_everything(7)
-    with set_current_vllm_config(
-            VllmConfig(parallel_config=ParallelConfig(
-                pipeline_parallel_size=1))):
-
-        dtype = torch.half
-
-        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-
-        # Get the right scale for tests.
-        _, a_scale1 = ops.scaled_fp8_quant(
-            a, use_per_token_if_dynamic=per_act_token)
-        a_q, _ = ops.scaled_fp8_quant(a,
-                                      a_scale1,
-                                      use_per_token_if_dynamic=per_act_token)
-
-        a_d = a_q.float().mul(a_scale1).to(dtype)
-
-        n_b_scales = 2 * n if per_out_ch else 1
-        k_b_scales = k if per_out_ch else 1
-
-        w1_q = torch.empty((e, 2 * n, k),
-                           device="cuda",
-                           dtype=torch.float8_e4m3fn)
-        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
-        w1_scale = torch.empty((e, n_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-        w2_scale = torch.empty((e, k_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-
-        for expert in range(e):
-            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-                w1[expert], use_per_token_if_dynamic=per_out_ch)
-            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-                w2[expert], use_per_token_if_dynamic=per_out_ch)
-        w1_q = w1_q.transpose(1, 2)
-        w2_q = w2_q.transpose(1, 2)
-
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-
-        w1_d = torch.empty_like(w1)
-        w2_d = torch.empty_like(w2)
-        for expert in range(e):
-            w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
-            w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
-
-        score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
-
-        triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
-
-        cutlass_output = cutlass_moe_fp8(a,
-                                         w1_q,
-                                         w2_q,
-                                         w1_scale,
-                                         w2_scale,
-                                         topk_weights,
-                                         topk_ids,
-                                         ab_strides1,
-                                         c_strides1,
-                                         ab_strides2,
-                                         c_strides2,
-                                         a1_scale=a_scale1)
-
-        #print(triton_output)
-        #print(cutlass_output)
-        #print("*")
-
-        torch.testing.assert_close(triton_output,
-                                   cutlass_output,
-                                   atol=5e-2,
-                                   rtol=1e-2)
-
-
-@pytest.mark.parametrize("m", [2, 64, 224])
-@pytest.mark.parametrize("n", [1024, 3072])
-@pytest.mark.parametrize("k", [1024, 1536])
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
-@pytest.mark.skipif(
-    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
-def test_cutlass_moe_cuda_graph(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    per_act_token: bool,
-    per_out_ch: bool,
-):
-    current_platform.seed_everything(7)
-    with set_current_vllm_config(
-            VllmConfig(parallel_config=ParallelConfig(
-                pipeline_parallel_size=1))):
-
-        dtype = torch.half
-
-        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-
-        # Get the right scale for tests.
-        _, a_scale1 = ops.scaled_fp8_quant(
-            a, use_per_token_if_dynamic=per_act_token)
-        a_q, _ = ops.scaled_fp8_quant(a,
-                                      a_scale1,
-                                      use_per_token_if_dynamic=per_act_token)
-
-        a_d = a_q.float().mul(a_scale1).to(dtype)
-
-        n_b_scales = 2 * n if per_out_ch else 1
-        k_b_scales = k if per_out_ch else 1
-
-        w1_q = torch.empty((e, 2 * n, k),
-                           device="cuda",
-                           dtype=torch.float8_e4m3fn)
-        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
-        w1_scale = torch.empty((e, n_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-        w2_scale = torch.empty((e, k_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-
-        for expert in range(e):
-            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-                w1[expert], use_per_token_if_dynamic=per_out_ch)
-            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-                w2[expert], use_per_token_if_dynamic=per_out_ch)
-        w1_q = w1_q.transpose(1, 2)
-        w2_q = w2_q.transpose(1, 2)
-
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-
-        w1_d = torch.empty_like(w1)
-        w2_d = torch.empty_like(w2)
-        for expert in range(e):
-            w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
-            w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
-
-        score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
-
-        triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
-
-        stream = torch.cuda.Stream()
-        graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(graph, stream=stream):
-            cutlass_output = run(a, a_scale1, w1_q, w2_q, w1_scale, w2_scale,
-                                 topk_weights, topk_ids, ab_strides1,
-                                 c_strides1, ab_strides2, c_strides2)
-        torch.cuda.synchronize()
-        graph.replay()
-        torch.cuda.synchronize()
-
-        #print(triton_output)
-        #print(cutlass_output)
-        #print("*")
-
-        torch.testing.assert_close(triton_output,
-                                   cutlass_output,
-                                   atol=9e-2,
-                                   rtol=1e-2)
--- a/tests/kernels/test_triton_flash_attention.py
+++ b/tests/kernels/test_triton_flash_attention.py
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for the triton_flash_attention kernel
+
+Run `pytest tests/kernels/test_triton_flash_attention.py`.
+"""
+import pytest
+import torch
+
+from vllm.attention.ops.triton_flash_attention import (SUPPORTED_LAYOUTS,
+                                                       MetaData,
+                                                       compute_alibi_tensor,
+                                                       scale_fp8,
+                                                       triton_attention_rocm)
+from vllm.platforms import current_platform
+
+
+class ReferenceAttention:
+
+    def __init__(self, Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype,
+                 input_metadata):
+        self.Z = Z
+        self.HQ = HQ
+        self.HK = HK
+        self.N_CTX_Q = N_CTX_Q
+        self.N_CTX_K = N_CTX_K
+        self.D_HEAD = D_HEAD
+        self.use_alibi = use_alibi
+        self.dtype = dtype
+        self.input_metadata = input_metadata
+
+    def fwd(self, q, k, v):
+        scores = torch.einsum('bhqd,bhkd->bhqk', q,
+                              k).float() * self.input_metadata.sm_scale
+        if self.input_metadata.causal:
+            mask = torch.tril(torch.ones(self.N_CTX_Q,
+                                         self.N_CTX_K,
+                                         device="cuda"),
+                              diagonal=self.N_CTX_K - self.N_CTX_Q)
+            scores[:, :, mask == 0] = float("-inf")
+
+        if self.input_metadata.bias is not None:
+            scores += self.input_metadata.bias
+
+        if self.use_alibi:
+            scores += compute_alibi_tensor(self.input_metadata.alibi_slopes,
+                                           self.N_CTX_Q, self.N_CTX_K)
+
+        p = torch.softmax(scores, dim=-1)
+        if self.input_metadata.causal:
+            # If N_CTX_Q > N_CTX_K, there's at least one row of all -infs going
+            # into softmax. This creates a row of NaNs as -inf - -inf == NaN.
+            # So we fix this by converting the NaNs to 0s, which is what they
+            # should be out of the softmax.
+            nan_mask = torch.isnan(p)
+            p[nan_mask == 1] = 0
+        ref_out = torch.einsum('bhqk,bhkd->bhqd', p.to(self.dtype), v)
+        # compare
+        if self.input_metadata.layout == 'bshd':
+            ref_out = ref_out.transpose(1, 2).clone()
+        return ref_out
+
+    def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
+        q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
+            self.dtype)
+        k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
+            self.dtype)
+        v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
+            self.dtype)
+        result = self.fwd(q, k, v)
+        if self.input_metadata.o_scale is not None:
+            result, _ = scale_fp8(result, self.input_metadata.o_scale)
+        return result
+
+    def fwd_fp8_kv(self, q, k_quantized, v_quantized):
+        k_descale, v_descale = (self.input_metadata.k_descale,
+                                self.input_metadata.v_descale)
+        k_dequantized = (k_quantized.to(torch.float32) *
+                         k_descale.to(torch.float32)).to(self.dtype)
+        v_dequantized = (v_quantized.to(torch.float32) *
+                         v_descale.to(torch.float32)).to(self.dtype)
+        return self.fwd(q, k_dequantized, v_dequantized)
+
+    def varlen_fwd(self, q, k, v, is_mqa=False):
+        ref_out = torch.empty_like(q)
+        if is_mqa:
+            # Make KV look like HQ/HK "groups" of HK. Later, we will reshape so
+            # the size aligns with Q.
+            k_ref = k.view(k.shape[0], k.shape[1], 1,
+                           k.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
+            v_ref = v.view(v.shape[0], v.shape[1], 1,
+                           v.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
+        else:
+            k_ref = k
+            v_ref = v
+
+        for i in range(0, self.input_metadata.num_contexts):
+            start_q, start_k = self.input_metadata.cu_seqlens_q[
+                i], self.input_metadata.cu_seqlens_k[i]
+            end_q, end_k = self.input_metadata.cu_seqlens_q[
+                i + 1], self.input_metadata.cu_seqlens_k[i + 1]
+            k_curr = k_ref[start_k:end_k]
+            v_curr = v_ref[start_k:end_k]
+            if is_mqa:
+                k_curr = k_curr.reshape(k_curr.shape[0], -1, k_curr.shape[3])
+                v_curr = v_curr.reshape(v_curr.shape[0], -1, v_curr.shape[3])
+            scores = torch.einsum('qhd,khd->qhk', q[start_q:end_q],
+                                  k_curr).float()
+            p = torch.softmax(scores * self.input_metadata.sm_scale,
+                              dim=-1).half()
+            ref_out[start_q:end_q] = torch.einsum('qhk,khd->qhd', p, v_curr)
+        return ref_out
+
+
+def quantize_input(q, k, v, fp8_kv=False, use_o_scale=False):
+    q_descale = None
+    if not fp8_kv:
+        q, q_descale = scale_fp8(q)
+    k, k_descale = scale_fp8(k)
+    v, v_descale = scale_fp8(v)
+
+    # In real world use case, the p scale would be a parameter trained by the
+    # model.
+    p_scale = None
+
+    o_scale = torch.rand(1, device="cuda",
+                         requires_grad=False) if use_o_scale else None
+
+    return q, k, v, q_descale, k_descale, v_descale, p_scale, o_scale
+
+
+def input_helper(
+    Z,
+    HQ,
+    HK,
+    N_CTX_Q,
+    N_CTX_K,
+    D_HEAD,
+    dtype,
+    layout=None,
+    use_alibi=None,
+    causal=None,
+    is_fp8=False,
+    fp8_kv=False,
+    use_o_scale=False,
+    use_bias=False,
+):
+    assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
+
+    current_platform.seed_everything(0)
+
+    # Initialize q, k, v
+    if layout == 'bhsd':
+        q_tensor_shape = (Z, HQ, N_CTX_Q, D_HEAD)
+        k_tensor_shape = (Z, HK, N_CTX_K, D_HEAD)
+    elif layout == 'bshd':
+        q_tensor_shape = (Z, N_CTX_Q, HQ, D_HEAD)
+        k_tensor_shape = (Z, N_CTX_K, HK, D_HEAD)
+
+    if use_alibi:
+        # for n heads the set of slopes is the geometric sequence that starts
+        # 2^(-8/n)
+        alibi_slopes = torch.tensor(
+            [2**(-8 / HQ * i) for i in range(1, HQ + 1)],
+            dtype=torch.float32,
+            device="cuda").repeat(Z, 1)
+    else:
+        alibi_slopes = None
+
+    if use_bias:
+        bias = torch.randn((1, HQ, N_CTX_Q, N_CTX_K),
+                           dtype=dtype,
+                           device="cuda",
+                           requires_grad=False)
+    else:
+        bias = None
+
+    q = torch.randn(q_tensor_shape,
+                    dtype=dtype,
+                    device="cuda",
+                    requires_grad=False)
+    k = torch.randn(k_tensor_shape,
+                    dtype=dtype,
+                    device="cuda",
+                    requires_grad=False)
+    v = torch.randn(k_tensor_shape,
+                    dtype=dtype,
+                    device="cuda",
+                    requires_grad=False)
+
+    if is_fp8:
+        (q, k, v, q_descale, k_descale, v_descale, p_scale,
+         o_scale) = quantize_input(q,
+                                   k,
+                                   v,
+                                   use_o_scale=use_o_scale,
+                                   fp8_kv=fp8_kv)
+    else:
+        q_descale = k_descale = v_descale = p_scale = o_scale = None
+
+    input_metadata = MetaData(sm_scale=D_HEAD**-0.5,
+                              max_seqlens_q=N_CTX_Q,
+                              max_seqlens_k=N_CTX_K,
+                              layout=layout,
+                              alibi_slopes=alibi_slopes,
+                              alibi_batch=Z,
+                              alibi_nheads=HQ,
+                              q_descale=q_descale,
+                              k_descale=k_descale,
+                              v_descale=v_descale,
+                              p_scale=p_scale,
+                              o_scale=o_scale,
+                              bias=bias,
+                              seqlen_q=N_CTX_Q,
+                              seqlen_k=N_CTX_K)
+    return q, k, v, input_metadata
+
+
+def varlen_input_helper(Z,
+                        HQ,
+                        HK,
+                        N_CTX_Q,
+                        N_CTX_K,
+                        D_HEAD,
+                        dtype,
+                        equal_seqlens=False):
+    current_platform.seed_everything(0)
+
+    # Random sequence lengths. Using N_CTX as kind of max of sum of individual
+    # seqs
+    if not equal_seqlens:
+        max_seqlens_q = N_CTX_Q // Z
+        max_seqlens_k = N_CTX_K // Z
+        seqlens_q = torch.randint(1,
+                                  max_seqlens_q + 1, (Z, ),
+                                  dtype=torch.int32)
+        seqlens_k = torch.randint(1,
+                                  max_seqlens_k + 1, (Z, ),
+                                  dtype=torch.int32)
+    else:
+        seqlens_q = torch.full((Z, ), N_CTX_Q // Z)
+        seqlens_k = torch.full((Z, ), N_CTX_K // Z)
+
+    # Calculate cumulative sequence lengths
+    cu_seqlens_q = torch.cat([
+        torch.tensor([0], dtype=torch.int32),
+        seqlens_q.cumsum(dim=0, dtype=torch.int32)
+    ])
+    cu_seqlens_k = torch.cat([
+        torch.tensor([0], dtype=torch.int32),
+        seqlens_k.cumsum(dim=0, dtype=torch.int32)
+    ])
+    cu_seqlens_q = cu_seqlens_q.to(device="cuda")
+    cu_seqlens_k = cu_seqlens_k.to(device="cuda")
+
+    # Initialize q, k, v with variable lengths
+    total_q = cu_seqlens_q[-1].item()
+    total_k = cu_seqlens_k[-1].item()
+    q = torch.randn((total_q, HQ, D_HEAD), dtype=dtype,
+                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
+    k = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
+                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
+    v = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
+                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
+    sm_scale = D_HEAD**-0.5
+    input_metadata = MetaData(sm_scale=sm_scale)
+    input_metadata.set_varlen_params(cu_seqlens_q, cu_seqlens_k)
+    return q, k, v, input_metadata
+
+
+@pytest.mark.parametrize('Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD', [
+    (1, 48, 12, 1, 1, 64),
+    (4, 4, 4, 128, 128, 65),
+    (16, 48, 48, 1, 1, 128),
+    (64, 48, 24, 3, 3, 128),
+    (4, 4, 4, 113, 123, 1),
+])
+@pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize('use_alibi', [True, False])
+@pytest.mark.parametrize('layout', ['bshd'])
+def test_op_fwd(Z,
+                HQ,
+                HK,
+                N_CTX_Q,
+                N_CTX_K,
+                D_HEAD,
+                causal,
+                use_alibi,
+                layout,
+                dtype=torch.float16):
+    current_platform.seed_everything(0)
+    q, k, v, input_metadata = input_helper(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
+                                           dtype, layout, use_alibi, causal)
+
+    o = torch.empty_like(q)
+
+    # triton implementation
+    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
+
+    # Transpose here if layout is bshd so we have same reference code for all
+    # layouts
+    if layout == 'bshd':
+        q = q.transpose(1, 2).clone()
+        k = k.transpose(1, 2).clone()
+        v = v.transpose(1, 2).clone()
+    # Replicate K and V if using MQA/GQA
+    if HQ != HK:
+        k = k.view(k.shape[0], k.shape[1], -1, k.shape[2],
+                   k.shape[3]).expand(-1, -1, HQ // HK, -1,
+                                      -1).reshape(k.shape[0], -1, k.shape[2],
+                                                  k.shape[3])
+        v = v.view(v.shape[0], v.shape[1], -1, v.shape[2],
+                   v.shape[3]).expand(-1, -1, HQ // HK, -1,
+                                      -1).reshape(v.shape[0], -1, v.shape[2],
+                                                  v.shape[3])
+
+    ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
+                                  use_alibi, dtype, input_metadata)
+    ref_out = ref_impl.fwd(q, k, v)
+
+    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
+
+
+@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
+    (4, 48, 1, 1, 64),
+    (4, 48, 1, 1, 128),
+    (4, 48, 3, 3, 128),
+    (4, 4, 128, 128, 65),
+])
+@pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize('layout', ['bhsd'])
+@pytest.mark.parametrize('use_o_scale', [True, False])
+@pytest.mark.skipif(torch.cuda.get_device_capability() < (9, 0),
+                    reason="Triton FP8 requires CUDA 9.0 or higher")
+def test_op_fwd_fp8(Z,
+                    H,
+                    N_CTX_Q,
+                    N_CTX_K,
+                    D_HEAD,
+                    causal,
+                    layout,
+                    use_o_scale,
+                    dtype=torch.float32):
+    current_platform.seed_everything(0)
+
+    # Disable grad to save memory it won't run into OOM on CI machine.
+    # q, k, v, input_metadata = input_helper(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD,
+    # dtype, layout)
+
+    q_quantized, k_quantized, v_quantized, input_metadata = input_helper(
+        Z,
+        H,
+        H,
+        N_CTX_Q,
+        N_CTX_K,
+        D_HEAD,
+        dtype,
+        causal=causal,
+        layout=layout,
+        is_fp8=True,
+        use_o_scale=use_o_scale)
+
+    o = torch.empty_like(q_quantized) if use_o_scale else None
+
+    tri_out, _ = triton_attention_rocm(q_quantized, k_quantized, v_quantized,
+                                       o, input_metadata)
+
+    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
+                                  dtype, input_metadata)
+    ref_out = ref_impl.fwd_fp8(q_quantized, k_quantized, v_quantized)
+
+    # compare
+    torch.testing.assert_close(ref_out.to(torch.float32),
+                               tri_out.to(torch.float32),
+                               atol=7e-2,
+                               rtol=2e-1)
+
+
+@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
+    (4, 48, 1, 1, 64),
+    (4, 48, 1, 1, 128),
+    (4, 48, 3, 3, 128),
+    (4, 4, 128, 128, 65),
+    (4, 4, 113, 123, 1),
+])
+@pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize('layout', ['bhsd'])
+def test_op_fwd_fp8_kv(Z,
+                       H,
+                       N_CTX_Q,
+                       N_CTX_K,
+                       D_HEAD,
+                       causal,
+                       layout,
+                       dtype=torch.float32):
+    current_platform.seed_everything(0)
+
+    q, k_quantized, v_quantized, input_metadata = input_helper(Z,
+                                                               H,
+                                                               H,
+                                                               N_CTX_Q,
+                                                               N_CTX_K,
+                                                               D_HEAD,
+                                                               dtype,
+                                                               causal=causal,
+                                                               layout=layout,
+                                                               is_fp8=True,
+                                                               fp8_kv=True)
+
+    o = torch.empty_like(q)
+
+    tri_out, _ = triton_attention_rocm(q, k_quantized, v_quantized, o,
+                                       input_metadata)
+
+    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
+                                  dtype, input_metadata)
+    ref_out = ref_impl.fwd_fp8_kv(q, k_quantized, v_quantized)
+
+    torch.testing.assert_close(ref_out, tri_out, atol=3e-2, rtol=8e-1)
+
+
+@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
+    (4, 48, 1, 1, 64),
+    (4, 48, 1, 1, 128),
+    (4, 48, 3, 3, 128),
+    (4, 4, 128, 128, 65),
+])
+@pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize('use_bias', [True])
+@pytest.mark.parametrize('dtype', [torch.bfloat16])
+def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype):
+    current_platform.seed_everything(0)
+    q, k, v, input_metadata = input_helper(Z,
+                                           H,
+                                           H,
+                                           N_CTX_Q,
+                                           N_CTX_K,
+                                           D_HEAD,
+                                           dtype,
+                                           layout='bhsd',
+                                           causal=causal,
+                                           use_bias=use_bias)
+    o = torch.empty_like(q)
+
+    # triton implementation
+    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
+
+    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
+                                  dtype, input_metadata)
+    ref_out = ref_impl.fwd(q, k, v)
+
+    # compare
+    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
+
+
+# NOTE: Uses thd layout, so also tests thd.
+@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(1, 48, 256, 64),
+                                                 (4, 48, 512, 64),
+                                                 (16, 48, 512, 64),
+                                                 (64, 48, 128, 128)])
+@pytest.mark.parametrize('causal', [True, False])
+def test_op_varlen_fwd(Z, H, N_CTX, D_HEAD, causal, dtype=torch.float16):
+
+    q, k, v, input_metadata = varlen_input_helper(Z, H, H, N_CTX, N_CTX,
+                                                  D_HEAD, dtype)
+
+    tri_out = torch.empty_like(q)
+    triton_attention_rocm(q, k, v, tri_out, input_metadata)
+
+    ref_impl = ReferenceAttention(Z, H, H, N_CTX, N_CTX, D_HEAD, False, dtype,
+                                  input_metadata)
+    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=False)
+
+    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
+
+
+# NOTE: Uses thd layout, so also tests thd.
+@pytest.mark.parametrize('Z, HQ, HK, N_CTX, D_HEAD', [(2, 48, 24, 128, 64),
+                                                      (4, 48, 12, 256, 64),
+                                                      (4, 48, 4, 512, 64),
+                                                      (4, 64, 16, 128, 128)])
+@pytest.mark.parametrize('causal', [False])
+def test_op_varlen_mqa_fwd(Z,
+                           HQ,
+                           HK,
+                           N_CTX,
+                           D_HEAD,
+                           causal,
+                           dtype=torch.float16):
+    q, k, v, input_metadata = varlen_input_helper(Z, HQ, HK, N_CTX, N_CTX,
+                                                  D_HEAD, dtype)
+
+    tri_out = torch.empty_like(q)
+    triton_attention_rocm(q, k, v, tri_out, input_metadata)
+
+    ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX, N_CTX, D_HEAD, False,
+                                  dtype, input_metadata)
+    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=True)
+
+    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
--- a/tests/kernels/test_utils.py
+++ b/tests/kernels/test_utils.py
-# SPDX-License-Identifier: Apache-2.0
-"""
-Tests for miscellaneous utilities
-"""
-
-import pytest
-import torch
-
-from tests.kernels.utils import opcheck
-from vllm.platforms import current_platform
-    
-
-# def test_convert_fp8_opcheck():
-#     data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
-#     result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
-#     opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
-
-
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="Only supported for CUDA")
-def test_cuda_utils_opcheck():
-    opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
-    opcheck(
-        torch.ops._C_cuda_utils.
-        get_max_shared_memory_per_block_device_attribute, (0, ))
--- a/tests/kernels/untest_machete_gemm.py
+++ b/tests/kernels/untest_machete_gemm.py
-"""Tests for the machete kernel.
-
-Run `pytest tests/kernels/test_machete_gemm.py`.
-"""
-
-import math
-from typing import Optional, Tuple
-
-import pytest
-import torch
-
-from tests.kernels.utils import opcheck
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    pack_rows, quantize_weights)
-from vllm.platforms import current_platform
-from vllm.scalar_type import ScalarType, scalar_types
-
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-
-MNK_SHAPES = [
-    (1, 128, 128),
-    (1, 512, 1024),
-    (1, 4096, 4096),
-    (13, 8192, 4096),
-    (26, 4096, 8192),
-    (1, 4096, 4096),
-    (257, 128, 4096),
-    (257, 4224, 4160),
-    (257, 4096, 4096),
-    (64, 4096, 4096),
-    (1024, 4096, 8192),
-    (1024, 8192, 4096),
-]
-
-ACT_TYPES = [torch.float16, torch.bfloat16]
-WTYPE_ZEROPOINTS = [
-    # GPTQ style
-    (scalar_types.uint4b8, False),
-    (scalar_types.uint8b128, False),
-    # AWQ style
-    (scalar_types.uint4, True),
-    (scalar_types.uint8, True),
-]
-
-# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
-#  unit tests to a common utility function. Currently the use of
-#  `is_quant_method_supported` conflates kernels with quantization methods
-#  an assumption which is breaking down as quantizations methods can have
-#  have kernels and some kernels support multiple quantization methods.
-IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)
-
-
-def rand_data(shape, dtype=torch.float16):
-    return 10 * (torch.rand(shape, dtype=dtype, device="cuda") - 0.3)
-
-
-def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor):
-    return zps if zps is None else -1 * s * (zps.to(s.dtype))
-
-
-def machete_quantize_and_pack(w: torch.Tensor,
-                              wtype: ScalarType,
-                              group_size: int,
-                              zero_points: bool = False):
-    assert wtype.is_integer(), "TODO: support floating point weights"
-
-    w_ref, w_q, w_s, w_zp = quantize_weights(
-        w,
-        wtype,
-        group_size,
-        zero_points=zero_points,
-        # to match how the kernel applies zps
-        ref_zero_points_after_scales=True)
-
-    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
-    w_q = w_q.t().contiguous().t()  # convert to col major
-    w_q_machete = ops.machete_prepack_B(w_q, wtype)
-
-    opcheck(torch.ops._C.machete_prepack_B, (w_q, wtype))
-
-    return w_ref, w_q_machete, w_s, w_zp
-
-
-def machete_gemm_test_helper(a: torch.Tensor, b: torch.Tensor,
-                             wtype: ScalarType, group_size: int,
-                             zero_points: bool):
-    w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
-        b, wtype, group_size, zero_points)
-
-    output_ref = torch.matmul(a, w_ref)
-
-    output = ops.machete_gemm(
-        a=a,
-        b_q=w_q_packed,
-        b_type=wtype,
-        b_scales=w_s,
-        b_zeros=maybe_convert_zeropoints(w_zp, w_s),
-        b_group_size=group_size,
-    )
-
-    # Relax atol as our reduction dim becomes larger (more rounding error)
-    # Relax atol when we have zeropoints since the way machete applies
-    #  zeropoints (after scales) causes noise around 0
-    atol = 1 if zero_points else min(5e-2 * math.sqrt(a.shape[1]), 1)
-    torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol)
-
-
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-@pytest.mark.parametrize("shape",
-                         MNK_SHAPES,
-                         ids=lambda x: "x".join(str(v) for v in x))
-@pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x))
-@pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS)
-@pytest.mark.parametrize("group_size", [128, None])
-def test_machete_all_schedules(shape, atype: torch.dtype,
-                               wtype_zeropoints: Tuple[ScalarType, bool],
-                               group_size: Optional[int]):
-    m, n, k = shape
-    wtype, zero_points = wtype_zeropoints
-
-    if group_size is not None and k % group_size != 0:
-        return
-
-    print(f"MNK = {m} {n} {k}")
-
-    # Normalize group_size
-    if group_size is None:
-        group_size = k
-    assert group_size <= k
-
-    a = rand_data((m, k), atype)
-    w = rand_data((k, n), atype)
-
-    w_ref, w_q_machete, w_s, w_zp = machete_quantize_and_pack(
-        w, wtype, group_size, zero_points)
-
-    output_ref = torch.matmul(a, w_ref)
-
-    for schedule in ops.machete_supported_schedules(wtype):
-        print(f"Testing schedule {schedule}")
-        output = ops.machete_gemm(
-            a,
-            b_q=w_q_machete,
-            b_type=wtype,
-            b_scales=w_s,
-            b_zeros=maybe_convert_zeropoints(w_zp, w_s),
-            b_group_size=group_size,
-            schedule=schedule,
-        )
-
-        opcheck(torch.ops._C.machete_gemm,
-                (a, w_q_machete, wtype, w_s, maybe_convert_zeropoints(
-                    w_zp, w_s), group_size, None, None, None, schedule))
-
-        # Relax atol as our reduction dim becomes larger (more rounding error)
-        # Relax atol when we have zeropoints since the way machete applies
-        #  zeropoints (after scales) causes noise around 0
-        atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1)
-        torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol),\
-               f"Schedule failed {schedule}"
-
-
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-@pytest.mark.parametrize("shape",
-                         MNK_SHAPES,
-                         ids=lambda x: "x".join(str(v) for v in x))
-@pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x))
-@pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS)
-@pytest.mark.parametrize("group_size", [128, None])
-def test_machete_heuristic(shape, atype: torch.dtype,
-                           wtype_zeropoints: Tuple[ScalarType, bool],
-                           group_size: Optional[int]):
-    m, n, k = shape
-    wtype, zero_points = wtype_zeropoints
-
-    if group_size is not None and k % group_size != 0:
-        return
-
-    # Normalize group_size
-    if group_size is None:
-        group_size = k
-    assert group_size <= k
-
-    a = rand_data((m, k), atype)
-    b = rand_data((k, n), atype)
-
-    machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
-
-
-# Test working on other devices
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_machete_devices(device: str):
-    m, n, k = 512, 4096, 4096
-    wtype = scalar_types.uint4b8
-    group_size = 128
-    zero_points = False
-
-    print(f"MNK = {m} {n} {k}, device = {device}")
-
-    a = rand_data((m, k), torch.float16).to(device)
-    b = rand_data((k, n), torch.float16).to(device)
-
-    machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
-
-
-# Test working with a subset of A and B
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-def test_machete_subset():
-    big_m, big_n, big_k = 1024, 1024, 1024
-    m, n, k = 512, 512, 512
-    wtype = scalar_types.uint4b8
-    group_size = 128
-    zero_points = False
-
-    whole_a = rand_data((big_m, big_k), torch.float16)
-    whole_b = rand_data((big_k, big_n), torch.float16)
-
-    a = whole_a[0:m, 0:k]
-    b = whole_b[0:k, 0:n]
-
-    machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
-
-
-# Test to make sure cuda graphs work
-class MacheteLayer(torch.nn.Module):
-
-    def __init__(self, **kwargs):
-        super().__init__()
-        self.kwargs = kwargs
-
-    def forward(self, a):
-        return ops.machete_gemm(**self.kwargs)
-
-
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-def test_machete_cuda_graph():
-    m, n, k = 512, 4096, 4096
-
-    a = rand_data((m, k), torch.float16)
-    b = rand_data((k, n), torch.float16)
-    wtype = scalar_types.uint4b8
-    group_size = 128
-    zero_points = False
-
-    w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
-        b, wtype, group_size, zero_points)
-
-    # Construct a trivial model with a single layer that calls a machete kernel
-    model = MacheteLayer(
-        a=a,
-        b_q=w_q_packed,
-        b_type=wtype,
-        b_scales=w_s,
-        b_zeros=maybe_convert_zeropoints(w_zp, w_s),
-        b_group_size=group_size,
-    )
-
-    output_ref = torch.matmul(a, w_ref)
-
-    # Run the model with a cuda graph
-    stream = torch.cuda.Stream()
-    with torch.cuda.stream(stream):
-        g = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(g):
-            output = model(a)
-    output.zero_()
-    g.replay()
-
-    # Relax atol as our reduction dim becomes larger (more rounding error)
-    # Relax atol when we have zeropoints since the way machete applies
-    #  zeropoints (after scales) causes noise around 0
-    atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1)
-    torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol)
--- a/tests/kernels/utils_block.py
+++ b/tests/kernels/utils_block.py
-# SPDX-License-Identifier: Apache-2.0
-
-import torch
-
-
-def native_w8a8_block_matmul(A: torch.Tensor, B: torch.Tensor,
-                             As: torch.Tensor, Bs: torch.Tensor, block_size,
-                             output_dtype):
-    """This function performs matrix multiplication with block-wise
-    quantization using native torch.
-    It is agnostic to the input data type and can be used for both int8 and
-    fp8 data types.
-
-    It takes two input tensors `A` and `B` (int8) with scales `As` and 
-    `Bs` (float32).
-    The output is returned in the specified `output_dtype`.
-    """
-    A = A.to(torch.float32)
-    B = B.to(torch.float32)
-    assert A.shape[-1] == B.shape[-1]
-    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
-    assert len(block_size) == 2
-    block_n, block_k = block_size[0], block_size[1]
-    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
-    assert A.shape[:-1] == As.shape[:-1]
-
-    M = A.numel() // A.shape[-1]
-    N, K = B.shape
-    origin_C_shape = A.shape[:-1] + (N, )
-    A = A.reshape(M, A.shape[-1])
-    As = As.reshape(M, As.shape[-1])
-    n_tiles = (N + block_n - 1) // block_n
-    k_tiles = (K + block_k - 1) // block_k
-    assert n_tiles == Bs.shape[0]
-    assert k_tiles == Bs.shape[1]
-
-    C_shape = (M, N)
-    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
-
-    A_tiles = [
-        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
-    ]
-    B_tiles = [[
-        B[
-            j * block_n:min((j + 1) * block_n, N),
-            i * block_k:min((i + 1) * block_k, K),
-        ] for i in range(k_tiles)
-    ] for j in range(n_tiles)]
-    C_tiles = [
-        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
-    ]
-    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
-
-    for i in range(k_tiles):
-        for j in range(n_tiles):
-            a = A_tiles[i]
-            b = B_tiles[j][i]
-            c = C_tiles[j]
-            s = As_tiles[i] * Bs[j][i]
-            c[:, :] += torch.matmul(a, b.t()) * s
-
-    C = C.reshape(origin_C_shape).to(output_dtype)
-    return C
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -48,6 +48,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    ]
    sampling_params = vllm.SamplingParams(temperature=0,
                                          max_tokens=256,
+                                          skip_special_tokens=False,
                                          stop=["[/assistant]"])
    outputs = llm.generate(
        prompts,

--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -31,6 +31,8 @@ DEVICES = ([
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ] if current_platform.is_cuda_alike() else ["cpu"])

+DEFAULT_DTYPE = torch.get_default_dtype()
+

 @pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch: pytest.MonkeyPatch):
@@ -125,8 +127,10 @@ def test_replace_submodules(dist_init, dummy_model):
    model = dummy_model
    manager = LoRAModelManager(
        model, 1, 1, 1,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8),
-        torch.device(DEVICES[0]))
+        LoRAConfig(max_lora_rank=8,
+                   max_cpu_loras=8,
+                   max_loras=8,
+                   lora_dtype=DEFAULT_DTYPE), torch.device(DEVICES[0]))
    model = manager.model
    assert isinstance(model.get_submodule("dense1"),
                      ColumnParallelLinearWithLoRA)
@@ -155,7 +159,8 @@ def test_lora_model_manager(dist_init, dummy_model, device):
                               2,
                               LoRAConfig(max_lora_rank=8,
                                          max_cpu_loras=3,
-                                          max_loras=2),
+                                          max_loras=2,
+                                          lora_dtype=DEFAULT_DTYPE),
                               device=device)
    assert all(x is None for x in manager.lora_index_to_id)
    assert manager.add_adapter(model_lora1)
@@ -221,7 +226,8 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
                                       2,
                                       LoRAConfig(max_lora_rank=8,
                                                  max_cpu_loras=3,
-                                                  max_loras=2),
+                                                  max_loras=2,
+                                                  lora_dtype=DEFAULT_DTYPE),
                                       device=device)
    assert all(x is None for x in manager.lora_index_to_id)
    assert manager.add_adapter(model_lora1)
@@ -316,7 +322,8 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
                                       2,
                                       LoRAConfig(max_lora_rank=8,
                                                  max_cpu_loras=2,
-                                                  max_loras=2),
+                                                  max_loras=2,
+                                                  lora_dtype=DEFAULT_DTYPE),
                                       device=device)

    assert all(x is None for x in manager.lora_index_to_id)
@@ -424,7 +431,10 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
 @pytest.mark.parametrize("device", DEVICES)
 def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                          sql_lora_files, device):
-    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+    lora_config = LoRAConfig(max_lora_rank=8,
+                             max_cpu_loras=4,
+                             max_loras=4,
+                             lora_dtype=DEFAULT_DTYPE)
    worker_adapter_manager = LRUCacheWorkerLoRAManager(
        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
        lora_config.lora_extra_vocab_size, lora_config, device,
@@ -504,7 +514,10 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
 def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                sql_lora_files, device):
    # Should remove every LoRA not specified in the request.
-    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+    lora_config = LoRAConfig(max_lora_rank=8,
+                             max_cpu_loras=4,
+                             max_loras=4,
+                             lora_dtype=DEFAULT_DTYPE)
    worker_adapter_manager = WorkerLoRAManager(
        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
        lora_config.lora_extra_vocab_size, lora_config, device,
@@ -600,7 +613,8 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
                               2,
                               LoRAConfig(max_lora_rank=8,
                                          max_cpu_loras=2,
-                                          max_loras=2),
+                                          max_loras=2,
+                                          lora_dtype=DEFAULT_DTYPE),
                               device=device)
    model = manager.model


--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -68,8 +68,12 @@ def test_minicpmv_lora(minicpmv_lora_files):
        max_loras=2,
        max_lora_rank=8,
        enforce_eager=True,
+        max_model_len=2048,
+        limit_mm_per_prompt={
+            "image": 2,
+            "video": 0
+        },
        trust_remote_code=True,
-        enable_chunked_prefill=True,
    )
    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
    for i in range(len(EXPECTED_OUTPUT)):
@@ -93,9 +97,11 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
        max_loras=4,
        max_lora_rank=64,
        tensor_parallel_size=4,
+        limit_mm_per_prompt={
+            "image": 2,
+            "video": 0
+        },
        trust_remote_code=True,
-        enforce_eager=True,
-        enable_chunked_prefill=True,
    )
    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
    for i in range(len(EXPECTED_OUTPUT)):
@@ -117,8 +123,11 @@ def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
        max_lora_rank=8,
        tensor_parallel_size=4,
        trust_remote_code=True,
+        limit_mm_per_prompt={
+            "image": 1,
+            "video": 0
+        },
        fully_sharded_loras=True,
-        enable_chunked_prefill=True,
    )
    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
    for i in range(len(EXPECTED_OUTPUT)):

--- a/tests/lora/test_resolver.py
+++ b/tests/lora/test_resolver.py
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import pytest
+
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+
+
+class DummyLoRAResolver(LoRAResolver):
+    """A dummy LoRA resolver for testing."""
+
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        if lora_name == "test_lora":
+            return LoRARequest(
+                lora_name=lora_name,
+                lora_path=f"/dummy/path/{base_model_name}/{lora_name}",
+                lora_int_id=abs(hash(lora_name)))
+        return None
+
+
+def test_resolver_registry_registration():
+    """Test basic resolver registration functionality."""
+    registry = LoRAResolverRegistry
+    resolver = DummyLoRAResolver()
+
+    # Register a new resolver
+    registry.register_resolver("dummy", resolver)
+    assert "dummy" in registry.get_supported_resolvers()
+
+    # Get registered resolver
+    retrieved_resolver = registry.get_resolver("dummy")
+    assert retrieved_resolver is resolver
+
+
+def test_resolver_registry_duplicate_registration():
+    """Test registering a resolver with an existing name."""
+    registry = LoRAResolverRegistry
+    resolver1 = DummyLoRAResolver()
+    resolver2 = DummyLoRAResolver()
+
+    registry.register_resolver("dummy", resolver1)
+    registry.register_resolver("dummy", resolver2)
+
+    assert registry.get_resolver("dummy") is resolver2
+
+
+def test_resolver_registry_unknown_resolver():
+    """Test getting a non-existent resolver."""
+    registry = LoRAResolverRegistry
+
+    with pytest.raises(KeyError, match="not found"):
+        registry.get_resolver("unknown_resolver")
+
+
+@pytest.mark.asyncio
+async def test_dummy_resolver_resolve():
+    """Test the dummy resolver's resolve functionality."""
+    dummy_resolver = DummyLoRAResolver()
+    base_model_name = "base_model_test"
+    lora_name = "test_lora"
+
+    # Test successful resolution
+    result = await dummy_resolver.resolve_lora(base_model_name, lora_name)
+    assert isinstance(result, LoRARequest)
+    assert result.lora_name == lora_name
+    assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}"
+
+    # Test failed resolution
+    result = await dummy_resolver.resolve_lora(base_model_name,
+                                               "nonexistent_lora")
+    assert result is None
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
 # SPDX-License-Identifier: Apache-2.0

+import os
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase

 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import get_lora_tokenizer
-from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
-import os
-from ..utils import RemoteOpenAIServer, models_path_prefix
-from ..conftest import get_tokenizer_pool_config
+
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+from ..utils import models_path_prefix


 @pytest.mark.asyncio
 @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
 async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
    reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
-    tokenizer_group = get_tokenizer_group(
-        get_tokenizer_pool_config(tokenizer_group_type),
+    tokenizer_group = TokenizerGroup(
        tokenizer_id=os.path.join(models_path_prefix,"gpt2"),
        enable_lora=True,
        max_num_seqs=1,
@@ -61,8 +60,7 @@ def test_get_lora_tokenizer(sql_lora_files, tmp_path):
 @pytest.mark.parametrize("max_num_seqs", [1, 2])
 @pytest.mark.parametrize("max_loras", [1, 2])
 def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras):
-    tokenizer_group = get_tokenizer_group(
-        get_tokenizer_pool_config(None),
+    tokenizer_group = TokenizerGroup(
        tokenizer_id="gpt2",
        enable_lora=enable_lora,
        max_num_seqs=max_num_seqs,

--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -9,7 +9,6 @@ from torch import nn

 from vllm.lora.utils import (get_adapter_absolute_path,
                             parse_fine_tuned_lora_name, replace_submodule)
-from vllm.utils import LRUCache


 def test_parse_fine_tuned_lora_name_valid():
@@ -40,6 +39,18 @@ def test_parse_fine_tuned_lora_name_valid():
            False,
            False,
        ),
+        (
+            "language_model.layers.9.mlp.down_proj.lora_A.weight",
+            "language_model.layers.9.mlp.down_proj",
+            True,
+            False,
+        ),
+        (
+            "language_model.layers.9.mlp.down_proj.lora_B.weight",
+            "language_model.layers.9.mlp.down_proj",
+            False,
+            False,
+        ),
    }
    for name, module_name, is_lora_a, is_bias in fixture:
        assert (module_name, is_lora_a,
@@ -85,114 +96,6 @@ def test_replace_submodule():
    assert dict(model.named_modules())["seq1.dense2"] == dense2


-class TestLRUCache(LRUCache):
-
-    def _on_remove(self, key, value):
-        if not hasattr(self, "_remove_counter"):
-            self._remove_counter = 0
-        self._remove_counter += 1
-
-
-def test_lru_cache():
-    cache = TestLRUCache(3)
-
-    cache.put(1, 1)
-    assert len(cache) == 1
-
-    cache.put(1, 1)
-    assert len(cache) == 1
-
-    cache.put(2, 2)
-    assert len(cache) == 2
-
-    cache.put(3, 3)
-    assert len(cache) == 3
-    assert set(cache.cache) == {1, 2, 3}
-
-    cache.put(4, 4)
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 3, 4}
-    assert cache._remove_counter == 1
-    assert cache.get(2) == 2
-
-    cache.put(5, 5)
-    assert set(cache.cache) == {2, 4, 5}
-    assert cache._remove_counter == 2
-
-    assert cache.pop(5) == 5
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.pop(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.get(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.put(6, 6)
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 4, 6}
-    assert 2 in cache
-    assert 4 in cache
-    assert 6 in cache
-
-    cache.remove_oldest()
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 6}
-    assert cache._remove_counter == 4
-
-    cache.clear()
-    assert len(cache) == 0
-    assert cache._remove_counter == 6
-
-    cache._remove_counter = 0
-
-    cache[1] = 1
-    assert len(cache) == 1
-
-    cache[1] = 1
-    assert len(cache) == 1
-
-    cache[2] = 2
-    assert len(cache) == 2
-
-    cache[3] = 3
-    assert len(cache) == 3
-    assert set(cache.cache) == {1, 2, 3}
-
-    cache[4] = 4
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 3, 4}
-    assert cache._remove_counter == 1
-    assert cache[2] == 2
-
-    cache[5] = 5
-    assert set(cache.cache) == {2, 4, 5}
-    assert cache._remove_counter == 2
-
-    del cache[5]
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.pop(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache[6] = 6
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 4, 6}
-    assert 2 in cache
-    assert 4 in cache
-    assert 6 in cache
-
-
 # Unit tests for get_adapter_absolute_path
 @patch('os.path.isabs')
 def test_get_adapter_absolute_path_absolute(mock_isabs):

--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -11,6 +11,8 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
    dispatch_fused_experts_func, dispatch_topk_func,
    torch_vllm_inplace_fused_experts, torch_vllm_outplace_fused_experts,
    vllm_topk_softmax)
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+    is_rocm_aiter_moe_enabled)
 from vllm.model_executor.layers.layernorm import (
    RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
    rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
@@ -100,11 +102,10 @@ def test_enabled_ops_invalid(env: str):
 def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
    topk_func = dispatch_topk_func()
-
+    is_rocm_aiter_moe_enabled.cache_clear()
    if current_platform.is_rocm() and int(use_rocm_aiter):
        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
            rocm_aiter_topk_softmax)
-
        assert topk_func == rocm_aiter_topk_softmax
    else:
        assert topk_func == vllm_topk_softmax
@@ -116,11 +117,11 @@ def test_fused_experts_dispatch(use_rocm_aiter: str, inplace: bool,
                                monkeypatch):

    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    is_rocm_aiter_moe_enabled.cache_clear()
    fused_experts_func = dispatch_fused_experts_func(inplace)
    if current_platform.is_rocm() and int(use_rocm_aiter):
        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
            rocm_aiter_fused_experts)
-
        assert fused_experts_func == rocm_aiter_fused_experts
    elif inplace:
        assert fused_experts_func == torch_vllm_inplace_fused_experts

--- a/tests/models/decoder_only/audio_language/test_granite_speech.py
+++ b/tests/models/decoder_only/audio_language/test_granite_speech.py
+# SPDX-License-Identifier: Apache-2.0
+
+from collections.abc import Sequence
+from typing import Optional
+
+import pytest
+from transformers import AutoModelForSpeechSeq2Seq
+
+from vllm.lora.request import LoRARequest
+from vllm.sequence import SampleLogprobs
+
+from ....conftest import HfRunner, PromptAudioInput, VllmRunner, _AudioAssets
+from ...registry import HF_EXAMPLE_MODELS
+from ...utils import check_logprobs_close
+
+HF_AUDIO_PROMPT = "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|><|audio|>can you transcribe the speech into a written format?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"  # noqa: E501
+
+
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    """Sanitize hf output to be comparable with vllm output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|end_of_text|>"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+MODEL_NAME = "ibm-granite/granite-speech-3.3-8b"
+# Audio lora co-exists directly in the model directory, but
+# currently still needs to be passed directly to vLLM.
+audio_lora_path = MODEL_NAME
+models = [MODEL_NAME]
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], PromptAudioInput]],
+    model: str,
+    *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the audio fixtures for the test are from AUDIO_ASSETS.
+    For huggingface runner, we provide the audio as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(
+            model,
+            task="generate",
+            max_model_len=max_model_len,
+            max_num_seqs=1,
+            dtype=dtype,
+            limit_mm_per_prompt={"audio": 1},
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            enable_lora=True,
+            max_lora_rank=64,
+            enforce_eager=True,
+    ) as vllm_model:
+        lora_request = LoRARequest("audio", 1, audio_lora_path)
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                audios=audios,
+                                                lora_request=lora_request)
+            for prompts, audios in inputs
+        ]
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
+
+        hf_processor = hf_model.processor
+        eos_token_id = hf_processor.tokenizer.eos_token_id
+
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    audios=[audios],
+                                                    eos_token_id=eos_token_id)
+            for prompts, audios in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
+                                        vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(output) for output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_model_len", [2048])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(hf_runner, vllm_runner, model: str, audio_assets: _AudioAssets,
+                dtype: str, max_model_len: int, max_tokens: int,
+                num_logprobs: int) -> None:
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    audio, sr = audio_assets[0].audio_and_sample_rate
+    # This model expects 16k sample rate, which our test audio
+    # already is; if this changes, it may break this test,
+    # so we check it directly
+    assert sr == 16000
+    run_test(
+        hf_runner,
+        vllm_runner,
+        [
+            ([HF_AUDIO_PROMPT], [audio]),
+        ],
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import Optional
+import json
+from typing import Any, Optional

 import numpy as np
 import pytest
@@ -9,10 +10,11 @@ import os
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer

-from vllm.multimodal.audio import resample_audio
+from vllm.multimodal.audio import resample_audio_librosa
 from vllm.sequence import SampleLogprobs

-from ....conftest import HfRunner, VllmRunner
+
+from ....conftest import HfRunner, VllmRunner, _AudioAssets
 from ....utils import RemoteOpenAIServer, models_path_prefix
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
@@ -32,31 +34,34 @@ CHUNKED_PREFILL_KWARGS = {
 }


-@pytest.fixture(scope="session")
-def audio_assets():
-    from vllm.assets.audio import AudioAsset
-    return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
-
-
 @pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call"))
 def audio(request):
    from vllm.assets.audio import AudioAsset
    return AudioAsset(request.param)


+def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
+    """Convert kwargs to CLI args."""
+    args = []
+    for key, value in params_kwargs.items():
+        if isinstance(value, bool):
+            if value:
+                args.append(f"--{key.replace('_','-')}")
+        else:
+            args.append(f"--{key.replace('_','-')}={value}")
+    return args
+
+
 @pytest.fixture(params=[
    pytest.param({}, marks=pytest.mark.cpu_model),
    pytest.param(CHUNKED_PREFILL_KWARGS),
 ])
-def server(request, audio_assets):
+def server(request, audio_assets: _AudioAssets):
    args = [
-        "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
-        f"--limit-mm-per-prompt=audio={len(audio_assets)}",
-        "--trust-remote-code"
-    ] + [
-        f"--{key.replace('_','-')}={value}"
-        for key, value in request.param.items()
-    ]
+        "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
+    ] + params_kwargs_to_cli_args(request.param)

    with RemoteOpenAIServer(MODEL_NAME,
                            args,
@@ -137,9 +142,9 @@ def run_test(
                [hf_prompt],
                max_tokens,
                num_logprobs=num_logprobs,
-                audios=[(resample_audio(audio[0],
-                                        orig_sr=audio[1],
-                                        target_sr=16000), 16000)])
+                audios=[(resample_audio_librosa(audio[0],
+                                                orig_sr=audio[1],
+                                                target_sr=16000), 16000)])
            for _, hf_prompt, audio in prompts_and_audios
        ]

@@ -222,8 +227,9 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
    pytest.param({}, marks=pytest.mark.cpu_model),
    pytest.param(CHUNKED_PREFILL_KWARGS),
 ])
-def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
-                                     max_tokens: int, num_logprobs: int,
+def test_models_with_multiple_audios(vllm_runner, audio_assets: _AudioAssets,
+                                     dtype: str, max_tokens: int,
+                                     num_logprobs: int,
                                     vllm_kwargs: dict) -> None:

    vllm_prompt = _get_prompt(len(audio_assets),
@@ -242,7 +248,7 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,


 @pytest.mark.asyncio
-async def test_online_serving(client, audio_assets):
+async def test_online_serving(client, audio_assets: _AudioAssets):
    """Exercises online serving with/without chunked prefill enabled."""

    messages = [{

--- a/tests/models/decoder_only/language/test_hybrid.py
+++ b/tests/models/decoder_only/language/test_hybrid.py
@@ -7,76 +7,85 @@ from tests.utils import multi_gpu_test
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams

-from ...utils import check_outputs_equal
 from ....utils import models_path_prefix
-
-# This test is for the hybrid models
-MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"), os.path.join(models_path_prefix, "Zyphra/Zamba2-1.2B-instruct")]
-# Bamba at Fp32 is too big for the CI (L4 GPU).
-# MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
+from ...utils import check_logprobs_close, check_outputs_equal
+
+# NOTE: The first model in each list is taken as the primary model,
+# meaning that it will be used in all tests in this file
+# The rest of the models will only be tested by test_models
+
+SSM_MODELS = [
+    os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"),
+    os.path.join(models_path_prefix, "tiiuae/falcon-mamba-tiny-dev"),
+    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
+    # Mamba2 is buggy for Codestral as it doesn't handle n_groups.
+    # See https://github.com/huggingface/transformers/pull/35943
+    # "mistralai/Mamba-Codestral-7B-v0.1",
+]
+
+HYBRID_MODELS = [
+    os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"),
+    # NOTE: Running Plamo2 in transformers implementation requires to install
+    # causal-conv1d package, which is not listed as a test dependency as it's
+    # not compatible with pip-compile.
+    os.path.join(models_path_prefix, "pfnet/plamo-2-1b"),
+    os.path.join(models_path_prefix, "Zyphra/Zamba2-1.2B-instruct"),
+    os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B"),
+]
+
+# Avoid OOM
+MAX_NUM_SEQS = 4
+
+
+@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
-    dtype: str,
    max_tokens: int,
+    num_logprobs: int,
 ) -> None:
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)

-    # numeric error produces different generation
-    if "Bamba" in model:
-        example_prompts.pop(3)
-
-    model_kwargs = {
-        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
-        # don't use them
-    }
-    if "Zamba2" in model:
-        # Zamba2 HF implementation automatically checks if mamba kernels are
-        # installed
-        model_kwargs = {}
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)

-    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )


-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_batching(
    vllm_runner,
    example_prompts,
    model: str,
-    dtype: str,
    max_tokens: int,
+    num_logprobs: int,
 ) -> None:
-    # To pass the small model tests, we need full precision.
    for_loop_outputs = []
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        for prompt in example_prompts:
-            for_loop_outputs.append(
-                vllm_model.generate_greedy([prompt], max_tokens)[0])
+            single_output, = vllm_model.generate_greedy_logprobs([prompt],
+                                                                 max_tokens,
+                                                                 num_logprobs)
+            for_loop_outputs.append(single_output)

-        batched_outputs = vllm_model.generate_greedy(example_prompts,
-                                                     max_tokens)
+        batched_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)

-    check_outputs_equal(
+    check_logprobs_close(
        outputs_0_lst=for_loop_outputs,
        outputs_1_lst=batched_outputs,
        name_0="for_loop_vllm",
@@ -84,74 +93,35 @@ def test_batching(
    )


-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float16"])
-@pytest.mark.parametrize("max_tokens", [10])
-def test_mamba_prefill_chunking_with_parallel_sampling(
-        hf_runner, vllm_runner, example_prompts, model: str, dtype: str,
-        max_tokens: int) -> None:
-    # Tests prefill chunking in conjunction with n>1, in this case,
-    # prefill is populated with decoding tokens and we test that it
-    # doesn't fail This test might fail if cache is not allocated
-    # correctly for n > 1 decoding steps inside a
-    # chunked prefill forward pass (where we have both prefills
-    # and decoding together )
-    sampling_params = SamplingParams(n=3,
-                                     temperature=1,
-                                     seed=0,
-                                     max_tokens=max_tokens)
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enable_chunked_prefill=True,
-            max_num_batched_tokens=30,
-            max_num_seqs=10  # forces prefill chunks with decoding
-    ) as vllm_model:
-        vllm_model.generate(example_prompts, sampling_params)
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [7])
-def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
-                                model: str, dtype: str,
-                                max_tokens: int) -> None:
-    # numeric error during prefill chunking produces different generation
-    # compared to w/o prefill chunking for those examples, removed them for now
-    if "Jamba" in model:
-        example_prompts.pop(7)
-        example_prompts.pop(2)
-        example_prompts.pop(1)
-    elif "Bamba" in model:
-        example_prompts.pop(6)
-        example_prompts.pop(3)
-        example_prompts.pop(2)
-        dtype = "half"  # use a different dtype for Bamba
-    elif "Zamba2" in model:
-        example_prompts.pop(7)
-        dtype = "half"
-
-    model_kwargs = {
-        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
-        # don't use them
-    }
-    if "Zamba2" in model:
-        # Zamba2 HF implementation automatically checks if mamba kernels are
-        # installed
-        model_kwargs = {}
-
-    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
-        non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+def test_chunked_prefill(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+    chunked_prefill_token_size: int,
+) -> None:
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size

    with vllm_runner(model,
-                     dtype=dtype,
                     enable_chunked_prefill=True,
-                     max_num_batched_tokens=5,
-                     max_num_seqs=2) as vllm_model:
-        chunked = vllm_model.generate_greedy(example_prompts,
-                                             max_tokens=max_tokens)
+                     max_num_batched_tokens=max_num_batched_tokens,
+                     max_num_seqs=max_num_seqs) as vllm_model:
+        chunked = vllm_model.generate_greedy_logprobs(example_prompts,
+                                                      max_tokens, num_logprobs)

-    check_outputs_equal(
+    with vllm_runner(model,
+                     enable_chunked_prefill=False,
+                     max_num_seqs=max_num_seqs) as vllm_model:
+        non_chunked = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
        outputs_0_lst=chunked,
        outputs_1_lst=non_chunked,
        name_0="chunked",
@@ -159,64 +129,59 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
    )


-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [15])
-def test_parallel_sampling(
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_chunked_prefill_with_parallel_sampling(
    vllm_runner,
    example_prompts,
    model: str,
-    dtype: str,
    max_tokens: int,
 ) -> None:
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        for_loop_outputs = []
-        for _ in range(10):
-            for_loop_outputs.append(
-                # using example_prompts index 1 instead of 0 since with 0 the
-                # logprobs get really close and the test doesn't pass
-                vllm_model.generate_greedy([example_prompts[1]], max_tokens)
-                [0])
-        sampling_params = SamplingParams(n=10,
-                                         temperature=0.001,
-                                         seed=0,
-                                         max_tokens=max_tokens)
-        n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
-                                             sampling_params)
-    token_ids, texts = n_lt_1_outputs[0]
-    n_lt_1_outputs = [(token_id, text)
-                      for token_id, text in zip(token_ids, texts)]
-
-    check_outputs_equal(
-        outputs_0_lst=n_lt_1_outputs,
-        outputs_1_lst=for_loop_outputs,
-        name_0="vllm_n_lt_1_outputs",
-        name_1="vllm",
-    )
+    """
+    Tests chunked prefill in conjunction with n > 1. 
+    
+    In this case, prefill is populated with decoding tokens and
+    we test that it doesn't fail.
+
+    This test might fail if cache is not allocated correctly for n > 1
+    decoding steps inside a chunked prefill forward pass
+    (where we have both prefill and decode together)
+    """
+    sampling_params = SamplingParams(n=3,
+                                     temperature=1,
+                                     seed=0,
+                                     max_tokens=max_tokens)
+    with vllm_runner(
+            model,
+            enable_chunked_prefill=True,
+            # forces prefill chunks with decoding
+            max_num_batched_tokens=MAX_NUM_SEQS * 3,
+            max_num_seqs=MAX_NUM_SEQS,
+    ) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)


-@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 @pytest.mark.parametrize("max_tokens", [20])
 def test_mamba_cache_cg_padding(
    vllm_runner,
    example_prompts,
    model: str,
-    dtype: str,
    max_tokens: int,
 ) -> None:
-    # This test is for verifying that mamba cache is padded to CG captured
-    # batch size. If it's not, a torch RuntimeError will be raised because
-    # tensor dimensions aren't compatible
-    vllm_config = EngineArgs(model=model).create_engine_config()
+    """
+    This test is for verifying that mamba cache is padded to CG captured
+    batch size. If it's not, a torch RuntimeError will be raised because
+    tensor dimensions aren't compatible.
+    """
+    vllm_config = EngineArgs(model=model,
+                             trust_remote_code=True).create_engine_config()
    while len(example_prompts) == vllm_config.pad_for_cudagraph(
            len(example_prompts)):
        example_prompts.append(example_prompts[0])

    try:
-        with vllm_runner(model, dtype=dtype) as vllm_model:
+        with vllm_runner(model) as vllm_model:
            vllm_model.generate_greedy(example_prompts, max_tokens)
    except RuntimeError:
        pytest.fail(
@@ -225,28 +190,24 @@ def test_mamba_cache_cg_padding(
            "Could be related to mamba cache not padded correctly")


-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 @pytest.mark.parametrize("max_tokens", [20])
 def test_models_preemption_recompute(
-    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
-    dtype: str,
    max_tokens: int,
 ) -> None:
-    # Tests that outputs are identical with and w/o preemtions (recompute)
-    assert dtype == "float"
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_model.model.llm_engine.scheduler[
-            0].ENABLE_ARTIFICIAL_PREEMPT = True
+    """
+    Tests that outputs are identical with and w/o preemptions (recompute).
+    """
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        scheduler = vllm_model.model.llm_engine.scheduler[0]
+        scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
        preempt_vllm_outputs = vllm_model.generate_greedy(
            example_prompts, max_tokens)

-        vllm_model.model.llm_engine.scheduler[
-            0].ENABLE_ARTIFICIAL_PREEMPT = False
+        scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)

    check_outputs_equal(
@@ -257,40 +218,43 @@ def test_models_preemption_recompute(
    )


-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
    vllm_runner,
-    model: str,
-    dtype: str,
    example_prompts,
+    model: str,
 ) -> None:
-    # This test is for verifying that the hybrid inner state management doesn't
-    # collapse in case where the number of incoming requests and
-    # finished_requests_ids is larger than the maximum mamba block capacity.
-    # This could generally happen due to the fact that hybrid does support
-    # statelessness mechanism where it can cleanup new incoming requests in
-    # a single step.
+    """
+    This test is for verifying that the hybrid inner state management doesn't
+    collapse in case where the number of incoming requests and
+    finished_requests_ids is larger than the maximum mamba block capacity.
+
+    This could generally happen due to the fact that hybrid does support
+    statelessness mechanism where it can cleanup new incoming requests in
+    a single step.
+    """
    try:
-        with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
    except ValueError:
        pytest.fail("Hybrid inner state wasn't cleaned up properly between"
                    "steps finished requests registered unnecessarily ")


-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 def test_state_cleanup(
    vllm_runner,
-    model: str,
-    dtype: str,
    example_prompts,
+    model: str,
 ) -> None:
-    # This test is for verifying that the Hybrid state is cleaned up between
-    # steps, If its not cleaned, an error would be expected.
+    """ 
+    This test is for verifying that the Hybrid state is cleaned up between
+    steps.
+    
+    If its not cleaned, an error would be expected.
+    """
    try:
-        with vllm_runner(model, dtype=dtype) as vllm_model:
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
            for _ in range(10):
                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
    except ValueError:
@@ -298,28 +262,14 @@ def test_state_cleanup(
                    "could be related to finished_requests_ids")


-@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_multistep(
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [64])
+def test_multistep_correctness(
    vllm_runner,
-    model: str,
-    dtype: str,
    example_prompts,
+    model: str,
+    max_tokens: int,
 ) -> None:
-    # This test is verifying that multistep works correctly
-    #on mamba-like models
-    with vllm_runner(model, num_scheduler_steps=8,
-                     max_num_seqs=2) as vllm_model:
-        vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
-
-
-@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [64])
-def test_multistep_correctness(vllm_runner, model: str, dtype: str,
-                               max_tokens: int, example_prompts) -> None:
    with vllm_runner(model, num_scheduler_steps=8,
                     max_num_seqs=2) as vllm_model:
        vllm_outputs_multistep = vllm_model.generate_greedy(
@@ -339,18 +289,21 @@ def test_multistep_correctness(vllm_runner, model: str, dtype: str,


 @multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 @pytest.mark.parametrize("max_tokens", [64])
 def test_hybrid_distributed_produces_identical_generation(
-        vllm_runner, model: str, dtype: str, max_tokens: int,
-        example_prompts) -> None:
-
-    with vllm_runner(model, dtype=dtype, tensor_parallel_size=2) as vllm_model:
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+) -> None:
+    with vllm_runner(model, tensor_parallel_size=2,
+                     max_num_seqs=2) as vllm_model:
        vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts,
                                                       max_tokens)

-    with vllm_runner(model, dtype=dtype, tensor_parallel_size=1) as vllm_model:
+    with vllm_runner(model, tensor_parallel_size=1,
+                     max_num_seqs=2) as vllm_model:
        vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts,
                                                       max_tokens)


--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -11,8 +11,8 @@ import jsonschema.exceptions
 import pytest
 import os

-from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (  # noqa
-    MistralToolParser)
+from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
+    MistralToolCall, MistralToolParser)
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams

 from ...utils import check_logprobs_close
@@ -196,7 +196,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
    )


-@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
 @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
@@ -248,10 +247,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str,
            assert "�" not in outputs[0].outputs[0].text.strip()


-@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("model",
-                         MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
 def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
    with vllm_runner(model,
                     dtype=dtype,
@@ -272,7 +269,8 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
        parsed_message = tool_parser.extract_tool_calls(model_output, None)

        assert parsed_message.tools_called
-        assert parsed_message.tool_calls[0].id == "0UAqFzWsD"
+
+        assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
        assert parsed_message.tool_calls[
            0].function.name == "get_current_weather"
        assert parsed_message.tool_calls[
@@ -283,28 +281,38 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("guided_backend",
                         ["outlines", "lm-format-enforcer", "xgrammar"])
-def test_mistral_guided_decoding(vllm_runner, model: str,
-                                 guided_backend: str) -> None:
-    with vllm_runner(model, dtype='bfloat16',
-                     tokenizer_mode="mistral") as vllm_model:
+def test_mistral_guided_decoding(
+    monkeypatch: pytest.MonkeyPatch,
+    vllm_runner,
+    model: str,
+    guided_backend: str,
+) -> None:
+    with monkeypatch.context() as m:
+        # Guided JSON not supported in xgrammar + V1 yet
+        m.setenv("VLLM_USE_V1", "0")

-        guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA,
-                                               backend=guided_backend)
-        params = SamplingParams(max_tokens=512,
-                                temperature=0.7,
-                                guided_decoding=guided_decoding)
-
-        messages = [{
-            "role": "system",
-            "content": "you are a helpful assistant"
-        }, {
-            "role":
-            "user",
-            "content":
-            f"Give an example JSON for an employee profile that "
-            f"fits this schema: {SAMPLE_JSON_SCHEMA}"
-        }]
-        outputs = vllm_model.model.chat(messages, sampling_params=params)
+        with vllm_runner(
+                model,
+                dtype='bfloat16',
+                tokenizer_mode="mistral",
+                guided_decoding_backend=guided_backend,
+        ) as vllm_model:
+            guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA)
+            params = SamplingParams(max_tokens=512,
+                                    temperature=0.7,
+                                    guided_decoding=guided_decoding)
+
+            messages = [{
+                "role": "system",
+                "content": "you are a helpful assistant"
+            }, {
+                "role":
+                "user",
+                "content":
+                f"Give an example JSON for an employee profile that "
+                f"fits this schema: {SAMPLE_JSON_SCHEMA}"
+            }]
+            outputs = vllm_model.model.chat(messages, sampling_params=params)

        generated_text = outputs[0].outputs[0].text
        json_response = json.loads(generated_text)

--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -10,6 +10,8 @@ import torch

 from vllm.platforms import current_platform

+from ....utils import large_gpu_mark
+from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 from ....utils import models_path_prefix

@@ -27,7 +29,7 @@ REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
 AITER_MODEL_LIST = [
    "meta-llama/Llama-3.2-1B-Instruct",
    "openbmb/MiniCPM3-4B",
-    "Qwen/Qwen-7B",
+    "Qwen/Qwen-7B-Chat",
    "Qwen/Qwen2.5-0.5B-Instruct",
    "ehristoforu/Falcon3-MoE-2x7B-Insruct",
 ]
@@ -62,7 +64,8 @@ AITER_MODEL_LIST = [
        pytest.param(
            os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"),
            # fused_moe not supported on CPU
-            marks=[pytest.mark.core_model],
+            marks=[pytest.mark.core_model,
+                   large_gpu_mark(min_gb=32)],
        ),
        pytest.param(
            os.path.join(models_path_prefix, "facebook/opt-125m"),  # opt
@@ -73,7 +76,7 @@ AITER_MODEL_LIST = [
            marks=[pytest.mark.core_model],
        ),
        pytest.param(
-            os.path.join(models_path_prefix, "Qwen/Qwen-7B"),  # qwen (text-only)
+            os.path.join(models_path_prefix, "Qwen/Qwen-7B-Chat"),  # qwen (text-only)
        ),
        pytest.param(
            os.path.join(models_path_prefix, "Qwen/Qwen2.5-0.5B-Instruct"),  # qwen2
@@ -83,17 +86,21 @@ AITER_MODEL_LIST = [
        pytest.param(os.path.join(models_path_prefix, "bigcode/starcoder2-3b")),  # starcoder2
        pytest.param(
            os.path.join(models_path_prefix, "ehristoforu/Falcon3-MoE-2x7B-Insruct"),  # mixtral
-            marks=[pytest.mark.cpu_model],
+            marks=[pytest.mark.cpu_model,
+                   large_gpu_mark(min_gb=48)],
        )
    ])
-@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize(
    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_models(hf_runner, vllm_runner, example_prompts, model: str,
-                dtype: str, max_tokens: int, num_logprobs: int,
-                use_rocm_aiter: bool, monkeypatch) -> None:
+                max_tokens: int, num_logprobs: int, use_rocm_aiter: bool,
+                monkeypatch) -> None:
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")

    if model in REQUIRES_V0:
        monkeypatch.setenv("VLLM_USE_V1", "0")
@@ -107,15 +114,17 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
        # in parts of the operators
        pytest.skip(f"Skipping '{model}' model test with AITER kernel.")

-    with hf_runner(model, dtype=dtype) as hf_model:
-        if model.startswith("THUDM/chatglm3"):
-            hf_model.model.get_output_embeddings = lambda: \
-                hf_model.model.transformer.output_layer
-
+    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(
+            model,
+            tokenizer_name=model_info.tokenizer or model,
+            tokenizer_mode=model_info.tokenizer_mode,
+            trust_remote_code=model_info.trust_remote_code,
+            max_num_seqs=2,
+    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)


--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -142,6 +142,23 @@ VLM_TEST_SETTINGS = {
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
+    "qwen2_5_omni": VLMTestInfo(
+        models=["Qwen/Qwen2.5-Omni-7B"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+    ),
    #### Extended model tests
    "aria": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")],
@@ -321,6 +338,18 @@ VLM_TEST_SETTINGS = {
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
+    "kimi_vl": VLMTestInfo(
+        models=["moonshotai/Kimi-VL-A3B-Instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
+        img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>",  # noqa: E501
+        max_model_len=8192,
+        max_num_seqs=2,
+        dtype="bfloat16",
+        tensor_parallel_size=1,
+        vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
+        marks=[large_gpu_mark(min_gb=48)],
+    ),
    "llama4": VLMTestInfo(
        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501

--- a/tests/models/decoder_only/vision_language/test_phi4mm.py
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
@@ -181,7 +181,7 @@ def run_test(
    ],
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [4096])
+@pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@@ -225,7 +225,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
    ],
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_model_len", [25600])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
@@ -258,7 +258,7 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,

 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,