Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1

af7f4372 · zhuwenwen · 5e19cdef · 09c77926 · af7f4372 · af7f4372
Commit af7f4372 authored Sep 03, 2024 by zhuwenwen
20 changed files
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -327,7 +327,7 @@ def test_paged_attention(
    atol, rtol = 1e-3, 1e-5
    if kv_cache_dtype == "fp8":
        atol, rtol = 1e-2, 1e-5
-    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)


 def ref_multi_query_kv_attention(
@@ -441,4 +441,4 @@ def test_varlen_blocksparse_attention_prefill(
        scale,
        dtype,
    )
-    assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2)
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -99,10 +99,10 @@ def test_copy_blocks(

    # Compare the results.
    for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
-        assert torch.allclose(key_cache, cloned_key_cache)
+        torch.testing.assert_close(key_cache, cloned_key_cache)
    for value_cache, cloned_value_cache in zip(value_caches,
                                               cloned_value_caches):
-        assert torch.allclose(value_cache, cloned_value_cache)
+        torch.testing.assert_close(value_cache, cloned_value_cache)


 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -185,17 +185,17 @@ def test_reshape_and_cache(
        cloned_value_cache[block_idx, :, :, block_offset] = value[i]

    if kv_cache_dtype == "fp8":
-        assert torch.allclose(result_key_cache,
+        torch.testing.assert_close(result_key_cache,
                                   cloned_key_cache,
                                   atol=0.001,
                                   rtol=0.1)
-        assert torch.allclose(result_value_cache,
+        torch.testing.assert_close(result_value_cache,
                                   cloned_value_cache,
                                   atol=0.001,
                                   rtol=0.1)
    else:
-        assert torch.allclose(key_cache, cloned_key_cache)
-        assert torch.allclose(value_cache, cloned_value_cache)
+        torch.testing.assert_close(key_cache, cloned_key_cache)
+        torch.testing.assert_close(value_cache, cloned_value_cache)


 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -291,17 +291,17 @@ def test_reshape_and_cache_flash(
        cloned_value_cache[block_idx, block_offset, :, :] = value[i]

    if kv_cache_dtype == "fp8":
-        assert torch.allclose(result_key_cache,
+        torch.testing.assert_close(result_key_cache,
                                   cloned_key_cache,
                                   atol=0.001,
                                   rtol=0.1)
-        assert torch.allclose(result_value_cache,
+        torch.testing.assert_close(result_value_cache,
                                   cloned_value_cache,
                                   atol=0.001,
                                   rtol=0.1)
    else:
-        assert torch.allclose(key_cache, cloned_key_cache)
-        assert torch.allclose(value_cache, cloned_value_cache)
+        torch.testing.assert_close(key_cache, cloned_key_cache)
+        torch.testing.assert_close(value_cache, cloned_value_cache)


 @pytest.mark.parametrize("direction", COPYING_DIRECTION)
@@ -373,9 +373,9 @@ def test_swap_blocks(
                    block_mapping_tensor)

    for src, dst in block_mapping:
-        assert torch.allclose(src_key_caches_clone[src].cpu(),
+        torch.testing.assert_close(src_key_caches_clone[src].cpu(),
                                   dist_key_caches[0][dst].cpu())
-        assert torch.allclose(src_value_caches_clone[src].cpu(),
+        torch.testing.assert_close(src_value_caches_clone[src].cpu(),
                                   dist_value_caches[0][dst].cpu())


@@ -412,4 +412,4 @@ def test_swap_blocks(
 #     converted_cache = torch.empty_like(cache)
 #     ops.convert_fp8(converted_cache, cache_fp8)

-#     assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)
+#     torch.testing.assert_close(cache, converted_cache, atol=0.001, rtol=0.1)
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -28,13 +28,16 @@ def to_int8(tensor: torch.Tensor):
    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)


+def rand_int8(shape: tuple, device: str = "cuda"):
+    return to_int8(torch.rand(shape, device=device) * 255 - 128)
+
+
 def baseline_scaled_mm(a: torch.Tensor,
                       b: torch.Tensor,
                       scale_a: torch.Tensor,
                       scale_b: torch.Tensor,
                       out_dtype: Type[torch.dtype],
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
    output = (scale_a * (scale_b * (torch.mm(
        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
    if bias is not None:
@@ -71,7 +74,7 @@ def cutlass_fp8_gemm_helper(m: int,
    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)

-    assert torch.allclose(out, baseline, rtol=1e-2, atol=5e-2)
+    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=5e-2)


 def cutlass_int8_gemm_helper(m: int,
@@ -103,7 +106,7 @@ def cutlass_int8_gemm_helper(m: int,
    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)

-    assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)


 @pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
@@ -221,6 +224,124 @@ def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
                                     use_bias)


+@pytest.mark.parametrize("m", [32, 64, 128])
+@pytest.mark.parametrize("n", [16, 32, 64])
+@pytest.mark.parametrize("k", [64, 128, 256])
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.skip
+def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
+                                    out_dtype: torch.dtype):
+    # Currently, the test is failing because folding azp into
+    # 16-bit bias loses too much precision
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, n), device="cuda", dtype=torch.float32) / 10
+
+    aq_i8 = rand_int8((m, k))
+    bq_i8 = rand_int8((n, k)).t()
+
+    aq_i32 = aq_i8.to(dtype=torch.int32)
+    bq_i32 = bq_i8.to(dtype=torch.int32)
+
+    aq_f32 = aq_i8.to(dtype=torch.float32)
+    bq_f32 = bq_i8.to(dtype=torch.float32)
+
+    b_dq = scale_b * bq_f32
+
+    azp_a = torch.rand((1, ), device="cuda", dtype=torch.float32) * 10 + 1.5
+    azp_aq_i8 = (azp_a / scale_a).to(dtype=torch.int8)
+    azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a  # correct for rounding
+
+    a_dq = scale_a * (aq_i32 + azp_aq_i8).to(dtype=torch.float32)
+    torch.testing.assert_close(a_dq, scale_a * aq_f32 + azp_a)
+
+    baseline_dq = torch.mm(a_dq, b_dq).to(out_dtype)
+
+    J = torch.ones((1, k), device="cuda", dtype=torch.float32)
+    azp_bias = (azp_a * scale_b * (J @ bq_f32)).to(out_dtype)
+    assert azp_bias.shape == (1, n)
+    assert azp_bias[0, :].shape == (n, )
+
+    baseline_q = (scale_a.to(device='cpu') * scale_b.to(device='cpu') * (
+        (aq_i32 + azp_aq_i8).to(device='cpu') @ bq_i32.to(device='cpu'))).to(
+            dtype=out_dtype, device='cuda')
+
+    out = ops.cutlass_scaled_mm(aq_i8,
+                                bq_i8,
+                                scale_a,
+                                scale_b,
+                                out_dtype=out_dtype,
+                                bias=azp_bias[0, :])
+    torch.testing.assert_close(out, baseline_dq, rtol=1e-2, atol=1e0)
+    torch.testing.assert_close(out, baseline_q, rtol=1e-2, atol=1e0)
+
+
+@pytest.mark.parametrize("m", [32, 64, 128])
+@pytest.mark.parametrize("n", [16, 32, 64])
+@pytest.mark.parametrize("k", [64, 128, 256])
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("azp_per_token", [True, False])
+def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
+                          use_bias: bool, azp_per_token: bool):
+    m_azp = m if azp_per_token else 1
+    scale_a = torch.randn((m_azp, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, n), device="cuda", dtype=torch.float32) / 10
+
+    aq_i8 = rand_int8((m, k))
+    aq_i32 = aq_i8.to(dtype=torch.int32)
+    aq_f32 = aq_i8.to(dtype=torch.float32)
+
+    bq_i8 = rand_int8((n, k)).t()
+    bq_i32 = bq_i8.to(dtype=torch.int32)
+    bq_f32 = bq_i8.to(dtype=torch.float32)
+    b_dq = scale_b * bq_f32
+
+    azp_a = torch.rand(
+        (m_azp, 1), device="cuda", dtype=torch.float32) * 10 + 1.5
+    azp_aq_i8 = (azp_a / scale_a).to(dtype=torch.int8)
+    azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a  # correct for rounding
+
+    a_dq = scale_a * (aq_i32 - azp_aq_i8).to(dtype=torch.float32)
+    torch.testing.assert_close(a_dq,
+                               scale_a * aq_f32 - azp_a,
+                               rtol=1e-4,
+                               atol=1e-3)
+
+    if use_bias:
+        bias = torch.rand((1, n), device="cuda", dtype=out_dtype) * 10 + 2.5
+    else:
+        bias = torch.zeros((1, n), device="cuda", dtype=out_dtype)
+
+    baseline_dq = (torch.mm(a_dq, b_dq) + bias).to(out_dtype)
+
+    # int32 mm not supported on CUDA
+    a_noazp_i32_cpu = (aq_i32 - azp_aq_i8).to(device='cpu')
+    cq = (a_noazp_i32_cpu @ bq_i32.to(device='cpu')).to(device='cuda')
+    baseline_q = (scale_a * scale_b * cq + bias).to(dtype=out_dtype)
+
+    # Hadamard is just the sum of the cols
+    azp_adj_i32 = bq_i32.sum(dim=0, keepdim=True, dtype=torch.int32)
+    azp_i32 = azp_aq_i8.to(dtype=torch.int32)
+    func_bias = bias if use_bias else None
+
+    if azp_per_token:
+        out = ops.cutlass_scaled_mm_azp(aq_i8, bq_i8, scale_a, scale_b,
+                                        out_dtype, azp_adj_i32, azp_i32,
+                                        func_bias)
+    else:
+        azp_with_adj_i32 = azp_i32 * azp_adj_i32
+        out = ops.cutlass_scaled_mm_azp(aq_i8, bq_i8, scale_a, scale_b,
+                                        out_dtype, azp_with_adj_i32, None,
+                                        func_bias)
+
+    # bfloat16 precision is 7-bit mantissa -> 2^-8 ~ 0.4%
+    # float16 precision is 10-bit mantissa -> 2^-11 ~ 0.05%
+    rtol = 1e-2 if out_dtype == torch.bfloat16 else 1e-3
+    atol = 1e-3
+    torch.testing.assert_close(out, baseline_dq, rtol=rtol, atol=atol)
+    torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)
+
+
 # Test working with a subset of A and B
 def test_cutlass_subset():
    big_m, big_n, big_k = 1024, 1024, 1024
@@ -245,7 +366,7 @@ def test_cutlass_subset():
                                  scale_b,
                                  out_dtype=torch.bfloat16)

-    assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)


 # Test to make sure cuda graphs work
@@ -293,4 +414,4 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):

    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
                        scale_b * b.to(dtype=torch.float32)).to(torch.bfloat16)
-    assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -4,8 +4,6 @@ Tests:
 * E2E test of Encoder attention + Decoder self-attention +
      Encoder/decoder cross-attention (collectively
      "encoder/decoder attention")
-* Confirm enc/dec models will fail for chunked prefill
-* Confirm enc/dec models will fail for prefix caching

 """

@@ -15,19 +13,22 @@ import pytest
 import torch

 from tests.kernels.utils import *
-from tests.kernels.utils import make_causal_mask, maybe_make_long_tensor
-from vllm.attention import Attention, AttentionMetadata
-from vllm.attention.backends.abstract import AttentionBackend, AttentionType
+from vllm.attention import (Attention, AttentionBackend, AttentionMetadata,
+                            AttentionType)
 from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
+from vllm.attention.selector import (_Backend,
+                                     global_force_attn_backend_context_manager)
 from vllm.utils import is_hip

+# List of support backends for encoder/decoder models
+LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS]
+
 HEAD_SIZES = [64, 256]

 NUM_HEADS = [1, 16]

 BATCH_SIZES = [1, 16]
 BLOCK_SIZES = [16]
-BACKEND_NAMES = [STR_XFORMERS_ATTN_VAL]
 CUDA_DEVICE = "cuda:0"

 MAX_DEC_SEQ_LENS = [128]
@@ -724,23 +725,58 @@ def _run_encoder_decoder_cross_attention_test(
 @pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("backend_name", BACKEND_NAMES)
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS)
 @pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS)
-def test_encoder_only(num_heads: int, head_size: int, backend_name: str,
-                      batch_size: int, block_size: int, max_dec_seq_len: int,
-                      max_enc_seq_len: int, monkeypatch):
+def test_encoder_only(
+    num_heads: int,
+    head_size: int,
+    attn_backend: _Backend,
+    batch_size: int,
+    block_size: int,
+    max_dec_seq_len: int,
+    max_enc_seq_len: int,
+):
+    '''
+    End-to-end encoder-only attention test:
+
+    * Construct fake test vectors for (1) encoder attention
+    * Construct (1) attention metadata structure with prefill-phase
+      encoder attention, and (2) an analogous attention metadata
+      structure but for decode-phase
+    * Test & validate encoder attention against ideal output
+
+    No KV cache is required for encoder-only attention.
+
+    Note on ROCm/HIP: currently encoder/decoder models are not supported on
+    AMD GPUs, therefore this test simply is skipped if is_hip(). 
+
+    This test globally forces an override of the usual backend
+    auto-selection process, forcing the specific backend-under-test
+    to be utilized.
+
+    Arguments:
+
+    * num_heads
+    * head_size,
+    * attn_backend: The attention backend to employ for testing
+    * batch_size
+    * block_size: KV cache block size
+    * max_dec_seq_len: max length of decoder input sequences
+    * max_enc_seq_len: max length of encoder input sequences
+    '''

    # Force Attention wrapper backend
-    override_backend_env_variable(monkeypatch, backend_name)
+    with global_force_attn_backend_context_manager(attn_backend):

        # Note: KV cache size of 4096 is arbitrary & chosen intentionally
        # to be more than necessary, since exceeding the kv cache size
        # is not part of this test
-    test_pt = TestPoint(num_heads, head_size, backend_name, batch_size,
-                        block_size, max_dec_seq_len, max_enc_seq_len, 4096)
+        test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                            batch_size, block_size, max_dec_seq_len,
+                            max_enc_seq_len, 4096)

        # Attention scale factor, attention backend instance, attention wrapper
        # instance, KV cache init
@@ -774,7 +810,7 @@ def test_encoder_only(num_heads: int, head_size: int, backend_name: str,
 @pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("backend_name", BACKEND_NAMES)
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS)
@@ -782,12 +818,11 @@ def test_encoder_only(num_heads: int, head_size: int, backend_name: str,
 def test_e2e_enc_dec_attn(
    num_heads: int,
    head_size: int,
-    backend_name: str,
+    attn_backend: _Backend,
    batch_size: int,
    block_size: int,
    max_dec_seq_len: int,
    max_enc_seq_len: int,
-    monkeypatch,
 ) -> None:
    '''
    End-to-end encoder/decoder test:
@@ -820,8 +855,9 @@ def test_e2e_enc_dec_attn(
    cross-attention K/Vs are allowed to differ in seq len, as is often the case
    for cross-attention.

-    This test utilizes PyTest monkey patching to force the attention backend
-    via an environment variable.
+    This test globally forces an override of the usual backend
+    auto-selection process, forcing the specific backend-under-test
+    to be utilized.

    Note on ROCm/HIP: currently encoder/decoder models are not supported on
    AMD GPUs, therefore this test simply is skipped if is_hip(). 
@@ -830,23 +866,34 @@ def test_e2e_enc_dec_attn(
    all prefill-phase attention operations (encoder, decoder, enc/dec cross), 
    and a single one shared by all decode-phase attention operations
    (decoder & enc/dec cross.) This is intended to reflect the behavior
-    of ModelRunner, which constructs a single attention metadata structure for
-    each prefill or decode run. A realistic scenario would rely on the
-    attention backend to utilize the appropriate attention metadata fields
-    according to the value of attn_metadata.attention_type. Thus, this test is
-    organized so as to confirm that the backend-under-test can handle a
-    shared prefill attention metadata structure & a shared decode attention
-    metadata structure.
+    of EncoderDecoderModelRunner, which constructs a single attention metadata
+    structure for each prefill or decode run. A realistic scenario would rely
+    on the attention backend to utilize the appropriate attention metadata
+    fields according to the value of attn_metadata.attention_type. Thus,
+    this test is organized so as to confirm that the backend-under-test can
+    handle a shared prefill attention metadata structure & a shared decode\
+    attention metadata structure.
+
+    Arguments:
+
+    * num_heads
+    * head_size,
+    * attn_backend: The attention backend to employ for testing
+    * batch_size
+    * block_size: KV cache block size
+    * max_dec_seq_len: max length of decoder input sequences
+    * max_enc_seq_len: max length of encoder input sequences
    '''

    # Force Attention wrapper backend
-    override_backend_env_variable(monkeypatch, backend_name)
+    with global_force_attn_backend_context_manager(attn_backend):

        # Note: KV cache size of 4096 is arbitrary & chosen intentionally
        # to be more than necessary, since exceeding the kv cache size
        # is not part of this test
-    test_pt = TestPoint(num_heads, head_size, backend_name, batch_size,
-                        block_size, max_dec_seq_len, max_enc_seq_len, 4096)
+        test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                            batch_size, block_size, max_dec_seq_len,
+                            max_enc_seq_len, 4096)

        # Attention scale factor, attention backend instance, attention wrapper
        # instance, KV cache init
@@ -870,8 +917,9 @@ def test_e2e_enc_dec_attn(
            cross_block_base_addr,
        ) = _decoder_attn_setup(test_pt, test_rsrcs)

-    # Construct encoder/decoder cross-attention prefill-phase & decode-phase
-    # test params, including key/value tensors, cross-attention memory-mapping
+        # Construct encoder/decoder cross-attention prefill-phase
+        # & decode-phase test params, including key/value tensors,
+        # cross-attention memory-mapping

        (
            prephase_cross_test_params,

--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -2,13 +2,16 @@ from typing import List, Optional, Tuple

 import pytest
 import torch
-from vllm_flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache

-NUM_HEADS = [(16, 16), (32, 8), (64, 8)]
+import vllm.attention.backends.flash_attn  # noqa: F401
+
+NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
 DTYPES = [torch.float16, torch.bfloat16]
-NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
+# one value large enough to test overflow in index calculation.
+# one value small enough to test the schema op check
+NUM_BLOCKS = [32768, 2048]


 def ref_paged_attn(
@@ -72,6 +75,7 @@ def ref_paged_attn(
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @torch.inference_mode()
 def test_flash_attn_with_paged_kv(
    kv_lens: List[int],
@@ -80,6 +84,7 @@ def test_flash_attn_with_paged_kv(
    dtype: torch.dtype,
    block_size: int,
    soft_cap: Optional[float],
+    num_blocks: int,
 ) -> None:
    torch.set_default_device("cuda")
    torch.cuda.manual_seed_all(0)
@@ -91,7 +96,7 @@ def test_flash_attn_with_paged_kv(
    scale = head_size**-0.5

    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
-    key_cache = torch.randn(NUM_BLOCKS,
+    key_cache = torch.randn(num_blocks,
                            block_size,
                            num_kv_heads,
                            head_size,
@@ -101,14 +106,14 @@ def test_flash_attn_with_paged_kv(

    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
    block_tables = torch.randint(0,
-                                 NUM_BLOCKS,
+                                 num_blocks,
                                 (num_seqs, max_num_blocks_per_seq),
                                 dtype=torch.int32)

-    output = flash_attn_with_kvcache(
-        q=query.unsqueeze(1),
-        k_cache=key_cache,
-        v_cache=value_cache,
+    output = torch.ops.vllm.flash_attn_with_kvcache(
+        decode_query=query.unsqueeze(1),
+        key_cache=key_cache,
+        value_cache=value_cache,
        softmax_scale=scale,
        causal=True,
        block_table=block_tables,
@@ -116,6 +121,25 @@ def test_flash_attn_with_paged_kv(
        softcap=soft_cap if soft_cap is not None else 0,
    ).squeeze(1)

+    if num_blocks <= 2048:
+        test_utils = ["test_faketensor", "test_schema"]
+    else:
+        test_utils = ["test_faketensor"]
+
+    torch.library.opcheck(torch.ops.vllm.flash_attn_with_kvcache,
+                          args=tuple(),
+                          kwargs=dict(
+                              decode_query=query.unsqueeze(1),
+                              key_cache=key_cache,
+                              value_cache=value_cache,
+                              softmax_scale=scale,
+                              causal=True,
+                              block_table=block_tables,
+                              cache_seqlens=kv_lens_tensor,
+                              softcap=soft_cap if soft_cap is not None else 0,
+                          ),
+                          test_utils=test_utils)
+
    ref_output = ref_paged_attn(
        query=query,
        key_cache=key_cache,
@@ -126,7 +150,7 @@ def test_flash_attn_with_paged_kv(
        scale=scale,
        soft_cap=soft_cap,
    )
-    assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
        f"{torch.max(torch.abs(output - ref_output))}"


@@ -137,6 +161,7 @@ def test_flash_attn_with_paged_kv(
 @pytest.mark.parametrize("sliding_window", [None])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @torch.inference_mode()
 def test_varlen_with_paged_kv(
    seq_lens: List[Tuple[int, int]],
@@ -146,6 +171,7 @@ def test_varlen_with_paged_kv(
    dtype: torch.dtype,
    block_size: int,
    soft_cap: Optional[float],
+    num_blocks: int,
 ) -> None:
    torch.set_default_device("cuda")
    torch.cuda.manual_seed_all(0)
@@ -166,7 +192,7 @@ def test_varlen_with_paged_kv(
                        num_query_heads,
                        head_size,
                        dtype=dtype)
-    key_cache = torch.randn(NUM_BLOCKS,
+    key_cache = torch.randn(num_blocks,
                            block_size,
                            num_kv_heads,
                            head_size,
@@ -181,11 +207,11 @@ def test_varlen_with_paged_kv(

    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
    block_tables = torch.randint(0,
-                                 NUM_BLOCKS,
+                                 num_blocks,
                                 (num_seqs, max_num_blocks_per_seq),
                                 dtype=torch.int32)

-    output = flash_attn_varlen_func(
+    output = torch.ops.vllm.flash_attn_varlen_func(
        q=query,
        k=key_cache,
        v=value_cache,
@@ -200,6 +226,29 @@ def test_varlen_with_paged_kv(
        softcap=soft_cap if soft_cap is not None else 0,
    )

+    if num_blocks <= 2048:
+        test_utils = ["test_faketensor", "test_schema"]
+    else:
+        test_utils = ["test_faketensor"]
+
+    torch.library.opcheck(torch.ops.vllm.flash_attn_varlen_func,
+                          args=tuple(),
+                          kwargs=dict(
+                              q=query,
+                              k=key_cache,
+                              v=value_cache,
+                              cu_seqlens_q=cu_query_lens,
+                              cu_seqlens_k=cu_kv_lens,
+                              max_seqlen_q=max_query_len,
+                              max_seqlen_k=max_kv_len,
+                              softmax_scale=scale,
+                              causal=True,
+                              window_size=window_size,
+                              block_table=block_tables,
+                              softcap=soft_cap if soft_cap is not None else 0,
+                          ),
+                          test_utils=test_utils)
+
    ref_output = ref_paged_attn(
        query=query,
        key_cache=key_cache,
@@ -211,5 +260,5 @@ def test_varlen_with_paged_kv(
        sliding_window=sliding_window,
        soft_cap=soft_cap,
    )
-    assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
        f"{torch.max(torch.abs(output - ref_output))}"
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -4,7 +4,7 @@ import flashinfer
 import pytest
 import torch

-NUM_HEADS = [(16, 16), (32, 8), (64, 8)]
+NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)]
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
 DTYPES = [torch.float16, torch.bfloat16]
@@ -123,7 +123,10 @@ def test_flashinfer_decode_with_paged_kv(kv_lens: List[int],

    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
    wrapper = flashinfer.\
-        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD")
+        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
+                use_tensor_cores=(
+                    (num_query_heads//num_kv_heads) not in (1, 2, 4, 8))
+                )
    wrapper.begin_forward(kv_indptr,
                          kv_indices,
                          kv_last_page_lens,
@@ -144,7 +147,7 @@ def test_flashinfer_decode_with_paged_kv(kv_lens: List[int],
                                block_tables=block_tables,
                                scale=scale,
                                soft_cap=soft_cap)
-    assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
        f"{torch.max(torch.abs(output - ref_output))}"


@@ -244,5 +247,5 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
                                block_tables=block_tables,
                                scale=scale,
                                soft_cap=soft_cap)
-    assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
        f"{torch.max(torch.abs(output - ref_output))}"
--- a/tests/kernels/test_fp8_quant.py
+++ b/tests/kernels/test_fp8_quant.py
@@ -2,7 +2,8 @@ import pytest
 import torch

 import vllm._custom_ops as ops
-from tests.kernels.quant_utils import (ref_dynamic_per_tensor_fp8_quant,
+from tests.kernels.quant_utils import (FP8_DTYPE,
+                                       ref_dynamic_per_tensor_fp8_quant,
                                       ref_dynamic_per_token_quant)

 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -31,14 +32,13 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,

    scale_ub = torch.mean(x).to(dtype=torch.float32, device='cuda') \
            if scale_ub else None
-    ref_out, ref_scales = ref_dynamic_per_token_quant(x, torch.float8_e4m3fn,
-                                                      scale_ub)
+    ref_out, ref_scales = ref_dynamic_per_token_quant(x, FP8_DTYPE, scale_ub)
    ops_out, ops_scales = ops.scaled_fp8_quant(x,
                                               scale_ub=scale_ub,
                                               use_per_token_if_dynamic=True)

-    assert torch.allclose(ref_scales, ops_scales)
-    assert torch.allclose(ref_out.to(dtype=torch.float32),
+    torch.testing.assert_close(ref_scales, ops_scales)
+    torch.testing.assert_close(ref_out.to(dtype=torch.float32),
                               ops_out.to(dtype=torch.float32))


@@ -57,8 +57,8 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
    ref_out, ref_scale = ref_dynamic_per_tensor_fp8_quant(x)
    ops_out, ops_scale = ops.scaled_fp8_quant(x)

-    assert torch.allclose(ref_scale, ops_scale)
-    assert torch.allclose(ref_out.to(dtype=torch.float32),
+    torch.testing.assert_close(ref_scale, ops_scale)
+    torch.testing.assert_close(ref_out.to(dtype=torch.float32),
                               ops_out.to(dtype=torch.float32))


@@ -84,4 +84,4 @@ def test_fp8_quant_large(seed: int) -> None:
    ref_out = ref_out.to(dtype=dtype)
    ops_out = ops_out.to(dtype=dtype)

-    assert torch.allclose(ref_out, ops_out)
+    torch.testing.assert_close(ref_out, ops_out)
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -29,9 +29,10 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
    # kernel
    ops_out, ops_scales = scaled_int8_quant(x)

-    assert torch.allclose(ops_scales, ref_scales)
-    assert torch.allclose(ops_out, ref_out,
-                          atol=1)  # big atol to account for rounding errors
+    torch.testing.assert_close(ops_scales, ref_scales)
+    torch.testing.assert_close(
+        ops_out, ref_out, atol=1,
+        rtol=0.0)  # big atol to account for rounding errors


 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -54,5 +55,6 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
                                     int8_traits.max).to(torch.int8)
    out2, _ = scaled_int8_quant(x, scale)

-    assert torch.allclose(out1, out2,
-                          atol=1)  # big atol to account for rounding errors
+    torch.testing.assert_close(
+        out1, out2, atol=1,
+        rtol=0.0)  # big atol to account for rounding errors
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -48,7 +48,7 @@ def test_rms_norm(
    # numerical errors than other operators because they involve reductions.
    # Therefore, we use a larger tolerance.
    if add_residual:
-        assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
-        assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
    else:
-        assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
--- a/tests/kernels/test_machete_gemm.py
+++ b/tests/kernels/test_machete_gemm.py
+"""Tests for the machete kernel.
+
+Run `pytest tests/kernels/test_machete_gemm.py`.
+"""
+
+import math
+from typing import Optional, Tuple
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_rows, quantize_weights)
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+MNK_SHAPES = [
+    (1, 128, 128),
+    (1, 512, 1024),
+    (1, 4096, 4096),
+    (13, 8192, 4096),
+    (26, 4096, 8192),
+    (1, 4096, 4096),
+    (257, 128, 4096),
+    (257, 4224, 4160),
+    (257, 4096, 4096),
+    (64, 4096, 4096),
+]
+
+ACT_TYPES = [torch.float16, torch.bfloat16]
+WTYPE_ZEROPOINTS = [
+    # GPTQ style
+    (scalar_types.uint4b8, False),
+    (scalar_types.uint8b128, False),
+    # AWQ style
+    (scalar_types.uint4, True),
+    (scalar_types.uint8, True),
+]
+
+# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
+#  unit tests to a common utility function. Currently the use of
+#  `is_quant_method_supported` conflates kernels with quantization methods
+#  an assumption which is breaking down as quantizations methods can have
+#  have kernels and some kernels support multiple quantization methods.
+IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
+
+
+def rand_data(shape, dtype=torch.float16):
+    return 10 * (torch.rand(shape, dtype=dtype, device="cuda") - 0.3)
+
+
+def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor):
+    return zps if zps is None else -1 * s * (zps.to(s.dtype))
+
+
+def machete_quantize_and_pack(w: torch.Tensor,
+                              wtype: ScalarType,
+                              group_size: int,
+                              zero_points: bool = False):
+    assert wtype.is_integer(), "TODO: support floating point weights"
+
+    w_ref, w_q, w_s, w_zp = quantize_weights(
+        w,
+        wtype,
+        group_size,
+        zero_points=zero_points,
+        # to match how the kernel applies zps
+        ref_zero_points_after_scales=True)
+
+    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
+    w_q = w_q.t().contiguous().t()  # convert to col major
+    w_q_machete = ops.machete_prepack_B(w_q, wtype)
+
+    return w_ref, w_q_machete, w_s, w_zp
+
+
+def machete_gemm_test_helper(a: torch.Tensor, b: torch.Tensor,
+                             wtype: ScalarType, group_size: int,
+                             zero_points: bool):
+    w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
+        b, wtype, group_size, zero_points)
+
+    output_ref = torch.matmul(a, w_ref)
+
+    output = ops.machete_gemm(
+        a=a,
+        b_q=w_q_packed,
+        b_type=wtype,
+        b_scales=w_s,
+        b_zeros=maybe_convert_zeropoints(w_zp, w_s),
+        b_group_size=group_size,
+    )
+
+    # Relax atol as our reduction dim becomes larger (more rounding error)
+    # Relax atol when we have zeropoints since the way machete applies
+    #  zeropoints (after scales) causes noise around 0
+    atol = 1 if zero_points else min(5e-2 * math.sqrt(a.shape[1]), 1)
+    torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.parametrize("shape",
+                         MNK_SHAPES,
+                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x))
+@pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS)
+@pytest.mark.parametrize("group_size", [128, None])
+def test_machete_all_schedules(shape, atype: torch.dtype,
+                               wtype_zeropoints: Tuple[ScalarType, bool],
+                               group_size: Optional[int]):
+    m, n, k = shape
+    wtype, zero_points = wtype_zeropoints
+
+    if group_size is not None and k % group_size != 0:
+        return
+
+    print(f"MNK = {m} {n} {k}")
+
+    # Normalize group_size
+    if group_size is None:
+        group_size = k
+    assert group_size <= k
+
+    a = rand_data((m, k), atype)
+    w = rand_data((k, n), atype)
+
+    w_ref, w_q_machete, w_s, w_zp = machete_quantize_and_pack(
+        w, wtype, group_size, zero_points)
+
+    output_ref = torch.matmul(a, w_ref)
+
+    for schedule in ops.machete_supported_schedules(wtype):
+        output = ops.machete_gemm(
+            a,
+            b_q=w_q_machete,
+            b_type=wtype,
+            b_scales=w_s,
+            b_zeros=maybe_convert_zeropoints(w_zp, w_s),
+            b_group_size=group_size,
+            schedule=schedule,
+        )
+
+        # Relax atol as our reduction dim becomes larger (more rounding error)
+        # Relax atol when we have zeropoints since the way machete applies
+        #  zeropoints (after scales) causes noise around 0
+        atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1)
+        torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol),\
+               f"Schedule failed {schedule}"
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.parametrize("shape",
+                         MNK_SHAPES,
+                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x))
+@pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS)
+@pytest.mark.parametrize("group_size", [128, None])
+def test_machete_heuristic(shape, atype: torch.dtype,
+                           wtype_zeropoints: Tuple[ScalarType, bool],
+                           group_size: Optional[int]):
+    m, n, k = shape
+    wtype, zero_points = wtype_zeropoints
+
+    if group_size is not None and k % group_size != 0:
+        return
+
+    # Normalize group_size
+    if group_size is None:
+        group_size = k
+    assert group_size <= k
+
+    a = rand_data((m, k), atype)
+    b = rand_data((k, n), atype)
+
+    machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
+
+
+# Test working on other devices
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_machete_devices(device: str):
+    m, n, k = 512, 4096, 4096
+    wtype = scalar_types.uint4b8
+    group_size = 128
+    zero_points = False
+
+    print(f"MNK = {m} {n} {k}, device = {device}")
+
+    a = rand_data((m, k), torch.float16).to(device)
+    b = rand_data((k, n), torch.float16).to(device)
+
+    machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
+
+
+# Test working with a subset of A and B
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+def test_machete_subset():
+    big_m, big_n, big_k = 1024, 1024, 1024
+    m, n, k = 512, 512, 512
+    wtype = scalar_types.uint4b8
+    group_size = 128
+    zero_points = False
+
+    whole_a = rand_data((big_m, big_k), torch.float16)
+    whole_b = rand_data((big_k, big_n), torch.float16)
+
+    a = whole_a[0:m, 0:k]
+    b = whole_b[0:k, 0:n]
+
+    machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
+
+
+# Test to make sure cuda graphs work
+class MacheteLayer(torch.nn.Module):
+
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.kwargs = kwargs
+
+    def forward(self, a):
+        return ops.machete_gemm(**self.kwargs)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="Machete is not supported on this GPU type.")
+def test_machete_cuda_graph():
+    m, n, k = 512, 4096, 4096
+
+    a = rand_data((m, k), torch.float16)
+    b = rand_data((k, n), torch.float16)
+    wtype = scalar_types.uint4b8
+    group_size = 128
+    zero_points = False
+
+    w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
+        b, wtype, group_size, zero_points)
+
+    # Construct a trivial model with a single layer that calls a machete kernel
+    model = MacheteLayer(
+        a=a,
+        b_q=w_q_packed,
+        b_type=wtype,
+        b_scales=w_s,
+        b_zeros=maybe_convert_zeropoints(w_zp, w_s),
+        b_group_size=group_size,
+    )
+
+    output_ref = torch.matmul(a, w_ref)
+
+    # Run the model with a cuda graph
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            output = model(a)
+    output.zero_()
+    g.replay()
+
+    # Relax atol as our reduction dim becomes larger (more rounding error)
+    # Relax atol when we have zeropoints since the way machete applies
+    #  zeropoints (after scales) causes noise around 0
+    atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1)
+    torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol)
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -122,7 +122,7 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
    )
    torch.cuda.synchronize()

-    assert torch.allclose(marlin_q_w_1, marlin_q_w_2)
+    torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)


 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
@@ -174,7 +174,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
    )
    torch.cuda.synchronize()

-    assert torch.allclose(marlin_q_w_1, marlin_q_w_2)
+    torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)


 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),

--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -50,7 +50,7 @@ def test_fused_moe(
    score = torch.randn((m, e), device='cuda', dtype=dtype)
    triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
    torch_output = torch_moe(a, w1, w2, score, topk)
-    assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
+    torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0)


 @pytest.mark.parametrize("dtype",
@@ -95,7 +95,7 @@ def test_mixtral_moe(dtype: torch.dtype):
        torch.bfloat16: 1e-2,
    }

-    assert torch.allclose(hf_states.flatten(0, 1),
+    torch.testing.assert_close(hf_states.flatten(0, 1),
                               vllm_states,
                               rtol=mixtral_moe_tol[dtype],
                               atol=mixtral_moe_tol[dtype])
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -67,11 +67,11 @@ def test_rotary_embedding(
    ref_query, ref_key = rope.forward_native(positions, query, key)
    out_query, out_key = rope.forward(positions, query, key)
    # Compare the results.
-    assert torch.allclose(out_query,
+    torch.testing.assert_close(out_query,
                               ref_query,
                               atol=get_default_atol(out_query),
                               rtol=get_default_rtol(out_query))
-    assert torch.allclose(out_key,
+    torch.testing.assert_close(out_key,
                               ref_key,
                               atol=get_default_atol(out_key),
                               rtol=get_default_rtol(out_key))
@@ -129,11 +129,11 @@ def test_batched_rotary_embedding(
                                                          dtype=torch.long,
                                                          device=device))
    # Compare the results.
-    assert torch.allclose(out_query,
+    torch.testing.assert_close(out_query,
                               ref_query,
                               atol=get_default_atol(out_query),
                               rtol=get_default_rtol(out_query))
-    assert torch.allclose(out_key,
+    torch.testing.assert_close(out_key,
                               ref_key,
                               atol=get_default_atol(out_key),
                               rtol=get_default_rtol(out_key))
@@ -200,11 +200,11 @@ def test_batched_rotary_embedding_multi_lora(
    out_query, out_key = rope.forward(positions, query, key,
                                      query_offsets.flatten())
    # Compare the results.
-    assert torch.allclose(out_query,
+    torch.testing.assert_close(out_query,
                               ref_query,
                               atol=get_default_atol(out_query),
                               rtol=get_default_rtol(out_query))
-    assert torch.allclose(out_key,
+    torch.testing.assert_close(out_key,
                               ref_key,
                               atol=get_default_atol(out_key),
                               rtol=get_default_rtol(out_key))

--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -10,6 +10,7 @@ from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
 from vllm.attention.backends.xformers import _make_alibi_bias
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
 from vllm.utils import  is_hip
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

 NUM_HEADS = [64]
 NUM_QUERIES_PER_KV = [1, 8, 64]
@@ -19,12 +20,14 @@ CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
 SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
+KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]


 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
 @torch.inference_mode()
@@ -34,6 +37,7 @@ def test_contexted_kv_attention(
    head_size: int,
    sliding_window: int,
    dtype: torch.dtype,
+    kv_cache_dtype: str,
    device: str,
 ) -> None:
    random.seed(0)
@@ -68,16 +72,20 @@ def test_contexted_kv_attention(
    kv.uniform_(-1e-3, 1e-3)
    key, value = kv.unbind(dim=1)

+    if kv_cache_dtype == "auto":
+        cache_dtype = dtype
+    else:
+        cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
    k_cache = torch.zeros(cache_size,
                          block_size,
                          num_kv_heads,
                          head_size,
-                          dtype=dtype)
+                          dtype=cache_dtype)
    v_cache = torch.zeros(cache_size,
                          block_size,
                          num_kv_heads,
                          head_size,
-                          dtype=dtype)
+                          dtype=cache_dtype)
    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
    values = torch.arange(0, cache_size, dtype=torch.long)
@@ -133,6 +141,7 @@ def test_contexted_kv_attention(
                          k,
                          v,
                          output,
+                          kv_cache_dtype,
                          k_cache,
                          v_cache,
                          block_table,
@@ -147,6 +156,7 @@ def test_contexted_kv_attention(
                          k,
                          v,
                          output,
+                          kv_cache_dtype,
                          k_cache,
                          v_cache,
                          block_table,
@@ -210,13 +220,15 @@ def test_contexted_kv_attention(
        end_time = time.time()
        print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
        output_ref = output_ref.reshape(output.shape)
-        assert torch.allclose(output_ref, output, atol=1e-6, rtol=0)
+        atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
+        torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)


 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_contexted_kv_attention_alibi(
@@ -224,6 +236,7 @@ def test_contexted_kv_attention_alibi(
    num_queries_per_kv: int,
    head_size: int,
    dtype: torch.dtype,
+    kv_cache_dtype: str,
    device: str,
 ) -> None:
    random.seed(0)
@@ -284,17 +297,20 @@ def test_contexted_kv_attention_alibi(
    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
    kv.uniform_(-1e-3, 1e-3)
    key, value = kv.unbind(dim=1)
-
+    if kv_cache_dtype == "auto":
+        cache_dtype = dtype
+    else:
+        cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
    k_cache = torch.zeros(cache_size,
                          block_size,
                          num_kv_heads,
                          head_size,
-                          dtype=dtype)
+                          dtype=cache_dtype)
    v_cache = torch.zeros(cache_size,
                          block_size,
                          num_kv_heads,
                          head_size,
-                          dtype=dtype)
+                          dtype=cache_dtype)
    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
    values = torch.arange(0, cache_size, dtype=torch.long)
@@ -350,6 +366,7 @@ def test_contexted_kv_attention_alibi(
                          k,
                          v,
                          output,
+                          kv_cache_dtype,
                          k_cache,
                          v_cache,
                          block_table,
@@ -364,6 +381,7 @@ def test_contexted_kv_attention_alibi(
                          k,
                          v,
                          output,
+                          kv_cache_dtype,
                          k_cache,
                          v_cache,
                          block_table,
@@ -420,6 +438,54 @@ def test_contexted_kv_attention_alibi(
        key = key.unsqueeze(0)
        value = value.unsqueeze(0)

+        attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
+        output_ref = torch.empty_like(output)
+        seq_start = 0
+        query_start = 0
+        start_time = time.time()
+        # Attention with alibi slopes.
+        # FIXME(DefTruth): Because xformers does not support dynamic sequence
+        # lengths with custom attention bias, we process each prompt one by
+        # one. This is inefficient, especially when we have many short prompts.
+        # modified from: vllm/attention/backends/xformers.py#L343
+        for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
+            seq_end = seq_start + seq_len
+            query_end = query_start + query_len
+            out = xops.memory_efficient_attention_forward(query[:,
+                                                                seq_start:seq_end],
+                                                        key[:,
+                                                            seq_start:seq_end],
+                                                        value[:,
+                                                                seq_start:seq_end],
+                                                        attn_bias=attn_bias[i],
+                                                        p=0.0,
+                                                        scale=scale)
+            out = out.view_as(query[:, seq_start:seq_end]).view(
+                seq_len, num_heads, head_size)
+            output_ref[query_start:query_end, ...].copy_(out[seq_len - query_len:,
+                                                            ...])
+            seq_start += seq_len
+            query_start += query_len
+        query = query_pad
+
+    if num_kv_heads != num_heads:
+        # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
+        # project the key and value tensors to the desired number of
+        # heads.
+        #
+        # see also: vllm/model_executor/layers/attention.py
+        query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv,
+                           query.shape[-1])
+        key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
+                                        num_queries_per_kv, key.shape[-1])
+        value = value[:, :,
+                      None, :].expand(value.shape[0], num_kv_heads,
+                                      num_queries_per_kv, value.shape[-1])
+
+    query = query.unsqueeze(0)
+    key = key.unsqueeze(0)
+    value = value.unsqueeze(0)
+
    attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
    output_ref = torch.empty_like(output)
    seq_start = 0
@@ -451,4 +517,5 @@ def test_contexted_kv_attention_alibi(
    torch.cuda.synchronize()
    end_time = time.time()
    print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
-        assert torch.allclose(output_ref, output, atol=1e-6, rtol=0)
+    atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
+    torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
--- a/tests/kernels/test_sampler.py
+++ b/tests/kernels/test_sampler.py
@@ -100,11 +100,11 @@ def test_sample_decoding_only(random_sampling, max_best_of,
        if modify_greedy_probs and not request_uses_random_sampling:
            # If we are modifying greedy probs and the request is greedy,
            # we want to make sure the probs tensor is modified in place
-            assert torch.allclose(
+            torch.testing.assert_close(
                probs[i][sampled_tokens[i]],
                torch.full_like(probs[i][sampled_tokens[i]], 1.0))
            assert torch.sum(probs[i]) == 1.0
-            assert torch.allclose(
+            torch.testing.assert_close(
                sampled_modified_probs[i][0],
                torch.full_like(sampled_modified_probs[i][0], 1.0))
        elif request_uses_random_sampling:
@@ -117,7 +117,7 @@ def test_sample_decoding_only(random_sampling, max_best_of,
            # If the request is greedy and we are not modifying greedy probs,
            # we want to make sure sampled_modified_probs tensor is the same as
            # the probs tensor.
-            assert torch.allclose(sampled_modified_probs[i][0],
+            torch.testing.assert_close(sampled_modified_probs[i],
                                       probs[i][sampled_tokens[i]])

    if save_logprobs:

--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -8,24 +8,10 @@ from typing import Any, List, NamedTuple, Optional, Tuple, Union
 import pytest
 import torch

-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadata, AttentionType)
+from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
 from vllm.attention.backends.xformers import XFormersBackend
-from vllm.utils import make_tensor_with_pad
-
-# String name of register which may be set in order to
-# force auto-selection of attention backend by Attention
-# wrapper
-STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
-
-# Possible string values of STR_BACKEND_ENV_VAR
-# register, corresponding to possible backends
-STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
-STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA"
-STR_ROCM_FLASH_ATTN_VAL: str = "ROCM_FLASH"
-STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
-STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
-STR_INVALID_VAL: str = "INVALID"
+from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
+                        make_tensor_with_pad)


 class QKVInputs(NamedTuple):
@@ -938,5 +924,5 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters,
    * output_under_test: actually observed output value
    '''
    ideal_output = test_params.packed_qkvo.ideal_output
-    assert torch.allclose(ideal_output,
+    torch.testing.assert_close(ideal_output,
                               output_under_test.view_as(ideal_output))
--- a/tests/lora/test_layer_variation.py
+++ b/tests/lora/test_layer_variation.py
-import tempfile
-from random import sample
-from typing import List, Optional
-
-import peft
-import pytest
-from transformers import AutoModelForCausalLM
-
-import vllm
-from vllm.lora.request import LoRARequest
-
-from .conftest import cleanup
-
-MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"
-PROMPTS = [
-    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
-    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
-    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
-]
-
-
-def get_lora_model(model_id: str, target_modules: List[str], rank: int):
-    model = AutoModelForCausalLM.from_pretrained(model_id)
-    lora_config = peft.tuners.lora.LoraConfig(target_modules, rank)
-    lora_model = peft.PeftModel(model, lora_config)
-    return lora_model
-
-
-def do_sample(llm: vllm.LLM,
-              lora_path: Optional[str] = None,
-              lora_id: Optional[int] = None,
-              logprobs: int = 0,
-              n_tokens: int = 256):
-    prompts = PROMPTS
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=n_tokens,
-                                          logprobs=logprobs,
-                                          stop=["[/assistant]"])
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    # Print the outputs.
-    generated_texts: List[str] = []
-    generated_logprobs: List[List[List[int]]] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        generated_logprobs.append([
-            list(logprob.keys()) for out in output.outputs
-            for logprob in out.logprobs
-        ])
-    return generated_logprobs if logprobs else generated_texts
-
-
-SUPPORTED_MODULES = [
-    "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
-    "lm_head"
-]
-TARGET_MODULES_LIST = []
-for length in range(2, 6):
-    TARGET_MODULES_LIST.extend(
-        [sample(SUPPORTED_MODULES, length) for _ in range(3)])
-
-
-# Test the correctness when layer and rank are varied
-# step 1: init a base model and serve with LoRA to get the reference results
-# step 2: merge the same LoRA to the base model, serve the merged model
-# step 3: compare the results from step 1 and step 2
-@pytest.mark.parametrize("tp_size", [1])
-@pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST)
-@pytest.mark.parametrize("rank", [8, 16, 32, 64])
-def test_layer_variation_correctness(tp_size, target_modules, rank):
-    llm = vllm.LLM(MODEL_PATH,
-                   enable_lora=True,
-                   max_num_seqs=16,
-                   max_loras=4,
-                   tensor_parallel_size=tp_size,
-                   worker_use_ray=True)
-    model = get_lora_model(MODEL_PATH, target_modules, rank)
-    with tempfile.TemporaryDirectory() as tmpdir:
-        model.save_pretrained(tmpdir)
-        merged_probs = do_sample(llm, tmpdir, 1, logprobs=5, n_tokens=32)
-    del llm
-    cleanup()
-    reference_id_sets = [set(prob[0]) for prob in merged_probs]
-
-    model = get_lora_model(MODEL_PATH, target_modules, rank)
-    with tempfile.TemporaryDirectory() as tmpdir:
-        merged_model = model.merge_and_unload()
-        merged_model.save_pretrained(tmpdir)
-        llm = vllm.LLM(tmpdir,
-                       tokenizer=MODEL_PATH,
-                       enable_lora=False,
-                       max_num_seqs=16,
-                       tensor_parallel_size=tp_size,
-                       worker_use_ray=True)
-    probs = do_sample(llm, logprobs=5, n_tokens=32)
-    del llm
-    cleanup()
-    # verify the top-5 tokens are identical for each token
-    id_sets = [set(prob[0]) for prob in probs]
-    assert id_sets == reference_id_sets
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -247,7 +247,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
        expected_result = torch.cat(expected_results)

        rtol, atol = TOLERANCES[lora_result.dtype]
-        assert torch.allclose(lora_result,
+        torch.testing.assert_close(lora_result,
                                   expected_result,
                                   rtol=rtol,
                                   atol=atol)
@@ -274,7 +274,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
        expected_result = embedding(torch.cat(inputs))

        rtol, atol = TOLERANCES[lora_result.dtype]
-        assert torch.allclose(lora_result,
+        torch.testing.assert_close(lora_result,
                                   expected_result,
                                   rtol=rtol,
                                   atol=atol)
@@ -384,7 +384,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
        expected_result = torch.cat(expected_results)

        rtol, atol = TOLERANCES[lora_result.dtype]
-        assert torch.allclose(lora_result,
+        torch.testing.assert_close(lora_result,
                                   expected_result,
                                   rtol=rtol,
                                   atol=atol)
@@ -411,7 +411,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
        expected_result = expanded_embedding(torch.cat(inputs))

        rtol, atol = TOLERANCES[lora_result.dtype]
-        assert torch.allclose(lora_result,
+        torch.testing.assert_close(lora_result,
                                   expected_result,
                                   rtol=rtol,
                                   atol=atol)
@@ -420,7 +420,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
 @pytest.mark.parametrize("stage", STAGES)
 def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
                                  stage) -> None:
@@ -541,7 +541,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
            embedding_bias=None)

        rtol, atol = TOLERANCES[lora_result.dtype]
-        assert torch.allclose(lora_result,
+        torch.testing.assert_close(lora_result,
                                   expected_result,
                                   rtol=rtol,
                                   atol=atol)
@@ -614,7 +614,7 @@ def test_linear_replicated(dist_init, num_loras, device, stage) -> None:
        expected_result = torch.cat(expected_results)

        rtol, atol = TOLERANCES[lora_result.dtype]
-        assert torch.allclose(lora_result,
+        torch.testing.assert_close(lora_result,
                                   expected_result,
                                   rtol=rtol,
                                   atol=atol)
@@ -642,7 +642,7 @@ def test_linear_replicated(dist_init, num_loras, device, stage) -> None:
        expected_result = linear(torch.cat(inputs))[0]

        rtol, atol = TOLERANCES[lora_result.dtype]
-        assert torch.allclose(lora_result,
+        torch.testing.assert_close(lora_result,
                                   expected_result,
                                   rtol=rtol,
                                   atol=atol)
@@ -728,7 +728,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
        expected_result = torch.cat(expected_results)

        rtol, atol = TOLERANCES[lora_result.dtype]
-        assert torch.allclose(lora_result,
+        torch.testing.assert_close(lora_result,
                                   expected_result,
                                   rtol=rtol,
                                   atol=atol)
@@ -756,7 +756,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
        expected_result = linear(torch.cat(inputs))[0]

        rtol, atol = TOLERANCES[lora_result.dtype]
-        assert torch.allclose(lora_result,
+        torch.testing.assert_close(lora_result,
                                   expected_result,
                                   rtol=rtol,
                                   atol=atol)
@@ -868,7 +868,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
        expected_result = torch.cat(expected_results)

        rtol, atol = TOLERANCES[lora_result.dtype]
-        assert torch.allclose(lora_result,
+        torch.testing.assert_close(lora_result,
                                   expected_result,
                                   rtol=rtol,
                                   atol=atol)
@@ -900,7 +900,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
        expected_result = linear(torch.cat(inputs))[0]

        rtol, atol = TOLERANCES[lora_result.dtype]
-        assert torch.allclose(lora_result,
+        torch.testing.assert_close(lora_result,
                                   expected_result,
                                   rtol=rtol,
                                   atol=atol)

--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -533,13 +533,13 @@ def test_packed_loras(dist_init, dummy_model_gate_up):
    packed_lora = model_lora.get_lora("gate_up_proj")
    assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)

-    assert torch.allclose(packed_lora.lora_a[0],
+    torch.testing.assert_close(packed_lora.lora_a[0],
                               model_lora.get_lora("gate_proj").lora_a)
-    assert torch.allclose(packed_lora.lora_b[0],
+    torch.testing.assert_close(packed_lora.lora_b[0],
                               model_lora.get_lora("gate_proj").lora_b)
-    assert torch.allclose(packed_lora.lora_a[1],
+    torch.testing.assert_close(packed_lora.lora_a[1],
                               model_lora.get_lora("up_proj").lora_a)
-    assert torch.allclose(packed_lora.lora_b[1],
+    torch.testing.assert_close(packed_lora.lora_b[1],
                               model_lora.get_lora("up_proj").lora_b)

    packed_lora1 = model_lora1.get_lora("gate_up_proj")
@@ -547,7 +547,7 @@ def test_packed_loras(dist_init, dummy_model_gate_up):

    assert packed_lora1.lora_a[0] is None
    assert packed_lora1.lora_b[0] is None
-    assert torch.allclose(packed_lora1.lora_a[1],
+    torch.testing.assert_close(packed_lora1.lora_a[1],
                               model_lora1.get_lora("up_proj").lora_a)
-    assert torch.allclose(packed_lora1.lora_b[1],
+    torch.testing.assert_close(packed_lora1.lora_b[1],
                               model_lora1.get_lora("up_proj").lora_b)
--- a/tests/lora/test_punica_sizes.py
+++ b/tests/lora/test_punica_sizes.py
@@ -98,7 +98,7 @@ HIDDEN_SIZES = [
    128256,
 ]
 #The size of TP
-divisibility = [1, 2, 4, 8, 16, 32, 64]
+divisibility = [1, 2, 8, 16, 64]

 all_hidden_size = []
 for div in divisibility: