skip cutlass fwop and xformers cuda backend

045b5ad2 · zhuwenwen · bbf9488b · 045b5ad2
Commit 045b5ad2 authored Jul 05, 2024 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 119 additions and 115 deletions

tests/kernels/test_prefix_prefill.py tests/kernels/test_prefix_prefill.py +119 -115

No files found.
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -9,6 +9,7 @@ from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask

 from vllm.attention.backends.xformers import _make_alibi_bias
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
+from vllm.utils import  is_hip

 NUM_HEADS = [64]
 NUM_QUERIES_PER_KV = [1, 8, 64]
@@ -158,6 +159,7 @@ def test_contexted_kv_attention(
    end_time = time.time()
    print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")

+    if not is_hip():
        scale = float(1.0 / (head_size**0.5))

        attn_op = xops.fmha.cutlass.FwOp()
@@ -373,6 +375,8 @@ def test_contexted_kv_attention_alibi(
    torch.cuda.synchronize()
    end_time = time.time()
    print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
+    
+    if not is_hip():
        scale = float(1.0 / (head_size**0.5))

        # NOTE(DefTruth): In order to reuse _make_alibi_bias function,