Commit 045b5ad2 authored by zhuwenwen's avatar zhuwenwen
Browse files

skip cutlass fwop and xformers cuda backend

parent bbf9488b
......@@ -9,6 +9,7 @@ from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
from vllm.attention.backends.xformers import _make_alibi_bias
from vllm.attention.ops.prefix_prefill import context_attention_fwd
from vllm.utils import is_hip
NUM_HEADS = [64]
NUM_QUERIES_PER_KV = [1, 8, 64]
......@@ -158,6 +159,7 @@ def test_contexted_kv_attention(
end_time = time.time()
print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
if not is_hip():
scale = float(1.0 / (head_size**0.5))
attn_op = xops.fmha.cutlass.FwOp()
......@@ -373,6 +375,8 @@ def test_contexted_kv_attention_alibi(
torch.cuda.synchronize()
end_time = time.time()
print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
if not is_hip():
scale = float(1.0 / (head_size**0.5))
# NOTE(DefTruth): In order to reuse _make_alibi_bias function,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment