Fix paged attention testing. (#495)

Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>

Fix paged attention testing. (#495)
Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>
d7a1c6d6 · Tao Peng · GitHub · 7d5a155e · d7a1c6d6
Unverified Commit d7a1c6d6 authored Jul 25, 2023 by Tao Peng Committed by GitHub Jul 24, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 0 deletions

tests/kernels/test_attention.py tests/kernels/test_attention.py +9 -0

No files found.
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -164,6 +164,7 @@ def run_single_query_cached_kv_attention(
    block_size: int,
    num_blocks: int,
    dtype: torch.dtype,
+    num_kv_heads: int = None,
 ) -> None:
    qkv = torch.empty(num_tokens,
                      3,
@@ -202,6 +203,14 @@ def run_single_query_cached_kv_attention(
    head_mapping = torch.arange(num_heads, dtype=torch.int32, device="cuda")

    scale = float(1.0 / (head_size**0.5))
+
+    num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+    assert num_heads % num_kv_heads == 0
+    num_queries_per_kv = num_heads // num_kv_heads
+    head_mapping = torch.repeat_interleave(
+        torch.arange(num_kv_heads, dtype=torch.int32, device="cuda"),
+                     num_queries_per_kv)
+
    output = torch.empty(num_tokens,
                         num_heads,
                         head_size,