hotfix attn alibi wo head mapping (#496)

Co-authored-by: oliveryuan <oliveryuan@basemind.com>

hotfix attn alibi wo head mapping (#496)
Co-authored-by: oliveryuan <oliveryuan@basemind.com>
bda41c70 · Song · GitHub · 453bafb9 · bda41c70 · bda41c70
Unverified Commit bda41c70 authored Jul 19, 2023 by Song Committed by GitHub Jul 18, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 0 deletions

tests/kernels/test_attention.py tests/kernels/test_attention.py +2 -0

vllm/model_executor/layers/attention.py vllm/model_executor/layers/attention.py +1 -0

No files found.
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -199,6 +199,7 @@ def run_single_query_cached_kv_attention(
        ]
        block_tables.append(block_table)
    block_tables = torch.tensor(block_tables, dtype=torch.int, device='cuda')
+    head_mapping = torch.arange(num_heads, dtype=torch.int32, device="cuda")

    scale = float(1.0 / (head_size**0.5))
    output = torch.empty(num_tokens,
@@ -211,6 +212,7 @@ def run_single_query_cached_kv_attention(
        query,
        key_cache,
        value_cache,
+        head_mapping,
        scale,
        block_tables,
        context_lens,

--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -408,6 +408,7 @@ class PagedAttentionWithALiBi(PagedAttention):
            query,
            key_cache,
            value_cache,
+            self.head_mapping,
            self.scale,
            input_metadata.block_tables,
            input_metadata.context_lens,