Fix attention

87e0bcd4 · Woosuk Kwon · 1ce13335 · 87e0bcd4
Commit 87e0bcd4 authored Feb 23, 2023 by Woosuk Kwon
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 7 deletions

cacheflow/models/attention.py cacheflow/models/attention.py +12 -7

No files found.
--- a/cacheflow/models/attention.py
+++ b/cacheflow/models/attention.py
@@ -44,19 +44,18 @@ class OPTCacheFlowAttention(nn.Module):
        # FIXME(woosuk): Replace the following with a custom op.
        for i in range(input_metadata.num_generation_tokens):
-            q = query[i]
+            q = query[i].unsqueeze(0)
            block_table = block_tables[i]
            context_len = int(input_metadata.context_lens[i])
            keys = []
            for j in range(context_len):
                block_number = block_table[j // block_size]
                block_offset = j % block_size
                k = key_cache[block_number, :, :, block_offset, :]
-                k = k.view(num_heads, head_size)
+                k = k.reshape(num_heads, head_size)
                keys.append(k)
-            keys = torch.stack(keys, dim=-1)
+            keys = torch.stack(keys, dim=0)
-            logits = q @ keys
-            attention_weights = torch.softmax(logits, dim=-1)
            values = []
            for j in range(context_len):
@@ -64,8 +63,14 @@ class OPTCacheFlowAttention(nn.Module):
                block_offset = j % block_size
                v = value_cache[block_number, :, block_offset, :]
                values.append(v)
-            values = torch.stack(values, dim=-1)
+            values = torch.stack(values, dim=0)
-            out = attention_weights @ values
+            q = q.unsqueeze(0)
+            keys = keys.unsqueeze(0)
+            values = values.unsqueeze(0)
+            out = xops.memory_efficient_attention(
+                q, keys, values, scale=self.scale)
+            out = out.view(num_heads, head_size)
            output[i].copy_(out, non_blocking=True)
    def forward(