Fix attention

ba84b872 · Woosuk Kwon · 87e0bcd4 · ba84b872
Commit ba84b872 authored Feb 23, 2023 by Woosuk Kwon
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 1 deletion

cacheflow/models/attention.py cacheflow/models/attention.py +5 -1

No files found.
--- a/cacheflow/models/attention.py
+++ b/cacheflow/models/attention.py
-from typing import Optional, Tuple
+from typing import Optional
 import torch
 import torch.nn as nn
@@ -24,8 +24,12 @@ class OPTCacheFlowAttention(nn.Module):
        key: torch.Tensor,
        value: torch.Tensor,
    ) -> None:
+        query = query.unsqueeze(0)
+        key = key.unsqueeze(0)
+        value = value.unsqueeze(0)
        out = xops.memory_efficient_attention(
            query, key, value, attn_bias=self.attention_mask, scale=self.scale)
+        out = out.squeeze(0)
        # FIXME(woosuk): Directly write the attention output.
        output.copy_(out, non_blocking=True)