Only apply attention mask if seqlen is greater than 1

e94b7f40 · Casper Hansen · 0baf5e18 · e94b7f40
Commit e94b7f40 authored Oct 07, 2023 by Casper Hansen
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 1 deletion

awq/modules/fused/attn.py awq/modules/fused/attn.py +2 -1

No files found.
--- a/awq/modules/fused/attn.py
+++ b/awq/modules/fused/attn.py
@@ -176,7 +176,8 @@ class QuantAttentionFused(nn.Module):
            if self.use_alibi:
                scores = self.alibi.forward(scores, seqlen)
-            if attention_mask is not None:
+            # When seqlen is 1, there is nothing else to attend to
+            if attention_mask is not None and seqlen > 1:
                scores = scores + attention_mask  # (bs, n_local_heads, slen, cache_len + slen)
            scores = F.softmax(scores.float(), dim=-1).type_as(xq)