Fix condition when rolling cache (#150)

958678df · Casper · GitHub · 92a403b2 · 958678df
Unverified Commit 958678df authored Nov 04, 2023 by Casper Committed by GitHub Nov 04, 2023
Show whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

awq/modules/fused/attn.py awq/modules/fused/attn.py +1 -1

No files found.
--- a/awq/modules/fused/attn.py
+++ b/awq/modules/fused/attn.py
@@ -143,7 +143,7 @@ class QuantAttentionFused(nn.Module):
        will_cache_be_exceeded = self.start_pos + seqlen > self.max_seq_len
        # Reset and avoid retaining state when processing context
-        if will_cache_be_exceeded:
+        if will_cache_be_exceeded and seqlen > 1:
            self.start_pos = self.cache.roll_kv_n_steps(self.start_pos, n=self.start_pos)
        # Slowly roll out old tokens without performance hit if exceeded during decoding 
        elif will_cache_be_exceeded and seqlen == 1: