New logic for passing past_key_value (#177)

5db86ec5 · Younes Belkada · GitHub · 63e3fd83 · 5db86ec5
Unverified Commit 5db86ec5 authored Nov 10, 2023 by Younes Belkada Committed by GitHub Nov 10, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 1 deletion

awq/modules/fused/attn.py awq/modules/fused/attn.py +3 -1

No files found.
--- a/awq/modules/fused/attn.py
+++ b/awq/modules/fused/attn.py
@@ -215,5 +215,7 @@ class QuantAttentionFused(nn.Module):
        self.start_pos += seqlen

        # past_key_value is replaced with cache_v, cache_k, returning empty data
-        past_key_value = [torch.Tensor([ [ [[0]], [[0]], [[0]] ] ])]
+        # we pass a dummy past kv cache for transformers to be able to retrieve the correct info 
+        # about past key length
+        past_key_value = [torch.zeros(1, 1, self.start_pos, 1)]
        return attn_output, attention_weight, past_key_value