Move cache initialization back to init

8c80b3e0 · Casper Hansen · c2cd6535 · 8c80b3e0
Commit 8c80b3e0 authored Sep 27, 2023 by Casper Hansen
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 10 deletions

awq/modules/fused/attn.py awq/modules/fused/attn.py +2 -10

No files found.
--- a/awq/modules/fused/attn.py
+++ b/awq/modules/fused/attn.py
@@ -84,7 +84,8 @@ class QuantAttentionFused(nn.Module):
        self.cache_batch_size = int(os.getenv("AWQ_BATCH_SIZE", "1"))
        self.max_seq_len = max_seq_len
        self.attention_shapes = self._get_attention_shapes(attention_shapes, max_seq_len)
-        self._initialize_cache(dev)
+        self.cache_v = ( torch.zeros(self.attention_shapes["cache_v"]).to(dev).half() )
+        self.cache_k = ( torch.zeros(self.attention_shapes["cache_k"]).to(dev).half() )

        if use_alibi:
            alibi_slopes, alibi_bias = build_alibi_bias(self.n_heads, max_seq_len)
@@ -101,15 +102,6 @@ class QuantAttentionFused(nn.Module):
            self.alibi_slopes = None
            self.is_neox = True
    
-    def _initialize_cache(self, dev):
-        self.cache_v = (
-            torch.zeros(self.attention_shapes["cache_v"]).to(dev).half()
-        )
-        
-        self.cache_k = (
-            torch.zeros(self.attention_shapes["cache_k"]).to(dev).half()
-        )
-    
    def _get_attention_shapes(self, attention_shapes, max_seq_len):
        if attention_shapes is not None:
            attention_shapes = attention_shapes