[`core`] Make AutoAWQ fused modules compatible with HF transformers (#244)

ae24f424 · Younes Belkada · GitHub · 09db054f · ae24f424
Unverified Commit ae24f424 authored Dec 11, 2023 by Younes Belkada Committed by GitHub Dec 11, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 0 deletions

awq/modules/fused/attn.py awq/modules/fused/attn.py +16 -0

No files found.
--- a/awq/modules/fused/attn.py
+++ b/awq/modules/fused/attn.py
@@ -6,12 +6,22 @@ from torch.nn import functional as F
 from awq.modules.fused.cache import WindowedCache
 from awq.utils.fused_utils import get_attention_shapes
 try:
    import ft_inference_engine
    FT_INSTALLED = True
 except:
    FT_INSTALLED = False
+HF_NEW_CACHE_FORMAT = False
+import transformers
+# https://github.com/huggingface/transformers/pull/26681 introduced a new cache format
+HF_NEW_CACHE_FORMAT = hasattr(transformers, "cache_utils")
+if HF_NEW_CACHE_FORMAT:
+    from transformers.cache_utils import DynamicCache
 class RoPE(nn.Module):
    def __init__(self, hidden_size, n_heads, max_seq_len, device):
        super(RoPE, self).__init__()
@@ -223,4 +233,10 @@ class QuantAttentionFused(nn.Module):
        # we pass a dummy past kv cache for transformers to be able to retrieve the correct info 
        # about past key length
        past_key_value = [torch.zeros(1, 1, self.start_pos, 1)]
+        if HF_NEW_CACHE_FORMAT and self.is_hf_transformers:
+            new_cache = DynamicCache()
+            new_cache.update(past_key_value[0], past_key_value[0], layer_idx=0)
+            past_key_value = new_cache
        return attn_output, attention_weight, past_key_value