Add compatibility with GQA & optimize multi-GPU memory allocation

9b427ebc · Jiaming Tang · dc139757 · 9b427ebc · 9b427ebc
Commit 9b427ebc authored Aug 14, 2023 by Jiaming Tang
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 6 deletions

awq/entry.py awq/entry.py +2 -1

awq/quantize/auto_scale.py awq/quantize/auto_scale.py +6 -5

No files found.
--- a/awq/entry.py
+++ b/awq/entry.py
@@ -5,6 +5,7 @@ import argparse
 import os
 import json
 from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model, load_checkpoint_in_model
+from accelerate.utils.modeling import get_balanced_memory
 from awq.utils.parallel import auto_parallel
 from awq.quantize.pre_quant import run_awq, apply_awq
 from awq.quantize.quantizer import pseudo_quantize_model_weight, real_quantize_model_weight
@@ -162,7 +163,7 @@ def build_model_and_enc(model_path):
                raise NotImplementedError
            
        # Move the model to GPU (as much as possible) for LM evaluation
-        kwargs = {"max_memory": max_memory} if len(max_memory) else {}
+        kwargs = {"max_memory": get_balanced_memory(model, max_memory if len(max_memory) > 0 else None)}
        device_map = infer_auto_device_map(
            model,
            # TODO: can we remove this?

--- a/awq/quantize/auto_scale.py
+++ b/awq/quantize/auto_scale.py
@@ -213,11 +213,12 @@ def auto_scale_block(module, module_kwargs,
            module2inspect=module.self_attn, kwargs=module_kwargs,
        ))
        # attn out
-        scales_list.append(_auto_get_scale(
-            prev_op=module.self_attn.v_proj,
-            layers=[module.self_attn.o_proj],
-            inp=input_feat['self_attn.o_proj'],
-        ))
+        if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
+            scales_list.append(_auto_get_scale(
+                prev_op=module.self_attn.v_proj,
+                layers=[module.self_attn.o_proj],
+                inp=input_feat['self_attn.o_proj'],
+            ))
        # fc1
        scales_list.append(_auto_get_scale(
            prev_op=module.post_attention_layernorm,