[Major] Add CPU offloading support for apply_scale, apply_clip,...

[Major] Add CPU offloading support for apply_scale, apply_clip, pseudo_quantize_model_weight, real_quantize_model_weight

[Major] Add CPU offloading support for apply_scale, apply_clip,...
[Major] Add CPU offloading support for apply_scale, apply_clip, pseudo_quantize_model_weight, real_quantize_model_weight
95cd9c2d · Abhinav Kulkarni · 8e7e9ccc · 95cd9c2d · 95cd9c2d · 95cd9c2d
Commit 95cd9c2d authored Jul 01, 2023 by Abhinav Kulkarni
Showing with 29 additions and 5 deletions

awq/entry.py awq/entry.py +12 -4

awq/quantize/auto_clip.py awq/quantize/auto_clip.py +2 -0

awq/quantize/auto_scale.py awq/quantize/auto_scale.py +9 -1

awq/quantize/quantizer.py awq/quantize/quantizer.py +6 -0

No files found.
--- a/awq/entry.py
+++ b/awq/entry.py
@@ -114,8 +114,8 @@ def build_model_and_enc(model_path):
            exit(0)
        else:
            # Inference with fake quant
-            # Init model on GPUs:
+            # Init model on CPU:
-            kwargs = {"device_map": "balanced", "torch_dtype": torch.float16}
+            kwargs = {"torch_dtype": torch.float16}
            model = AutoModelForCausalLM.from_pretrained(
                model_path, config=config, trust_remote_code=True, **kwargs)
@@ -146,6 +146,15 @@ def build_model_and_enc(model_path):
                    exit(0)
            else:
                raise NotImplementedError
+            # Move the model to GPU (as much as possible) for LM evaluation
+            kwargs = {
+                "torch_dtype": torch.float16, 
+                "device_map": "auto", 
+                "max_memory": {0: "8GiB", "cpu": "99GiB"}
+            }
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path, config=config, state_dict=model.state_dict(), trust_remote_code=True, **kwargs)
    return model, enc
@@ -163,11 +172,10 @@ def main():
    # a hack here to auto set model group
    model, enc = build_model_and_enc(args.model_path)
-    lm_eval_model = LMEvalAdaptor(args.model_path, model, enc, args.batch_size)
    if args.tasks is not None:
        task_names = args.tasks.split(",")
+        lm_eval_model = LMEvalAdaptor(args.model_path, model, enc, args.batch_size)
        results = evaluator.simple_evaluate(
            model=lm_eval_model,
            tasks=task_names,

--- a/awq/quantize/auto_clip.py
+++ b/awq/quantize/auto_clip.py
@@ -86,8 +86,10 @@ def apply_clip(module, clip_list):
    from ..utils.module import get_op_by_name
    for name, max_val in clip_list:
        layer = get_op_by_name(module, name)
+        layer.cuda()
        max_val = max_val.to(layer.weight.device)
        org_shape = layer.weight.shape
        layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
        layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
        layer.weight.data = layer.weight.data.reshape(org_shape)
+        layer.cpu()
--- a/awq/quantize/auto_scale.py
+++ b/awq/quantize/auto_scale.py
@@ -320,6 +320,10 @@ def apply_scale(module, scales_list, input_feat_dict=None):
    for prev_op_name, layer_names, scales in scales_list:
        prev_op = get_op_by_name(module, prev_op_name)
        layers = [get_op_by_name(module, name) for name in layer_names]
+        prev_op.cuda()
+        for layer in layers:
+            layer.cuda()
        if isinstance(prev_op, nn.Linear):
            assert len(layers) == 1
@@ -338,4 +342,8 @@ def apply_scale(module, scales_list, input_feat_dict=None):
        if input_feat_dict is not None:  
            for layer_name in layer_names:
                inp = input_feat_dict[layer_name]
                inp.div_(scales.view(1, -1).to(inp.device))
\ No newline at end of file
+        prev_op.cpu()
+        for layer in layers:
+            layer.cpu()
--- a/awq/quantize/quantizer.py
+++ b/awq/quantize/quantizer.py
@@ -98,7 +98,9 @@ def pseudo_quantize_model_weight(
    for i in tqdm(range(len(layers)), desc="pseudo weight quantization..."):
        named_linears = get_named_linears(layers[i])
        for n, m in named_linears.items():
+            m.cuda()
            m.weight.data = pseudo_quantize_tensor(m.weight.data, n_bit=w_bit, **q_config)
+            m.cpu()
 @torch.no_grad()
@@ -121,11 +123,15 @@ def real_quantize_model_weight(
                q_linear = WQLinear.from_linear(
                    module, w_bit, q_config['q_group_size'], True)
            else:
+                module.cuda()
                module.weight.data, scales, zeros = pseudo_quantize_tensor(module.weight.data, n_bit=w_bit, get_scale_zp=True, **q_config)
                scales = scales.t().contiguous()
                zeros = zeros.t().contiguous()
                q_linear = WQLinear.from_linear(
                    module, w_bit, q_config['q_group_size'], False, scales, zeros)
+                module.cpu()
            set_op_by_name(layer, name, q_linear)
            torch.cuda.empty_cache()
            gc.collect()
+    model.tie_weights()
\ No newline at end of file