[Minor] accelerate loading quantized model

f22c2a35 · Jiaming Tang · 5f377eff · f22c2a35
Commit f22c2a35 authored Jul 26, 2023 by Jiaming Tang
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 4 deletions

awq/quantize/quantizer.py awq/quantize/quantizer.py +9 -4

No files found.
--- a/awq/quantize/quantizer.py
+++ b/awq/quantize/quantizer.py
@@ -122,6 +122,8 @@ def real_quantize_model_weight(
            if init_only:
                q_linear = WQLinear.from_linear(
                    module, w_bit, q_config['q_group_size'], True)
+                q_linear.to(next(layer.parameters()).device)
+                set_op_by_name(layer, name, q_linear)
            else:
                module.cuda()
                module.weight.data, scales, zeros = pseudo_quantize_tensor(module.weight.data, n_bit=w_bit, get_scale_zp=True, **q_config)
@@ -130,7 +132,10 @@ def real_quantize_model_weight(
                q_linear = WQLinear.from_linear(
                    module, w_bit, q_config['q_group_size'], False, scales, zeros)
                module.cpu()
-            q_linear.to(next(layer.parameters()).device)
+                q_linear.to(next(layer.parameters()).device)
-            set_op_by_name(layer, name, q_linear)
+                set_op_by_name(layer, name, q_linear)
-            torch.cuda.empty_cache()
+                torch.cuda.empty_cache()
-            gc.collect()
+                gc.collect()
+    torch.cuda.empty_cache()
+    gc.collect()