Multi-GPU support for quantized models

f8273a0c · Casper Hansen · 7cf0d987 · f8273a0c
Commit f8273a0c authored Sep 06, 2023 by Casper Hansen
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 7 deletions

awq/models/base.py awq/models/base.py +15 -7

No files found.
--- a/awq/models/base.py
+++ b/awq/models/base.py
@@ -297,21 +297,29 @@ class BaseAWQForCausalLM(nn.Module):
        model.tie_weights()
+        device_map = infer_auto_device_map(
+            model,
+            no_split_module_classes=[self.layer_type], 
+            dtype=torch_dtype
+        )
        # Load model weights
        if is_quantized:
-            model = load_checkpoint_and_dispatch(model, model_filename, device_map=device, no_split_module_classes=[self.layer_type])
+            model = load_checkpoint_and_dispatch(
+                model, 
+                model_filename, 
+                device_map=device_map, 
+                no_split_module_classes=[self.layer_type]
+            )
            if fuse_layers:
                self.fuse_layers(model)
+            from awq.utils.utils import simple_dispatch_model
+            model = simple_dispatch_model(model, device_map)
        else:
            # If not quantized, must load with AutoModelForCausalLM
-            device_map = infer_auto_device_map(
-                model,
-                no_split_module_classes=[self.layer_type], 
-                dtype=torch_dtype
-            )
            del model
            # Load model weights