Unverified Commit 46415f5a authored by Blake Wyatt's avatar Blake Wyatt Committed by GitHub
Browse files

Add CPU-loaded multi-GPU quantization (#289)

parent 2edb3f6f
...@@ -81,9 +81,15 @@ class AwqQuantizer: ...@@ -81,9 +81,15 @@ class AwqQuantizer:
# Move module and inputs to correct device # Move module and inputs to correct device
common_device = next(self.modules[i].parameters()).device common_device = next(self.modules[i].parameters()).device
if common_device is None or str(common_device) == "cpu": if common_device is None or str(common_device) == "cpu":
self.modules[i] = self.modules[i].cuda() self.modules[i] = self.modules[i].cuda("cuda:" + str(i % torch.cuda.device_count()))
common_device = next(self.modules[i].parameters()).device common_device = next(self.modules[i].parameters()).device
if self.module_kwargs.get("position_ids") is not None:
self.module_kwargs["position_ids"] = self.module_kwargs["position_ids"].to(common_device)
if self.module_kwargs.get("attention_mask") is not None:
self.module_kwargs["attention_mask"] = self.module_kwargs["attention_mask"].to(common_device)
self.inps = self.inps.to(common_device) self.inps = self.inps.to(common_device)
# [STEP 1]: Get layer, extract linear modules, extract input features # [STEP 1]: Get layer, extract linear modules, extract input features
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment