attach empty grad to its param to ensure it's copied after reduction (#316)

e1bea67f · Jeff Rasley · GitHub · 6855ba1c · e1bea67f
Unverified Commit e1bea67f authored Aug 13, 2020 by Jeff Rasley Committed by GitHub Aug 13, 2020
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 4 deletions

deepspeed/pt/deepspeed_light.py deepspeed/pt/deepspeed_light.py +4 -4

No files found.
--- a/deepspeed/pt/deepspeed_light.py
+++ b/deepspeed/pt/deepspeed_light.py
@@ -1013,10 +1013,10 @@ class DeepSpeedLight(Module):
                # rank is reducing the same size. In some cases it may make
                # sense in the future to support the ability to average not
                # w.r.t. world size but with a different value.
-                grads.append(
+                param.grad = torch.zeros(param.size(),
-                    torch.zeros(param.size(),
                                         dtype=param.dtype,
-                                device=param.device))
+                                         device=param.device)
+                grads.append(param.grad.data)
            else:
                grad_data = param.grad.data
                if self.sparse_gradients_enabled(