Unverified Commit e1bea67f authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files

attach empty grad to its param to ensure it's copied after reduction (#316)

parent 6855ba1c
......@@ -1013,10 +1013,10 @@ class DeepSpeedLight(Module):
# rank is reducing the same size. In some cases it may make
# sense in the future to support the ability to average not
# w.r.t. world size but with a different value.
grads.append(
torch.zeros(param.size(),
dtype=param.dtype,
device=param.device))
param.grad = torch.zeros(param.size(),
dtype=param.dtype,
device=param.device)
grads.append(param.grad.data)
else:
grad_data = param.grad.data
if self.sparse_gradients_enabled(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment