Unverified Commit e1bea67f authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files

attach empty grad to its param to ensure it's copied after reduction (#316)

parent 6855ba1c
...@@ -1013,10 +1013,10 @@ class DeepSpeedLight(Module): ...@@ -1013,10 +1013,10 @@ class DeepSpeedLight(Module):
# rank is reducing the same size. In some cases it may make # rank is reducing the same size. In some cases it may make
# sense in the future to support the ability to average not # sense in the future to support the ability to average not
# w.r.t. world size but with a different value. # w.r.t. world size but with a different value.
grads.append( param.grad = torch.zeros(param.size(),
torch.zeros(param.size(),
dtype=param.dtype, dtype=param.dtype,
device=param.device)) device=param.device)
grads.append(param.grad.data)
else: else:
grad_data = param.grad.data grad_data = param.grad.data
if self.sparse_gradients_enabled( if self.sparse_gradients_enabled(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment