Unverified Commit 6e65c2cc authored by Olatunji Ruwase's avatar Olatunji Ruwase Committed by GitHub
Browse files

Deprecate client ability to disable gradient reduction (#552)


Co-authored-by: default avatarJeff Rasley <jerasley@microsoft.com>
parent 1ef5cd23
......@@ -878,6 +878,11 @@ class DeepSpeedEngine(Module):
allreduce_gradients: If this is False, then gradient averaging will be skipped. Default is True.
"""
if not allreduce_gradients:
logger.warning(
f'Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed'
)
# scale loss w.r.t. gradient accumulation if needed
if self.gradient_accumulation_steps() > 1:
loss = self._scale_loss(loss.float())
......@@ -931,7 +936,7 @@ class DeepSpeedEngine(Module):
self.timers('backward_allreduce_microstep').start()
self.timers('backward_allreduce').start()
if allreduce_gradients and self.enable_backward_allreduce:
if self.enable_backward_allreduce:
self.allreduce_gradients()
if self.wall_clock_breakdown():
......
......@@ -955,6 +955,12 @@ class FP16_DeepSpeedZeroOptimizer(object):
with torch.cuda.stream(stream):
for _, param, param_id in self.params_in_ipg_bucket:
assert self.params_already_reduced[param_id] == False, \
f"The parameter {param_id} has already been reduced. \
Gradient computed twice for this partition. \
Multiple gradient reduction is currently not supported"
self.params_already_reduced[param_id] = True
if not self.is_param_in_current_partition[param_id]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment