Deprecate client ability to disable gradient reduction (#552)

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>

Deprecate client ability to disable gradient reduction (#552)
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
6e65c2cc · Olatunji Ruwase · GitHub · 1ef5cd23 · 6e65c2cc · 6e65c2cc
Unverified Commit 6e65c2cc authored Nov 24, 2020 by Olatunji Ruwase Committed by GitHub Nov 24, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 1 deletion

deepspeed/runtime/engine.py deepspeed/runtime/engine.py +6 -1

deepspeed/runtime/zero/stage2.py deepspeed/runtime/zero/stage2.py +6 -0

No files found.
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -878,6 +878,11 @@ class DeepSpeedEngine(Module):
            allreduce_gradients: If this is False, then gradient averaging will be skipped. Default is True.
        """
+        if not allreduce_gradients:
+            logger.warning(
+                f'Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed'
+            )
        # scale loss w.r.t. gradient accumulation if needed
        if self.gradient_accumulation_steps() > 1:
            loss = self._scale_loss(loss.float())
@@ -931,7 +936,7 @@ class DeepSpeedEngine(Module):
            self.timers('backward_allreduce_microstep').start()
            self.timers('backward_allreduce').start()
-        if allreduce_gradients and self.enable_backward_allreduce:
+        if self.enable_backward_allreduce:
            self.allreduce_gradients()
        if self.wall_clock_breakdown():

--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -955,6 +955,12 @@ class FP16_DeepSpeedZeroOptimizer(object):
        with torch.cuda.stream(stream):
            for _, param, param_id in self.params_in_ipg_bucket:
+                assert self.params_already_reduced[param_id] == False, \
+                    f"The parameter {param_id} has already been reduced. \
+                    Gradient computed twice for this partition. \
+                    Multiple gradient reduction is currently not supported"
                self.params_already_reduced[param_id] = True
                if not self.is_param_in_current_partition[param_id]: