ZeRO Stage 2: Clear reduced gradients (#856)

* Ensure gradients of other partitions are cleared after reduction * Remove redundant code Co-authored-by: Jeff Rasley <jerasley@microsoft.com>

ZeRO Stage 2: Clear reduced gradients (#856)
* Ensure gradients of other partitions are cleared after reduction * Remove redundant code Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
a75d971b · Olatunji Ruwase · GitHub · 46018859 · a75d971b
Unverified Commit a75d971b authored Mar 15, 2021 by Olatunji Ruwase Committed by GitHub Mar 15, 2021
Show whitespace changes
Inline Side-by-side

Showing with 15 additions and 8 deletions

deepspeed/runtime/zero/stage2.py deepspeed/runtime/zero/stage2.py +15 -8

No files found.
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -37,7 +37,7 @@ def split_half_float_double(tensors):
    ]
    buckets = []
    for i, dtype in enumerate(dtypes):
-        bucket = [t for t in tensors if t is not None and t.type() == dtype]
+        bucket = [t for t in tensors if t.type() == dtype]
        if bucket:
            buckets.append(bucket)
    return buckets
@@ -477,6 +477,8 @@ class FP16_DeepSpeedZeroOptimizer(object):

        if self.overlap_comm:
            torch.cuda.synchronize()
+            # It is safe to clear previously reduced grads of other partitions
+            self._clear_previous_reduced_grads()

        if self.cpu_offload is False:
            for i, _ in enumerate(self.fp16_groups):
@@ -638,6 +640,9 @@ class FP16_DeepSpeedZeroOptimizer(object):
            param.grad.data = new_grad_tensor.data.view_as(param.grad)

        self.elements_in_ipg_bucket += param.numel()
+
+        assert param.grad is not None, f"rank {dist.get_rank()} - Invalid to reduce Param {param_id} with None gradient"
+
        self.grads_in_ipg_bucket.append(param.grad)
        self.params_in_ipg_bucket.append((i, param, param_id))

@@ -965,7 +970,7 @@ class FP16_DeepSpeedZeroOptimizer(object):

                if not self.is_param_in_current_partition[param_id]:
                    if self.overlap_comm and self.contiguous_gradients is False:
-                        # Clear the previous grads during the next reduction
+                        # Clear grads of other partitions during the next reduction
                        # to avoid clearing them before the reduction is complete.
                        if self.previous_reduced_grads is None:
                            self.previous_reduced_grads = []
@@ -1078,16 +1083,18 @@ class FP16_DeepSpeedZeroOptimizer(object):

        return tensor

-    #if rank is specified do a reduction instead of an allreduce
-    def allreduce_and_copy(self, small_bucket, rank=None, log=None):
-        if self.overlap_comm:
-            torch.cuda.synchronize()
+    def _clear_previous_reduced_grads(self):
        if self.previous_reduced_grads is not None:
-                # previous_reduced_grads has the previous reduced grads,
-                # now it is safe to clear.
            for param in self.previous_reduced_grads:
                param.grad = None
            self.previous_reduced_grads = None
+
+    #if rank is specified do a reduction instead of an allreduce
+    def allreduce_and_copy(self, small_bucket, rank=None, log=None):
+        if self.overlap_comm:
+            torch.cuda.synchronize()
+            # It is safe to clear the previously reduced grads of other partitions
+            self._clear_previous_reduced_grads()
            stream = self.reduction_stream
        else:
            stream = torch.cuda.current_stream()