Add option to turn on/off allreduce in DDP (useful for gradient accumulation) (#356)

1c2ba890 · Thor Johnsen · mcarilli · 47e3367f · 1c2ba890
Commit 1c2ba890 authored Jun 13, 2019 by Thor Johnsen Committed by mcarilli Jun 13, 2019
Show whitespace changes
Inline Side-by-side

Showing with 79 additions and 69 deletions

apex/parallel/distributed.py apex/parallel/distributed.py +79 -69

No files found.
--- a/apex/parallel/distributed.py
+++ b/apex/parallel/distributed.py
@@ -216,6 +216,8 @@ class DistributedDataParallel(Module):
        self.module = module
+        self.disable_allreduce = False
        if self._backend == self.backend_enum_holder.NCCL:
            for param in self.module.parameters():
                assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU."
@@ -250,6 +252,12 @@ class DistributedDataParallel(Module):
            del attrs['self.reduction_event']
            return attrs
+    def turn_on_allreduce(self):
+        self.disable_allreduce = False
+    def turn_off_allreduce(self):
+        self.disable_allreduce = True
    # Broadcast rank 0's bucket structure across all processes, and have all processes 
    # regenerate their bucket structures to match. 
    def sync_bucket_structure(self):
@@ -327,6 +335,7 @@ class DistributedDataParallel(Module):
                    grad_acc = param_tmp.grad_fn.next_functions[0][0]
                    def allreduce_hook(*unused):
+                        if not self.disable_allreduce:
                            if self.delay_allreduce or self.needs_refresh:
                                # TODO:  How do we want to handle multiple backward passes between
                                # each forward, e.g., backward passes with retain_graph=True?
@@ -470,6 +479,7 @@ class DistributedDataParallel(Module):
    def forward(self, *inputs, **kwargs):
        result = self.module(*inputs, **kwargs)
+        if not self.disable_allreduce: 
            if not self.delay_allreduce:
                param_list = [param for param in self.module.parameters() if param.requires_grad]