Support DDP.no_sync context manager

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/671 Differential Revision: D15925248 fbshipit-source-id: 9eeea8a257929347e2458afdfc1def8dbb925a72

Support DDP.no_sync context manager
Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/671 Differential Revision: D15925248 fbshipit-source-id: 9eeea8a257929347e2458afdfc1def8dbb925a72
b625d53d · Myle Ott · Facebook Github Bot · 6be5f07c · b625d53d · b625d53d
Commit b625d53d authored Jun 20, 2019 by Myle Ott Committed by Facebook Github Bot Jun 20, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 36 additions and 20 deletions

fairseq/legacy_distributed_data_parallel.py fairseq/legacy_distributed_data_parallel.py +14 -4

fairseq/trainer.py fairseq/trainer.py +22 -16

No files found.
--- a/fairseq/legacy_distributed_data_parallel.py
+++ b/fairseq/legacy_distributed_data_parallel.py
@@ -7,14 +7,16 @@
 """
 A modified version of the legacy DistributedDataParallel module that uses c10d
-communication primitives. This is necessary for models that have conditional
+communication primitives. This version is simpler than the latest PyTorch
-computation (e.g., AdaptiveSoftmax) and which therefore do not work with the
+version and is useful for debugging. Notably it does not overlap gradient
-c10d version of DDP.
+communication with the backward pass, which makes it slower but more robust
+than the PyTorch version.
-This version also supports the *accumulate_grads* feature, which allows faster
+This version also supports the *no_sync* context manager, which allows faster
 training with `--update-freq`.
 """
+from contextlib import contextmanager
 import copy
 import torch
@@ -74,6 +76,14 @@ class LegacyDistributedDataParallel(nn.Module):
        super().__setstate__(state)
        self._register_grad_hook()
+    @contextmanager
+    def no_sync(self):
+        """A context manager to disable gradient synchronization."""
+        old_accumulate_grads = self.accumulate_grads
+        self.accumulate_grads = True
+        yield
+        self.accumulate_grads = old_accumulate_grads
    def forward(self, *inputs, **kwargs):
        return self.module(*inputs, **kwargs)

--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -10,6 +10,7 @@ Train a network across multiple GPUs.
 """
 from collections import OrderedDict
+import contextlib
 from itertools import chain
 import math
 import os
@@ -242,23 +243,28 @@ class Trainer(object):
            else:
                ignore_grad = False
+            def maybe_no_sync():
+                """
+                Whenever *samples* contains more than one mini-batch, we
+                want to accumulate gradients locally and only call
+                all-reduce in the last backwards pass.
+                """
+                if (
+                    self.args.distributed_world_size > 1
+                    and hasattr(self.model, 'no_sync')
+                    and i < len(samples) - 1
+                ):
+                    return self.model.no_sync()
+                else:
+                    return contextlib.ExitStack()  # dummy contextmanager
            try:
-                if self.args.distributed_world_size > 1:
+                with maybe_no_sync():
-                    # Whenever *samples* contains more than one mini-batch, we
+                    # forward and backward
-                    # want to accumulate gradients locally and only call
+                    loss, sample_size, logging_output = self.task.train_step(
-                    # all-reduce in the last backwards pass. Currently the
+                        sample, self.model, self.criterion, self.optimizer,
-                    # *accumulate_grads* flag is only supported by
+                        ignore_grad
-                    # LegacyDistributedDataParallel.
+                    )
-                    if i < len(samples) - 1:
-                        self.model.accumulate_grads = True
-                    else:
-                        self.model.accumulate_grads = False
-                # forward and backward
-                loss, sample_size, logging_output = self.task.train_step(
-                    sample, self.model, self.criterion, self.optimizer,
-                    ignore_grad
-                )
                if not ignore_grad:
                    logging_outputs.append(logging_output)