Merge pull request #30 from NVIDIA/checkpoint_fix

Handle set/get state for DDP, remove stream which cant be pickled.

Merge pull request #30 from NVIDIA/checkpoint_fix
Handle set/get state for DDP, remove stream which cant be pickled.
77ee4bcd · Christian Sarofeen · GitHub · 5c6144e6 · f1f97f9f · 77ee4bcd
Unverified Commit 77ee4bcd authored Jul 18, 2018 by Christian Sarofeen Committed by GitHub Jul 18, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 0 deletions

apex/parallel/distributed.py apex/parallel/distributed.py +11 -0

No files found.
--- a/apex/parallel/distributed.py
+++ b/apex/parallel/distributed.py
@@ -87,6 +87,17 @@ class DistributedDataParallel(Module):
        self.create_hooks()
        flat_dist_call([param.data for param in self.module.parameters()], dist.broadcast, (0,) )
+    def __setstate__(self, state):
+        super(DistributedDataParallel, self).__setstate__(state)
+        self.reduction_stream = torch.cuda.Stream()
+    def __getstate__(self, state):
+        attrs = copy.copy(self.__dict__)
+        if dist._backend != dist.dist_backend.NCCL:
+            del attrs['self.reduction_stream']
+            return attrs
    def create_hooks(self):
        #all reduce gradient hook