[OSS] Fix for torch dist broadcast randomly failing on dummy object (#323)

* fix for torch dist broadcast failing on dummy object

[OSS] Fix for torch dist broadcast randomly failing on dummy object (#323)
* fix for torch dist broadcast failing on dummy object
eab1551a · Benjamin Lefaudeux · GitHub · 1ece280a · eab1551a · eab1551a
Unverified Commit eab1551a authored Jan 25, 2021 by Benjamin Lefaudeux Committed by GitHub Jan 25, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 3 deletions

fairscale/optim/oss.py fairscale/optim/oss.py +5 -2

pyproject.toml pyproject.toml +1 -1

No files found.
--- a/fairscale/optim/oss.py
+++ b/fairscale/optim/oss.py
@@ -327,6 +327,9 @@ class OSS(Optimizer):
            self.local_state_dict(), non_blocking=True, device=torch.device("cpu")
        )

+        # Tensor cannot be really empty, even if its size is meaningless
+        dummy_sync_tensor = torch.tensor([1], device=self._device)
+
        for rank in range(self.world_size):
            if rank == self.rank:
                # Send the state to the reference replica
@@ -346,10 +349,10 @@ class OSS(Optimizer):

                # Discard this tensor/rank, broadcast necessary for syncing and because NCCL does not support gather
                if _torch_broadcast_object:
-                    dist.broadcast_object_list([0], src=global_rank, group=self.group)
+                    dist.broadcast_object_list([dummy_sync_tensor], src=global_rank, group=self.group)
                else:
                    broadcast_object(
-                        torch.tensor([0], dtype=torch.uint8, device=self._device),
+                        torch.tensor([dummy_sync_tensor], dtype=torch.uint8, device=self._device),
                        src_rank=global_rank,
                        group=self.group,
                        dist_device=self._device,

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,4 +28,4 @@ use_parentheses = true
 skip_glob = ["build/*", "stubs/*"]
 # Don't split "import" and "from".
 force_sort_within_sections = true
-known_third_party = ["datasets", "golden_configs", "helpers", "models", "numpy", "pytest", "recommonmark", "setuptools", "torch", "torch_pg", "torchtext", "torchvision"]
+known_third_party = ["benchmark_dataset", "datasets", "golden_configs", "helpers", "models", "numpy", "pytest", "recommonmark", "setuptools", "torch", "torch_pg", "torchtext", "torchvision"]