Do not initialize `torch.distributed` process group if one is already initailized (#16487)

* Do not initialize torch process group twice * Apply suggestions from code review

Do not initialize `torch.distributed` process group if one is already initailized (#16487)
* Do not initialize torch process group twice * Apply suggestions from code review
277d49a5 · Antoni Baum · GitHub · 2b483230 · 277d49a5 · 277d49a5
Unverified Commit 277d49a5 authored Mar 29, 2022 by Antoni Baum Committed by GitHub Mar 29, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 3 deletions

src/transformers/sagemaker/training_args_sm.py src/transformers/sagemaker/training_args_sm.py +7 -1

src/transformers/training_args.py src/transformers/training_args.py +8 -2

No files found.
--- a/src/transformers/sagemaker/training_args_sm.py
+++ b/src/transformers/sagemaker/training_args_sm.py
@@ -77,6 +77,11 @@ class SageMakerTrainingArguments(TrainingArguments):
    @cached_property
    def _setup_devices(self) -> "torch.device":
        logger.info("PyTorch: setting up devices")
+        if torch.distributed.is_initialized() and self.local_rank == -1:
+            logger.warning(
+                "torch.distributed process group is initialized, but local_rank == -1. "
+                "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
+            )
        if self.no_cuda:
            device = torch.device("cpu")
            self._n_gpu = 0
@@ -105,7 +110,8 @@ class SageMakerTrainingArguments(TrainingArguments):
        else:
            # Here, we'll use torch.distributed.
            # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-            torch.distributed.init_process_group(backend="nccl")
+            if not torch.distributed.is_initialized():
+                torch.distributed.init_process_group(backend="nccl")
            device = torch.device("cuda", self.local_rank)
            self._n_gpu = 1


--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1022,10 +1022,15 @@ class TrainingArguments:
    @torch_required
    def _setup_devices(self) -> "torch.device":
        logger.info("PyTorch: setting up devices")
+        if torch.distributed.is_initialized() and self.local_rank == -1:
+            logger.warning(
+                "torch.distributed process group is initialized, but local_rank == -1. "
+                "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
+            )
        if self.no_cuda:
            device = torch.device("cpu")
            self._n_gpu = 0
-            if self.local_rank != -1:
+            if self.local_rank != -1 and not torch.distributed.is_initialized():
                # Initializes distributed backend for cpu
                if self.xpu_backend not in ("mpi", "ccl"):
                    raise ValueError(
@@ -1076,7 +1081,8 @@ class TrainingArguments:
        else:
            # Here, we'll use torch.distributed.
            # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-            torch.distributed.init_process_group(backend="nccl")
+            if not torch.distributed.is_initialized():
+                torch.distributed.init_process_group(backend="nccl")
            device = torch.device("cuda", self.local_rank)
            self._n_gpu = 1