add gloo backend support for CPU DDP (#19555)

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

add gloo backend support for CPU DDP (#19555)
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
e82c1cb7 · Wang, Yi · GitHub · 0e0b7cb7 · e82c1cb7
Unverified Commit e82c1cb7 authored Oct 14, 2022 by Wang, Yi Committed by GitHub Oct 14, 2022
Show whitespace changes
Inline Side-by-side

Showing with 7 additions and 4 deletions

src/transformers/training_args.py src/transformers/training_args.py +7 -4

No files found.
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -295,7 +295,7 @@ class TrainingArguments:
        local_rank (`int`, *optional*, defaults to -1):
            Rank of the process during distributed training.
        xpu_backend (`str`, *optional*):
-            The backend to use for xpu distributed training. Must be one of `"mpi"` or `"ccl"`.
+            The backend to use for xpu distributed training. Must be one of `"mpi"` or `"ccl"` or `"gloo"`.
        tpu_num_cores (`int`, *optional*):
            When training on TPU, the number of TPU cores (automatically passed by launcher script).
        dataloader_drop_last (`bool`, *optional*, defaults to `False`):
@@ -721,7 +721,10 @@ class TrainingArguments:
    local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
    xpu_backend: Optional[str] = field(
        default=None,
-        metadata={"help": "The backend to be used for distributed training on Intel XPU.", "choices": ["mpi", "ccl"]},
+        metadata={
+            "help": "The backend to be used for distributed training on Intel XPU.",
+            "choices": ["mpi", "ccl", "gloo"],
+        },
    )
    tpu_num_cores: Optional[int] = field(
        default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"}
@@ -1333,10 +1336,10 @@ class TrainingArguments:
            )
            if self.local_rank != -1 and not torch.distributed.is_initialized():
                # Initializes distributed backend for cpu
-                if self.xpu_backend not in ("mpi", "ccl"):
+                if self.xpu_backend not in ("mpi", "ccl", "gloo"):
                    raise ValueError(
                        "CPU distributed training backend is not properly set. "
-                        "Please set '--xpu_backend' to either 'mpi' or 'ccl'."
+                        "Please set '--xpu_backend' to either 'mpi' or 'ccl' or 'gloo'."
                    )
                if self.xpu_backend == "ccl":
                    requires_backends(self, "oneccl_bind_pt")