Unverified Commit e82c1cb7 authored by Wang, Yi's avatar Wang, Yi Committed by GitHub
Browse files

add gloo backend support for CPU DDP (#19555)


Signed-off-by: default avatarWang, Yi A <yi.a.wang@intel.com>
Signed-off-by: default avatarWang, Yi A <yi.a.wang@intel.com>
parent 0e0b7cb7
......@@ -295,7 +295,7 @@ class TrainingArguments:
local_rank (`int`, *optional*, defaults to -1):
Rank of the process during distributed training.
xpu_backend (`str`, *optional*):
The backend to use for xpu distributed training. Must be one of `"mpi"` or `"ccl"`.
The backend to use for xpu distributed training. Must be one of `"mpi"` or `"ccl"` or `"gloo"`.
tpu_num_cores (`int`, *optional*):
When training on TPU, the number of TPU cores (automatically passed by launcher script).
dataloader_drop_last (`bool`, *optional*, defaults to `False`):
......@@ -721,7 +721,10 @@ class TrainingArguments:
local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
xpu_backend: Optional[str] = field(
default=None,
metadata={"help": "The backend to be used for distributed training on Intel XPU.", "choices": ["mpi", "ccl"]},
metadata={
"help": "The backend to be used for distributed training on Intel XPU.",
"choices": ["mpi", "ccl", "gloo"],
},
)
tpu_num_cores: Optional[int] = field(
default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"}
......@@ -1333,10 +1336,10 @@ class TrainingArguments:
)
if self.local_rank != -1 and not torch.distributed.is_initialized():
# Initializes distributed backend for cpu
if self.xpu_backend not in ("mpi", "ccl"):
if self.xpu_backend not in ("mpi", "ccl", "gloo"):
raise ValueError(
"CPU distributed training backend is not properly set. "
"Please set '--xpu_backend' to either 'mpi' or 'ccl'."
"Please set '--xpu_backend' to either 'mpi' or 'ccl' or 'gloo'."
)
if self.xpu_backend == "ccl":
requires_backends(self, "oneccl_bind_pt")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment