"...lm-evaluation-harness.git" did not exist on "afda6551e9e8d8021c1fdd35d2aad0fbe63f3919"
Unverified Commit e82c1cb7 authored by Wang, Yi's avatar Wang, Yi Committed by GitHub
Browse files

add gloo backend support for CPU DDP (#19555)


Signed-off-by: default avatarWang, Yi A <yi.a.wang@intel.com>
Signed-off-by: default avatarWang, Yi A <yi.a.wang@intel.com>
parent 0e0b7cb7
...@@ -295,7 +295,7 @@ class TrainingArguments: ...@@ -295,7 +295,7 @@ class TrainingArguments:
local_rank (`int`, *optional*, defaults to -1): local_rank (`int`, *optional*, defaults to -1):
Rank of the process during distributed training. Rank of the process during distributed training.
xpu_backend (`str`, *optional*): xpu_backend (`str`, *optional*):
The backend to use for xpu distributed training. Must be one of `"mpi"` or `"ccl"`. The backend to use for xpu distributed training. Must be one of `"mpi"` or `"ccl"` or `"gloo"`.
tpu_num_cores (`int`, *optional*): tpu_num_cores (`int`, *optional*):
When training on TPU, the number of TPU cores (automatically passed by launcher script). When training on TPU, the number of TPU cores (automatically passed by launcher script).
dataloader_drop_last (`bool`, *optional*, defaults to `False`): dataloader_drop_last (`bool`, *optional*, defaults to `False`):
...@@ -721,7 +721,10 @@ class TrainingArguments: ...@@ -721,7 +721,10 @@ class TrainingArguments:
local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"}) local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
xpu_backend: Optional[str] = field( xpu_backend: Optional[str] = field(
default=None, default=None,
metadata={"help": "The backend to be used for distributed training on Intel XPU.", "choices": ["mpi", "ccl"]}, metadata={
"help": "The backend to be used for distributed training on Intel XPU.",
"choices": ["mpi", "ccl", "gloo"],
},
) )
tpu_num_cores: Optional[int] = field( tpu_num_cores: Optional[int] = field(
default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"} default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"}
...@@ -1333,10 +1336,10 @@ class TrainingArguments: ...@@ -1333,10 +1336,10 @@ class TrainingArguments:
) )
if self.local_rank != -1 and not torch.distributed.is_initialized(): if self.local_rank != -1 and not torch.distributed.is_initialized():
# Initializes distributed backend for cpu # Initializes distributed backend for cpu
if self.xpu_backend not in ("mpi", "ccl"): if self.xpu_backend not in ("mpi", "ccl", "gloo"):
raise ValueError( raise ValueError(
"CPU distributed training backend is not properly set. " "CPU distributed training backend is not properly set. "
"Please set '--xpu_backend' to either 'mpi' or 'ccl'." "Please set '--xpu_backend' to either 'mpi' or 'ccl' or 'gloo'."
) )
if self.xpu_backend == "ccl": if self.xpu_backend == "ccl":
requires_backends(self, "oneccl_bind_pt") requires_backends(self, "oneccl_bind_pt")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment