Unverified Commit 4aabf9b5 authored by Wang, Yi's avatar Wang, Yi Committed by GitHub
Browse files

enable cpu distribution training using mpirun (#17570)



* enable cpu distribution training using mpirun

*command like
*    mpirun -n 2 python3 run_qa.py --no_cuda --xpu_backend ccl xxxx
*MASTER_ADDR and MASTER_PORT should be set as env
*export MASTER_ADDR=127.0.0.1
*export MASTER_PORT=29500
Signed-off-by: default avatarWang, Yi A <yi.a.wang@intel.com>

* fix according to the review comment
Signed-off-by: default avatarWang, Yi A <yi.a.wang@intel.com>

* use accelerate logic for cpu distribution training to set "RANK","LOCAL_RANK","WORLD_SIZE" environment
Signed-off-by: default avatarWang, Yi A <yi.a.wang@intel.com>
parent 457d4a32
...@@ -76,6 +76,15 @@ def default_logdir() -> str: ...@@ -76,6 +76,15 @@ def default_logdir() -> str:
return os.path.join("runs", current_time + "_" + socket.gethostname()) return os.path.join("runs", current_time + "_" + socket.gethostname())
def get_int_from_env(env_keys, default):
"""Returns the first positive env value found in the `env_keys` list or the default."""
for e in env_keys:
val = int(os.environ.get(e, -1))
if val >= 0:
return val
return default
class OptimizerNames(ExplicitEnum): class OptimizerNames(ExplicitEnum):
""" """
Stores the acceptable string identifiers for optimizers. Stores the acceptable string identifiers for optimizers.
...@@ -1246,6 +1255,10 @@ class TrainingArguments: ...@@ -1246,6 +1255,10 @@ class TrainingArguments:
if self.no_cuda: if self.no_cuda:
device = torch.device("cpu") device = torch.device("cpu")
self._n_gpu = 0 self._n_gpu = 0
self.local_rank = get_int_from_env(
["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"],
self.local_rank,
)
if self.local_rank != -1 and not torch.distributed.is_initialized(): if self.local_rank != -1 and not torch.distributed.is_initialized():
# Initializes distributed backend for cpu # Initializes distributed backend for cpu
if self.xpu_backend not in ("mpi", "ccl"): if self.xpu_backend not in ("mpi", "ccl"):
...@@ -1253,7 +1266,30 @@ class TrainingArguments: ...@@ -1253,7 +1266,30 @@ class TrainingArguments:
"CPU distributed training backend is not properly set. " "CPU distributed training backend is not properly set. "
"Please set '--xpu_backend' to either 'mpi' or 'ccl'." "Please set '--xpu_backend' to either 'mpi' or 'ccl'."
) )
torch.distributed.init_process_group(backend=self.xpu_backend) if self.xpu_backend == "ccl" and int(os.environ.get("CCL_WORKER_COUNT", 0)) < 1:
raise ValueError(
"CPU distributed training backend is ccl. but CCL_WORKER_COUNT is not correctly set. "
"Please use like 'export CCL_WORKER_COUNT = 1' to set."
)
# Try to get launch configuration from environment variables set by MPI launcher - works for Intel MPI, OpenMPI and MVAPICH
rank = get_int_from_env(["RANK", "PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK"], 0)
size = get_int_from_env(["WORLD_SIZE", "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE"], 1)
local_size = get_int_from_env(
["MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1
)
os.environ["RANK"] = str(rank)
os.environ["WORLD_SIZE"] = str(size)
os.environ["LOCAL_RANK"] = str(self.local_rank)
if not os.environ.get("MASTER_PORT", None):
os.environ["MASTER_PORT"] = "29500"
if not os.environ.get("MASTER_ADDR", None):
if local_size != size or self.xpu_backend != "mpi":
raise ValueError(
"Looks like distributed multinode run but MASTER_ADDR env not set, "
"please try exporting rank 0's hostname as MASTER_ADDR"
)
torch.distributed.init_process_group(backend=self.xpu_backend, rank=rank, world_size=size)
elif is_torch_tpu_available(): elif is_torch_tpu_available():
device = xm.xla_device() device = xm.xla_device()
self._n_gpu = 0 self._n_gpu = 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment