enable cpu distribution training using mpirun (#17570)

* enable cpu distribution training using mpirun *command like * mpirun -n 2 python3 run_qa.py --no_cuda --xpu_backend ccl xxxx *MASTER_ADDR and MASTER_PORT should be set as env *export MASTER_ADDR=127.0.0.1 *export MASTER_PORT=29500 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix according to the review comment Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * use accelerate logic for cpu distribution training to set "RANK","LOCAL_RANK","WORLD_SIZE" environment Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

enable cpu distribution training using mpirun (#17570)
* enable cpu distribution training using mpirun *command like * mpirun -n 2 python3 run_qa.py --no_cuda --xpu_backend ccl xxxx *MASTER_ADDR and MASTER_PORT should be set as env *export MASTER_ADDR=127.0.0.1 *export MASTER_PORT=29500 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix according to the review comment Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * use accelerate logic for cpu distribution training to set "RANK","LOCAL_RANK","WORLD_SIZE" environment Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
4aabf9b5 · Wang, Yi · GitHub · 457d4a32 · 4aabf9b5
Unverified Commit 4aabf9b5 authored Jun 14, 2022 by Wang, Yi Committed by GitHub Jun 13, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 37 additions and 1 deletion

src/transformers/training_args.py src/transformers/training_args.py +37 -1

No files found.
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -76,6 +76,15 @@ def default_logdir() -> str:
    return os.path.join("runs", current_time + "_" + socket.gethostname())
+def get_int_from_env(env_keys, default):
+    """Returns the first positive env value found in the `env_keys` list or the default."""
+    for e in env_keys:
+        val = int(os.environ.get(e, -1))
+        if val >= 0:
+            return val
+    return default
 class OptimizerNames(ExplicitEnum):
    """
    Stores the acceptable string identifiers for optimizers.
@@ -1246,6 +1255,10 @@ class TrainingArguments:
        if self.no_cuda:
            device = torch.device("cpu")
            self._n_gpu = 0
+            self.local_rank = get_int_from_env(
+                ["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"],
+                self.local_rank,
+            )
            if self.local_rank != -1 and not torch.distributed.is_initialized():
                # Initializes distributed backend for cpu
                if self.xpu_backend not in ("mpi", "ccl"):
@@ -1253,7 +1266,30 @@ class TrainingArguments:
                        "CPU distributed training backend is not properly set. "
                        "Please set '--xpu_backend' to either 'mpi' or 'ccl'."
                    )
-                torch.distributed.init_process_group(backend=self.xpu_backend)
+                if self.xpu_backend == "ccl" and int(os.environ.get("CCL_WORKER_COUNT", 0)) < 1:
+                    raise ValueError(
+                        "CPU distributed training backend is ccl. but CCL_WORKER_COUNT is not correctly set. "
+                        "Please use like 'export CCL_WORKER_COUNT = 1' to set."
+                    )
+                # Try to get launch configuration from environment variables set by MPI launcher - works for Intel MPI, OpenMPI and MVAPICH
+                rank = get_int_from_env(["RANK", "PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK"], 0)
+                size = get_int_from_env(["WORLD_SIZE", "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE"], 1)
+                local_size = get_int_from_env(
+                    ["MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1
+                )
+                os.environ["RANK"] = str(rank)
+                os.environ["WORLD_SIZE"] = str(size)
+                os.environ["LOCAL_RANK"] = str(self.local_rank)
+                if not os.environ.get("MASTER_PORT", None):
+                    os.environ["MASTER_PORT"] = "29500"
+                if not os.environ.get("MASTER_ADDR", None):
+                    if local_size != size or self.xpu_backend != "mpi":
+                        raise ValueError(
+                            "Looks like distributed multinode run but MASTER_ADDR env not set, "
+                            "please try exporting rank 0's hostname as MASTER_ADDR"
+                        )
+                torch.distributed.init_process_group(backend=self.xpu_backend, rank=rank, world_size=size)
        elif is_torch_tpu_available():
            device = xm.xla_device()
            self._n_gpu = 0