Unverified Commit cdde85a0 authored by Wang, Yi's avatar Wang, Yi Committed by GitHub
Browse files

oob performance improvement for cpu DDP (#18595)



* oob performance improvement for cpu DDP
Signed-off-by: default avatarWang, Yi A <yi.a.wang@intel.com>

* add is_psutil_available check
Signed-off-by: default avatarWang, Yi A <yi.a.wang@intel.com>
Signed-off-by: default avatarWang, Yi A <yi.a.wang@intel.com>
parent c3be98eb
......@@ -39,6 +39,7 @@ from .utils import (
ccl_version,
get_full_repo_name,
is_accelerate_available,
is_psutil_available,
is_sagemaker_dp_enabled,
is_sagemaker_mp_enabled,
is_torch_available,
......@@ -1342,6 +1343,21 @@ class TrainingArguments:
"Looks like distributed multinode run but MASTER_ADDR env not set, "
"please try exporting rank 0's hostname as MASTER_ADDR"
)
if (
torch.get_num_threads() == 1
and get_int_from_env(["OMP_NUM_THREADS", "MKL_NUM_THREADS"], 0) == 0
and is_psutil_available()
):
import psutil
num_cpu_threads_per_process = int(psutil.cpu_count(logical=False) / local_size)
if num_cpu_threads_per_process == 0:
num_cpu_threads_per_process = 1
torch.set_num_threads(num_cpu_threads_per_process)
logger.info(
f"num_cpu_threads_per_process unset, we set it at {num_cpu_threads_per_process} to improve oob"
" performance."
)
torch.distributed.init_process_group(backend=self.xpu_backend, rank=rank, world_size=size)
elif is_torch_tpu_available():
device = xm.xla_device()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment