Unverified Commit 7c300d6d authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Fix barrier for SM distributed (#12853)

parent 0c1c42c1
......@@ -1052,6 +1052,8 @@ class TrainingArguments:
logger.debug(f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}")
if is_torch_tpu_available():
xm.rendezvous(desc)
elif is_sagemaker_dp_enabled():
sm_dist.Barrier()
else:
torch.distributed.barrier()
yield
......@@ -1061,6 +1063,8 @@ class TrainingArguments:
logger.debug(f"{self.process_index}: {main_process_desc} completed {desc}, releasing all replicas")
if is_torch_tpu_available():
xm.rendezvous(desc)
elif is_sagemaker_dp_enabled():
sm_dist.Barrier()
else:
torch.distributed.barrier()
else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment