"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "67ff1c314a61a2d5949b3bb48fa3ec7e9b697d7e"
Unverified Commit 7c300d6d authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Fix barrier for SM distributed (#12853)

parent 0c1c42c1
...@@ -1052,6 +1052,8 @@ class TrainingArguments: ...@@ -1052,6 +1052,8 @@ class TrainingArguments:
logger.debug(f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}") logger.debug(f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}")
if is_torch_tpu_available(): if is_torch_tpu_available():
xm.rendezvous(desc) xm.rendezvous(desc)
elif is_sagemaker_dp_enabled():
sm_dist.Barrier()
else: else:
torch.distributed.barrier() torch.distributed.barrier()
yield yield
...@@ -1061,6 +1063,8 @@ class TrainingArguments: ...@@ -1061,6 +1063,8 @@ class TrainingArguments:
logger.debug(f"{self.process_index}: {main_process_desc} completed {desc}, releasing all replicas") logger.debug(f"{self.process_index}: {main_process_desc} completed {desc}, releasing all replicas")
if is_torch_tpu_available(): if is_torch_tpu_available():
xm.rendezvous(desc) xm.rendezvous(desc)
elif is_sagemaker_dp_enabled():
sm_dist.Barrier()
else: else:
torch.distributed.barrier() torch.distributed.barrier()
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment