Unverified Commit 6c5b20aa authored by Jin Young (Daniel) Sohn's avatar Jin Young (Daniel) Sohn Committed by GitHub
Browse files

Fix training_args.py barrier for torch_xla (#12464)

torch_xla currently has its own synchronization primitives, so use
xm.rendezvous(tag) instead.
parent 2a501ac9
......@@ -1027,13 +1027,19 @@ class TrainingArguments:
if not is_main_process:
# tell all replicas to wait
logger.debug(f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}")
torch.distributed.barrier()
if is_torch_tpu_available():
xm.rendezvous(desc)
else:
torch.distributed.barrier()
yield
finally:
if is_main_process:
# the wait is over
logger.debug(f"{self.process_index}: {main_process_desc} completed {desc}, releasing all replicas")
torch.distributed.barrier()
if is_torch_tpu_available():
xm.rendezvous(desc)
else:
torch.distributed.barrier()
else:
yield
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment