"tests/models/vscode:/vscode.git/clone" did not exist on "fa9d2ad7ec0b41e6ac646cde507b747e7688a7c0"
Unverified Commit 33fd83bc authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Fix RNG saves in distributed mode. (#11620)



* Fix RNG saves in distributed mode.

* Update src/transformers/trainer.py
Co-authored-by: default avatarStas Bekman <stas00@users.noreply.github.com>
Co-authored-by: default avatarStas Bekman <stas00@users.noreply.github.com>
parent 619200cc
......@@ -1527,6 +1527,9 @@ class Trainer:
if is_torch_tpu_available():
rng_states["xla"] = xm.get_rng_state()
# A process can arrive here before the process 0 has a chance to save the model, in which case output_dir may
# not yet exist.
os.makedirs(output_dir, exist_ok=True)
local_rank = xm.get_local_ordinal() if is_torch_tpu_available() else self.args.local_rank
if local_rank == -1:
torch.save(rng_states, os.path.join(output_dir, "rng_state.pth"))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment