Unverified Commit 1c286be5 authored by dumpmemory's avatar dumpmemory Committed by GitHub
Browse files

Fix bug for checkpoint saving on multi node training setting (#28078)

* add multi-node traning setting

* fix style
parent dec84b32
...@@ -2386,7 +2386,9 @@ class Trainer: ...@@ -2386,7 +2386,9 @@ class Trainer:
self.args.distributed_state.wait_for_everyone() self.args.distributed_state.wait_for_everyone()
# Then go through the rewriting process starting on process 0 # Then go through the rewriting process starting on process 0
if staging_output_dir != output_dir: if staging_output_dir != output_dir:
with self.args.main_process_first(desc="Renaming model checkpoint folder to true location"): with self.args.main_process_first(
desc="Renaming model checkpoint folder to true location", local=self.args.save_on_each_node
):
if os.path.exists(staging_output_dir): if os.path.exists(staging_output_dir):
os.rename(staging_output_dir, output_dir) os.rename(staging_output_dir, output_dir)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment