Unverified Commit cd918492 authored by radcheb's avatar radcheb Committed by GitHub
Browse files

Fix race condition on cleaning checkpoints when save_total_limit set to 1 (#20989)



* Update trainer.py

* fix style
Co-authored-by: default avatarRadhwane Chebaane <rchebaane.external@epo.org>
parent cd245780
...@@ -1919,8 +1919,8 @@ class Trainer: ...@@ -1919,8 +1919,8 @@ class Trainer:
run_dir = self._get_output_dir(trial) run_dir = self._get_output_dir(trial)
checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir) checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)
# Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint. # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
if self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
for checkpoint in checkpoints_sorted: for checkpoint in checkpoints_sorted:
if checkpoint != self.state.best_model_checkpoint: if checkpoint != self.state.best_model_checkpoint:
logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment