Unverified Commit de9e3b59 authored by Pbihao's avatar Pbihao Committed by GitHub
Browse files

fix delete all checkpoints when save_total_limit is set to 1 (#25136)

parent a0042379
...@@ -1960,7 +1960,7 @@ class Trainer: ...@@ -1960,7 +1960,7 @@ class Trainer:
# Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
for checkpoint in checkpoints_sorted: for checkpoint in checkpoints_sorted:
if checkpoint != self.state.best_model_checkpoint: if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
shutil.rmtree(checkpoint) shutil.rmtree(checkpoint)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment