Unverified Commit e12d6f51 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Distributed barrier before loading model (#10685)

parent 339fc51a
...@@ -1131,6 +1131,12 @@ class Trainer: ...@@ -1131,6 +1131,12 @@ class Trainer:
logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
if self.args.load_best_model_at_end and self.state.best_model_checkpoint is not None: if self.args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
# Wait for everyone to get here so we are sur the model has been saved by process 0.
if is_torch_tpu_available():
xm.rendezvous("load_best_model_at_end")
elif self.args.local_rank != -1:
dist.barrier()
logger.info( logger.info(
f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})."
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment