Unverified Commit a6ea244f authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Fix: save checkpoint after each epoch and push checkpoint to the hub (#13872)


Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 7079a99e
......@@ -769,6 +769,14 @@ def main():
cur_step = epoch * (len(train_dataset) // train_batch_size)
write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step)
# save checkpoint after each epoch and push checkpoint to the hub
if jax.process_index() == 0:
params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
model.save_pretrained(training_args.output_dir, params=params)
tokenizer.save_pretrained(training_args.output_dir)
if training_args.push_to_hub:
repo.push_to_hub(commit_message=f"Saving weights and logs of epoch {epoch}", blocking=False)
# ======================== Prediction loop ==============================
if training_args.do_predict:
logger.info("*** Predict ***")
......@@ -808,14 +816,6 @@ def main():
desc = f"Predict Loss: {pred_metrics['loss']} | {rouge_desc})"
logger.info(desc)
# save checkpoint after each epoch and push checkpoint to the hub
if jax.process_index() == 0:
params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
model.save_pretrained(training_args.output_dir, params=params)
tokenizer.save_pretrained(training_args.output_dir)
if training_args.push_to_hub:
repo.push_to_hub(commit_message=f"Saving weights and logs of epoch {epoch}", blocking=False)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment