".github/vscode:/vscode.git/clone" did not exist on "9b3399522d228c61a607701d638ac24e6a0d9eed"
Commit 13bde16f authored by Deepak Narayanan's avatar Deepak Narayanan
Browse files

Checkpoint should be saved only after evaluation pass is run to make sure...

Checkpoint should be saved only after evaluation pass is run to make sure validation losses are identical after loading checkpoint
parent c84f0752
...@@ -857,15 +857,6 @@ def train(forward_step_func, model, optimizer, lr_scheduler, ...@@ -857,15 +857,6 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
check_adlr_autoresume_termination(iteration, model, optimizer, check_adlr_autoresume_termination(iteration, model, optimizer,
lr_scheduler) lr_scheduler)
# Checkpointing
saved_checkpoint = False
if args.save and args.save_interval and \
iteration % args.save_interval == 0:
save_checkpoint_and_time(iteration, model, optimizer,
lr_scheduler)
saved_checkpoint = True
# Evaluation # Evaluation
if args.eval_interval and iteration % args.eval_interval == 0 and \ if args.eval_interval and iteration % args.eval_interval == 0 and \
args.do_valid: args.do_valid:
...@@ -874,6 +865,14 @@ def train(forward_step_func, model, optimizer, lr_scheduler, ...@@ -874,6 +865,14 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
valid_data_iterator, model, valid_data_iterator, model,
iteration, False) iteration, False)
# Checkpointing
saved_checkpoint = False
if args.save and args.save_interval and \
iteration % args.save_interval == 0:
save_checkpoint_and_time(iteration, model, optimizer,
lr_scheduler)
saved_checkpoint = True
# Exiting based on duration # Exiting based on duration
if args.exit_duration_in_mins: if args.exit_duration_in_mins:
train_time = (time.time() - _TRAIN_START_TIME) / 60.0 train_time = (time.time() - _TRAIN_START_TIME) / 60.0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment