Add condition to avoid saving duplicate checkpoints.

PiperOrigin-RevId: 277156216

Add condition to avoid saving duplicate checkpoints.
PiperOrigin-RevId: 277156216
f0141859 · Yeqing Li · A. Unique TensorFlower · 803b2540 · f0141859
Commit f0141859 authored Oct 28, 2019 by Yeqing Li Committed by A. Unique TensorFlower Oct 28, 2019
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 3 deletions

official/modeling/training/distributed_executor.py official/modeling/training/distributed_executor.py +4 -3

No files found.
--- a/official/modeling/training/distributed_executor.py
+++ b/official/modeling/training/distributed_executor.py
@@ -452,7 +452,6 @@ class DistributedExecutor(object):
      save_freq = FLAGS.save_checkpoint_freq
    else:
      save_freq = iterations_per_loop
-    last_save_checkpoint_step = 0
    params = self._params
    strategy = self._strategy
@@ -507,6 +506,7 @@ class DistributedExecutor(object):
      test_step = self._create_test_step(strategy, model, metric=eval_metric)
    logging.info('Training started')
+    last_save_checkpoint_step = current_step
    while current_step < total_steps:
      num_steps = _steps_to_run(current_step, total_steps, iterations_per_loop)
@@ -569,6 +569,7 @@ class DistributedExecutor(object):
        train_metric.reset_states()
    # Reaches the end of training and saves the last checkpoint.
+    if last_save_checkpoint_step < total_steps:
      _save_checkpoint(checkpoint, model_dir,
                       checkpoint_name.format(step=current_step))