Remove extraneous evaluation & export which happens on the final epoch.

Preserves current behavior for submodels of exporting duplicates (e.g., for 100 total steps, my_submodel_step_100.ckpt and my_submodel.ckpt are equivalent and both written after the final epoch). PiperOrigin-RevId: 306629202

Remove extraneous evaluation & export which happens on the final epoch.
Preserves current behavior for submodels of exporting duplicates (e.g., for 100 total steps, my_submodel_step_100.ckpt and my_submodel.ckpt are equivalent and both written after the final epoch). PiperOrigin-RevId: 306629202
8931d298 · Jeremiah Harmsen · A. Unique TensorFlower · 47d10833 · 8931d298 · 8931d298
Commit 8931d298 authored Apr 15, 2020 by Jeremiah Harmsen Committed by A. Unique TensorFlower Apr 15, 2020
Showing with 34 additions and 17 deletions

official/nlp/bert/model_training_utils.py official/nlp/bert/model_training_utils.py +19 -16

official/nlp/bert/model_training_utils_test.py official/nlp/bert/model_training_utils_test.py +15 -1

No files found.
--- a/official/nlp/bert/model_training_utils.py
+++ b/official/nlp/bert/model_training_utils.py
@@ -443,31 +443,34 @@ def run_customized_training_loop(
          train_summary_writer.flush()
      logging.info(training_status)

-      # Saves model checkpoints and run validation steps at every epoch end.
      if current_step % steps_per_epoch == 0:
-        # To avoid repeated model saving, we do not save after the last
-        # step of training.
+        # Save a submodel with the step in the file name after each epoch.
+        if sub_model_export_name:
+          _save_checkpoint(
+              strategy, sub_model_checkpoint, model_dir,
+              '%s_step_%d.ckpt' % (sub_model_export_name, current_step))
+
+        # Save model checkpoints and run validation steps after each epoch
+        # (with the exception of the final epoch which is handled after the
+        # training loop).
        if current_step < total_training_steps:
          _save_checkpoint(strategy, checkpoint, model_dir,
                           checkpoint_name.format(step=current_step))
-          if sub_model_export_name:
-            _save_checkpoint(
-                strategy, sub_model_checkpoint, model_dir,
-                '%s_step_%d.ckpt' % (sub_model_export_name, current_step))
-        if eval_input_fn:
-          logging.info('Running evaluation after step: %s.', current_step)
-          _run_evaluation(current_step,
-                          _get_input_iterator(eval_input_fn, strategy))
-          # Re-initialize evaluation metric.
-          for metric in eval_metrics + model.metrics:
-            metric.reset_states()
+          if eval_input_fn:
+            logging.info('Running evaluation after step: %s.', current_step)
+            _run_evaluation(current_step,
+                            _get_input_iterator(eval_input_fn, strategy))
+            # Re-initialize evaluation metric.
+            for metric in eval_metrics + model.metrics:
+              metric.reset_states()

-    _save_checkpoint(strategy, checkpoint, model_dir,
-                     checkpoint_name.format(step=current_step))
    if sub_model_export_name:
      _save_checkpoint(strategy, sub_model_checkpoint, model_dir,
                       '%s.ckpt' % sub_model_export_name)

+    _save_checkpoint(strategy, checkpoint, model_dir,
+                     checkpoint_name.format(step=current_step))
+
    if eval_input_fn:
      logging.info('Running final evaluation after training is complete.')
      _run_evaluation(current_step,

--- a/official/nlp/bert/model_training_utils_test.py
+++ b/official/nlp/bert/model_training_utils_test.py
@@ -156,6 +156,7 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
        eval_input_fn=input_fn,
        eval_steps=10,
        init_checkpoint=None,
+        sub_model_export_name='my_submodel_name',
        metric_fn=metric_fn,
        custom_callbacks=None,
        run_eagerly=run_eagerly)
@@ -188,7 +189,20 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
        distribution, model_dir, steps_per_loop=10, run_eagerly=False)

    # Two checkpoints should be saved after two epochs.
-    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(model_dir, 'ctl_step_*')))
+    files = map(os.path.basename,
+                tf.io.gfile.glob(os.path.join(model_dir, 'ctl_step_*index')))
+    self.assertCountEqual(['ctl_step_20.ckpt-1.index',
+                           'ctl_step_40.ckpt-2.index'], files)
+
+    # Three submodel checkpoints should be saved after two epochs (one after
+    # each epoch plus one final).
+    files = map(os.path.basename,
+                tf.io.gfile.glob(os.path.join(model_dir,
+                                              'my_submodel_name*index')))
+    self.assertCountEqual(['my_submodel_name.ckpt-3.index',
+                           'my_submodel_name_step_20.ckpt-1.index',
+                           'my_submodel_name_step_40.ckpt-2.index'], files)
+
    self.assertNotEmpty(
        tf.io.gfile.glob(
            os.path.join(model_dir, 'summaries/training_summary*')))