Internal change

PiperOrigin-RevId: 267007907

Internal change
PiperOrigin-RevId: 267007907
9a88e415 · Hongkun Yu · A. Unique TensorFlower · bd211e3e · 9a88e415 · 9a88e415
Commit 9a88e415 authored Sep 03, 2019 by Hongkun Yu Committed by A. Unique TensorFlower Sep 03, 2019
4 changed files
--- a/official/resnet/ctl/ctl_imagenet_main.py
+++ b/official/resnet/ctl/ctl_imagenet_main.py
@@ -26,7 +26,6 @@ import tensorflow as tf
 from official.resnet.ctl import ctl_common
 from official.vision.image_classification import imagenet_preprocessing
 from official.vision.image_classification import common
-from official.vision.image_classification import resnet_imagenet_main
 from official.vision.image_classification import resnet_model
 from official.utils.flags import core as flags_core
 from official.utils.logs import logger
@@ -246,7 +245,7 @@ def run(flags_obj):
      training_accuracy.reset_states()

      for step in range(train_steps):
-        optimizer.lr = resnet_imagenet_main.learning_rate_schedule(
+        optimizer.lr = common.learning_rate_schedule(
            epoch, step, train_steps, flags_obj.batch_size)

        time_callback.on_batch_begin(step+epoch*train_steps)

--- a/official/vision/image_classification/common.py
+++ b/official/vision/image_classification/common.py
@@ -31,6 +31,41 @@ from official.utils.misc import keras_utils
 FLAGS = flags.FLAGS
 BASE_LEARNING_RATE = 0.1  # This matches Jing's version.
 TRAIN_TOP_1 = 'training_accuracy_top_1'
+LR_SCHEDULE = [    # (multiplier, epoch to start) tuples
+    (1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
+]
+
+
+def learning_rate_schedule(current_epoch,
+                           current_batch,
+                           batches_per_epoch,
+                           batch_size):
+  """Handles linear scaling rule, gradual warmup, and LR decay.
+
+  Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the
+  provided scaling factor.
+
+  Args:
+    current_epoch: integer, current epoch indexed from 0.
+    current_batch: integer, current batch in the current epoch, indexed from 0.
+    batches_per_epoch: integer, number of steps in an epoch.
+    batch_size: integer, total batch sized.
+
+  Returns:
+    Adjusted learning rate.
+  """
+  initial_lr = BASE_LEARNING_RATE * batch_size / 256
+  epoch = current_epoch + float(current_batch) / batches_per_epoch
+  warmup_lr_multiplier, warmup_end_epoch = LR_SCHEDULE[0]
+  if epoch < warmup_end_epoch:
+    # Learning rate increases linearly per step.
+    return initial_lr * warmup_lr_multiplier * epoch / warmup_end_epoch
+  for mult, start_epoch in LR_SCHEDULE:
+    if epoch >= start_epoch:
+      learning_rate = initial_lr * mult
+    else:
+      break
+  return learning_rate


 class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
@@ -172,12 +207,13 @@ def get_optimizer(learning_rate=0.1):
  return gradient_descent_v2.SGD(learning_rate=learning_rate, momentum=0.9)


-def get_callbacks(learning_rate_schedule_fn, num_images):
+# TODO(hongkuny,haoyuzhang): make cifar model use_tensor_lr to clean up code.
+def get_callbacks(learning_rate_schedule_fn=None, num_images=None):
  """Returns common callbacks."""
  time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps)
  callbacks = [time_callback]

-  if not FLAGS.use_tensor_lr:
+  if not FLAGS.use_tensor_lr and learning_rate_schedule_fn:
    lr_callback = LearningRateBatchScheduler(
        learning_rate_schedule_fn,
        batch_size=FLAGS.batch_size,
@@ -312,6 +348,9 @@ def define_keras_flags(dynamic_loss_scale=True):
  flags.DEFINE_boolean(
      name='enable_get_next_as_optional', default=False,
      help='Enable get_next_as_optional behavior in DistributedIterator.')
+  flags.DEFINE_boolean(
+      name='enable_checkpoint_and_export', default=False,
+      help='Whether to enable a checkpoint callback and export the savedmodel.')


 def get_synth_input_fn(height, width, num_channels, num_classes,
@@ -346,7 +385,6 @@ def get_synth_input_fn(height, width, num_channels, num_classes,
                                        mean=127,
                                        stddev=60,
                                        name='synthetic_inputs')
-
    labels = tf.random.uniform([1],
                               minval=0,
                               maxval=num_classes - 1,

--- a/official/vision/image_classification/resnet_imagenet_main.py
+++ b/official/vision/image_classification/resnet_imagenet_main.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import os
+
 from absl import app
 from absl import flags
 from absl import logging
@@ -33,42 +35,6 @@ from official.vision.image_classification import common
 from official.vision.image_classification import imagenet_preprocessing
 from official.vision.image_classification import resnet_model

-LR_SCHEDULE = [    # (multiplier, epoch to start) tuples
-    (1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
-]
-
-
-def learning_rate_schedule(current_epoch,
-                           current_batch,
-                           batches_per_epoch,
-                           batch_size):
-  """Handles linear scaling rule, gradual warmup, and LR decay.
-
-  Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the
-  provided scaling factor.
-
-  Args:
-    current_epoch: integer, current epoch indexed from 0.
-    current_batch: integer, current batch in the current epoch, indexed from 0.
-    batches_per_epoch: integer, number of steps in an epoch.
-    batch_size: integer, total batch sized.
-
-  Returns:
-    Adjusted learning rate.
-  """
-  initial_lr = common.BASE_LEARNING_RATE * batch_size / 256
-  epoch = current_epoch + float(current_batch) / batches_per_epoch
-  warmup_lr_multiplier, warmup_end_epoch = LR_SCHEDULE[0]
-  if epoch < warmup_end_epoch:
-    # Learning rate increases linearly per step.
-    return initial_lr * warmup_lr_multiplier * epoch / warmup_end_epoch
-  for mult, start_epoch in LR_SCHEDULE:
-    if epoch >= start_epoch:
-      learning_rate = initial_lr * mult
-    else:
-      break
-  return learning_rate
-

 def run(flags_obj):
  """Run ResNet ImageNet training and eval loop using native Keras APIs.
@@ -94,7 +60,7 @@ def run(flags_obj):
  common.set_cudnn_batchnorm_mode()

  dtype = flags_core.get_tf_dtype(flags_obj)
-  if dtype == 'float16':
+  if dtype == tf.float16:
    loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
    policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
        'mixed_float16', loss_scale=loss_scale)
@@ -175,9 +141,9 @@ def run(flags_obj):
    lr_schedule = common.PiecewiseConstantDecayWithWarmup(
        batch_size=flags_obj.batch_size,
        epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
-        warmup_epochs=LR_SCHEDULE[0][1],
-        boundaries=list(p[1] for p in LR_SCHEDULE[1:]),
-        multipliers=list(p[0] for p in LR_SCHEDULE),
+        warmup_epochs=common.LR_SCHEDULE[0][1],
+        boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
+        multipliers=list(p[0] for p in common.LR_SCHEDULE),
        compute_lr_on_cpu=True)

  with strategy_scope:
@@ -218,8 +184,11 @@ def run(flags_obj):
          run_eagerly=flags_obj.run_eagerly)

  callbacks = common.get_callbacks(
-      learning_rate_schedule, imagenet_preprocessing.NUM_IMAGES['train'])
-
+      common.learning_rate_schedule, imagenet_preprocessing.NUM_IMAGES['train'])
+  if flags_obj.enable_checkpoint_and_export:
+    ckpt_full_path = os.path.join(flags_obj.model_dir, 'model.ckpt-{epoch:04d}')
+    callbacks.append(tf.keras.callbacks.ModelCheckpoint(ckpt_full_path,
+                                                        save_weights_only=True))
  train_steps = (
      imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
  train_epochs = flags_obj.train_epochs
@@ -257,6 +226,10 @@ def run(flags_obj):
                      validation_data=validation_data,
                      validation_freq=flags_obj.epochs_between_evals,
                      verbose=2)
+  if flags_obj.enable_checkpoint_and_export:
+    # Keras model.save assumes a float32 input designature.
+    export_path = os.path.join(flags_obj.model_dir, 'saved_model')
+    model.save(export_path, include_optimizer=False)

  eval_output = None
  if not flags_obj.skip_eval:

--- a/official/vision/image_classification/resnet_imagenet_test.py
+++ b/official/vision/image_classification/resnet_imagenet_test.py
@@ -18,19 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import tempfile
-
 import tensorflow as tf

 from tensorflow.python.eager import context
-from tensorflow.python.platform import googletest
 from official.utils.misc import keras_utils
 from official.utils.testing import integration
 from official.vision.image_classification import imagenet_preprocessing
 from official.vision.image_classification import resnet_imagenet_main


-class KerasImagenetTest(googletest.TestCase):
+class KerasImagenetTest(tf.test.TestCase):
  """Unit tests for Keras ResNet with ImageNet."""

  _extra_flags = [
@@ -40,11 +37,6 @@ class KerasImagenetTest(googletest.TestCase):
  ]
  _tempdir = None

-  def get_temp_dir(self):
-    if not self._tempdir:
-      self._tempdir = tempfile.mkdtemp(dir=googletest.GetTempDir())
-    return self._tempdir
-
  @classmethod
  def setUpClass(cls):  # pylint: disable=invalid-name
    super(KerasImagenetTest, cls).setUpClass()
@@ -65,7 +57,6 @@ class KerasImagenetTest(googletest.TestCase):

    extra_flags = [
        "-distribution_strategy", "off",
-        "-model_dir", "keras_imagenet_no_dist_strat",
        "-data_format", "channels_last",
    ]
    extra_flags = extra_flags + self._extra_flags
@@ -81,7 +72,6 @@ class KerasImagenetTest(googletest.TestCase):
    extra_flags = [
        "-enable_eager", "false",
        "-distribution_strategy", "off",
-        "-model_dir", "keras_imagenet_graph_no_dist_strat",
        "-data_format", "channels_last",
    ]
    extra_flags = extra_flags + self._extra_flags
@@ -105,8 +95,8 @@ class KerasImagenetTest(googletest.TestCase):
    extra_flags = [
        "-num_gpus", "1",
        "-distribution_strategy", "default",
-        "-model_dir", "keras_imagenet_1_gpu",
        "-data_format", "channels_last",
+        "-enable_checkpoint_and_export", "1",
    ]
    extra_flags = extra_flags + self._extra_flags

@@ -130,7 +120,6 @@ class KerasImagenetTest(googletest.TestCase):
        "-num_gpus", "1",
        "-dtype", "fp16",
        "-distribution_strategy", "default",
-        "-model_dir", "keras_imagenet_1_gpu",
        "-data_format", "channels_last",
    ]
    extra_flags = extra_flags + self._extra_flags
@@ -141,27 +130,6 @@ class KerasImagenetTest(googletest.TestCase):
        extra_flags=extra_flags
    )

-  def test_end_to_end_graph_1_gpu(self):
-    """Test Keras model in legacy graph mode with 1 GPU."""
-    if context.num_gpus() < 1:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(1, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "1",
-        "-enable_eager", "false",
-        "-distribution_strategy", "default",
-        "-model_dir", "keras_imagenet_graph_1_gpu",
-        "-data_format", "channels_last",
-    ]
-    extra_flags = extra_flags + self._extra_flags
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )

  def test_end_to_end_2_gpu(self):
    """Test Keras model with 2 GPUs."""
@@ -176,7 +144,6 @@ class KerasImagenetTest(googletest.TestCase):
    extra_flags = [
        "-num_gpus", "2",
        "-distribution_strategy", "default",
-        "-model_dir", "keras_imagenet_2_gpu",
    ]
    extra_flags = extra_flags + self._extra_flags

@@ -200,7 +167,6 @@ class KerasImagenetTest(googletest.TestCase):
        "-num_gpus", "2",
        "-enable_xla", "true",
        "-distribution_strategy", "default",
-        "-model_dir", "keras_imagenet_xla_2_gpu",
    ]
    extra_flags = extra_flags + self._extra_flags

@@ -224,7 +190,6 @@ class KerasImagenetTest(googletest.TestCase):
        "-num_gpus", "2",
        "-dtype", "fp16",
        "-distribution_strategy", "default",
-        "-model_dir", "keras_imagenet_2_gpu_fp16",
    ]
    extra_flags = extra_flags + self._extra_flags

@@ -249,50 +214,6 @@ class KerasImagenetTest(googletest.TestCase):
        "-dtype", "fp16",
        "-enable_xla", "true",
        "-distribution_strategy", "default",
-        "-model_dir", "keras_imagenet_xla_2_gpu_fp16",
-    ]
-    extra_flags = extra_flags + self._extra_flags
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_graph_2_gpu(self):
-    """Test Keras model in legacy graph mode with 2 GPUs."""
-    if context.num_gpus() < 2:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(2, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "2",
-        "-enable_eager", "false",
-        "-distribution_strategy", "default",
-        "-model_dir", "keras_imagenet_graph_2_gpu",
-    ]
-    extra_flags = extra_flags + self._extra_flags
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_graph_xla_2_gpu(self):
-    """Test Keras model in legacy graph mode with XLA and 2 GPUs."""
-    if context.num_gpus() < 2:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(2, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "2",
-        "-enable_eager", "false",
-        "-enable_xla", "true",
-        "-distribution_strategy", "default",
-        "-model_dir", "keras_imagenet_graph_xla_2_gpu",
    ]
    extra_flags = extra_flags + self._extra_flags

@@ -305,4 +226,4 @@ class KerasImagenetTest(googletest.TestCase):

 if __name__ == "__main__":
  tf.compat.v1.enable_v2_behavior()
-  googletest.main()
+  tf.test.main()