bug fixes

c923a420 · Shining Sun · 80dcd27c · c923a420 · c923a420 · c923a420
Commit c923a420 authored Dec 21, 2018 by Shining Sun
7 changed files
--- a/official/resnet/keras/keras_cifar_main.py
+++ b/official/resnet/keras/keras_cifar_main.py
@@ -25,7 +25,7 @@ import tensorflow as tf  # pylint: disable=g-bad-import-order
 from official.resnet import cifar10_main as cifar_main
 from official.resnet import resnet_run_loop
 from official.resnet.keras import keras_common
-from official.resnet.keras import resnet56
+from official.resnet.keras import resnet_cifar_model
 from official.utils.flags import core as flags_core
 from official.utils.logs import logger
 from official.utils.misc import distribution_utils
@@ -39,8 +39,8 @@ LR_SCHEDULE = [  # (multiplier, epoch to start) tuples
 def learning_rate_schedule(current_epoch, current_batch, batches_per_epoch, batch_size):
  """Handles linear scaling rule, gradual warmup, and LR decay.

-  The learning rate starts at base learning_rate, then after 91, 136 and
-  182 epochs, the learning rate is divided by 10.
+  Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the provided scaling
+  factor.

  Args:
    current_epoch: integer, current epoch indexed from 0.
@@ -65,7 +65,7 @@ def parse_record_keras(raw_record, is_training, dtype):
  The input record is parsed into a label and image, and the image is passed
  through preprocessing steps (cropping, flipping, and so on).

-  This method converts the label to onhot to fit the loss function.
+  This method converts the label to one hot to fit the loss function.

  Args:
    raw_record: scalar Tensor tf.string containing a serialized
@@ -127,9 +127,9 @@ def run(flags_obj):

  optimizer = keras_common.get_optimizer()
  strategy = distribution_utils.get_distribution_strategy(
-    flags_obj.num_gpus, flags_obj.use_one_device_strategy)
+    flags_obj.num_gpus, flags_obj.turn_off_distribution_strategy)

-  model = resnet56.ResNet56(input_shape=(32, 32, 3),
+  model = resnet_cifar_model.resnet56(input_shape=(32, 32, 3),
          classes=cifar_main.NUM_CLASSES)

  model.compile(loss='categorical_crossentropy',
@@ -150,6 +150,11 @@ def run(flags_obj):
  num_eval_steps = (cifar_main.NUM_IMAGES['validation'] //
                    flags_obj.batch_size)

+  validation_data = eval_input_dataset
+  if flags_obj.skip_eval:
+    num_eval_steps = None
+    validation_data = None
+
  history = model.fit(train_input_dataset,
      epochs=train_epochs,
      steps_per_epoch=train_steps,
@@ -159,7 +164,7 @@ def run(flags_obj):
        tensorboard_callback
        ],
      validation_steps=num_eval_steps,
-                      validation_data=eval_input_dataset,
+      validation_data=validation_data,
      verbose=1)

  if not flags_obj.skip_eval:
@@ -167,10 +172,6 @@ def run(flags_obj):
                                 steps=num_eval_steps,
                                 verbose=1)

-  stats = keras_common.analyze_fit_and_eval_result(history, eval_output)
-
-  return stats
-

 def main(_):
  with logger.benchmark_context(flags.FLAGS):

--- a/official/resnet/keras/keras_common.py
+++ b/official/resnet/keras/keras_common.py
@@ -42,28 +42,26 @@ class TimeHistory(tf.keras.callbacks.Callback):
    """
    self._batch_size = batch_size
    super(TimeHistory, self).__init__()
-    self.log_batch_size = 100
+    self.log_steps = 100

  def on_train_begin(self, logs=None):
-    self.batch_times_secs = []
    self.record_batch = True

  def on_batch_begin(self, batch, logs=None):
    if self.record_batch:
-      self.batch_time_start = time.time()
+      self.start_time= time.time()
      self.record_batch = False

  def on_batch_end(self, batch, logs=None):
-    if batch % self.log_batch_size == 0:
-      last_n_batches = time.time() - self.batch_time_start
-      examples_per_second = (self._batch_size * self.log_batch_size) / last_n_batches
-      self.batch_times_secs.append(last_n_batches)
+    if batch % self.log_steps == 0:
+      elapsed_time = time.time() - self.start_time
+      examples_per_second = (self._batch_size * self.log_steps) / elapsed_time
      self.record_batch = True
      # TODO(anjalisridhar): add timestamp as well.
      if batch != 0:
        tf.logging.info("BenchmarkMetric: {'num_batches':%d, 'time_taken': %f,"
                        "'images_per_second': %f}" %
-                        (batch, last_n_batches, examples_per_second))
+                        (batch, elapsed_time, examples_per_second))

 class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
  """Callback to update learning rate on every batch (not epoch boundaries).
@@ -95,20 +93,14 @@ class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
      raise ValueError('The output of the "schedule" function should be float.')
    if lr != self.prev_lr:
      self.model.optimizer.learning_rate = lr  # lr should be a float here
-      # tf.keras.backend.set_value(self.model.optimizer.learning_rate, lr)
      self.prev_lr = lr
      tf.logging.debug('Epoch %05d Batch %05d: LearningRateBatchScheduler change '
                   'learning rate to %s.', self.epochs, batch, lr)

 def get_optimizer():
-  if FLAGS.use_tf_momentum_optimizer:
-    learning_rate = BASE_LEARNING_RATE * FLAGS.batch_size / 256
-    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9)
-  else:
-    # optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
-    optimizer = gradient_descent_v2.SGD(learning_rate=0.1, momentum=0.9)
-
-  return optimizer
+  # The learning rate set here is a placeholder and not use. It will be overwritten
+  # at the beginning of each batch by callback
+  return gradient_descent_v2.SGD(learning_rate=0.1, momentum=0.9)


 def get_callbacks(learning_rate_schedule_fn, num_images):
@@ -124,25 +116,15 @@ def get_callbacks(learning_rate_schedule_fn, num_images):

  return time_callback, tensorboard_callback, lr_callback

-def analyze_fit_and_eval_result(history, eval_output):
-  stats = {}
-  stats['accuracy_top_1'] = eval_output[1]
-  stats['eval_loss'] = eval_output[0]
-  stats['training_loss'] = history.history['loss'][-1]
-  stats['training_accuracy_top_1'] = history.history['categorical_accuracy'][-1]
-
-  print('Test loss:{}'.format(stats['eval_loss']))
-  print('top_1 accuracy:{}'.format(stats['accuracy_top_1']))
-  print('top_1_training_accuracy:{}'.format(stats['training_accuracy_top_1']))
-
-  return stats

 def define_keras_flags():
  flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
  flags.DEFINE_boolean(name='skip_eval', default=False, help='Skip evaluation?')
  flags.DEFINE_integer(
      name="train_steps", default=None,
-      help="The number of steps to run for training")
+      help="The number of steps to run for training. If it is larger than "
+      "# batches per epoch, then use # bathes per epoch. When this flag is "
+      "set, only one epoch is going to run for training.")


 def get_synth_input_fn(height, width, num_channels, num_classes,
@@ -152,7 +134,7 @@ def get_synth_input_fn(height, width, num_channels, num_classes,
  This input_fn returns a data set that iterates over a set of random data and
  bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
  copy is still included. This used to find the upper throughput bound when
-  tunning the full input pipeline.
+  tuning the full input pipeline.

  Args:
    height: Integer height that will be used to create a fake image tensor.

--- a/official/resnet/keras/keras_imagenet_main.py
+++ b/official/resnet/keras/keras_imagenet_main.py
@@ -26,13 +26,11 @@ from official.resnet import imagenet_main
 from official.resnet import imagenet_preprocessing
 from official.resnet import resnet_run_loop
 from official.resnet.keras import keras_common
-from official.resnet.keras import resnet50
+from official.resnet.keras import resnet_model
 from official.utils.flags import core as flags_core
 from official.utils.logs import logger
 from official.utils.misc import distribution_utils

-# import os
-# os.environ['TF2_BEHAVIOR'] = 'enabled'

 LR_SCHEDULE = [    # (multiplier, epoch to start) tuples
    (1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
@@ -42,12 +40,8 @@ LR_SCHEDULE = [    # (multiplier, epoch to start) tuples
 def learning_rate_schedule(current_epoch, current_batch, batches_per_epoch, batch_size):
  """Handles linear scaling rule, gradual warmup, and LR decay.

-  The learning rate starts at 0, then it increases linearly per step.
-  After 5 epochs we reach the base learning rate (scaled to account
-    for batch size).
-  After 30, 60 and 80 epochs the learning rate is divided by 10.
-  After 90 epochs training stops and the LR is set to 0. This ensures
-    that we train for exactly 90 epochs for reproducibility.
+  Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the provided scaling
+  factor.

  Args:
    current_epoch: integer, current epoch indexed from 0.
@@ -81,7 +75,7 @@ def parse_record_keras(raw_record, is_training, dtype):
  return image, label


-def run_imagenet_with_keras(flags_obj):
+def run(flags_obj):
  """Run ResNet ImageNet training and eval loop using native Keras APIs.

  Args:
@@ -128,9 +122,9 @@ def run_imagenet_with_keras(flags_obj):

  optimizer = keras_common.get_optimizer()
  strategy = distribution_utils.get_distribution_strategy(
-    flags_obj.num_gpus, flags_obj.use_one_device_strategy)
+    flags_obj.num_gpus, flags_obj.turn_off_distribution_strategy)

-  model = resnet50.ResNet50(num_classes=imagenet_main.NUM_CLASSES)
+  model = resnet_model.resnet50(num_classes=imagenet_main.NUM_CLASSES)

  model.compile(loss='sparse_categorical_crossentropy',
                optimizer=optimizer,
@@ -140,10 +134,6 @@ def run_imagenet_with_keras(flags_obj):
  time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks(
      learning_rate_schedule, imagenet_main.NUM_IMAGES['train'])

-  steps_per_epoch = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size
-  num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] //
-                  flags_obj.batch_size)
-
  train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size
  train_epochs = flags_obj.train_epochs

@@ -151,6 +141,14 @@ def run_imagenet_with_keras(flags_obj):
    train_steps = min(flags_obj.train_steps, train_steps)
    train_epochs = 1

+  num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] //
+                  flags_obj.batch_size)
+
+  validation_data = eval_input_dataset
+  if flags_obj.skip_eval:
+    num_eval_steps = None
+    validation_data = None
+
  history = model.fit(train_input_dataset,
      epochs=train_epochs,
      steps_per_epoch=train_steps,
@@ -160,7 +158,7 @@ def run_imagenet_with_keras(flags_obj):
        tensorboard_callback
        ],
      validation_steps=num_eval_steps,
-                      validation_data=eval_input_dataset,
+      validation_data=validation_data,
      verbose=1)

  if not flags_obj.skip_eval:
@@ -168,14 +166,10 @@ def run_imagenet_with_keras(flags_obj):
                                 steps=num_eval_steps,
                                 verbose=1)

-  stats = keras_common.analyze_fit_and_eval_result(history, eval_output)
-
-  return stats
-

 def main(_):
  with logger.benchmark_context(flags.FLAGS):
-    run_imagenet_with_keras(flags.FLAGS)
+    run(flags.FLAGS)


 if __name__ == '__main__':

--- a/official/resnet/keras/resnet56.py
+++ b/official/resnet/keras/resnet56.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""ResNet50 model for Keras adapted from tf.keras.applications.ResNet50.
+"""ResNet56 model for Keras adapted from tf.keras.applications.ResNet50.

 # Reference:
 - [Deep Residual Learning for Image Recognition](
@@ -200,7 +200,7 @@ def conv_building_block(input_tensor,
  return x


-def ResNet56(input_shape=None, classes=1000):
+def resnet56(input_shape=None, classes=1000):
  """Instantiates the ResNet56 architecture.

  Arguments:

--- a/official/resnet/keras/resnet50.py
+++ b/official/resnet/keras/resnet50.py
@@ -15,6 +15,7 @@
 """ResNet50 model for Keras.

 Adapted from tf.keras.applications.resnet50.ResNet50().
+This is ResNet model version 1.5.

 Related papers/blogs:
 - https://arxiv.org/abs/1512.03385
@@ -179,7 +180,7 @@ def conv_block(input_tensor,
  return x


-def ResNet50(num_classes):
+def resnet50(num_classes):
  """Instantiates the ResNet50 architecture.

  Args:

--- a/official/resnet/resnet_run_loop.py
+++ b/official/resnet/resnet_run_loop.py
@@ -629,12 +629,9 @@ def define_resnet_flags(resnet_size_choices=None):
          'inference. Note, this flag only applies to ImageNet and cannot '
          'be used for CIFAR.'))
  flags.DEFINE_boolean(
-      name='use_one_device_strategy', default=True,
-      help=flags_core.help_wrap('Set to False to not use distribution '
+      name='turn_off_distribution_strategy', default=False,
+      help=flags_core.help_wrap('Set to True to not use distribution '
                                'strategies.'))
-  flags.DEFINE_boolean(name='use_tf_momentum_optimizer', default=False,
-          help='Use tf MomentumOptimizer.')
-
  choice_kwargs = dict(
      name='resnet_size', short_name='rs', default='50',
      help=flags_core.help_wrap('The size of the ResNet model to use.'))

--- a/official/utils/misc/distribution_utils.py
+++ b/official/utils/misc/distribution_utils.py
@@ -22,7 +22,7 @@ import tensorflow as tf


 def get_distribution_strategy(
-  num_gpus, all_reduce_alg=None, use_one_device_strategy=True):
+  num_gpus, all_reduce_alg=None, turn_off_distribution_strategy=False):
  """Return a DistributionStrategy for running the model.

  Args:
@@ -31,25 +31,30 @@ def get_distribution_strategy(
      See tf.contrib.distribute.AllReduceCrossDeviceOps for available
      algorithms. If None, DistributionStrategy will choose based on device
      topology.
-    use_one_device_strategy: Should only be set to Truen when num_gpus is 1.
-      If True, then use OneDeviceStrategy; otherwise, do not use any
-      distribution strategy.
+      turn_off_distribution_strategy: when set to True, do not use any
+      distribution strategy. Note that when it is True, and num_gpus is
+      larger than 1, it will raise a ValueError.

  Returns:
    tf.contrib.distribute.DistibutionStrategy object.
+  Raises:
+    ValueError: if turn_off_distribution_strategy is True and num_gpus is
+    larger than 1
  """
-  if num_gpus == 0 and use_one_device_strategy:
-    return tf.contrib.distribute.OneDeviceStrategy("device:CPU:0")
-  elif num_gpus == 0:
+  if num_gpus == 0:
+      if turn_off_distribution_strategy:
          return None
-  elif num_gpus == 1 and use_one_device_strategy:
-    return tf.contrib.distribute.OneDeviceStrategy("device:GPU:0")
+      else:
+        return tf.contrib.distribute.OneDeviceStrategy("device:CPU:0")
  elif num_gpus == 1:
+    if turn_off_distribution_strategy:
      return None
-  elif use_one_device_strategy:
-    raise ValueError("When %d GPUs are specified, use_one_device_strategy"
+    else:
+      return tf.contrib.distribute.OneDeviceStrategy("device:GPU:0")
+  elif turn_off_distribution_strategy:
+    raise ValueError("When %d GPUs are specified, turn_off_distribution_strategy"
        " flag cannot be set to True.".format(num_gpus))
-  else: # num_gpus > 1 and not use_one_device_strategy
+  else: # num_gpus > 1 and not turn_off_distribution_strategy
    if all_reduce_alg:
      return tf.contrib.distribute.MirroredStrategy(
          num_gpus=num_gpus,