bug fixes and clean ups

6f881f77 · Shining Sun · b1b4c805 · 6f881f77 · 6f881f77 · 6f881f77
Commit 6f881f77 authored Dec 20, 2018 by Shining Sun
6 changed files
--- a/official/resnet/cifar10_main.py
+++ b/official/resnet/cifar10_main.py
@@ -39,7 +39,7 @@ NUM_CLASSES = 10
 _NUM_DATA_FILES = 5

 # TODO(tobyboyd): Change to best practice 45K(train)/5K(val)/10K(test) splits.
-_NUM_IMAGES = {
+NUM_IMAGES = {
    'train': 50000,
    'validation': 10000,
 }
@@ -134,7 +134,7 @@ def input_fn(is_training, data_dir, batch_size, num_epochs=1,
      dataset=dataset,
      is_training=is_training,
      batch_size=batch_size,
-      shuffle_buffer=_NUM_IMAGES['train'],
+      shuffle_buffer=NUM_IMAGES['train'],
      parse_record_fn=parse_record_fn,
      num_epochs=num_epochs,
      dtype=dtype,
@@ -200,7 +200,7 @@ def cifar10_model_fn(features, labels, mode, params):
  # Learning rate schedule follows arXiv:1512.03385 for ResNet-56 and under.
  learning_rate_fn = resnet_run_loop.learning_rate_with_decay(
      batch_size=params['batch_size'], batch_denom=128,
-      num_images=_NUM_IMAGES['train'], boundary_epochs=[91, 136, 182],
+      num_images=NUM_IMAGES['train'], boundary_epochs=[91, 136, 182],
      decay_rates=[1, 0.1, 0.01, 0.001])

  # Weight decay of 2e-4 diverges from 1e-4 decay used in the ResNet paper

--- a/official/resnet/keras/keras_cifar_main.py
+++ b/official/resnet/keras/keras_cifar_main.py
@@ -152,18 +152,24 @@ def run(flags_obj):
  model.compile(loss='categorical_crossentropy',
                optimizer=optimizer,
                metrics=['categorical_accuracy'],
-                strategy=strategy)
+                distribute=strategy)

  time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks(
      learning_rate_schedule, cifar_main.NUM_IMAGES['train'])

-  steps_per_epoch = cifar_main.NUM_IMAGES['train'] // flags_obj.batch_size
+  train_steps = cifar_main.NUM_IMAGES['train'] // flags_obj.batch_size
+  train_epochs = flags_obj.train_epochs
+
+  if flags_obj.train_steps:
+    train_steps = min(flags_obj.train_steps, train_steps)
+    train_epochs = 1
+
  num_eval_steps = (cifar_main.NUM_IMAGES['validation'] //
                    flags_obj.batch_size)

  history = model.fit(train_input_dataset,
-                      epochs=flags_obj.train_epochs,
-                      steps_per_epoch=steps_per_epoch,
+                      epochs=train_epochs,
+                      steps_per_epoch=train_steps,
                      callbacks=[
                          time_callback,
                          lr_callback,
@@ -190,4 +196,5 @@ def main(_):
 if __name__ == '__main__':
  tf.logging.set_verbosity(tf.logging.INFO)
  cifar_main.define_cifar_flags()
+  keras_common.define_keras_flags()
  absl_app.run(main)
--- a/official/resnet/keras/keras_common.py
+++ b/official/resnet/keras/keras_common.py
@@ -56,8 +56,7 @@ class TimeHistory(tf.keras.callbacks.Callback):
  def on_batch_end(self, batch, logs=None):
    if batch % self.log_batch_size == 0:
      last_n_batches = time.time() - self.batch_time_start
-      examples_per_second = 
-        (self._batch_size * self.log_batch_size) / last_n_batches
+      examples_per_second = (self._batch_size * self.log_batch_size) / last_n_batches
      self.batch_times_secs.append(last_n_batches)
      self.record_batch = True
      # TODO(anjalisridhar): add timestamp as well.
@@ -131,8 +130,14 @@ def analyze_fit_and_eval_result(history, eval_output):
  stats['training_loss'] = history.history['loss'][-1]
  stats['training_accuracy_top_1'] = history.history['categorical_accuracy'][-1]

-  print('Test loss:{}'.format(stats['']))
+  print('Test loss:{}'.format(stats['eval_loss']))
  print('top_1 accuracy:{}'.format(stats['accuracy_top_1']))
  print('top_1_training_accuracy:{}'.format(stats['training_accuracy_top_1']))

  return stats
+
+def define_keras_flags():
+  flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
+  flags.DEFINE_integer(
+      name="train_steps", default=None,
+      help="The number of steps to run for training")
--- a/official/resnet/keras/keras_imagenet_main.py
+++ b/official/resnet/keras/keras_imagenet_main.py
@@ -153,9 +153,16 @@ def run_imagenet_with_keras(flags_obj):
  num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] //
                  flags_obj.batch_size)

+  train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size
+  train_epochs = flags_obj.train_epochs
+
+  if flags_obj.train_steps:
+    train_steps = min(flags_obj.train_steps, train_steps)
+    train_epochs = 1
+
  history = model.fit(train_input_dataset,
-                      epochs=flags_obj.train_epochs,
-                      steps_per_epoch=steps_per_epoch,
+                      epochs=train_epochs,
+                      steps_per_epoch=train_steps,
                      callbacks=[
                        time_callback,
                        lr_callback,
@@ -182,4 +189,5 @@ def main(_):
 if __name__ == '__main__':
  tf.logging.set_verbosity(tf.logging.INFO)
  imagenet_main.define_imagenet_flags()
+  keras_common.define_keras_flags()
  absl_app.run(main)
--- a/official/resnet/resnet_run_loop.py
+++ b/official/resnet/resnet_run_loop.py
@@ -632,7 +632,6 @@ def define_resnet_flags(resnet_size_choices=None):
      name='use_one_device_strategy', default=True,
      help=flags_core.help_wrap('Set to False to not use distribution '
                                'strategies.'))
-  flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
  flags.DEFINE_boolean(name='use_tf_momentum_optimizer', default=False,
          help='Use tf MomentumOptimizer.')


--- a/official/utils/misc/distribution_utils.py
+++ b/official/utils/misc/distribution_utils.py
@@ -22,7 +22,7 @@ import tensorflow as tf


 def get_distribution_strategy(
-  num_gpus, all_reduce_alg=None, use_one_device_strategy):
+  num_gpus, all_reduce_alg=None, use_one_device_strategy=True):
  """Return a DistributionStrategy for running the model.

  Args:
@@ -47,7 +47,7 @@ def get_distribution_strategy(
  elif num_gpus == 1:
    return None
  elif use_one_device_strategy:
-    rase ValueError("When %d GPUs are specified, use_one_device_strategy"
+    raise ValueError("When %d GPUs are specified, use_one_device_strategy"
        " flag cannot be set to True.".format(num_gpus))
  else: # num_gpus > 1 and not use_one_device_strategy
    if all_reduce_alg: