Improve performance of Keras ResNet models when not using distribution strategy (#7055)

* Do not set learning phase when skipping eval * Do not set learning phase in no dist strat case * Added device placement, tweaked benchmarks * Added tweaked benchmarks for Cifar * Fix device scope * Fix lint * Add explicit GPU placement flag * Also run accuracy test with explicit GPU placement * Added doc string

Improve performance of Keras ResNet models when not using distribution strategy (#7055)
* Do not set learning phase when skipping eval * Do not set learning phase in no dist strat case * Added device placement, tweaked benchmarks * Added tweaked benchmarks for Cifar * Fix device scope * Fix lint * Add explicit GPU placement flag * Also run accuracy test with explicit GPU placement * Added doc string
cf3c2407 · Haoyu Zhang · GitHub · e0e6d981 · cf3c2407 · cf3c2407
Unverified Commit cf3c2407 authored Jun 20, 2019 by Haoyu Zhang Committed by GitHub Jun 20, 2019
5 changed files
--- a/official/resnet/keras/keras_cifar_benchmark.py
+++ b/official/resnet/keras/keras_cifar_benchmark.py
@@ -83,6 +83,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
    """Test keras based model with eager and no dist strat."""
    self._setup()
    FLAGS.num_gpus = 1
+    FLAGS.explicit_gpu_placement = True
    FLAGS.data_dir = self.data_dir
    FLAGS.batch_size = 128
    FLAGS.train_epochs = 182
@@ -189,6 +190,19 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.batch_size = 128
    self._run_and_report_benchmark()
+  def benchmark_1_gpu_no_dist_strat_tweaked(self):
+    """Test no distribution strategy with manual config."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.explicit_gpu_placement = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.set_learning_phase_to_train = False
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_tweaked')
+    FLAGS.batch_size = 128
+    self._run_and_report_benchmark()
  def benchmark_graph_1_gpu_no_dist_strat(self):
    self._setup()
    FLAGS.num_gpus = 1

--- a/official/resnet/keras/keras_cifar_main.py
+++ b/official/resnet/keras/keras_cifar_main.py
@@ -168,10 +168,19 @@ def run(flags_obj):
  validation_data = eval_input_dataset
  if flags_obj.skip_eval:
+    if flags_obj.set_learning_phase_to_train:
+      # TODO(haoyuzhang): Understand slowdown of setting learning phase when
+      # not using distribution strategy.
      tf.keras.backend.set_learning_phase(1)
    num_eval_steps = None
    validation_data = None
+  if not strategy and flags_obj.explicit_gpu_placement:
+    # TODO(b/135607227): Add device scope automatically in Keras training loop
+    # when not using distribition strategy.
+    no_dist_strat_device = tf.device('/device:GPU:0')
+    no_dist_strat_device.__enter__()
  history = model.fit(train_input_dataset,
                      epochs=train_epochs,
                      steps_per_epoch=train_steps,
@@ -185,6 +194,10 @@ def run(flags_obj):
    eval_output = model.evaluate(eval_input_dataset,
                                 steps=num_eval_steps,
                                 verbose=2)
+  if not strategy and flags_obj.explicit_gpu_placement:
+    no_dist_strat_device.__exit__()
  stats = keras_common.build_stats(history, eval_output, callbacks)
  return stats

--- a/official/resnet/keras/keras_common.py
+++ b/official/resnet/keras/keras_common.py
@@ -256,6 +256,15 @@ def define_keras_flags():
      name='run_eagerly', default=False,
      help='Run the model op by op without building a model function.')
  flags.DEFINE_boolean(name='skip_eval', default=False, help='Skip evaluation?')
+  # TODO(b/135607288): Remove this flag once we understand the root cause of
+  # slowdown when setting the learning phase in Keras backend.
+  flags.DEFINE_boolean(
+      name='set_learning_phase_to_train', default=True,
+      help='If skip eval, also set Keras learning phase to 1 (training).')
+  flags.DEFINE_boolean(
+      name='explicit_gpu_placement', default=False,
+      help='If not using distribution strategy, explicitly set device scope '
+      'for the Keras training loop.')
  flags.DEFINE_boolean(name='use_trivial_model', default=False,
                       help='Whether to use a trivial Keras model.')
  flags.DEFINE_boolean(name='report_accuracy_metrics', default=True,

--- a/official/resnet/keras/keras_imagenet_benchmark.py
+++ b/official/resnet/keras/keras_imagenet_benchmark.py
@@ -234,6 +234,20 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.batch_size = 128
    self._run_and_report_benchmark()
+  def benchmark_1_gpu_no_dist_strat_tweaked(self):
+    """Test with 1 GPU, no distribution strategy, and manual tuning."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.explicit_gpu_placement = True
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.set_learning_phase_to_train = False
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_tweaked')
+    FLAGS.batch_size = 128
+    self._run_and_report_benchmark()
  def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
    """Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
    self._setup()

--- a/official/resnet/keras/keras_imagenet_main.py
+++ b/official/resnet/keras/keras_imagenet_main.py
@@ -226,10 +226,19 @@ def run(flags_obj):
    # Only build the training graph. This reduces memory usage introduced by
    # control flow ops in layers that have different implementations for
    # training and inference (e.g., batch norm).
+    if flags_obj.set_learning_phase_to_train:
+      # TODO(haoyuzhang): Understand slowdown of setting learning phase when
+      # not using distribution strategy.
      tf.keras.backend.set_learning_phase(1)
    num_eval_steps = None
    validation_data = None
+  if not strategy and flags_obj.explicit_gpu_placement:
+    # TODO(b/135607227): Add device scope automatically in Keras training loop
+    # when not using distribition strategy.
+    no_dist_strat_device = tf.device('/device:GPU:0')
+    no_dist_strat_device.__enter__()
  history = model.fit(train_input_dataset,
                      epochs=train_epochs,
                      steps_per_epoch=train_steps,
@@ -244,6 +253,10 @@ def run(flags_obj):
    eval_output = model.evaluate(eval_input_dataset,
                                 steps=num_eval_steps,
                                 verbose=2)
+  if not strategy and flags_obj.explicit_gpu_placement:
+    no_dist_strat_device.__exit__()
  stats = keras_common.build_stats(history, eval_output, callbacks)
  return stats