Single execution path tests for ResNet50, ResNet56, NCF, and Shakespeare LSTM. (#7276)

* Add force_run_distributed tests. * Added enable_eager * r/force_run_distributed/force_v2_in_keras_compile * Adding force_v2 tests and FLAGs. * Rename method to avoid conflict. * Add cpu force_v2 tests. * fix lint, wrap line. * change to force_v2_in_keras_compile * Update method name. * Lower mlperf target to 0.736.

Single execution path tests for ResNet50, ResNet56, NCF, and Shakespeare LSTM. (#7276)
* Add force_run_distributed tests. * Added enable_eager * r/force_run_distributed/force_v2_in_keras_compile * Adding force_v2 tests and FLAGs. * Rename method to avoid conflict. * Add cpu force_v2 tests. * fix lint, wrap line. * change to force_v2_in_keras_compile * Update method name. * Lower mlperf target to 0.736.
9d8c9aa4 · Toby Boyd · GitHub · 8390b362 · 9d8c9aa4 · 9d8c9aa4
Unverified Commit 9d8c9aa4 authored Jul 23, 2019 by Toby Boyd Committed by GitHub Jul 23, 2019
12 changed files
--- a/official/recommendation/ncf_common.py
+++ b/official/recommendation/ncf_common.py
@@ -163,7 +163,8 @@ def define_ncf_flags():
      max_train_steps=False,
      dtype=False,
      all_reduce_alg=False,
-      enable_xla=True
+      enable_xla=True,
+      force_v2_in_keras_compile=True
  )
  flags_core.define_device(tpu=True)
  flags_core.define_benchmark()

--- a/official/recommendation/ncf_keras_benchmark.py
+++ b/official/recommendation/ncf_keras_benchmark.py
@@ -122,12 +122,25 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.early_stopping = True
    self._run_and_report_benchmark()

+  def benchmark_1_gpu_force_v2_early_stop(self):
+    self._setup()
+    FLAGS.early_stopping = True
+    FLAGS.force_v2_in_keras_compile = True
+    self._run_and_report_benchmark()
+
  def benchmark_1_gpu_no_dist_strat_early_stop(self):
    self._setup()
    FLAGS.distribution_strategy = 'off'
    FLAGS.early_stopping = True
    self._run_and_report_benchmark()

+  def benchmark_1_gpu_no_dist_strat_force_v2_early_stop(self):
+    self._setup()
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.early_stopping = True
+    FLAGS.force_v2_in_keras_compile = True
+    self._run_and_report_benchmark()
+
  def benchmark_1_gpu_no_dist_strat_run_eagerly_early_stop(self):
    self._setup()
    FLAGS.distribution_strategy = 'off'
@@ -141,6 +154,13 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.enable_xla = True
    self._run_and_report_benchmark()

+  def benchmark_xla_1_gpu_force_v2_early_stop(self):
+    self._setup()
+    FLAGS.early_stopping = True
+    FLAGS.enable_xla = True
+    FLAGS.force_v2_in_keras_compile = True
+    self._run_and_report_benchmark()
+
  def benchmark_1_gpu_ctl_early_stop(self):
    self._setup()
    FLAGS.keras_use_ctl = True
@@ -184,6 +204,14 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.train_epochs = 7
    self._run_and_report_benchmark()

+  def benchmark_1_gpu_no_dist_strat_force_v2_mlperf_like(self):
+    """1 GPU using compile/fit without dist_strat."""
+    self._setup()
+    FLAGS.train_epochs = 7
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.force_v2_in_keras_compile = True
+    self._run_and_report_benchmark()
+
  def benchmark_1_gpu_no_dist_strat_mlperf_like(self):
    """1 GPU using compile/fit without dist_strat."""
    self._setup()

--- a/official/recommendation/ncf_keras_main.py
+++ b/official/recommendation/ncf_keras_main.py
@@ -413,7 +413,8 @@ def run_ncf(_):
    with distribution_utils.get_strategy_scope(strategy):

      keras_model.compile(optimizer=optimizer,
-                          run_eagerly=FLAGS.run_eagerly)
+                          run_eagerly=FLAGS.run_eagerly,
+                          run_distributed=FLAGS.force_v2_in_keras_compile)

      history = keras_model.fit(train_input_dataset,
                                epochs=FLAGS.train_epochs,

--- a/official/resnet/keras/keras_cifar_benchmark.py
+++ b/official/resnet/keras/keras_cifar_benchmark.py
@@ -25,7 +25,7 @@ import tensorflow as tf  # pylint: disable=g-bad-import-order
 from official.resnet.keras import keras_benchmark
 from official.resnet.keras import keras_cifar_main

-MIN_TOP_1_ACCURACY = 0.925
+MIN_TOP_1_ACCURACY = 0.929
 MAX_TOP_1_ACCURACY = 0.938

 FLAGS = flags.FLAGS
@@ -75,6 +75,19 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
    FLAGS.enable_eager = True
    self._run_and_report_benchmark()

+  def benchmark_1_gpu_force_v2(self):
+    """Test keras based model with eager, DS, and force_v2 path."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_force_v2')
+    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
+    FLAGS.force_v2_in_keras_compile = True
+    self._run_and_report_benchmark()
+
  def benchmark_cpu(self):
    """Test keras based model on CPU."""
    self._setup()
@@ -102,6 +115,22 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
    FLAGS.data_format = 'channels_last'
    self._run_and_report_benchmark()

+  def benchmark_cpu_no_dist_strat_force_v2(self):
+    """Keras on CPU without dist strat but with force v2 in keras.compile."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_cpu_no_dist_strat_force_v2')
+    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.data_format = 'channels_last'
+    FLAGS.force_v2_in_keras_compile = True
+    self._run_and_report_benchmark()
+
  def benchmark_cpu_no_dist_strat_run_eagerly(self):
    """Test keras based model on CPU w/forced eager and no dist_strat."""
    self._setup()
@@ -147,38 +176,69 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
    FLAGS.distribution_strategy = 'off'
    self._run_and_report_benchmark()

-  def benchmark_2_gpu(self):
-    """Test keras based model with eager and distribution strategies."""
+  def benchmark_graph_1_gpu_no_dist_strat(self):
+    """Test keras based model with Keras fit but not distribution strategies."""
    self._setup()
-    FLAGS.num_gpus = 2
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.num_gpus = 1
    FLAGS.data_dir = self.data_dir
    FLAGS.batch_size = 128
    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
+    FLAGS.dtype = 'fp32'
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_no_dist_strat_force_v2(self):
+    """No dist strat but forced v2 execution path."""
+    self._setup()
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.num_gpus = 1
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_force_v2')
    FLAGS.dtype = 'fp32'
    FLAGS.enable_eager = True
+    FLAGS.force_v2_in_keras_compile = True
    self._run_and_report_benchmark()

-  def benchmark_graph_2_gpu(self):
-    """Test keras based model with Keras fit and distribution strategies."""
+  def benchmark_1_gpu_force_v2_run_eagerly(self):
+    """No dist strat but forced v2 path via tf.compile path and force eager."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_force_v2_run_eagerly')
+    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.force_v2_in_keras_compile = True
+    self._run_and_report_benchmark()
+
+  def benchmark_2_gpu(self):
+    """Test keras based model with eager and distribution strategies."""
    self._setup()
    FLAGS.num_gpus = 2
    FLAGS.data_dir = self.data_dir
    FLAGS.batch_size = 128
    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
+    FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
    self._run_and_report_benchmark()

-  def benchmark_graph_1_gpu_no_dist_strat(self):
-    """Test keras based model with Keras fit but not distribution strategies."""
+  def benchmark_graph_2_gpu(self):
+    """Test keras based model with Keras fit and distribution strategies."""
    self._setup()
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.num_gpus = 1
+    FLAGS.num_gpus = 2
    FLAGS.data_dir = self.data_dir
    FLAGS.batch_size = 128
    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
    FLAGS.dtype = 'fp32'
    self._run_and_report_benchmark()

@@ -228,6 +288,17 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.batch_size = 128
    self._run_and_report_benchmark()

+  def benchmark_1_gpu_force_v2(self):
+    """Test 1 gpu using forced v2 execution path."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'default'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
+    FLAGS.batch_size = 128
+    FLAGS.force_v2_in_keras_compile = True
+    self._run_and_report_benchmark()
+
  def benchmark_graph_1_gpu(self):
    """Test 1 gpu graph."""
    self._setup()
@@ -271,6 +342,33 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.distribution_strategy = 'off'
    self._run_and_report_benchmark()

+  def benchmark_1_gpu_no_dist_strat_force_v2(self):
+    """No dist strat but forced v2 execution path."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 128
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_force_v2')
+    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.force_v2_in_keras_compile = True
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_no_dist_strat_force_v2_run_eagerly(self):
+    """Forced v2 execution path and forced eager."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 128
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_force_v2_run_eagerly')
+    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.force_v2_in_keras_compile = True
+    self._run_and_report_benchmark()
+
  def benchmark_2_gpu(self):
    """Test 2 gpu."""
    self._setup()
@@ -335,6 +433,19 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.data_format = 'channels_last'
    self._run_and_report_benchmark()

+  def benchmark_cpu_no_dist_strat_force_v2(self):
+    """Test cpu without dist strat and force v2 in model.compile."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_cpu_no_dist_strat_force_v2')
+    FLAGS.batch_size = 128
+    FLAGS.data_format = 'channels_last'
+    FLAGS.force_v2_in_keras_compile = True
+    self._run_and_report_benchmark()
+
  def benchmark_graph_cpu_no_dist_strat(self):
    """Test cpu graph mode without distribution strategies."""
    self._setup()

--- a/official/resnet/keras/keras_cifar_main.py
+++ b/official/resnet/keras/keras_cifar_main.py
@@ -181,7 +181,8 @@ def run(flags_obj):
                  optimizer=optimizer,
                  metrics=(['categorical_accuracy']
                           if flags_obj.report_accuracy_metrics else None),
-                  run_eagerly=flags_obj.run_eagerly)
+                  run_eagerly=flags_obj.run_eagerly,
+                  run_distributed=flags_obj.force_v2_in_keras_compile)

  callbacks = keras_common.get_callbacks(
      learning_rate_schedule, cifar_main.NUM_IMAGES['train'])

--- a/official/resnet/keras/keras_common.py
+++ b/official/resnet/keras/keras_common.py
@@ -258,7 +258,8 @@ def define_keras_flags(dynamic_loss_scale=True):
                                dynamic_loss_scale=dynamic_loss_scale,
                                loss_scale=True,
                                tf_data_experimental_slack=True,
-                                enable_xla=True)
+                                enable_xla=True,
+                                force_v2_in_keras_compile=True)
  flags_core.define_image()
  flags_core.define_benchmark()
  flags.adopt_module_key_flags(flags_core)

--- a/official/resnet/keras/keras_imagenet_benchmark.py
+++ b/official/resnet/keras/keras_imagenet_benchmark.py
@@ -136,7 +136,7 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
    FLAGS.enable_xla = True
    FLAGS.use_tensor_lr = True
    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    self._run_and_report_benchmark()
+    self._run_and_report_benchmark(top_1_min=0.736)

  def benchmark_8_gpu_mlperf_like(self):
    """Test similar to the rules for MLPerf 0.5.
@@ -160,7 +160,7 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
    FLAGS.dtype = 'fp16'
    FLAGS.enable_eager = True
    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
+    self._run_and_report_benchmark(top_1_min=0.736)

  def benchmark_xla_8_gpu_fp16_dynamic(self):
    """Test Keras model with XLA, eager, dist_strat, 8 GPUs, dynamic fp16."""
@@ -178,9 +178,11 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
    # Thread tuning to improve performance.
    FLAGS.tf_gpu_thread_mode = 'gpu_private'
    FLAGS.use_tensor_lr = True
-    self._run_and_report_benchmark()
+    self._run_and_report_benchmark(top_1_min=0.736)

-  def _run_and_report_benchmark(self):
+  def _run_and_report_benchmark(self,
+                                top_1_min=MIN_TOP_1_ACCURACY,
+                                top_1_max=MAX_TOP_1_ACCURACY):
    start_time_sec = time.time()
    stats = keras_imagenet_main.run(flags.FLAGS)
    wall_time_sec = time.time() - start_time_sec
@@ -188,8 +190,8 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
    super(Resnet50KerasAccuracy, self)._report_benchmark(
        stats,
        wall_time_sec,
-        top_1_min=MIN_TOP_1_ACCURACY,
-        top_1_max=MAX_TOP_1_ACCURACY,
+        top_1_min=top_1_min,
+        top_1_max=top_1_max,
        total_batch_size=FLAGS.batch_size,
        log_steps=100)

@@ -261,6 +263,33 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.batch_size = 64
    self._run_and_report_benchmark()

+  def benchmark_1_gpu_force_dist_strat_run_eagerly(self):
+    """No dist strat but forced ds tf.compile path and force eager."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_force_dist_strat_run_eagerly')
+    FLAGS.batch_size = 64
+    FLAGS.force_run_distributed = True
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_force_dist_strat(self):
+    """No dist strat but forced ds tf.compile path."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_force_dist_strat')
+    FLAGS.batch_size = 128
+    FLAGS.force_run_distributed = True
+    self._run_and_report_benchmark()
+
  def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
    """Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
    self._setup()

--- a/official/resnet/keras/keras_imagenet_main.py
+++ b/official/resnet/keras/keras_imagenet_main.py
@@ -205,7 +205,8 @@ def run(flags_obj):
                  optimizer=optimizer,
                  metrics=(['sparse_categorical_accuracy']
                           if flags_obj.report_accuracy_metrics else None),
-                  run_eagerly=flags_obj.run_eagerly)
+                  run_eagerly=flags_obj.run_eagerly,
+                  run_distributed=flags_obj.force_v2_in_keras_compile)

  callbacks = keras_common.get_callbacks(
      learning_rate_schedule, imagenet_main.NUM_IMAGES['train'])

--- a/official/staging/shakespeare/shakespeare_benchmark.py
+++ b/official/staging/shakespeare/shakespeare_benchmark.py
@@ -148,7 +148,7 @@ class ShakespeareAccuracy(ShakespeareBenchmarkBase):
    self._run_and_report_benchmark()

  def benchmark_1_gpu_no_ds_run_eagerly(self):
-    """Benchmark 1 gpu."""
+    """Benchmark 1 gpu without distribution strategies."""
    self._setup()
    FLAGS.num_gpus = 1
    FLAGS.training_data = self.train_data
@@ -160,6 +160,19 @@ class ShakespeareAccuracy(ShakespeareBenchmarkBase):

    self._run_and_report_benchmark()

+  def benchmark_1_gpu_no_ds_force_v2(self):
+    """Benchmark 1 gpu no ds with force_v2 in keras.compile."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    FLAGS.force_v2_in_keras_compile = True
+    FLAGS.distribution_strategy = 'off'
+
+    self._run_and_report_benchmark()
+
  def benchmark_xla_1_gpu(self):
    """Benchmark 1 gpu w/xla."""
    self._setup()

--- a/official/staging/shakespeare/shakespeare_main.py
+++ b/official/staging/shakespeare/shakespeare_main.py
@@ -55,7 +55,8 @@ def define_flags():
                                synthetic_data=False,
                                max_train_steps=False,
                                dtype=False,
-                                enable_xla=True)
+                                enable_xla=True,
+                                force_v2_in_keras_compile=True)

  flags_core.set_defaults(train_epochs=43,
                          batch_size=64)
@@ -166,7 +167,8 @@ def train_model(flags_obj, dataset, vocab_size, strategy, checkpoint_dir=None):
                  metrics=[
                      tf.keras.metrics.Recall(top_k=1, name='RecallAt1'),
                      tf.keras.metrics.Recall(top_k=5, name='RecallAt5')],
-                  run_eagerly=flags_obj.run_eagerly)
+                  run_eagerly=flags_obj.run_eagerly,
+                  run_distributed=flags_obj.force_v2_in_keras_compile)

  callbacks = []
  if checkpoint_dir:

--- a/official/transformer/v2/misc.py
+++ b/official/transformer/v2/misc.py
@@ -71,7 +71,8 @@ def define_transformer_flags():
      dtype=True,
      loss_scale=True,
      all_reduce_alg=True,
-      enable_xla=True
+      enable_xla=True,
+      force_v2_in_keras_compile=True
  )

  # Additional performance flags

--- a/official/utils/flags/_performance.py
+++ b/official/utils/flags/_performance.py
@@ -61,7 +61,8 @@ def define_performance(num_parallel_calls=True, inter_op=True, intra_op=True,
                       datasets_num_parallel_batches=False,
                       dynamic_loss_scale=False, fp16_implementation=False,
                       loss_scale=False,
-                       tf_data_experimental_slack=False, enable_xla=False):
+                       tf_data_experimental_slack=False, enable_xla=False,
+                       force_v2_in_keras_compile=False):
  """Register flags for specifying performance tuning arguments.

  Args:
@@ -87,6 +88,9 @@ def define_performance(num_parallel_calls=True, inter_op=True, intra_op=True,
    tf_data_experimental_slack: Determines whether to enable tf.data's
      `experimental_slack` option.
    enable_xla: Determines if XLA (auto clustering) is turned on.
+    force_v2_in_keras_compile: Forces the use of run_distribued path even if not
+      using a `strategy`. This is not the same as
+      `tf.distribute.OneDeviceStrategy`

  Returns:
    A list of flags for core.py to marks as key flags.
@@ -276,4 +280,11 @@ def define_performance(num_parallel_calls=True, inter_op=True, intra_op=True,
        name="enable_xla", default=False,
        help="Whether to enable XLA auto jit compilation")

+  if force_v2_in_keras_compile:
+    flags.DEFINE_boolean(
+        name="force_v2_in_keras_compile", default=False,
+        help="Forces the use of run_distribued path even if not"
+             "using a `strategy`. This is not the same as"
+             "`tf.distribute.OneDeviceStrategy`")
+
  return key_flags