Combined imagenet and cifar-10 estimator tests (#6672)

* Combined imagenet and cifar-10 benchmarks * Comments and epochs_between_evals. * Added tuned tests and cleaned up benchmark flags * Fix names. * Return results and add images/sec hook. * updated doc strings for return values. * 128 to 256 batch for FP16 test * added more doc strings to fix lint.

Combined imagenet and cifar-10 estimator tests (#6672)
* Combined imagenet and cifar-10 benchmarks * Comments and epochs_between_evals. * Added tuned tests and cleaned up benchmark flags * Fix names. * Return results and add images/sec hook. * updated doc strings for return values. * 128 to 256 batch for FP16 test * added more doc strings to fix lint.
acc6f6d7 · Toby Boyd · GitHub · 67c403fc · acc6f6d7 · acc6f6d7
Unverified Commit acc6f6d7 authored Apr 26, 2019 by Toby Boyd Committed by GitHub Apr 26, 2019
3 changed files
--- a/official/resnet/estimator_cifar_benchmark.py
+++ b/official/resnet/estimator_cifar_benchmark.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Executes Estimator benchmarks and accuracy tests."""
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,14 +25,271 @@ from absl.testing import flagsaver
 import tensorflow as tf  # pylint: disable=g-bad-import-order

 from official.resnet import cifar10_main as cifar_main
+from official.resnet import imagenet_main
 from official.utils.logs import hooks

-MIN_TOP_1_ACCURACY = 0.926
-MAX_TOP_1_ACCURACY = 0.938
+IMAGENET_DATA_DIR_NAME = 'imagenet'
+CIFAR_DATA_DIR_NAME = 'cifar-10-batches-bin'
+FLAGS = flags.FLAGS
+

+class EstimatorBenchmark(tf.test.Benchmark):
+  """Base class to hold methods common to test classes in the module.

-class EstimatorCifar10BenchmarkTests(tf.test.Benchmark):
-  """Benchmarks and accuracy tests for Estimator ResNet56."""
+     Code under test for Estimator models (ResNet50 and 56) report mostly the
+     same data and require the same FLAG setup.
+  """
+
+  local_flags = None
+
+  def __init__(self, output_dir=None, default_flags=None, flag_methods=None):
+    if not output_dir:
+      output_dir = '/tmp'
+    self.output_dir = output_dir
+    self.default_flags = default_flags or {}
+    self.flag_methods = flag_methods or {}
+
+  def _get_model_dir(self, folder_name):
+    """Returns directory to store info, e.g. saved model and event log."""
+    return os.path.join(self.output_dir, folder_name)
+
+  def _setup(self):
+    """Sets up and resets flags before each test."""
+    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG)
+    if EstimatorBenchmark.local_flags is None:
+      for flag_method in self.flag_methods:
+        flag_method()
+      # Loads flags to get defaults to then override. List cannot be empty.
+      flags.FLAGS(['foo'])
+      # Overrides flag values with defaults for the class of tests.
+      for k, v in self.default_flags.items():
+        setattr(FLAGS, k, v)
+      saved_flag_values = flagsaver.save_flag_values()
+      EstimatorBenchmark.local_flags = saved_flag_values
+    else:
+      flagsaver.restore_flag_values(EstimatorBenchmark.local_flags)
+
+  def _report_benchmark(self,
+                        stats,
+                        wall_time_sec,
+                        top_1_max=None,
+                        top_1_min=None):
+    """Report benchmark results by writing to local protobuf file.
+
+    Args:
+      stats: dict returned from estimator models with known entries.
+      wall_time_sec: the during of the benchmark execution in seconds
+      top_1_max: highest passing level for top_1 accuracy.
+      top_1_min: lowest passing level for top_1 accuracy.
+    """
+
+    examples_per_sec_hook = None
+    for hook in stats['train_hooks']:
+      if isinstance(hook, hooks.ExamplesPerSecondHook):
+        examples_per_sec_hook = hook
+        break
+
+    eval_results = stats['eval_results']
+    metrics = []
+    if 'accuracy' in eval_results:
+      metrics.append({'name': 'accuracy_top_1',
+                      'value': eval_results['accuracy'].item(),
+                      'min_value': top_1_min,
+                      'max_value': top_1_max})
+    if 'accuracy_top_5' in eval_results:
+      metrics.append({'name': 'accuracy_top_5',
+                      'value': eval_results['accuracy_top_5'].item()})
+
+    if examples_per_sec_hook:
+      exp_per_second_list = examples_per_sec_hook.current_examples_per_sec_list
+      # ExamplesPerSecondHook skips the first 10 steps.
+      exp_per_sec = sum(exp_per_second_list) / (len(exp_per_second_list))
+      metrics.append({'name': 'exp_per_second',
+                      'value': exp_per_sec})
+    self.report_benchmark(
+        iters=eval_results['global_step'],
+        wall_time=wall_time_sec,
+        metrics=metrics)
+
+
+class Resnet50EstimatorAccuracy(EstimatorBenchmark):
+  """Benchmark accuracy tests for ResNet50 w/ Estimator."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """Benchmark accuracy tests for ResNet50 w/ Estimator.
+
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+                constructor forward compatible in case PerfZero provides more
+                named arguments before updating the constructor.
+    """
+    flag_methods = [imagenet_main.define_imagenet_flags]
+
+    self.data_dir = os.path.join(root_data_dir, IMAGENET_DATA_DIR_NAME)
+    super(Resnet50EstimatorAccuracy, self).__init__(
+        output_dir=output_dir, flag_methods=flag_methods)
+
+  def benchmark_graph_8_gpu(self):
+    """Test 8 GPUs graph mode."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128 * 8
+    FLAGS.train_epochs = 90
+    FLAGS.epochs_between_evals = 10
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
+    FLAGS.dtype = 'fp32'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_8_gpu(self):
+    """Test FP16 8 GPUs graph mode."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 256 * 8
+    FLAGS.train_epochs = 90
+    FLAGS.epochs_between_evals = 10
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
+    FLAGS.dtype = 'fp16'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def _run_and_report_benchmark(self):
+    start_time_sec = time.time()
+    stats = imagenet_main.run_imagenet(flags.FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+    self._report_benchmark(stats,
+                           wall_time_sec,
+                           top_1_min=0.762,
+                           top_1_max=0.766)
+
+
+class Resnet50EstimatorBenchmark(EstimatorBenchmark):
+  """Benchmarks for ResNet50 using Estimator."""
+  local_flags = None
+
+  def __init__(self, output_dir=None, default_flags=None):
+    flag_methods = [imagenet_main.define_imagenet_flags]
+
+    super(Resnet50EstimatorBenchmark, self).__init__(
+        output_dir=output_dir,
+        default_flags=default_flags,
+        flag_methods=flag_methods)
+
+  def benchmark_graph_fp16_1_gpu(self):
+    """Benchmarks graph fp16 1 gpu."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu')
+    FLAGS.batch_size = 128
+    FLAGS.dtype = 'fp16'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_1_gpu_tweaked(self):
+    """Benchmarks graph fp16 1 gpu tweaked."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.tf_gpu_thread_mode = 'gpu_private'
+    FLAGS.intra_op_parallelism_threads = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu_tweaked')
+    FLAGS.batch_size = 256
+    FLAGS.dtype = 'fp16'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_1_gpu(self):
+    """Benchmarks graph 1 gpu."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
+    FLAGS.batch_size = 128
+    FLAGS.dtype = 'fp32'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_8_gpu(self):
+    """Benchmarks graph 8 gpus."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
+    FLAGS.batch_size = 128*8
+    FLAGS.dtype = 'fp32'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_8_gpu(self):
+    """Benchmarks graph fp16 8 gpus."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
+    FLAGS.batch_size = 256*8
+    FLAGS.dtype = 'fp16'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_8_gpu_tweaked(self):
+    """Benchmarks graph fp16 8 gpus tweaked."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.tf_gpu_thread_mode = 'gpu_private'
+    FLAGS.intra_op_parallelism_threads = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu_tweaked')
+    FLAGS.batch_size = 256*8
+    FLAGS.dtype = 'fp16'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def _run_and_report_benchmark(self):
+    start_time_sec = time.time()
+    stats = imagenet_main.run_imagenet(FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+    print(stats)
+    # Remove values to skip triggering accuracy check.
+    del stats['eval_results']['accuracy']
+    del stats['eval_results']['accuracy_top_5']
+
+    self._report_benchmark(stats,
+                           wall_time_sec)
+
+
+class Resnet50EstimatorBenchmarkSynth(Resnet50EstimatorBenchmark):
+  """Resnet50 synthetic benchmark tests."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    def_flags = {}
+    def_flags['use_synthetic_data'] = True
+    def_flags['max_train_steps'] = 110
+    def_flags['train_epochs'] = 1
+
+    super(Resnet50EstimatorBenchmarkSynth, self).__init__(
+        output_dir=output_dir, default_flags=def_flags)
+
+
+class Resnet50EstimatorBenchmarkReal(Resnet50EstimatorBenchmark):
+  """Resnet50 real data benchmark tests."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    def_flags = {}
+    def_flags['data_dir'] = os.path.join(root_data_dir, IMAGENET_DATA_DIR_NAME)
+    def_flags['max_train_steps'] = 110
+    def_flags['train_epochs'] = 1
+
+    super(Resnet50EstimatorBenchmarkReal, self).__init__(
+        output_dir=output_dir, default_flags=def_flags)
+
+
+class Resnet56EstimatorAccuracy(EstimatorBenchmark):
+  """Accuracy tests for Estimator ResNet56."""

  local_flags = None

@@ -47,57 +303,59 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark):
                constructor forward compatible in case PerfZero provides more
                named arguments before updating the constructor.
    """
+    flag_methods = [cifar_main.define_cifar_flags]

-    self.output_dir = output_dir
-    self.data_dir = os.path.join(root_data_dir, 'cifar-10-batches-bin')
+    self.data_dir = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
+    super(Resnet56EstimatorAccuracy, self).__init__(
+        output_dir=output_dir, flag_methods=flag_methods)

-  def resnet56_1_gpu(self):
+  def benchmark_graph_1_gpu(self):
    """Test layers model with Estimator and distribution strategies."""
    self._setup()
    flags.FLAGS.num_gpus = 1
    flags.FLAGS.data_dir = self.data_dir
    flags.FLAGS.batch_size = 128
    flags.FLAGS.train_epochs = 182
-    flags.FLAGS.model_dir = self._get_model_dir('resnet56_1_gpu')
+    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
    flags.FLAGS.resnet_size = 56
    flags.FLAGS.dtype = 'fp32'
    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
    self._run_and_report_benchmark()

-  def resnet56_fp16_1_gpu(self):
+  def benchmark_graph_fp16_1_gpu(self):
    """Test layers FP16 model with Estimator and distribution strategies."""
    self._setup()
    flags.FLAGS.num_gpus = 1
    flags.FLAGS.data_dir = self.data_dir
    flags.FLAGS.batch_size = 128
    flags.FLAGS.train_epochs = 182
-    flags.FLAGS.model_dir = self._get_model_dir('resnet56_fp16_1_gpu')
+    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu')
    flags.FLAGS.resnet_size = 56
    flags.FLAGS.dtype = 'fp16'
    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
    self._run_and_report_benchmark()

-  def resnet56_2_gpu(self):
+  def benchmark_graph_2_gpu(self):
    """Test layers model with Estimator and dist_strat. 2 GPUs."""
    self._setup()
    flags.FLAGS.num_gpus = 2
    flags.FLAGS.data_dir = self.data_dir
    flags.FLAGS.batch_size = 128
    flags.FLAGS.train_epochs = 182
-    flags.FLAGS.model_dir = self._get_model_dir('resnet56_2_gpu')
+    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
    flags.FLAGS.resnet_size = 56
    flags.FLAGS.dtype = 'fp32'
    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
    self._run_and_report_benchmark()

-  def resnet56_fp16_2_gpu(self):
+  def benchmark_graph_fp16_2_gpu(self):
    """Test layers FP16 model with Estimator and dist_strat. 2 GPUs."""
    self._setup()
    flags.FLAGS.num_gpus = 2
    flags.FLAGS.data_dir = self.data_dir
    flags.FLAGS.batch_size = 128
    flags.FLAGS.train_epochs = 182
-    flags.FLAGS.model_dir = self._get_model_dir('resnet56_fp16_2_gpu')
+    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_2_gpu')
    flags.FLAGS.resnet_size = 56
    flags.FLAGS.dtype = 'fp16'
    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
@@ -110,7 +368,7 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark):
    flags.FLAGS.data_dir = self.data_dir
    flags.FLAGS.batch_size = 128
    flags.FLAGS.train_epochs = 1
-    flags.FLAGS.model_dir = self._get_model_dir('resnet56_1_gpu')
+    flags.FLAGS.model_dir = self._get_model_dir('unit_test')
    flags.FLAGS.resnet_size = 8
    flags.FLAGS.dtype = 'fp32'
    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
@@ -122,42 +380,7 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark):
    stats = cifar_main.run_cifar(flags.FLAGS)
    wall_time_sec = time.time() - start_time_sec

-    examples_per_sec_hook = None
-    for hook in stats['train_hooks']:
-      if isinstance(hook, hooks.ExamplesPerSecondHook):
-        examples_per_sec_hook = hook
-        break
-
-    eval_results = stats['eval_results']
-    metrics = []
-    metrics.append({'name': 'accuracy_top_1',
-                    'value': eval_results['accuracy'].item(),
-                    'min_value': MIN_TOP_1_ACCURACY,
-                    'max_value': MAX_TOP_1_ACCURACY})
-    metrics.append({'name': 'accuracy_top_5',
-                    'value': eval_results['accuracy_top_5'].item()})
-    if examples_per_sec_hook:
-      exp_per_second_list = examples_per_sec_hook.current_examples_per_sec_list
-      # ExamplesPerSecondHook skips the first 10 steps.
-      exp_per_sec = sum(exp_per_second_list) / (len(exp_per_second_list))
-      metrics.append({'name': 'exp_per_second',
-                      'value': exp_per_sec})
-
-    self.report_benchmark(
-        iters=eval_results['global_step'],
-        wall_time=wall_time_sec,
-        metrics=metrics)
-
-  def _get_model_dir(self, folder_name):
-    return os.path.join(self.output_dir, folder_name)
-
-  def _setup(self):
-    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG)
-    if EstimatorCifar10BenchmarkTests.local_flags is None:
-      cifar_main.define_cifar_flags()
-      # Loads flags to get defaults to then override.
-      flags.FLAGS(['foo'])
-      saved_flag_values = flagsaver.save_flag_values()
-      EstimatorCifar10BenchmarkTests.local_flags = saved_flag_values
-      return
-    flagsaver.restore_flag_values(EstimatorCifar10BenchmarkTests.local_flags)
+    self._report_benchmark(stats,
+                           wall_time_sec,
+                           top_1_min=0.926,
+                           top_1_max=0.938)
--- a/official/resnet/imagenet_main.py
+++ b/official/resnet/imagenet_main.py
@@ -360,15 +360,23 @@ def run_imagenet(flags_obj):

  Args:
    flags_obj: An object containing parsed flag values.
+
+  Returns:
+    Dict of results of the run.  Contains the keys `eval_results` and
+      `train_hooks`. `eval_results` contains accuracy (top_1) and
+      accuracy_top_5. `train_hooks` is a list the instances of hooks used during
+      training.
  """
  input_function = (flags_obj.use_synthetic_data and
                    get_synth_input_fn(flags_core.get_tf_dtype(flags_obj)) or
                    input_fn)

-  resnet_run_loop.resnet_main(
+  result = resnet_run_loop.resnet_main(
      flags_obj, imagenet_model_fn, input_function, DATASET_NAME,
      shape=[DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS])

+  return result
+

 def main(_):
  with logger.benchmark_context(flags.FLAGS):

--- a/official/resnet/resnet_run_loop.py
+++ b/official/resnet/resnet_run_loop.py
@@ -525,8 +525,9 @@ def resnet_main(
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.

-  Returns:
-    Dict of results of the run.
+  Dict of results of the run.  Contains the keys `eval_results` and
+    `train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5.
+    `train_hooks` is a list the instances of hooks used during training.
  """

  model_helpers.apply_clean(flags.FLAGS)