Refactor and add benchmarks as well as accuracy tests for GPU and CPU (#7248)

* Added benchmarks and common flags. * Add cpu tests. * Add tracking epoch times. * fix transformer. * Add examples_per_second. * fix pylint

Refactor and add benchmarks as well as accuracy tests for GPU and CPU (#7248)
* Added benchmarks and common flags. * Add cpu tests. * Add tracking epoch times. * fix transformer. * Add examples_per_second. * fix pylint
e0a2b8c3 · Toby Boyd · GitHub · 63605b95 · e0a2b8c3 · e0a2b8c3
Unverified Commit e0a2b8c3 authored Jul 18, 2019 by Toby Boyd Committed by GitHub Jul 18, 2019
3 changed files
--- a/official/staging/shakespeare/shakespeare_benchmark.py
+++ b/official/staging/shakespeare/shakespeare_benchmark.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes Shakespeare (LSTM) benchmark and accuracy tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import time
+from absl import flags
+from official.staging.shakespeare import shakespeare_main
+from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+from official.utils.testing.perfzero_benchmark import PerfZeroBenchmark
+SHAKESPEARE_TRAIN_DATA = 'shakespeare/shakespeare.txt'
+FLAGS = flags.FLAGS
+class ShakespeareBenchmarkBase(PerfZeroBenchmark):
+  """Base class for Shakespeare (LSTM) benchmark and accuracy tests."""
+  def __init__(self, output_dir=None, default_flags=None, root_data_dir=None):
+    super(ShakespeareBenchmarkBase, self).__init__(
+        output_dir=output_dir,
+        default_flags=default_flags,
+        flag_methods=[shakespeare_main.define_flags])
+  def _run_and_report_benchmark(self,
+                                top_1_train_min=0.923,
+                                top_1_train_max=0.93,
+                                warmup=1,
+                                log_steps=100):
+    """Report benchmark results by writing to local protobuf file.
+    Average epoch time is calculated by skipping the first epoch. This average
+    ignores time spent between epoch and is recorded by begin and end epoch. To
+    skip accuracy check set `top_1_train_min=None`.
+    Args:
+      top_1_train_min: lowest passing value.
+      top_1_train_max: highest passing value.
+      warmup: number of entries in `timestamp_log` to ignore.
+      log_steps: How often the log was created for `timestamp_log`.
+    """
+    total_batch_size = FLAGS.batch_size
+    metrics = []
+    start_time_sec = time.time()
+    stats = shakespeare_main.run(FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+    if top_1_train_min:
+      metrics.append({'name': 'accuracy_top_1_train',
+                      'value': stats['history']['RecallAt1'][-1],
+                      'min_value': top_1_train_min,
+                      'max_value': top_1_train_max})
+    # Look for the time history callback which was used during keras.fit
+    for callback in stats['callbacks']:
+      if isinstance(callback, keras_utils.TimeHistory):
+        epoch_timings = callback.epoch_runtime_log
+        average_time = sum(epoch_timings[1:]) / len(epoch_timings[1:])
+        metrics.append({'name': 'avg_epoch_time',
+                        'value': average_time})
+      # First entry in timestamp_log is the start of step 1. The rest of the
+      # entries are the end of each step recorded.
+      time_log = callback.timestamp_log
+      elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
+      num_examples = (
+          total_batch_size * log_steps * (len(time_log) - warmup - 1))
+      examples_per_sec = num_examples / elapsed
+      metrics.append({'name': 'exp_per_second',
+                      'value': examples_per_sec})
+    flags_str = flags_core.get_nondefault_flags_as_str()
+    self.report_benchmark(iters=-1, wall_time=wall_time_sec,
+                          metrics=metrics,
+                          extras={'flags': flags_str})
+class ShakespeareAccuracy(ShakespeareBenchmarkBase):
+  """Shakespeare accuracy tests.
+  This is not an ideal test. The best we can use for the accuracy check is to
+  validate top_1 of the training set. At batch size 64 the top_1 training
+  stabilizes to ~0.92 around 40-45 epochs.
+  """
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """Shakespeare accuracy tests.
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+                constructor forward compatible in case PerfZero provides more
+                named arguments before updating the constructor.
+    """
+    self.train_data = os.path.join(root_data_dir, SHAKESPEARE_TRAIN_DATA)
+    super(ShakespeareAccuracy, self).__init__(
+        output_dir=output_dir, root_data_dir=root_data_dir)
+  def benchmark_cpu(self):
+    """Benchmark cpu."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    self._run_and_report_benchmark()
+  def benchmark_cpu_no_ds_run_eagerly(self):
+    """Benchmark cpu without distribution strategies and run eagerly."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu(self):
+    """Benchmark 1 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu_no_ds_run_eagerly(self):
+    """Benchmark 1 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+  def benchmark_xla_1_gpu(self):
+    """Benchmark 1 gpu w/xla."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
+  def benchmark_8_gpu(self):
+    """Benchmark 8 gpu.
+    This is test is for accuracy not scaling.  The batch-size is not scaled to
+    the number of gpus.
+    """
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    self._run_and_report_benchmark()
+  def benchmark_xla_8_gpu(self):
+    """Benchmark 8 gpu w/xla.
+    This is test is for accuracy not scaling.  The batch-size is not scaled to
+    the number of gpus.
+    """
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
+class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase):
+  """Benchmark accuracy tests."""
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """Benchmark tests w/Keras.
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+                constructor forward compatible in case PerfZero provides more
+                named arguments before updating the constructor.
+    """
+    self.train_data = os.path.join(root_data_dir, SHAKESPEARE_TRAIN_DATA)
+    def_flags = {}
+    def_flags['training_data'] = self.train_data
+    def_flags['model_dir'] = ''
+    def_flags['train_epochs'] = 4
+    super(ShakespeareKerasBenchmarkReal, self).__init__(
+        output_dir=output_dir,
+        root_data_dir=root_data_dir,
+        default_flags=def_flags)
+  def benchmark_cpu(self):
+    """Benchmark cpu."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.batch_size = 64
+    self._run_and_report_benchmark()
+  def benchmark_cpu_no_ds_run_eagerly(self):
+    """Benchmark cpu without distribution strategy and run eagerly."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.batch_size = 64
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.run_eagerly = True
+    self._run_and_report_benchmark()
+  def benchmark_cpu_no_ds(self):
+    """Benchmark cpu without distribution strategy."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.batch_size = 64
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu(self):
+    """Benchmark 1 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu_no_ds(self):
+    """Benchmark 1 gpu without distribution strategies."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu_no_ds_run_eagerly(self):
+    """Benchmark 1 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+  def benchmark_xla_1_gpu(self):
+    """Benchmark 1 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
+  def benchmark_8_gpu(self):
+    """Benchmark 8 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.batch_size = 64 * 8
+    self._run_and_report_benchmark()
+  def benchmark_xla_8_gpu(self):
+    """Benchmark 8 gpu w/xla."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64 * 8
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
+  def _run_and_report_benchmark(self):
+    """Run and report benchmark."""
+    super(ShakespeareKerasBenchmarkReal, self)._run_and_report_benchmark(
+        top_1_train_min=None)
--- a/official/staging/shakespeare/shakespeare_main.py
+++ b/official/staging/shakespeare/shakespeare_main.py
@@ -19,26 +19,48 @@ from __future__ import division
 from __future__ import print_function
 import os
-import numpy as np
+# pylint: disable=wrong-import-order
 from absl import app as absl_app
 from absl import flags
+import numpy as np
 import tensorflow as tf
+# pylint: enable=wrong-import-order
+from official.utils.flags import core as flags_core
+from official.utils.misc import distribution_utils
+from official.utils.misc import keras_utils
-BATCH_SIZE = 64
-EPOCHS = 10
 EMBEDDING_DIM = 256
 RNN_UNITS = 1024
 SEQ_LENGTH = 100
+# Calculated by running batch_size=1
+BATCHES_PER_EPOCH = 11043
 def define_flags():
  """Define the flags for the Shakespeare character LSTM."""
-  flags.DEFINE_string(
+  flags_core.define_base(data_dir=False,
-      name='model_dir', default=None,
+                         clean=False,
-      help='Directory for model check points.')
+                         train_epochs=True,
+                         epochs_between_evals=False,
+                         stop_threshold=False,
+                         hooks=False,
+                         export_dir=False,
+                         run_eagerly=True)
+  flags_core.define_performance(num_parallel_calls=False,
+                                inter_op=False,
+                                intra_op=False,
+                                synthetic_data=False,
+                                max_train_steps=False,
+                                dtype=False,
+                                enable_xla=True)
+  flags_core.set_defaults(train_epochs=43,
+                          batch_size=64)
+  flags.DEFINE_boolean(name='enable_eager', default=True, help='Enable eager?')
  flags.DEFINE_boolean(
      name='train', default=True,
      help='If true trains the model.')
@@ -53,18 +75,20 @@ def define_flags():
      help='Path to file containing the training data.')
-def get_dataset(path_to_file, seq_length=SEQ_LENGTH):
+def get_dataset(path_to_file, batch_size=None, seq_length=SEQ_LENGTH):
  """Creates a dataset from a given text file.
  Args:
    path_to_file: The path to the training data.
+    batch_size: Batch size to use.
    seq_length: The length of the LSTM sequence.
  Returns:
    A tuple, consisting of the Dataset and the class to character mapping
    and character to class mapping.
  """
-  text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
+  with open(path_to_file, 'rb') as train_data:
+    text = train_data.read().decode(encoding='utf-8')
  # Create vocab
  vocab = sorted(set(text))
@@ -80,9 +104,9 @@ def get_dataset(path_to_file, seq_length=SEQ_LENGTH):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, tf.one_hot(target_text, len(vocab))
  dataset = sequences.map(split_input_target)
-  dataset = dataset.shuffle(10000).batch(BATCH_SIZE, drop_remainder=True)
+  dataset = dataset.shuffle(10000).repeat()
+  dataset = dataset.batch(batch_size, drop_remainder=True)
  return dataset, idx2char, char2idx
@@ -90,7 +114,7 @@ def get_dataset(path_to_file, seq_length=SEQ_LENGTH):
 def build_model(vocab_size,
                embedding_dim=EMBEDDING_DIM,
                rnn_units=RNN_UNITS,
-                batch_size=BATCH_SIZE,
+                batch_size=None,
                stateful=False):
  """Builds the Shakespeare model.
@@ -115,26 +139,30 @@ def build_model(vocab_size,
      tf.keras.layers.Dense(vocab_size, activation='softmax')])
-def train_model(dataset, vocab_size, checkpoint_dir=None):
+def train_model(flags_obj, dataset, vocab_size, strategy, checkpoint_dir=None):
  """Trains a Shakespeare model.
  Args:
+    flags_obj: An object containing parsed flag values.s
    dataset: the training data set.
    vocab_size: the number of unique character classes.
+    strategy: distribution strategy to use.
    checkpoint_dir: if not None, the directory in which to make checkpoints.
  Returns:
-    The training history.
+    The training history and callbacks.
  """
-  strategy = tf.distribute.MirroredStrategy()
+  train_steps = BATCHES_PER_EPOCH // flags_obj.batch_size
+  strategy_scope = distribution_utils.get_strategy_scope(strategy)
-  with strategy.scope():
+  with strategy_scope:
-    model = build_model(vocab_size=vocab_size)
+    model = build_model(vocab_size=vocab_size, batch_size=flags_obj.batch_size)
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.keras.losses.CategoricalCrossentropy(),
                  metrics=[
                      tf.keras.metrics.Recall(top_k=1, name='RecallAt1'),
-                      tf.keras.metrics.Recall(top_k=5, name='RecallAt5')])
+                      tf.keras.metrics.Recall(top_k=5, name='RecallAt5')],
+                  run_eagerly=flags_obj.run_eagerly)
  callbacks = []
  if checkpoint_dir:
@@ -143,8 +171,14 @@ def train_model(dataset, vocab_size, checkpoint_dir=None):
        filepath=checkpoint_prefix,
        save_weights_only=True)
    callbacks.append(checkpoint_callback)
+  time_callback = keras_utils.TimeHistory(flags_obj.batch_size, 100)
-  return model.fit(dataset, epochs=EPOCHS, callbacks=callbacks)
+  callbacks.append(time_callback)
+  history = model.fit(dataset,
+                      epochs=flags_obj.train_epochs,
+                      steps_per_epoch=train_steps,
+                      callbacks=callbacks,
+                      verbose=2)
+  return history, callbacks
 def make_prediction(checkpoint_dir, length, context, idx2char, char2idx):
@@ -188,18 +222,39 @@ def make_prediction(checkpoint_dir, length, context, idx2char, char2idx):
  return context + ''.join(text_generated)
-def main(_):
+def run(flags_obj):
-  flags_obj = flags.FLAGS
+  """Run Shakespeare training and predict.
+  Args:
+    flags_obj: An object containing parsed flag values.
+  Returns:
+    Dictionary with status from the run.
+  """
  if not flags_obj.training_data:
    raise ValueError(
        'Must set the path to a training data file. e.g download the following '
        'https://storage.googleapis.com/download.tensorflow.org/data/'
        'shakespeare.txt')
-  dataset, idx2char, char2idx = get_dataset(flags_obj.training_data)
+  keras_utils.set_session_config(
+      enable_eager=flags_obj.enable_eager,
+      enable_xla=flags_obj.enable_xla)
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus)
+  dataset, idx2char, char2idx = get_dataset(flags_obj.training_data,
+                                            batch_size=flags_obj.batch_size)
+  stats = {}
  if flags_obj.train:
-    train_model(dataset, len(idx2char), flags_obj.model_dir)
+    history, callbacks = train_model(flags_obj, dataset,
+                                     len(idx2char), strategy,
+                                     checkpoint_dir=flags_obj.model_dir)
+    stats['history'] = history.history
+    stats['callbacks'] = callbacks
  if flags_obj.predict_context:
    if not flags_obj.model_dir:
@@ -210,6 +265,13 @@ def main(_):
                          idx2char,
                          char2idx))
+  return stats
+def main(_):
+  flags_obj = flags.FLAGS
+  run(flags_obj)
 if __name__ == '__main__':
  define_flags()

--- a/official/utils/misc/keras_utils.py
+++ b/official/utils/misc/keras_utils.py
@@ -41,11 +41,11 @@ class TimeHistory(tf.keras.callbacks.Callback):
  """Callback for Keras models."""
  def __init__(self, batch_size, log_steps):
-    """Callback for logging performance (# examples/second).
+    """Callback for logging performance.
    Args:
      batch_size: Total batch size.
-      log_steps: Interval of time history logs.
+      log_steps: Interval of steps between logging of batch level stats.
    """
    self.batch_size = batch_size
    super(TimeHistory, self).__init__()
@@ -55,9 +55,15 @@ class TimeHistory(tf.keras.callbacks.Callback):
    # Logs start of step 1 then end of each step based on log_steps interval.
    self.timestamp_log = []
+    # Records the time each epoch takes to run from start to finish of epoch.
+    self.epoch_runtime_log = []
  def on_train_end(self, logs=None):
    self.train_finish_time = time.time()
+  def on_epoch_begin(self, epoch, logs=None):
+    self.epoch_start = time.time()
  def on_batch_begin(self, batch, logs=None):
    self.global_steps += 1
    if self.global_steps == 1:
@@ -78,6 +84,13 @@ class TimeHistory(tf.keras.callbacks.Callback):
          (self.global_steps, elapsed_time, examples_per_second))
      self.start_time = timestamp
+  def on_epoch_end(self, epoch, logs=None):
+    epoch_run_time = time.time() - self.epoch_start
+    self.epoch_runtime_log.append(epoch_run_time)
+    tf.compat.v1.logging.info(
+        "BenchmarkMetric: {'epoch':%d, 'time_taken': %f}" %
+        (epoch, epoch_run_time))
 def get_profiler_callback(model_dir, profile_steps, enable_tensorboard):
  """Validate profile_steps flag value and return profiler callback."""