Add non-CuDNN LSTM option and PerfZero benchmarks. (#7391)

* add non-CuDNN LSTM * condition eager on v2 for no cudnn benchmark * remove implementation specification * add comments and adjust benchmarks * re-add implementation=2 * update docstrings

Add non-CuDNN LSTM option and PerfZero benchmarks. (#7391)
* add non-CuDNN LSTM * condition eager on v2 for no cudnn benchmark * remove implementation specification * add comments and adjust benchmarks * re-add implementation=2 * update docstrings
4d93d894 · Taylor Robie · GitHub · 8384b05d · 4d93d894 · 4d93d894
Unverified Commit 4d93d894 authored Aug 06, 2019 by Taylor Robie Committed by GitHub Aug 06, 2019
Showing with 65 additions and 6 deletions

official/staging/shakespeare/shakespeare_benchmark.py official/staging/shakespeare/shakespeare_benchmark.py +38 -0

official/staging/shakespeare/shakespeare_main.py official/staging/shakespeare/shakespeare_main.py +27 -6

No files found.
--- a/official/staging/shakespeare/shakespeare_benchmark.py
+++ b/official/staging/shakespeare/shakespeare_benchmark.py
@@ -289,6 +289,15 @@ class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase):
    FLAGS.batch_size = 64
    self._run_and_report_benchmark()
+  def benchmark_1_gpu_no_cudnn(self):
+    """Benchmark 1 gpu with CuDNN disabled."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64
+    FLAGS.cudnn = False
+    FLAGS.enable_eager = keras_utils.is_v2_0()
+    self._run_and_report_benchmark()
  def benchmark_1_gpu_no_ds(self):
    """Benchmark 1 gpu without distribution strategies."""
    self._setup()
@@ -323,6 +332,16 @@ class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase):
    FLAGS.enable_xla = True
    self._run_and_report_benchmark()
+  def benchmark_xla_1_gpu_no_cudnn(self):
+    """Benchmark 1 gpu w/xla and CuDNN disabled."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64
+    FLAGS.cudnn = False
+    FLAGS.enable_eager = keras_utils.is_v2_0()
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
  def benchmark_8_gpu(self):
    """Benchmark 8 gpu."""
    self._setup()
@@ -331,6 +350,15 @@ class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase):
    FLAGS.log_steps = 10
    self._run_and_report_benchmark()
+  def benchmark_8_gpu_no_cudnn(self):
+    """Benchmark 8 gpu with CuDNN disabled."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.batch_size = 64 * 8
+    FLAGS.cudnn = False
+    FLAGS.enable_eager = keras_utils.is_v2_0()
+    self._run_and_report_benchmark()
  def benchmark_xla_8_gpu(self):
    """Benchmark 8 gpu w/xla."""
    self._setup()
@@ -340,6 +368,16 @@ class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase):
    FLAGS.enable_xla = True
    self._run_and_report_benchmark()
+  def benchmark_xla_8_gpu_no_cudnn(self):
+    """Benchmark 8 gpu w/xla and CuDNN disabled."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.batch_size = 64 * 8
+    FLAGS.cudnn = False
+    FLAGS.enable_eager = keras_utils.is_v2_0()
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
  def _run_and_report_benchmark(self):
    """Run and report benchmark."""
    super(ShakespeareKerasBenchmarkReal, self)._run_and_report_benchmark(

--- a/official/staging/shakespeare/shakespeare_main.py
+++ b/official/staging/shakespeare/shakespeare_main.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import functools
 import os
 # pylint: disable=wrong-import-order
@@ -78,6 +79,7 @@ def define_flags():
  flags.DEFINE_string(
      name='training_data', default=None,
      help='Path to file containing the training data.')
+  flags.DEFINE_boolean(name='cudnn', default=True, help='Use CuDNN LSTM.')
 def get_dataset(path_to_file, batch_size=None, seq_length=SEQ_LENGTH):
@@ -120,7 +122,8 @@ def build_model(vocab_size,
                embedding_dim=EMBEDDING_DIM,
                rnn_units=RNN_UNITS,
                batch_size=None,
-                stateful=False):
+                stateful=False,
+                use_cudnn=True):
  """Builds the Shakespeare model.
  Args:
@@ -133,14 +136,31 @@ def build_model(vocab_size,
  Returns:
    A Keras Model.
  """
+  # In V1 there is a separate class for CuDNN. In V2 the LSTM class will use
+  # CuDNN automatically if applicable.
+  if use_cudnn and not keras_utils.is_v2_0():
+    LSTM = tf.compat.v1.CuDNNLSTM
+  else:
+    # The LSTM call was rewritten to be more efficient in 2.0. However because
+    # we want to compare the performance of the two runtimes, we force both
+    # V1 and V2 to use the more efficient implementation.
+    LSTM = functools.partial(tf.keras.layers.LSTM, implementation=2)
+  # By indirecting the activation through a lambda layer, the logic to dispatch
+  # to CuDNN in V2 doesn't trigger and we force the LSTM to run in non-CuDNN
+  # mode.
+  lstm_activation = ('tanh' if use_cudnn else
+                     lambda x: tf.math.tanh(x))
  batch_shape = [batch_size if stateful else None, None]
  return tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                batch_input_shape=batch_shape),
-      tf.keras.layers.LSTM(rnn_units,
+      LSTM(rnn_units,
-                           return_sequences=True,
+           activation=lstm_activation,
-                           stateful=stateful,
+           return_sequences=True,
-                           recurrent_initializer='glorot_uniform'),
+           stateful=stateful,
+           recurrent_initializer='glorot_uniform'),
      tf.keras.layers.Dense(vocab_size, activation='softmax')])
@@ -161,7 +181,8 @@ def train_model(flags_obj, dataset, vocab_size, strategy, checkpoint_dir=None):
  strategy_scope = distribution_utils.get_strategy_scope(strategy)
  with strategy_scope:
-    model = build_model(vocab_size=vocab_size, batch_size=flags_obj.batch_size)
+    model = build_model(vocab_size=vocab_size, batch_size=flags_obj.batch_size,
+                        use_cudnn=flags_obj.cudnn)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss=tf.keras.losses.CategoricalCrossentropy(),