Unverified Commit 4d93d894 authored by Taylor Robie's avatar Taylor Robie Committed by GitHub
Browse files

Add non-CuDNN LSTM option and PerfZero benchmarks. (#7391)

* add non-CuDNN LSTM

* condition eager on v2 for no cudnn benchmark

* remove implementation specification

* add comments and adjust benchmarks

* re-add implementation=2

* update docstrings
parent 8384b05d
...@@ -289,6 +289,15 @@ class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase): ...@@ -289,6 +289,15 @@ class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase):
FLAGS.batch_size = 64 FLAGS.batch_size = 64
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_1_gpu_no_cudnn(self):
"""Benchmark 1 gpu with CuDNN disabled."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = 64
FLAGS.cudnn = False
FLAGS.enable_eager = keras_utils.is_v2_0()
self._run_and_report_benchmark()
def benchmark_1_gpu_no_ds(self): def benchmark_1_gpu_no_ds(self):
"""Benchmark 1 gpu without distribution strategies.""" """Benchmark 1 gpu without distribution strategies."""
self._setup() self._setup()
...@@ -323,6 +332,16 @@ class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase): ...@@ -323,6 +332,16 @@ class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase):
FLAGS.enable_xla = True FLAGS.enable_xla = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_xla_1_gpu_no_cudnn(self):
"""Benchmark 1 gpu w/xla and CuDNN disabled."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = 64
FLAGS.cudnn = False
FLAGS.enable_eager = keras_utils.is_v2_0()
FLAGS.enable_xla = True
self._run_and_report_benchmark()
def benchmark_8_gpu(self): def benchmark_8_gpu(self):
"""Benchmark 8 gpu.""" """Benchmark 8 gpu."""
self._setup() self._setup()
...@@ -331,6 +350,15 @@ class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase): ...@@ -331,6 +350,15 @@ class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase):
FLAGS.log_steps = 10 FLAGS.log_steps = 10
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_8_gpu_no_cudnn(self):
"""Benchmark 8 gpu with CuDNN disabled."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.batch_size = 64 * 8
FLAGS.cudnn = False
FLAGS.enable_eager = keras_utils.is_v2_0()
self._run_and_report_benchmark()
def benchmark_xla_8_gpu(self): def benchmark_xla_8_gpu(self):
"""Benchmark 8 gpu w/xla.""" """Benchmark 8 gpu w/xla."""
self._setup() self._setup()
...@@ -340,6 +368,16 @@ class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase): ...@@ -340,6 +368,16 @@ class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase):
FLAGS.enable_xla = True FLAGS.enable_xla = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_xla_8_gpu_no_cudnn(self):
"""Benchmark 8 gpu w/xla and CuDNN disabled."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.batch_size = 64 * 8
FLAGS.cudnn = False
FLAGS.enable_eager = keras_utils.is_v2_0()
FLAGS.enable_xla = True
self._run_and_report_benchmark()
def _run_and_report_benchmark(self): def _run_and_report_benchmark(self):
"""Run and report benchmark.""" """Run and report benchmark."""
super(ShakespeareKerasBenchmarkReal, self)._run_and_report_benchmark( super(ShakespeareKerasBenchmarkReal, self)._run_and_report_benchmark(
......
...@@ -18,6 +18,7 @@ from __future__ import absolute_import ...@@ -18,6 +18,7 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import functools
import os import os
# pylint: disable=wrong-import-order # pylint: disable=wrong-import-order
...@@ -78,6 +79,7 @@ def define_flags(): ...@@ -78,6 +79,7 @@ def define_flags():
flags.DEFINE_string( flags.DEFINE_string(
name='training_data', default=None, name='training_data', default=None,
help='Path to file containing the training data.') help='Path to file containing the training data.')
flags.DEFINE_boolean(name='cudnn', default=True, help='Use CuDNN LSTM.')
def get_dataset(path_to_file, batch_size=None, seq_length=SEQ_LENGTH): def get_dataset(path_to_file, batch_size=None, seq_length=SEQ_LENGTH):
...@@ -120,7 +122,8 @@ def build_model(vocab_size, ...@@ -120,7 +122,8 @@ def build_model(vocab_size,
embedding_dim=EMBEDDING_DIM, embedding_dim=EMBEDDING_DIM,
rnn_units=RNN_UNITS, rnn_units=RNN_UNITS,
batch_size=None, batch_size=None,
stateful=False): stateful=False,
use_cudnn=True):
"""Builds the Shakespeare model. """Builds the Shakespeare model.
Args: Args:
...@@ -133,14 +136,31 @@ def build_model(vocab_size, ...@@ -133,14 +136,31 @@ def build_model(vocab_size,
Returns: Returns:
A Keras Model. A Keras Model.
""" """
# In V1 there is a separate class for CuDNN. In V2 the LSTM class will use
# CuDNN automatically if applicable.
if use_cudnn and not keras_utils.is_v2_0():
LSTM = tf.compat.v1.CuDNNLSTM
else:
# The LSTM call was rewritten to be more efficient in 2.0. However because
# we want to compare the performance of the two runtimes, we force both
# V1 and V2 to use the more efficient implementation.
LSTM = functools.partial(tf.keras.layers.LSTM, implementation=2)
# By indirecting the activation through a lambda layer, the logic to dispatch
# to CuDNN in V2 doesn't trigger and we force the LSTM to run in non-CuDNN
# mode.
lstm_activation = ('tanh' if use_cudnn else
lambda x: tf.math.tanh(x))
batch_shape = [batch_size if stateful else None, None] batch_shape = [batch_size if stateful else None, None]
return tf.keras.Sequential([ return tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, tf.keras.layers.Embedding(vocab_size, embedding_dim,
batch_input_shape=batch_shape), batch_input_shape=batch_shape),
tf.keras.layers.LSTM(rnn_units, LSTM(rnn_units,
return_sequences=True, activation=lstm_activation,
stateful=stateful, return_sequences=True,
recurrent_initializer='glorot_uniform'), stateful=stateful,
recurrent_initializer='glorot_uniform'),
tf.keras.layers.Dense(vocab_size, activation='softmax')]) tf.keras.layers.Dense(vocab_size, activation='softmax')])
...@@ -161,7 +181,8 @@ def train_model(flags_obj, dataset, vocab_size, strategy, checkpoint_dir=None): ...@@ -161,7 +181,8 @@ def train_model(flags_obj, dataset, vocab_size, strategy, checkpoint_dir=None):
strategy_scope = distribution_utils.get_strategy_scope(strategy) strategy_scope = distribution_utils.get_strategy_scope(strategy)
with strategy_scope: with strategy_scope:
model = build_model(vocab_size=vocab_size, batch_size=flags_obj.batch_size) model = build_model(vocab_size=vocab_size, batch_size=flags_obj.batch_size,
use_cudnn=flags_obj.cudnn)
model.compile( model.compile(
optimizer=tf.keras.optimizers.Adam(), optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.CategoricalCrossentropy(), loss=tf.keras.losses.CategoricalCrossentropy(),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment