Remove benchmark folder from the master branch. They are stale. (#9085)

c5e107ff · Hongkun Yu · GitHub · 266c7f43 · 266c7f43 · 266c7f43
Unverified Commit c5e107ff authored Aug 10, 2020 by Hongkun Yu Committed by GitHub Aug 10, 2020
16 changed files
--- a/official/benchmark/models/shakespeare/README.md
+++ b/official/benchmark/models/shakespeare/README.md
-# Shakespeare character LSTM model
-
-This is an implemention of a simple character LSTM used to generate text.
-
-## Instructions
-
-First download the source data:
-
-```
-wget https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
-```
-
-Note that files other than shakepeare.txt can also be used to train the model to generater other text.
-
-Then train the model:
-
-```python
-python3 shakespeare_main.py --training_data shakespeare.txt \
-    --model_dir /tmp/shakespeare
-```
-
-This will place model checkpoints in `/tmp/shakespeare`, so that we can use them to make predictions.
-
-Then generate predictions:
-
-```python
-python3 shakespeare_main.py --training_data shakespeare.txt \
-    --model_dir /tmp/shakespeare --notrain --predict_context=ROMEO:
-```
-
-Change `--predict_context` and `--predict_length` to suit your needs.
--- a/official/benchmark/models/shakespeare/__init__.py
+++ b/official/benchmark/models/shakespeare/__init__.py
-
--- a/official/benchmark/models/shakespeare/shakespeare_main.py
+++ b/official/benchmark/models/shakespeare/shakespeare_main.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Runs a character LSTM model trained on Shakespeare."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import os
-
-# pylint: disable=wrong-import-order
-from absl import app
-from absl import flags
-import numpy as np
-import tensorflow as tf
-# pylint: enable=wrong-import-order
-
-from official.utils.flags import core as flags_core
-from official.utils.misc import distribution_utils
-from official.utils.misc import keras_utils
-
-EMBEDDING_DIM = 256
-RNN_UNITS = 1024
-SEQ_LENGTH = 100
-# Calculated by running batch_size=1
-BATCHES_PER_EPOCH = 11043
-
-
-def define_flags():
-  """Define the flags for the Shakespeare character LSTM."""
-  flags_core.define_base(data_dir=False,
-                         clean=False,
-                         train_epochs=True,
-                         epochs_between_evals=False,
-                         stop_threshold=False,
-                         num_gpu=True,
-                         export_dir=False,
-                         run_eagerly=True,
-                         distribution_strategy=True)
-
-  flags_core.define_performance(num_parallel_calls=False,
-                                inter_op=False,
-                                intra_op=False,
-                                synthetic_data=False,
-                                max_train_steps=False,
-                                dtype=True,
-                                loss_scale=True,
-                                enable_xla=True)
-
-  flags_core.set_defaults(train_epochs=43,
-                          batch_size=64)
-
-  flags.DEFINE_boolean(name='enable_eager', default=True, help='Enable eager?')
-  flags.DEFINE_boolean(
-      name='train', default=True,
-      help='If true trains the model.')
-  flags.DEFINE_string(
-      name='predict_context', default=None,
-      help='If set, makes a prediction with the given context.')
-  flags.DEFINE_integer(
-      name='predict_length', default=1000,
-      help='Length of the predicted text including the context.')
-  flags.DEFINE_integer(name='train_steps', default=None,
-                       help='Overrides train_steps per epoch if not None.')
-  flags.DEFINE_integer(
-      name='log_steps', default=100,
-      help='For every log_steps, we log the timing information such as '
-      'examples per second.')
-  flags.DEFINE_string(
-      name='training_data', default=None,
-      help='Path to file containing the training data.')
-  flags.DEFINE_boolean(name='cudnn', default=True, help='Use CuDNN LSTM.')
-
-
-def get_dataset(path_to_file, batch_size=None, seq_length=SEQ_LENGTH):
-  """Creates a dataset from a given text file.
-
-  Args:
-    path_to_file: The path to the training data.
-    batch_size: Batch size to use.
-    seq_length: The length of the LSTM sequence.
-
-  Returns:
-    A tuple, consisting of the Dataset and the class to character mapping
-    and character to class mapping.
-  """
-  with tf.io.gfile.GFile(path_to_file, 'rb') as train_data:
-    text = train_data.read().decode(encoding='utf-8')
-
-  # Create vocab
-  vocab = sorted(set(text))
-  char2idx = {u: i for i, u in enumerate(vocab)}
-  idx2char = np.array(vocab)
-
-  # Split text into sequence length + 1 chucks to create examples
-  text_as_int = np.array([char2idx[c] for c in text])
-  char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
-  sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
-
-  def split_input_target(chunk):
-    input_text = chunk[:-1]
-    target_text = chunk[1:]
-    return input_text, tf.one_hot(target_text, len(vocab))
-  dataset = sequences.map(split_input_target)
-  dataset = dataset.shuffle(10000).repeat()
-  dataset = dataset.batch(batch_size, drop_remainder=True)
-
-  return dataset, idx2char, char2idx
-
-
-def build_model(vocab_size,
-                embedding_dim=EMBEDDING_DIM,
-                rnn_units=RNN_UNITS,
-                batch_size=None,
-                stateful=False,
-                use_cudnn=True):
-  """Builds the Shakespeare model.
-
-  Args:
-    vocab_size: The number of character classes in the input.
-    embedding_dim: The dimension of the embedding space for each class.
-    rnn_units: The number of RNN units in the layer.
-    batch_size: When predicting, the batch size of the predictions.
-    stateful: If true, the LSTM is stateful.
-
-  Returns:
-    A Keras Model.
-  """
-  LSTM = functools.partial(tf.keras.layers.LSTM, implementation=2)
-
-  # By indirecting the activation through a lambda layer, the logic to dispatch
-  # to CuDNN in V2 doesn't trigger and we force the LSTM to run in non-CuDNN
-  # mode.
-  lstm_activation = ('tanh' if use_cudnn else
-                     lambda x: tf.math.tanh(x))
-
-  batch_shape = [batch_size if stateful else None, None]
-  return tf.keras.Sequential([
-      tf.keras.layers.Embedding(vocab_size, embedding_dim,
-                                batch_input_shape=batch_shape),
-      LSTM(rnn_units,
-           activation=lstm_activation,
-           return_sequences=True,
-           stateful=stateful,
-           recurrent_initializer='glorot_uniform'),
-      tf.keras.layers.Dense(vocab_size),
-      tf.keras.layers.Softmax(dtype=tf.float32)])
-
-
-def train_model(flags_obj, dataset, vocab_size, strategy, checkpoint_dir=None):
-  """Trains a Shakespeare model.
-
-  Args:
-    flags_obj: An object containing parsed flag values.s
-    dataset: the training data set.
-    vocab_size: the number of unique character classes.
-    strategy: distribution strategy to use.
-    checkpoint_dir: if not None, the directory in which to make checkpoints.
-
-  Returns:
-    The training history and callbacks.
-  """
-  if flags_obj.train_steps:
-    train_steps = flags_obj.train_steps
-  else:
-    train_steps = BATCHES_PER_EPOCH // flags_obj.batch_size
-  strategy_scope = distribution_utils.get_strategy_scope(strategy)
-
-  with strategy_scope:
-    model = build_model(vocab_size=vocab_size, batch_size=flags_obj.batch_size,
-                        use_cudnn=flags_obj.cudnn)
-
-   # When keras_use_ctl is False, Model.fit() automatically applies
-   # loss scaling so we don't need to create a LossScaleOptimizer.
-    model.compile(
-        optimizer=tf.keras.optimizers.Adam(),
-        loss=tf.keras.losses.CategoricalCrossentropy(),
-        metrics=[tf.keras.metrics.Recall(top_k=1, name='RecallAt1'),
-                 tf.keras.metrics.Recall(top_k=5, name='RecallAt5')],
-        run_eagerly=flags_obj.run_eagerly)
-
-  callbacks = []
-  if checkpoint_dir:
-    checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')
-    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
-        filepath=checkpoint_prefix,
-        save_weights_only=True)
-    callbacks.append(checkpoint_callback)
-  time_callback = keras_utils.TimeHistory(flags_obj.batch_size,
-                                          flags_obj.log_steps)
-  callbacks.append(time_callback)
-  history = model.fit(dataset,
-                      epochs=flags_obj.train_epochs,
-                      steps_per_epoch=train_steps,
-                      callbacks=callbacks,
-                      verbose=2)
-  return history, callbacks
-
-
-def make_prediction(checkpoint_dir, length, context, idx2char, char2idx):
-  """Make predictions from a Shakespeare model.
-
-  Args:
-    checkpoint_dir: the directory from which to load checkpoints
-    length: the total length of the generated text (including the context).
-    context: the initial text with which the LSTM is primed.
-    idx2char: the character class to character mapping.
-    char2idx: the character to character class mapping.
-
-  Returns:
-    A generated string of text of the given length.
-  """
-  prediction_model = build_model(
-      vocab_size=len(idx2char), batch_size=1, stateful=True)
-  prediction_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
-  prediction_model.build(tf.TensorShape([1, None]))
-
-  input_eval = [char2idx[s] for s in context]
-  input_eval = tf.expand_dims(input_eval, 0)
-
-  text_generated = []
-
-  prediction_model.reset_states()
-  for _ in range(length - len(context)):
-    predictions = prediction_model(input_eval)
-    predictions = tf.squeeze(predictions, 0)
-
-    # We applied a softmax to the output of the model so that
-    # tf.keras.metrics.Recall would work. We need logits for
-    # tf.random.categorical, so we convert the probabilities back to log odds
-    predictions = tf.math.log(predictions / (1 - predictions))
-
-    random_output = tf.random.categorical(predictions, num_samples=1)
-    selected_id = random_output[-1, 0].numpy()
-    input_eval = tf.expand_dims([selected_id], 0)
-    text_generated.append(idx2char[selected_id])
-
-  return context + ''.join(text_generated)
-
-
-def run(flags_obj):
-  """Run Shakespeare training and predict.
-
-  Args:
-    flags_obj: An object containing parsed flag values.
-
-  Returns:
-    Dictionary with status from the run.
-  """
-  if not flags_obj.training_data:
-    raise ValueError(
-        'Must set the path to a training data file. e.g download the following '
-        'https://storage.googleapis.com/download.tensorflow.org/data/'
-        'shakespeare.txt')
-
-  if flags_obj.dtype == 'fp16':
-    policy = tf.keras.mixed_precision.experimental.Policy(
-        'mixed_float16',
-        loss_scale=flags_core.get_loss_scale(flags_obj,
-                                             default_for_fp16='dynamic'))
-    tf.keras.mixed_precision.experimental.set_policy(policy)
-
-  keras_utils.set_session_config(
-      enable_xla=flags_obj.enable_xla)
-
-  strategy = distribution_utils.get_distribution_strategy(
-      distribution_strategy=flags_obj.distribution_strategy,
-      num_gpus=flags_obj.num_gpus)
-
-  dataset, idx2char, char2idx = get_dataset(flags_obj.training_data,
-                                            batch_size=flags_obj.batch_size)
-  stats = {}
-  if flags_obj.train:
-    history, callbacks = train_model(flags_obj, dataset,
-                                     len(idx2char), strategy,
-                                     checkpoint_dir=flags_obj.model_dir)
-
-    stats['history'] = history.history
-    stats['callbacks'] = callbacks
-
-  if flags_obj.predict_context:
-    if not flags_obj.model_dir:
-      raise ValueError('Must set model_dir to get predictions.')
-    print(make_prediction(flags_obj.model_dir,
-                          flags_obj.predict_length,
-                          flags_obj.predict_context,
-                          idx2char,
-                          char2idx))
-
-  return stats
-
-
-def main(_):
-  flags_obj = flags.FLAGS
-  run(flags_obj)
-
-
-if __name__ == '__main__':
-  define_flags()
-  app.run(main)
--- a/official/benchmark/models/synthetic_util.py
+++ b/official/benchmark/models/synthetic_util.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Helper functions to generate data directly on devices."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import random
-import string
-
-from absl import logging
-import tensorflow as tf
-
-
-# The `SyntheticDataset` is a temporary solution for generating synthetic data
-# directly on devices. It is only useful for Keras with Distribution
-# Strategies. We will have better support in `tf.data` or Distribution Strategy
-# later.
-class SyntheticDataset(object):
-  """A dataset that generates synthetic data on each device."""
-
-  def __init__(self, dataset, split_by=1):
-    # dataset.take(1) doesn't have GPU kernel.
-    with tf.device('device:CPU:0'):
-      tensor = tf.data.experimental.get_single_element(dataset.take(1))
-    flat_tensor = tf.nest.flatten(tensor)
-    variable_data = []
-    initializers = []
-    for t in flat_tensor:
-      rebatched_t = tf.split(t, num_or_size_splits=split_by, axis=0)[0]
-      assert rebatched_t.shape.is_fully_defined(), rebatched_t.shape
-      v = tf.compat.v1.get_local_variable(self._random_name(),
-                                          initializer=rebatched_t)
-      variable_data.append(v)
-      initializers.append(v.initializer)
-    input_data = tf.nest.pack_sequence_as(tensor, variable_data)
-    self._iterator = SyntheticIterator(input_data, initializers)
-
-  def _random_name(self, size=10, chars=string.ascii_uppercase + string.digits):
-    return ''.join(random.choice(chars) for _ in range(size))
-
-  def __iter__(self):
-    return self._iterator
-
-  def make_one_shot_iterator(self):
-    return self._iterator
-
-  def make_initializable_iterator(self):
-    return self._iterator
-
-
-class SyntheticIterator(object):
-  """A dataset that generates synthetic data on each device."""
-
-  def __init__(self, input_data, initializers):
-    self._input_data = input_data
-    self._initializers = initializers
-
-  def get_next(self):
-    return self._input_data
-
-  def next(self):
-    return self.__next__()
-
-  def __next__(self):
-    try:
-      return self.get_next()
-    except tf.errors.OutOfRangeError:
-      raise StopIteration
-
-  def initialize(self):
-    if tf.executing_eagerly():
-      return tf.no_op()
-    else:
-      return self._initializers
-
-
-def _monkey_patch_dataset_method(strategy):
-  """Monkey-patch `strategy`'s `make_dataset_iterator` method."""
-  def make_dataset(self, dataset):
-    logging.info('Using pure synthetic data.')
-    with self.scope():
-      if self.extended._global_batch_size:  # pylint: disable=protected-access
-        return SyntheticDataset(dataset, self.num_replicas_in_sync)
-      else:
-        return SyntheticDataset(dataset)
-
-  def make_iterator(self, dataset):
-    dist_dataset = make_dataset(self, dataset)
-    return iter(dist_dataset)
-
-  strategy.orig_make_dataset_iterator = strategy.make_dataset_iterator
-  strategy.make_dataset_iterator = make_iterator
-  strategy.orig_distribute_dataset = strategy.experimental_distribute_dataset
-  strategy.experimental_distribute_dataset = make_dataset
-
-
-def _undo_monkey_patch_dataset_method(strategy):
-  if hasattr(strategy, 'orig_make_dataset_iterator'):
-    strategy.make_dataset_iterator = strategy.orig_make_dataset_iterator
-  if hasattr(strategy, 'orig_distribute_dataset'):
-    strategy.make_dataset_iterator = strategy.orig_distribute_dataset
-
-
-def set_up_synthetic_data():
-  _monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
-  _monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
-  _monkey_patch_dataset_method(
-      tf.distribute.experimental.MultiWorkerMirroredStrategy)
-
-
-def undo_set_up_synthetic_data():
-  _undo_monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
-  _undo_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
-  _undo_monkey_patch_dataset_method(
-      tf.distribute.experimental.MultiWorkerMirroredStrategy)
--- a/official/benchmark/ncf_keras_benchmark.py
+++ b/official/benchmark/ncf_keras_benchmark.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes Keras benchmarks and accuracy tests."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-
-from absl import flags
-from absl import logging
-from absl.testing import flagsaver
-import tensorflow as tf
-from official.benchmark import benchmark_wrappers
-from official.benchmark import owner_utils
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-from official.recommendation import ncf_common
-from official.recommendation import ncf_keras_main
-from official.utils.flags import core
-
-FLAGS = flags.FLAGS
-NCF_DATA_DIR_NAME = 'movielens_data'
-NCF_TF_REGRESSION_DATA_DIR_NAME = 'gs://tf-regression/ncf/data'
-
-
-class NCFKerasBenchmarkBase(PerfZeroBenchmark):
-  """Base class for NCF model benchmark."""
-
-  def __init__(self, output_dir=None, default_flags=None, **kwargs):
-    super(NCFKerasBenchmarkBase, self).__init__(output_dir, default_flags,
-                                                **kwargs)
-
-    # Run all benchmarks with ml_perf flag.
-    self.default_flags['ml_perf'] = True
-
-  def _setup(self):
-    """Sets up and resets flags before each test."""
-    logging.set_verbosity(logging.INFO)
-    if NCFKerasBenchmarkBase.local_flags is None:
-      ncf_common.define_ncf_flags()
-      # Loads flags to get defaults to then override. List cannot be empty.
-      flags.FLAGS(['foo'])
-      core.set_defaults(**self.default_flags)
-      saved_flag_values = flagsaver.save_flag_values()
-      NCFKerasBenchmarkBase.local_flags = saved_flag_values
-    else:
-      flagsaver.restore_flag_values(NCFKerasBenchmarkBase.local_flags)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self, hr_at_10_min=0, hr_at_10_max=0):
-    start_time_sec = time.time()
-    stats = ncf_keras_main.run_ncf(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    metrics = []
-    metrics.append({'name': 'exp_per_second',
-                    'value': stats['avg_exp_per_second']})
-
-    if hr_at_10_min > 0:
-      metrics.append({'name': 'hr_at_10',
-                      'value': stats['eval_hit_rate'],
-                      'min_value': hr_at_10_min,
-                      'max_value': hr_at_10_max})
-
-      metrics.append({'name': 'train_loss',
-                      'value': stats['loss']})
-
-    self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=metrics)
-
-
-class NCFKerasAccuracy(NCFKerasBenchmarkBase):
-  """Benchmark NCF model using real data."""
-
-  def __init__(self,
-               output_dir=None,
-               root_data_dir=None,
-               default_flags=None,
-               **kwargs):
-    root_data_dir = root_data_dir if root_data_dir else ''
-    default_flags = {}
-    default_flags['dataset'] = 'ml-20m'
-    default_flags['num_gpus'] = 1
-    default_flags['train_epochs'] = 10
-    default_flags['clean'] = True
-    default_flags['batch_size'] = 99000
-    default_flags['learning_rate'] = 0.00382059
-    default_flags['beta1'] = 0.783529
-    default_flags['beta2'] = 0.909003
-    default_flags['epsilon'] = 1.45439e-07
-    default_flags['layers'] = [256, 256, 128, 64]
-    default_flags['num_factors'] = 64
-    default_flags['hr_threshold'] = 0.635
-    default_flags['ml_perf'] = True
-    default_flags['use_synthetic_data'] = False
-    default_flags['data_dir'] = os.path.join(root_data_dir, NCF_DATA_DIR_NAME)
-
-    super(NCFKerasAccuracy, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        **kwargs)
-
-  def _run_and_report_benchmark_mlperf_like(self):
-    """Run test and report results.
-
-    Note: MLPerf like tests are not tuned to hit a specific hr@10 value, but
-    we want it recorded.
-    """
-    self._run_and_report_benchmark(hr_at_10_min=0.61)
-
-  def _run_and_report_benchmark(self, hr_at_10_min=0.630, hr_at_10_max=0.645):
-    """Run test and report results.
-
-    Note: Target is 0.635, but some runs are below that level. Until we have
-    multi-run tests, we have to accept a lower target.
-
-    Args:
-      hr_at_10_min: Minimum acceptable hr@10 value.
-      hr_at_10_max: Maximum acceptable hr@10 value.
-    """
-    super(NCFKerasAccuracy, self)._run_and_report_benchmark(
-        hr_at_10_min=hr_at_10_min,
-        hr_at_10_max=hr_at_10_max)
-
-  def _set_8_gpu_defaults(self):
-    FLAGS.num_gpus = 8
-    FLAGS.learning_rate = 0.0045
-    FLAGS.beta1 = 0.25
-    FLAGS.beta2 = 0.5
-    FLAGS.epsilon = 1e-8
-    FLAGS.train_epochs = 14
-    FLAGS.batch_size = 99000
-    FLAGS.eval_batch_size = 160000
-    FLAGS.train_dataset_path = os.path.join(NCF_TF_REGRESSION_DATA_DIR_NAME,
-                                            'training_cycle_*/*')
-    FLAGS.eval_dataset_path = os.path.join(NCF_TF_REGRESSION_DATA_DIR_NAME,
-                                           'eval_data/*')
-    FLAGS.input_meta_data_path = os.path.join(NCF_TF_REGRESSION_DATA_DIR_NAME,
-                                              'metadata')
-    FLAGS.data_dir = NCF_TF_REGRESSION_DATA_DIR_NAME
-
-  def benchmark_1_gpu_early_stop(self):
-    self._setup()
-    FLAGS.early_stopping = True
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_early_stop(self):
-    self._setup()
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.early_stopping = True
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly_early_stop(self):
-    self._setup()
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.early_stopping = True
-    FLAGS.run_eagerly = True
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_early_stop(self):
-    self._setup()
-    FLAGS.early_stopping = True
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_ctl_early_stop(self):
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.early_stopping = True
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_ctl_run_eagerly_early_stop(self):
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.early_stopping = True
-    FLAGS.run_eagerly = True
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_ctl_early_stop(self):
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.early_stopping = True
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_2_gpus_early_stop(self):
-    self._setup()
-    FLAGS.early_stopping = True
-    FLAGS.num_gpus = 2
-    FLAGS.eval_batch_size = 160000
-    self._run_and_report_benchmark()
-
-  def benchmark_2_gpus_ctl_early_stop(self):
-    """NCF with custom training loop. Works only in TF 2.0."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.early_stopping = True
-    FLAGS.num_gpus = 2
-    FLAGS.eval_batch_size = 160000
-    self._run_and_report_benchmark()
-
-#############################################
-# Tests below with mlperf in the test name are of two types:
-#  1) 1 GPU tests are based on MLPerf 0.5 and the TensorFlow pulled submission.
-#  2) 8 GPU tests are based on MLPerf 0.5 and use NVIDIA's hyper parameters.
-#
-# The purpose of both is to get a number to compare to existing results. To do
-# this the number of epochs is held constant rather than a race to a given
-# accuracy. The accuracy validation is done by the "early_stop" tests.
-#############################################
-
-  def benchmark_1_gpu_mlperf_like(self):
-    """1 GPU using keras fit/compile."""
-    self._setup()
-    FLAGS.train_epochs = 7
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_no_dist_strat_mlperf_like(self):
-    """1 GPU using compile/fit without dist_strat."""
-    self._setup()
-    FLAGS.train_epochs = 7
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly_mlperf_like(self):
-    self._setup()
-    FLAGS.train_epochs = 7
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.run_eagerly = True
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_xla_1_gpu_mlperf_like(self):
-    """1 GPU using compile/fit with XLA."""
-    self._setup()
-    FLAGS.train_epochs = 7
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_ctl_mlperf_like(self):
-    """1 GPU using CTL."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.train_epochs = 7
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_ctl_fp16_mlperf_like(self):
-    """1 GPU using CTL and FP16."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.train_epochs = 7
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_fp16_mlperf_like(self):
-    """1 GPU using FP16."""
-    self._setup()
-    FLAGS.train_epochs = 7
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_ctl_fp16_graph_rewrite_mlperf_like(self):
-    """1 GPU using CTL and FP16 graph rewrite."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.train_epochs = 7
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_fp16_graph_rewrite_mlperf_like(self):
-    """1 GPU using FP16 graph rewrite."""
-    self._setup()
-    FLAGS.train_epochs = 7
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_ctl_run_eagerly_mlperf_like(self):
-    """1 GPU using CTL with eager and distribution strategy."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.run_eagerly = True
-    FLAGS.train_epochs = 7
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_ctl_mlperf_like(self):
-    """1 GPU using CTL with XLA."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.enable_xla = True
-    FLAGS.train_epochs = 7
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_xla_1_gpu_fp16_mlperf_like(self):
-    """1 GPU using with XLA and FP16."""
-    self._setup()
-    FLAGS.enable_xla = True
-    FLAGS.train_epochs = 7
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_xla_1_gpu_ctl_fp16_mlperf_like(self):
-    """1 GPU using CTL with XLA and FP16."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.enable_xla = True
-    FLAGS.train_epochs = 7
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_8_gpu_mlperf_like(self):
-    """8 GPU using keras fit/compile."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.train_epochs = 17
-    FLAGS.batch_size = 1048576
-    FLAGS.eval_batch_size = 160000
-    FLAGS.learning_rate = 0.0045
-    FLAGS.beta1 = 0.25
-    FLAGS.beta2 = 0.5
-    FLAGS.epsilon = 1e-8
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_8_gpu_ctl_mlperf_like(self):
-    """8 GPU using CTL."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.num_gpus = 8
-    FLAGS.train_epochs = 17
-    FLAGS.batch_size = 1048576
-    FLAGS.eval_batch_size = 160000
-    FLAGS.learning_rate = 0.0045
-    FLAGS.beta1 = 0.25
-    FLAGS.beta2 = 0.5
-    FLAGS.epsilon = 1e-8
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_8_gpu_tf_data_ctl_mlperf_like(self):
-    """8 GPU using CTL."""
-    self._setup()
-    self._set_8_gpu_defaults()
-    FLAGS.keras_use_ctl = True
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_8_gpu_tf_data_fp16_mlperf_like(self):
-    """8 GPU FP16."""
-    self._setup()
-    self._set_8_gpu_defaults()
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_8_gpu_tf_data_ctl_fp16_mlperf_like(self):
-    """8 GPU FP16 using CTL."""
-    self._setup()
-    self._set_8_gpu_defaults()
-    FLAGS.keras_use_ctl = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_8_gpu_tf_data_ctl_fp16_graph_rewrite_mlperf_like(self):
-    """8 GPU FP16 graph rewrite using CTL."""
-    self._setup()
-    self._set_8_gpu_defaults()
-    FLAGS.keras_use_ctl = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-
-class NCFKerasBenchmarkReal(NCFKerasBenchmarkBase):
-  """NCF Keras throughput benchmarks."""
-
-  def __init__(self,
-               output_dir=None,
-               root_data_dir=None,
-               default_flags=None,
-               **kwargs):
-
-    root_data_dir = root_data_dir if root_data_dir else ''
-    default_flags = {}
-    default_flags['dataset'] = 'ml-20m'
-    default_flags['num_gpus'] = 1
-    default_flags['train_epochs'] = 14
-    default_flags['clean'] = True
-    default_flags['batch_size'] = 99000
-    default_flags['eval_batch_size'] = 160000
-    default_flags['learning_rate'] = 0.00382059
-    default_flags['beta1'] = 0.783529
-    default_flags['beta2'] = 0.909003
-    default_flags['epsilon'] = 1.45439e-07
-    default_flags['layers'] = [256, 256, 128, 64]
-    default_flags['num_factors'] = 64
-    default_flags['hr_threshold'] = 0.635
-    default_flags['ml_perf'] = True
-    default_flags['use_synthetic_data'] = False
-    default_flags['train_dataset_path'] = os.path.join(
-        NCF_TF_REGRESSION_DATA_DIR_NAME, 'training_cycle_*/*')
-    default_flags['eval_dataset_path'] = os.path.join(
-        NCF_TF_REGRESSION_DATA_DIR_NAME, 'eval_data/*')
-    default_flags['input_meta_data_path'] = os.path.join(
-        NCF_TF_REGRESSION_DATA_DIR_NAME, 'metadata')
-    default_flags['data_dir'] = NCF_TF_REGRESSION_DATA_DIR_NAME
-
-    super(NCFKerasBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=default_flags, **kwargs)
-
-  def benchmark_2x2_tpu(self):
-    """2x2 TPU using CTL with distribution strategy."""
-    self._setup()
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.keras_use_ctl = True
-    FLAGS.num_gpus = 0
-    FLAGS.train_epochs = 1
-    self._run_and_report_benchmark()
-
-  @owner_utils.Owner('tf-graph-compiler')
-  def benchmark_2x2_tpu_mlir(self):
-    """2x2 TPU using CTL with distribution strategy using the MLIR bridge."""
-    self._setup()
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.keras_use_ctl = True
-    FLAGS.num_gpus = 0
-    FLAGS.train_epochs = 1
-    tf.config.experimental.enable_mlir_bridge()
-    self._run_and_report_benchmark()
-
-
-class NCFKerasSynth(NCFKerasBenchmarkBase):
-  """Benchmark NCF model using synthetic data."""
-
-  def __init__(self,
-               output_dir=None,
-               default_flags=None,
-               **kwargs):
-
-    default_flags = {}
-    default_flags['dataset'] = 'ml-20m'
-    default_flags['num_gpus'] = 1
-    default_flags['train_epochs'] = 8
-    default_flags['batch_size'] = 99000
-    default_flags['eval_batch_size'] = 160000
-    default_flags['learning_rate'] = 0.00382059
-    default_flags['beta1'] = 0.783529
-    default_flags['beta2'] = 0.909003
-    default_flags['epsilon'] = 1.45439e-07
-    default_flags['layers'] = [256, 256, 128, 64]
-    default_flags['num_factors'] = 64
-    default_flags['hr_threshold'] = 0.635
-    default_flags['use_synthetic_data'] = True
-
-    super(NCFKerasSynth, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        **kwargs)
-
-  def benchmark_1_gpu(self):
-    self._setup()
-    self._run_and_report_benchmark()
-
-  def benchmark_2_gpus(self):
-    self._setup()
-    FLAGS.num_gpus = 2
-    self._run_and_report_benchmark()
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/nhnet_benchmark.py
+++ b/official/benchmark/nhnet_benchmark.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes benchmark testing for bert pretraining."""
-# pylint: disable=line-too-long
-from __future__ import print_function
-
-import time
-from typing import Optional
-
-from absl import flags
-import tensorflow as tf
-
-from official.benchmark import benchmark_wrappers
-from official.benchmark import owner_utils
-from official.benchmark import perfzero_benchmark
-from official.nlp.nhnet import trainer
-from official.utils.flags import core as flags_core
-
-MIN_LOSS = 0.40
-MAX_LOSS = 0.55
-NHNET_DATA = 'gs://tf-perfzero-data/nhnet/v1/processed/train.tfrecord*'
-PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12/bert_model.ckpt'
-
-FLAGS = flags.FLAGS
-
-
-class NHNetBenchmark(perfzero_benchmark.PerfZeroBenchmark):
-  """Base benchmark class for NHNet."""
-
-  def __init__(self, output_dir=None, default_flags=None, tpu=None, **kwargs):
-    self.default_flags = default_flags or {}
-    flag_methods = trainer.define_flags()
-    super(NHNetBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        flag_methods=flag_methods,
-        tpu=tpu,
-        **kwargs)
-
-  def _report_benchmark(self,
-                        stats,
-                        wall_time_sec,
-                        max_value=None,
-                        min_value=None):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      stats: dict returned from keras models with known entries.
-      wall_time_sec: the during of the benchmark execution in seconds
-      max_value: highest passing level.
-      min_value: lowest passing level.
-    """
-
-    metrics = []
-    metrics.append({
-        'name': 'training_loss',
-        'value': stats['training_loss'],
-        'min_value': min_value,
-        'max_value': max_value
-    })
-    # These metrics are placeholders to avoid PerfZero failure.
-    metrics.append({
-        'name': 'exp_per_second',
-        'value': 0.0,
-    })
-    metrics.append({
-        'name': 'startup_time',
-        'value': 9999.,
-    })
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(
-        iters=-1,
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_str})
-
-
-class NHNetAccuracyBenchmark(NHNetBenchmark):
-  """Benchmark accuracy tests for NHNet."""
-
-  def __init__(self,
-               output_dir: Optional[str] = None,
-               tpu: Optional[str] = None,
-               **kwargs):
-    default_flags = dict(
-        mode='train',
-        train_file_pattern=NHNET_DATA,
-        train_batch_size=1024,
-        model_type='nhnet',
-        len_title=15,
-        len_passage=200,
-        num_encoder_layers=12,
-        num_decoder_layers=12,
-        num_nhnet_articles=5,
-        steps_per_loop=1000,
-        params_override='init_from_bert2bert=false')
-    super(NHNetAccuracyBenchmark, self).__init__(
-        output_dir=output_dir, default_flags=default_flags, tpu=tpu, **kwargs)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self, max_value=MAX_LOSS, min_value=MIN_LOSS):
-    """Runs and reports the benchmark given the provided configuration."""
-    start_time_sec = time.time()
-    stats = trainer.run()
-    wall_time_sec = time.time() - start_time_sec
-    self._report_benchmark(
-        stats, wall_time_sec, max_value=max_value, min_value=min_value)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_accuracy_4x4_tpu_f32_50k_steps(self):
-    """Test bert pretraining with 4x4 TPU for 50k steps."""
-    # This is used for accuracy test.
-    self._setup()
-    FLAGS.train_steps = 50000
-    FLAGS.checkpoint_interval = FLAGS.train_steps
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.init_checkpoint = PRETRAINED_CHECKPOINT_PATH
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_accuracy_4x4_tpu_bf32_50k_steps')
-    self._run_and_report_benchmark()
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_accuracy_4x4_tpu_f32_1k_steps(self):
-    """Test bert pretraining with 4x4 TPU for 1k steps."""
-    self._setup()
-    FLAGS.train_steps = 1000
-    FLAGS.checkpoint_interval = FLAGS.train_steps
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_accuracy_4x4_tpu_bf32_1k_steps')
-    self._run_and_report_benchmark()
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/owner_utils.py
+++ b/official/benchmark/owner_utils.py
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utils to set Owner annotations on benchmarks.
-
-@owner_utils.Owner('owner_team/user') can be set either at the benchmark class
-level / benchmark method level or both.
-
-Runner frameworks can use owner_utils.GetOwner(benchmark_method) to get the
-actual owner. Python inheritance for the owner attribute is respected.  (E.g
-method level owner takes precedence over class level).
-
-See owner_utils_test for associated tests and more examples.
-
-The decorator can be applied both at the method level and at the class level.
-
-Simple example:
-===============
-
-class MLBenchmark:
-
-  @Owner('example_id')
-  def benchmark_method_1_gpu(self):
-    return True
-"""
-
-
-def Owner(owner_name):
-  """Sets the owner attribute on a decorated method or class."""
-
-  def _Wrapper(func_or_class):
-    """Sets the benchmark owner attribute."""
-    func_or_class.__benchmark__owner__ = owner_name
-    return func_or_class
-
-  return _Wrapper
-
-
-def GetOwner(benchmark_method_or_class):
-  """Gets the inherited owner attribute for this benchmark.
-
-  Checks for existence of __benchmark__owner__. If it's not present, looks for
-  it in the parent class's attribute list.
-
-  Args:
-    benchmark_method_or_class: A benchmark method or class.
-
-  Returns:
-    string - the associated owner if present / None.
-  """
-  if hasattr(benchmark_method_or_class, '__benchmark__owner__'):
-    return benchmark_method_or_class.__benchmark__owner__
-  elif hasattr(benchmark_method_or_class, '__self__'):
-    if hasattr(benchmark_method_or_class.__self__, '__benchmark__owner__'):
-      return benchmark_method_or_class.__self__.__benchmark__owner__
-  return None
--- a/official/benchmark/owner_utils_test.py
+++ b/official/benchmark/owner_utils_test.py
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for official.benchmark.owner_utils."""
-
-from absl.testing import absltest
-
-from official.benchmark import owner_utils
-
-
-@owner_utils.Owner('static_owner')
-def static_function(foo=5):
-  return foo
-
-
-def static_function_without_owner(foo=5):
-  return foo
-
-
-class BenchmarkClassWithoutOwner:
-
-  def method_without_owner(self):
-    return 100
-
-  @owner_utils.Owner('method_owner')
-  def method_with_owner(self):
-    return 200
-
-
-@owner_utils.Owner('class_owner')
-class SomeBenchmarkClass:
-
-  def method_inherited_owner(self):
-    return 123
-
-  @owner_utils.Owner('method_owner')
-  def method_override_owner(self):
-    return 345
-
-
-@owner_utils.Owner('new_class_owner')
-class InheritedClass(SomeBenchmarkClass):
-
-  def method_inherited_owner(self):
-    return 456
-
-  @owner_utils.Owner('new_method_owner')
-  def method_override_owner(self):
-    return 567
-
-
-class OwnerUtilsTest(absltest.TestCase):
-  """Tests to assert for owner decorator functionality."""
-
-  def test_owner_tag_missing(self):
-    self.assertEqual(None, owner_utils.GetOwner(static_function_without_owner))
-
-    benchmark_class = BenchmarkClassWithoutOwner()
-    self.assertEqual(None,
-                     owner_utils.GetOwner(benchmark_class.method_without_owner))
-    self.assertEqual(100, benchmark_class.method_without_owner())
-
-    self.assertEqual('method_owner',
-                     owner_utils.GetOwner(benchmark_class.method_with_owner))
-    self.assertEqual(200, benchmark_class.method_with_owner())
-
-  def test_owner_attributes_static(self):
-    self.assertEqual('static_owner', owner_utils.GetOwner(static_function))
-    self.assertEqual(5, static_function(5))
-
-  def test_owner_attributes_per_class(self):
-    level1 = SomeBenchmarkClass()
-    self.assertEqual('class_owner',
-                     owner_utils.GetOwner(level1.method_inherited_owner))
-    self.assertEqual(123, level1.method_inherited_owner())
-
-    self.assertEqual('method_owner',
-                     owner_utils.GetOwner(level1.method_override_owner))
-    self.assertEqual(345, level1.method_override_owner())
-
-  def test_owner_attributes_inherited_class(self):
-    level2 = InheritedClass()
-    self.assertEqual('new_class_owner',
-                     owner_utils.GetOwner(level2.method_inherited_owner))
-    self.assertEqual(456, level2.method_inherited_owner())
-
-    self.assertEqual('new_method_owner',
-                     owner_utils.GetOwner(level2.method_override_owner))
-    self.assertEqual(567, level2.method_override_owner())
-
-
-if __name__ == '__main__':
-  absltest.main()
--- a/official/benchmark/perfzero_benchmark.py
+++ b/official/benchmark/perfzero_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utils for creating PerfZero benchmarks."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl import flags
-from absl import logging
-from absl.testing import flagsaver
-import tensorflow as tf
-
-FLAGS = flags.FLAGS
-
-
-class PerfZeroBenchmark(tf.test.Benchmark):
-  """Common methods used in PerfZero Benchmarks.
-
-     Handles the resetting of flags between tests, loading of default_flags,
-     overriding of defaults.  PerfZero (OSS) runs each test in a separate
-     process reducing some need to reset the flags.
-  """
-  local_flags = None
-
-  def __init__(self,
-               output_dir=None,
-               default_flags=None,
-               root_data_dir=None,
-               flag_methods=None,
-               tpu=None):
-    """Initialize class.
-
-    Args:
-      output_dir: Base directory to store all output for the test.
-      default_flags: Set of flags to pass to model.
-      root_data_dir: Optional param used by child classes to look for the
-        dataset.
-      flag_methods: Set of flag methods to run during setup.
-      tpu: (optional) TPU name to use in a TPU benchmark.
-    """
-    if os.getenv('BENCHMARK_OUTPUT_DIR'):
-      self.output_dir = os.getenv('BENCHMARK_OUTPUT_DIR')
-    elif output_dir:
-      self.output_dir = output_dir
-    else:
-      self.output_dir = '/tmp'
-    self.default_flags = default_flags or {}
-    self.flag_methods = flag_methods or {}
-
-    if os.getenv('BENCHMARK_TPU'):
-      resolved_tpu = os.getenv('BENCHMARK_TPU')
-    elif tpu:
-      resolved_tpu = tpu
-    else:
-      resolved_tpu = None
-
-    if resolved_tpu:
-      # TPU models are expected to accept a --tpu=name flag. PerfZero creates
-      # the TPU at runtime and passes the TPU's name to this flag.
-      self.default_flags['tpu'] = resolved_tpu
-
-    logging.info('root_data_dir: %s', root_data_dir)
-
-  @property
-  def tpu(self):
-    return self.default_flags.get('tpu', None)
-
-  def _get_model_dir(self, folder_name):
-    """Returns directory to store info, e.g. saved model and event log."""
-    return os.path.join(self.output_dir, folder_name)
-
-  def _setup(self):
-    """Sets up and resets flags before each test."""
-    logging.set_verbosity(logging.INFO)
-    if PerfZeroBenchmark.local_flags is None:
-      for flag_method in self.flag_methods:
-        flag_method()
-      # Loads flags to get defaults to then override. List cannot be empty.
-      flags.FLAGS(['foo'])
-      # Overrides flag values with defaults for the class of tests.
-      for k, v in self.default_flags.items():
-        setattr(FLAGS, k, v)
-      saved_flag_values = flagsaver.save_flag_values()
-      PerfZeroBenchmark.local_flags = saved_flag_values
-    else:
-      flagsaver.restore_flag_values(PerfZeroBenchmark.local_flags)
--- a/official/benchmark/resnet_ctl_imagenet_benchmark.py
+++ b/official/benchmark/resnet_ctl_imagenet_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes CTL benchmarks and accuracy tests."""
-# pylint: disable=line-too-long,g-bad-import-order
-from __future__ import print_function
-
-import os
-import time
-
-from absl import flags
-import tensorflow as tf
-
-from official.benchmark import owner_utils
-from official.vision.image_classification.resnet import common
-from official.vision.image_classification.resnet import resnet_ctl_imagenet_main
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-from official.benchmark import benchmark_wrappers
-from official.utils.flags import core as flags_core
-
-MIN_TOP_1_ACCURACY = 0.76
-MAX_TOP_1_ACCURACY = 0.77
-
-FLAGS = flags.FLAGS
-
-
-class CtlBenchmark(PerfZeroBenchmark):
-  """Base benchmark class with methods to simplify testing."""
-
-  def __init__(self,
-               output_dir=None,
-               default_flags=None,
-               flag_methods=None,
-               **kwargs):
-    self.default_flags = default_flags or {}
-    self.flag_methods = flag_methods or {}
-    super(CtlBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=self.default_flags,
-        flag_methods=self.flag_methods,
-        **kwargs)
-
-  def _report_benchmark(self,
-                        stats,
-                        wall_time_sec,
-                        top_1_max=None,
-                        top_1_min=None,
-                        total_batch_size=None,
-                        log_steps=None,
-                        warmup=1,
-                        start_time_sec=None):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      stats: dict returned from keras models with known entries.
-      wall_time_sec: the during of the benchmark execution in seconds
-      top_1_max: highest passing level for top_1 accuracy.
-      top_1_min: lowest passing level for top_1 accuracy.
-      total_batch_size: Global batch-size.
-      log_steps: How often the log was created for stats['step_timestamp_log'].
-      warmup: number of entries in stats['step_timestamp_log'] to ignore.
-      start_time_sec: the start time of the program in seconds since epoch.
-    """
-
-    metrics = []
-    if 'eval_acc' in stats:
-      metrics.append({
-          'name': 'accuracy_top_1',
-          'value': stats['eval_acc'],
-          'min_value': top_1_min,
-          'max_value': top_1_max
-      })
-      metrics.append({'name': 'eval_loss', 'value': stats['eval_loss']})
-
-      metrics.append({
-          'name': 'top_1_train_accuracy',
-          'value': stats['train_acc']
-      })
-      metrics.append({'name': 'train_loss', 'value': stats['train_loss']})
-
-    if (warmup and 'step_timestamp_log' in stats and
-        len(stats['step_timestamp_log']) > warmup + 1):
-      # first entry in the time_log is start of step 0. The rest of the
-      # entries are the end of each step recorded
-      time_log = stats['step_timestamp_log']
-      steps_elapsed = time_log[-1].batch_index - time_log[warmup].batch_index
-      time_elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
-      examples_per_sec = total_batch_size * (steps_elapsed / time_elapsed)
-      metrics.append({'name': 'exp_per_second', 'value': examples_per_sec})
-
-    if 'avg_exp_per_second' in stats:
-      metrics.append({
-          'name': 'avg_exp_per_second',
-          'value': stats['avg_exp_per_second']
-      })
-
-    if start_time_sec and 'step_timestamp_log' in stats:
-      time_log = stats['step_timestamp_log']
-      # time_log[0] is recorded at the beginning of the first step.
-      startup_time = time_log[0].timestamp - start_time_sec
-      metrics.append({'name': 'startup_time', 'value': startup_time})
-
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(
-        iters=-1,
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_str})
-
-
-class Resnet50CtlAccuracy(CtlBenchmark):
-  """Benchmark accuracy tests for ResNet50 in CTL."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """A benchmark class.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-        constructor forward compatible in case PerfZero provides more named
-        arguments before updating the constructor.
-    """
-
-    flag_methods = [common.define_keras_flags]
-
-    self.data_dir = os.path.join(root_data_dir, 'imagenet')
-    super(Resnet50CtlAccuracy, self).__init__(
-        output_dir=output_dir, flag_methods=flag_methods)
-
-  def benchmark_8_gpu(self):
-    """Test Keras model with eager, dist_strat and 8 GPUs."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    FLAGS.dtype = 'fp32'
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_fp16(self):
-    """Test Keras model with eager, 8 GPUs with tf.keras mixed precision."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 256 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_amp(self):
-    """Test Keras model with 8 GPUs and mixed precision via graph rewrite."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 256 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_amp')
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    self._run_and_report_benchmark()
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = resnet_ctl_imagenet_main.run(flags.FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    super(Resnet50CtlAccuracy, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        top_1_min=MIN_TOP_1_ACCURACY,
-        top_1_max=MAX_TOP_1_ACCURACY,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=100,
-        start_time_sec=start_time_sec)
-
-
-class Resnet50CtlBenchmarkBase(CtlBenchmark):
-  """Resnet50 benchmarks."""
-
-  def __init__(self, output_dir=None, default_flags=None, **kwargs):
-    flag_methods = [common.define_keras_flags]
-
-    super(Resnet50CtlBenchmarkBase, self).__init__(
-        output_dir=output_dir,
-        flag_methods=flag_methods,
-        default_flags=default_flags,
-        **kwargs)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = resnet_ctl_imagenet_main.run(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    # Warmup means the number of logged step time entries that are excluded in
-    # performance report. Default to exclude 1 FLAGS.log_steps time.
-    super(Resnet50CtlBenchmarkBase, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps,
-        warmup=1,
-        start_time_sec=start_time_sec)
-
-  def benchmark_1_gpu_no_dist_strat(self):
-    """Test Keras model with 1 GPU, no distribution strategy."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu(self):
-    """Test Keras model with 1 GPU."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_fp16(self):
-    """Test Keras model with 1 GPU with tf.keras mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
-    FLAGS.batch_size = 256
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_amp(self):
-    """Test Keras model with 1 GPU with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_amp')
-    FLAGS.batch_size = 256
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_amp(self):
-    """Test Keras model with XLA and 1 GPU with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_amp')
-    FLAGS.batch_size = 256
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_eager(self):
-    """Test Keras model with 1 GPU in pure eager mode."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_eager')
-    FLAGS.batch_size = 120
-    FLAGS.use_tf_function = False
-    FLAGS.use_tf_while_loop = False
-    FLAGS.single_l2_loss_op = True
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_fp16_eager(self):
-    """Test Keras model with 1 GPU with fp16 and pure eager mode."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_eager')
-    FLAGS.batch_size = 240
-    FLAGS.dtype = 'fp16'
-    FLAGS.use_tf_function = False
-    FLAGS.use_tf_while_loop = False
-    FLAGS.single_l2_loss_op = True
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu(self):
-    """Test Keras model with 8 GPUs."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    FLAGS.batch_size = 128 * 8  # 8 GPUs
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_fp16(self):
-    """Test Keras model with 8 GPUs with tf.keras mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_eager(self):
-    """Test Keras model with 8 GPUs, eager, fp32."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.use_tf_function = False
-    FLAGS.use_tf_while_loop = False
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_eager')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_eager_fp16(self):
-    """Test Keras model with 8 GPUs, eager, fp16."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.use_tf_function = False
-    FLAGS.use_tf_while_loop = False
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_eager_fp16')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_amp(self):
-    """Test Keras model with 8 GPUs with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_amp')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu_amp(self):
-    """Test Keras model with XLA and 8 GPUs with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_amp')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def _set_df_common(self):
-    FLAGS.steps_per_loop = 500
-    FLAGS.train_epochs = 2
-    FLAGS.train_steps = None
-    FLAGS.skip_eval = True
-    FLAGS.enable_eager = True
-    FLAGS.enable_tensorboard = False
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.report_accuracy_metrics = False
-    FLAGS.log_steps = 50
-    FLAGS.single_l2_loss_op = True
-    FLAGS.use_tf_function = True
-    FLAGS.enable_checkpoint_and_export = False
-    FLAGS.data_dir = 'gs://mlcompass-data/imagenet/imagenet-2012-tfrecord'
-
-  def benchmark_2x2_tpu_bf16(self):
-    self._setup()
-    self._set_df_common()
-    FLAGS.batch_size = 1024
-    FLAGS.dtype = 'bf16'
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_bf16')
-    self._run_and_report_benchmark()
-
-  @owner_utils.Owner('tf-graph-compiler')
-  def benchmark_2x2_tpu_bf16_mlir(self):
-    self._setup()
-    self._set_df_common()
-    FLAGS.batch_size = 1024
-    FLAGS.dtype = 'bf16'
-    tf.config.experimental.enable_mlir_bridge()
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_bf16_mlir')
-    self._run_and_report_benchmark()
-
-  def benchmark_4x4_tpu_bf16(self):
-    self._setup()
-    self._set_df_common()
-    FLAGS.batch_size = 4096
-    FLAGS.dtype = 'bf16'
-    FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu_bf16')
-    self._run_and_report_benchmark()
-
-  @owner_utils.Owner('tf-graph-compiler')
-  def benchmark_4x4_tpu_bf16_mlir(self):
-    """Run resnet model on 4x4 with the MLIR Bridge enabled."""
-    self._setup()
-    self._set_df_common()
-    FLAGS.batch_size = 4096
-    FLAGS.dtype = 'bf16'
-    FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu_bf16_mlir')
-    tf.config.experimental.enable_mlir_bridge()
-    self._run_and_report_benchmark()
-
-  def benchmark_8x16_tpu_bf16(self):
-    self._setup()
-    self._set_df_common()
-    FLAGS.batch_size = 8192
-    FLAGS.dtype = 'bf16'
-    self._run_and_report_benchmark()
-
-  def fill_report_object(self, stats):
-    super(Resnet50CtlBenchmarkBase, self).fill_report_object(
-        stats, total_batch_size=FLAGS.batch_size, log_steps=FLAGS.log_steps)
-
-
-class Resnet50CtlBenchmarkSynth(Resnet50CtlBenchmarkBase):
-  """Resnet50 synthetic benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    def_flags = {}
-    def_flags['skip_eval'] = True
-    def_flags['use_synthetic_data'] = True
-    def_flags['train_steps'] = 110
-    def_flags['steps_per_loop'] = 10
-    def_flags['log_steps'] = 10
-
-    super(Resnet50CtlBenchmarkSynth, self).__init__(
-        output_dir=output_dir, default_flags=def_flags, **kwargs)
-
-
-class Resnet50CtlBenchmarkReal(Resnet50CtlBenchmarkBase):
-  """Resnet50 real data benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    def_flags = {}
-    def_flags['skip_eval'] = True
-    def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
-    def_flags['train_steps'] = 110
-    def_flags['steps_per_loop'] = 10
-    def_flags['log_steps'] = 10
-
-    super(Resnet50CtlBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=def_flags, **kwargs)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/retinanet_benchmark.py
+++ b/official/benchmark/retinanet_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes RetinaNet benchmarks and accuracy tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=g-bad-import-order
-import json
-import time
-
-from absl import flags
-from absl.testing import flagsaver
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from official.benchmark import benchmark_wrappers
-from official.benchmark import perfzero_benchmark
-from official.utils.flags import core as flags_core
-from official.utils.misc import keras_utils
-from official.vision.detection import main as detection
-from official.vision.detection.configs import base_config
-
-FLAGS = flags.FLAGS
-
-# pylint: disable=line-too-long
-COCO_TRAIN_DATA = 'gs://tf-perfzero-data/coco/train*'
-COCO_EVAL_DATA = 'gs://tf-perfzero-data/coco/val*'
-COCO_EVAL_JSON = 'gs://tf-perfzero-data/coco/instances_val2017.json'
-RESNET_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07'
-# pylint: enable=line-too-long
-
-
-class BenchmarkBase(perfzero_benchmark.PerfZeroBenchmark):
-  """Base class to hold methods common to test classes."""
-
-  def __init__(self, **kwargs):
-    super(BenchmarkBase, self).__init__(**kwargs)
-    self.timer_callback = None
-
-  def _report_benchmark(self, stats, start_time_sec, wall_time_sec, min_ap,
-                        max_ap, warmup):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      stats: dict returned from Detection models with known entries.
-      start_time_sec: the start of the benchmark execution in seconds
-      wall_time_sec: the duration of the benchmark execution in seconds
-      min_ap: Minimum detection AP constraint to verify correctness of the
-        model.
-      max_ap: Maximum detection AP accuracy constraint to verify correctness of
-        the model.
-      warmup: Number of time log entries to ignore when computing examples/sec.
-    """
-    metrics = [{
-        'name': 'total_loss',
-        'value': stats['total_loss'],
-    }]
-    if self.timer_callback:
-      metrics.append({
-          'name': 'exp_per_second',
-          'value': self.timer_callback.get_examples_per_sec(warmup)
-      })
-      metrics.append({
-          'name': 'startup_time',
-          'value': self.timer_callback.get_startup_time(start_time_sec)
-      })
-    else:
-      metrics.append({
-          'name': 'exp_per_second',
-          'value': 0.0,
-      })
-
-    if 'eval_metrics' in stats:
-      metrics.append({
-          'name': 'AP',
-          'value': stats['AP'],
-          'min_value': min_ap,
-          'max_value': max_ap,
-      })
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(
-        iters=stats['total_steps'],
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_str})
-
-
-class DetectionBenchmarkBase(BenchmarkBase):
-  """Base class to hold methods common to test classes in the module."""
-
-  def __init__(self, **kwargs):
-    self.train_data_path = COCO_TRAIN_DATA
-    self.eval_data_path = COCO_EVAL_DATA
-    self.eval_json_path = COCO_EVAL_JSON
-    self.resnet_checkpoint_path = RESNET_CHECKPOINT_PATH
-    super(DetectionBenchmarkBase, self).__init__(**kwargs)
-
-  def _run_detection_main(self):
-    """Starts detection job."""
-    if self.timer_callback:
-      FLAGS.log_steps = 0  # prevent detection.run from adding the same callback
-      return detection.run(callbacks=[self.timer_callback])
-    else:
-      return detection.run()
-
-
-class DetectionAccuracy(DetectionBenchmarkBase):
-  """Accuracy test for RetinaNet model.
-
-  Tests RetinaNet detection task model accuracy. The naming
-  convention of below test cases follow
-  `benchmark_(number of gpus)_gpu_(dataset type)` format.
-  """
-
-  def __init__(self, model, **kwargs):
-    self.model = model
-    super(DetectionAccuracy, self).__init__(**kwargs)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                params,
-                                min_ap=0.325,
-                                max_ap=0.35,
-                                do_eval=True,
-                                warmup=1):
-    """Starts Detection accuracy benchmark test."""
-    FLAGS.params_override = json.dumps(params)
-    # Need timer callback to measure performance
-    self.timer_callback = keras_utils.TimeHistory(
-        batch_size=params['train']['batch_size'],
-        log_steps=FLAGS.log_steps,
-    )
-
-    start_time_sec = time.time()
-    FLAGS.mode = 'train'
-    summary, _ = self._run_detection_main()
-    wall_time_sec = time.time() - start_time_sec
-
-    if do_eval:
-      FLAGS.mode = 'eval'
-      eval_metrics = self._run_detection_main()
-      summary.update(eval_metrics)
-
-    summary['total_steps'] = params['train']['total_steps']
-    self._report_benchmark(summary, start_time_sec, wall_time_sec, min_ap,
-                           max_ap, warmup)
-
-  def _setup(self):
-    super(DetectionAccuracy, self)._setup()
-    FLAGS.model = self.model
-
-  def _params(self):
-    return {
-        'architecture': {
-            'use_bfloat16': True,
-        },
-        'train': {
-            'batch_size': 64,
-            'iterations_per_loop': 100,
-            'total_steps': 22500,
-            'train_file_pattern': self.train_data_path,
-            'checkpoint': {
-                'path': self.resnet_checkpoint_path,
-                'prefix': 'resnet50/'
-            },
-            # Speed up ResNet training when loading from the checkpoint.
-            'frozen_variable_prefix': base_config.RESNET_FROZEN_VAR_PREFIX,
-        },
-        'eval': {
-            'batch_size': 8,
-            'eval_samples': 5000,
-            'val_json_file': self.eval_json_path,
-            'eval_file_pattern': self.eval_data_path,
-        },
-    }
-
-  @flagsaver.flagsaver
-  def benchmark_8_gpu_coco(self):
-    """Run RetinaNet model accuracy test with 8 GPUs."""
-    self._setup()
-    params = self._params()
-    FLAGS.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_coco')
-    FLAGS.strategy_type = 'mirrored'
-    self._run_and_report_benchmark(params)
-
-
-class DetectionBenchmarkReal(DetectionAccuracy):
-  """Short benchmark performance tests for a detection model.
-
-  Tests detection performance in different accelerator configurations.
-  The naming convention of below test cases follow
-  `benchmark_(number of gpus)_gpu` format.
-  """
-
-  def _setup(self):
-    super(DetectionBenchmarkReal, self)._setup()
-    # Use negative value to avoid saving checkpoints.
-    FLAGS.save_checkpoint_freq = -1
-
-  @flagsaver.flagsaver
-  def benchmark_8_gpu_coco(self):
-    """Run detection model accuracy test with 8 GPUs."""
-    self._setup()
-    params = self._params()
-    params['architecture']['use_bfloat16'] = False
-    params['train']['total_steps'] = 1875  # One epoch.
-    # The iterations_per_loop must be one, otherwise the number of examples per
-    # second would be wrong. Currently only support calling callback per batch
-    # when each loop only runs on one batch, i.e. host loop for one step. The
-    # performance of this situation might be lower than the case of
-    # iterations_per_loop > 1.
-    # Related bug: b/135933080
-    params['train']['iterations_per_loop'] = 1
-    params['eval']['eval_samples'] = 8
-    FLAGS.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('real_benchmark_8_gpu_coco')
-    FLAGS.strategy_type = 'mirrored'
-    self._run_and_report_benchmark(params)
-
-  @flagsaver.flagsaver
-  def benchmark_1_gpu_coco(self):
-    """Run detection model accuracy test with 1 GPU."""
-    self._setup()
-    params = self._params()
-    params['architecture']['use_bfloat16'] = False
-    params['train']['batch_size'] = 8
-    params['train']['total_steps'] = 200
-    params['train']['iterations_per_loop'] = 1
-    params['eval']['eval_samples'] = 8
-    FLAGS.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('real_benchmark_1_gpu_coco')
-    FLAGS.strategy_type = 'one_device'
-    self._run_and_report_benchmark(params)
-
-  @flagsaver.flagsaver
-  def benchmark_xla_1_gpu_coco(self):
-    """Run detection model accuracy test with 1 GPU and XLA enabled."""
-    self._setup()
-    params = self._params()
-    params['architecture']['use_bfloat16'] = False
-    params['train']['batch_size'] = 8
-    params['train']['total_steps'] = 200
-    params['train']['iterations_per_loop'] = 1
-    params['eval']['eval_samples'] = 8
-    FLAGS.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('real_benchmark_xla_1_gpu_coco')
-    FLAGS.strategy_type = 'one_device'
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark(params)
-
-  @flagsaver.flagsaver
-  def benchmark_2x2_tpu_coco(self):
-    """Run detection model accuracy test with 4 TPUs."""
-    self._setup()
-    params = self._params()
-    params['train']['batch_size'] = 64
-    params['train']['total_steps'] = 1875  # One epoch.
-    params['train']['iterations_per_loop'] = 500
-    FLAGS.model_dir = self._get_model_dir('real_benchmark_2x2_tpu_coco')
-    FLAGS.strategy_type = 'tpu'
-    self._run_and_report_benchmark(params, do_eval=False, warmup=0)
-
-  @flagsaver.flagsaver
-  def benchmark_4x4_tpu_coco(self):
-    """Run detection model accuracy test with 4 TPUs."""
-    self._setup()
-    params = self._params()
-    params['train']['batch_size'] = 256
-    params['train']['total_steps'] = 469  # One epoch.
-    params['train']['iterations_per_loop'] = 500
-    FLAGS.model_dir = self._get_model_dir('real_benchmark_4x4_tpu_coco')
-    FLAGS.strategy_type = 'tpu'
-    self._run_and_report_benchmark(params, do_eval=False, warmup=0)
-
-  @flagsaver.flagsaver
-  def benchmark_2x2_tpu_coco_mlir(self):
-    """Run detection model accuracy test with 4 TPUs."""
-    self._setup()
-    params = self._params()
-    params['train']['batch_size'] = 64
-    params['train']['total_steps'] = 1875  # One epoch.
-    params['train']['iterations_per_loop'] = 500
-    FLAGS.model_dir = self._get_model_dir('real_benchmark_2x2_tpu_coco_mlir')
-    FLAGS.strategy_type = 'tpu'
-    tf.config.experimental.enable_mlir_bridge()
-    self._run_and_report_benchmark(params, do_eval=False, warmup=0)
-
-  @flagsaver.flagsaver
-  def benchmark_4x4_tpu_coco_mlir(self):
-    """Run RetinaNet model accuracy test with 4 TPUs."""
-    self._setup()
-    params = self._params()
-    params['train']['batch_size'] = 256
-    params['train']['total_steps'] = 469  # One epoch.
-    params['train']['iterations_per_loop'] = 500
-    FLAGS.model_dir = self._get_model_dir('real_benchmark_4x4_tpu_coco_mlir')
-    FLAGS.strategy_type = 'tpu'
-    tf.config.experimental.enable_mlir_bridge()
-    self._run_and_report_benchmark(params, do_eval=False, warmup=0)
-
-  @flagsaver.flagsaver
-  def benchmark_2x2_tpu_spinenet_coco(self):
-    """Run detection model with SpineNet backbone accuracy test with 4 TPUs."""
-    self._setup()
-    params = self._params()
-    params['architecture']['backbone'] = 'spinenet'
-    params['architecture']['multilevel_features'] = 'identity'
-    params['architecture']['use_bfloat16'] = False
-    params['train']['batch_size'] = 64
-    params['train']['total_steps'] = 1875  # One epoch.
-    params['train']['iterations_per_loop'] = 500
-    params['train']['checkpoint']['path'] = ''
-    FLAGS.model_dir = self._get_model_dir(
-        'real_benchmark_2x2_tpu_spinenet_coco')
-    FLAGS.strategy_type = 'tpu'
-    self._run_and_report_benchmark(params, do_eval=False, warmup=0)
-
-
-class RetinanetBenchmarkReal(DetectionBenchmarkReal):
-  """Short benchmark performance tests for Retinanet model."""
-
-  def __init__(self, **kwargs):
-    super(RetinanetBenchmarkReal, self).__init__(
-        model='retinanet',
-        **kwargs)
-
-
-class MaskRCNNBenchmarkReal(DetectionBenchmarkReal):
-  """Short benchmark performance tests for Mask RCNN model."""
-
-  def __init__(self, **kwargs):
-    super(MaskRCNNBenchmarkReal, self).__init__(
-        model='mask_rcnn',
-        **kwargs)
-
-
-class ShapeMaskBenchmarkReal(DetectionBenchmarkReal):
-  """Short benchmark performance tests for ShapeMask model."""
-
-  def __init__(self, **kwargs):
-    super(ShapeMaskBenchmarkReal, self).__init__(
-        model='shapemask',
-        **kwargs)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/shakespeare_benchmark.py
+++ b/official/benchmark/shakespeare_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes Shakespeare (LSTM) benchmark and accuracy tests."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-
-from absl import flags
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.benchmark.models.shakespeare import shakespeare_main
-from official.utils.flags import core as flags_core
-from official.utils.misc import keras_utils
-from official.benchmark import benchmark_wrappers
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-
-SHAKESPEARE_TRAIN_DATA = 'shakespeare/shakespeare.txt'
-TMP_DIR = os.getenv('TMPDIR')
-FLAGS = flags.FLAGS
-
-
-class ShakespeareBenchmarkBase(PerfZeroBenchmark):
-  """Base class for Shakespeare (LSTM) benchmark and accuracy tests."""
-
-  def __init__(self, output_dir=None, default_flags=None, root_data_dir=None):
-    super(ShakespeareBenchmarkBase, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        flag_methods=[shakespeare_main.define_flags])
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                top_1_train_min=0.91,
-                                top_1_train_max=0.94,
-                                warmup=1,
-                                log_steps=100):
-    """Report benchmark results by writing to local protobuf file.
-
-    Average epoch time is calculated by skipping the first epoch. This average
-    ignores time spent between epoch and is recorded by begin and end epoch. To
-    skip accuracy check set `top_1_train_min=None`.
-
-    Args:
-      top_1_train_min: lowest passing value.
-      top_1_train_max: highest passing value.
-      warmup: number of entries in `timestamp_log` to ignore.
-      log_steps: How often the log was created for `timestamp_log`.
-    """
-    total_batch_size = FLAGS.batch_size
-    metrics = []
-    start_time_sec = time.time()
-    stats = shakespeare_main.run(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    if top_1_train_min:
-      metrics.append({'name': 'accuracy_top_1_train',
-                      'value': stats['history']['RecallAt1'][-1],
-                      'min_value': top_1_train_min,
-                      'max_value': top_1_train_max})
-
-    # Look for the time history callback which was used during keras.fit
-    for callback in stats['callbacks']:
-      if isinstance(callback, keras_utils.TimeHistory):
-        epoch_timings = callback.epoch_runtime_log
-        if len(epoch_timings) > 1:
-          average_time = sum(epoch_timings[1:]) / len(epoch_timings[1:])
-          metrics.append({'name': 'avg_epoch_time',
-                          'value': average_time})
-
-      # First entry in timestamp_log is the start of step 1. The rest of the
-      # entries are the end of each step recorded.
-      time_log = callback.timestamp_log
-      elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
-      num_examples = (
-          total_batch_size * log_steps * (len(time_log) - warmup - 1))
-      if elapsed > 0:
-        examples_per_sec = num_examples / elapsed
-        metrics.append({'name': 'exp_per_second',
-                        'value': examples_per_sec})
-
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(iters=-1, wall_time=wall_time_sec,
-                          metrics=metrics,
-                          extras={'flags': flags_str})
-
-
-class ShakespeareAccuracy(ShakespeareBenchmarkBase):
-  """Shakespeare accuracy tests.
-
-  This is not an ideal test. The best we can use for the accuracy check is to
-  validate top_1 of the training set. At batch size 64 the top_1 training
-  stabilizes to ~0.92 around 40-45 epochs.
-  """
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """Shakespeare accuracy tests.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-    self.train_data = os.path.join(root_data_dir, SHAKESPEARE_TRAIN_DATA)
-    super(ShakespeareAccuracy, self).__init__(
-        output_dir=output_dir, root_data_dir=root_data_dir)
-
-  def benchmark_cpu(self):
-    """Benchmark cpu."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_ds_run_eagerly(self):
-    """Benchmark cpu without distribution strategies and run eagerly."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu(self):
-    """Benchmark 1 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_ds(self):
-    """Benchmark 1 gpu without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_ds_run_eagerly(self):
-    """Benchmark 1 gpu without distribution strategies and run eagerly."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu(self):
-    """Benchmark 1 gpu w/xla."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu(self):
-    """Benchmark 8 gpu.
-
-    This is test is for accuracy not scaling.  The batch-size is not scaled to
-    the number of gpus.
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    self._run_and_report_benchmark()
-
-
-class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase):
-  """Benchmark accuracy tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=TMP_DIR, **kwargs):
-    """Benchmark tests w/Keras.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-    self.train_data = os.path.join(root_data_dir, SHAKESPEARE_TRAIN_DATA)
-
-    def_flags = {}
-    def_flags['training_data'] = self.train_data
-    def_flags['model_dir'] = ''
-    def_flags['train_epochs'] = 4
-    def_flags['log_steps'] = 50
-
-    super(ShakespeareKerasBenchmarkReal, self).__init__(
-        output_dir=output_dir,
-        root_data_dir=root_data_dir,
-        default_flags=def_flags)
-
-  def benchmark_cpu(self):
-    """Benchmark cpu."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.batch_size = 64
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_ds_run_eagerly(self):
-    """Benchmark cpu without distribution strategy and run eagerly."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.batch_size = 64
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.run_eagerly = True
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_ds(self):
-    """Benchmark cpu without distribution strategy."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.batch_size = 64
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_ds_force_v2(self):
-    """Benchmark cpu no ds, and force v2."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.batch_size = 64
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu(self):
-    """Benchmark 1 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_cudnn(self):
-    """Benchmark 1 gpu with CuDNN disabled."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64
-    FLAGS.cudnn = False
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_ds(self):
-    """Benchmark 1 gpu without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_ds_run_eagerly(self):
-    """Benchmark 1 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu(self):
-    """Benchmark 1 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_no_cudnn(self):
-    """Benchmark 1 gpu w/xla and CuDNN disabled."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64
-    FLAGS.cudnn = False
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu(self):
-    """Benchmark 8 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.batch_size = 64 * 8
-    FLAGS.log_steps = 10
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_no_cudnn(self):
-    """Benchmark 8 gpu with CuDNN disabled."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.batch_size = 64 * 8
-    FLAGS.log_steps = 10
-    FLAGS.cudnn = False
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu(self):
-    """Benchmark 8 gpu w/xla."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64 * 8
-    FLAGS.log_steps = 10
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu_no_cudnn(self):
-    """Benchmark 8 gpu w/xla and CuDNN disabled."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.batch_size = 64 * 8
-    FLAGS.log_steps = 10
-    FLAGS.cudnn = False
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def _run_and_report_benchmark(self):
-    """Run and report benchmark."""
-    super(ShakespeareKerasBenchmarkReal, self)._run_and_report_benchmark(
-        top_1_train_min=None, log_steps=FLAGS.log_steps)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/tfhub_memory_usage_benchmark.py
+++ b/official/benchmark/tfhub_memory_usage_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Runs a memory usage benchmark for a Tensorflow Hub model.
-
-Loads a SavedModel and records memory usage.
-"""
-import functools
-import time
-
-from absl import flags
-import tensorflow as tf
-import tensorflow_hub as hub
-
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-
-FLAGS = flags.FLAGS
-
-
-class TfHubMemoryUsageBenchmark(PerfZeroBenchmark):
-  """A benchmark measuring memory usage for a given TF Hub SavedModel."""
-
-  def __init__(self,
-               hub_model_handle_list=None,
-               output_dir=None,
-               default_flags=None,
-               root_data_dir=None,
-               **kwargs):
-    super(TfHubMemoryUsageBenchmark, self).__init__(
-        output_dir=output_dir, default_flags=default_flags, **kwargs)
-    if hub_model_handle_list:
-      for hub_model_handle in hub_model_handle_list.split(';'):
-        # Converts a model handle of the form
-        # https://tfhub.dev/google/nnlm-en-dim128/1 to valid python method name
-        # like google_nnlm_en_dim128_1.
-        hub_model_method_name = hub_model_handle.replace(
-            'https://tfhub.dev',
-            '').replace('/', '_').replace('-', '_').strip('_')
-        setattr(
-            self, 'benchmark_' + hub_model_method_name,
-            functools.partial(self.benchmark_memory_usage, hub_model_handle))
-
-  def benchmark_memory_usage(
-      self, hub_model_handle='https://tfhub.dev/google/nnlm-en-dim128/1'):
-    start_time_sec = time.time()
-    self.load_model(hub_model_handle)
-    wall_time_sec = time.time() - start_time_sec
-
-    metrics = []
-    self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=metrics)
-
-  def load_model(self, hub_model_handle):
-    """Loads a TF Hub module."""
-    hub.load(hub_model_handle)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/transformer_benchmark.py
+++ b/official/benchmark/transformer_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes Transformer w/Keras benchmark and accuracy tests."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-
-from absl import flags
-import tensorflow as tf
-from official.benchmark import benchmark_wrappers
-from official.benchmark import owner_utils
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-from official.nlp.transformer import misc
-from official.nlp.transformer import transformer_main as transformer_main
-from official.utils.flags import core as flags_core
-
-TPU_DATA_DIR = 'gs://mlcompass-data/transformer'
-GPU_DATA_DIR = os.getenv('TMPDIR')
-TRANSFORMER_EN2DE_DATA_DIR_NAME = 'wmt32k-en2de-official'
-EN2DE_2014_BLEU_DATA_DIR_NAME = 'newstest2014'
-FLAGS = flags.FLAGS
-TMP_DIR = os.getenv('TMPDIR')
-
-
-class TransformerBenchmark(PerfZeroBenchmark):
-  """Methods common to executing transformer w/keras tests.
-
-     Code under test for the Transformer Keras models report the same data and
-     require the same FLAG setup.
-
-  """
-
-  def __init__(self, output_dir=None, default_flags=None, root_data_dir=None,
-               flag_methods=None, tpu=None):
-    self._set_data_files(root_data_dir=root_data_dir)
-
-    if default_flags is None:
-      default_flags = {}
-    default_flags['data_dir'] = self.train_data_dir
-    default_flags['vocab_file'] = self.vocab_file
-
-    super(TransformerBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        flag_methods=flag_methods,
-        tpu=tpu)
-
-  def _set_data_files(self, root_data_dir=None, tpu_run=False):
-    """Sets train_data_dir, vocab_file, bleu_source and bleu_ref."""
-    # Use remote storage for TPU, remote storage for GPU if defined, else
-    # use environment provided root_data_dir.
-    if tpu_run:
-      root_data_dir = TPU_DATA_DIR
-    elif GPU_DATA_DIR is not None:
-      root_data_dir = GPU_DATA_DIR
-
-    root_data_dir = root_data_dir if root_data_dir else ''
-
-    self.train_data_dir = os.path.join(root_data_dir,
-                                       TRANSFORMER_EN2DE_DATA_DIR_NAME)
-    self.vocab_file = os.path.join(root_data_dir,
-                                   TRANSFORMER_EN2DE_DATA_DIR_NAME,
-                                   'vocab.ende.32768')
-    self.bleu_source = os.path.join(root_data_dir,
-                                    EN2DE_2014_BLEU_DATA_DIR_NAME,
-                                    'newstest2014.en')
-    self.bleu_ref = os.path.join(root_data_dir,
-                                 EN2DE_2014_BLEU_DATA_DIR_NAME,
-                                 'newstest2014.de')
-
-  def _set_data_file_flags(self):
-    """Sets the FLAGS for the data files."""
-    FLAGS.data_dir = self.train_data_dir
-    FLAGS.vocab_file = self.vocab_file
-    # Sets values directly to avoid validation check.
-    FLAGS['bleu_source'].value = self.bleu_source
-    FLAGS['bleu_ref'].value = self.bleu_ref
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                bleu_max=None,
-                                bleu_min=None,
-                                log_steps=None,
-                                total_batch_size=None,
-                                warmup=1):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      bleu_max: highest passing level for bleu score.
-      bleu_min: lowest passing level for bleu score.
-      log_steps: How often the log was created for stats['step_timestamp_log'].
-      total_batch_size: Global batch-size.
-      warmup: number of entries in stats['step_timestamp_log'] to ignore.
-    """
-    start_time_sec = time.time()
-    task = transformer_main.TransformerTask(FLAGS)
-    stats = task.train()
-    wall_time_sec = time.time() - start_time_sec
-
-    metrics = []
-    if 'bleu_uncased' in stats:
-      if 'bleu_uncased_history' in stats:
-        bleu_uncased_best = max(stats['bleu_uncased_history'],
-                                key=lambda x: x[1])
-        metrics.append({'name': 'bleu_uncased',
-                        'value': bleu_uncased_best[1],
-                        'min_value': bleu_min,
-                        'max_value': bleu_max})
-        metrics.append({'name': 'bleu_best_score_iteration',
-                        'value': bleu_uncased_best[0]})
-        metrics.append({'name': 'bleu_uncased_last',
-                        'value': stats['bleu_uncased']})
-      else:
-        metrics.append({'name': 'bleu_uncased',
-                        'value': stats['bleu_uncased'],
-                        'min_value': bleu_min,
-                        'max_value': bleu_max})
-
-    if (warmup and 'step_timestamp_log' in stats and
-        len(stats['step_timestamp_log']) > warmup + 1):
-      # first entry in the time_log is start of step 1. The rest of the
-      # entries are the end of each step recorded
-      time_log = stats['step_timestamp_log']
-      elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
-      num_examples = (
-          total_batch_size * log_steps * (len(time_log) - warmup - 1))
-      examples_per_sec = num_examples / elapsed
-      metrics.append({'name': 'exp_per_second',
-                      'value': examples_per_sec})
-
-    if 'avg_exp_per_second' in stats:
-      metrics.append({'name': 'avg_exp_per_second',
-                      'value': stats['avg_exp_per_second']})
-
-    if 'step_timestamp_log' in stats:
-      time_log = stats['step_timestamp_log']
-      metrics.append({'name': 'startup_time',
-                      'value': time_log[0].timestamp - start_time_sec})
-
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=metrics,
-                          extras={'flags': flags_str})
-
-
-class TransformerBaseKerasAccuracy(TransformerBenchmark):
-  """Benchmark accuracy tests for Transformer Base model w/ Keras."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """Benchmark accuracy tests for Transformer Base model w/ Keras.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-    flag_methods = [misc.define_transformer_flags]
-
-    super(TransformerBaseKerasAccuracy, self).__init__(
-        output_dir=output_dir, root_data_dir=root_data_dir,
-        flag_methods=flag_methods)
-
-  def benchmark_1_gpu(self):
-    """Benchmark 1 gpu.
-
-      The paper uses 8 GPUs and a much larger effective batch size, this is will
-      not converge to the 27.3 BLEU (uncased) SOTA.
-    """
-    self._setup()
-    self._set_data_file_flags()
-    FLAGS.num_gpus = 1
-    FLAGS.param_set = 'base'
-    FLAGS.batch_size = 2048
-    FLAGS.train_steps = 1000
-    FLAGS.steps_between_evals = 500
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
-    # These bleu scores are based on test runs after at this limited
-    # number of steps and batch size after verifying SOTA at 8xV100s.
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=25.3,
-                                   bleu_max=26)
-
-  def benchmark_1_gpu_static_batch(self):
-    """Benchmark 1 gpu with static_batch.
-
-      The paper uses 8 GPUs and a much larger effective batch size, this is will
-      not converge to the 27.3 BLEU (uncased) SOTA.
-    """
-    self._setup()
-    self._set_data_file_flags()
-    FLAGS.num_gpus = 1
-    FLAGS.param_set = 'base'
-    FLAGS.batch_size = 4096
-    FLAGS.train_steps = 100000
-    FLAGS.steps_between_evals = 5000
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_static_batch')
-    # These bleu scores are based on test runs after at this limited
-    # number of steps and batch size after verifying SOTA at 8xV100s.
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=25.3,
-                                   bleu_max=26)
-
-  def benchmark_8_gpu(self):
-    """Benchmark 8 gpu.
-
-      Should converge to 27.3 BLEU (uncased). This has not been confirmed yet.
-    """
-    self._setup()
-    self._set_data_file_flags()
-    FLAGS.num_gpus = 8
-    FLAGS.param_set = 'base'
-    FLAGS.batch_size = 4096*8
-    FLAGS.train_steps = 100000
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=27,
-                                   bleu_max=28)
-
-  def benchmark_8_gpu_static_batch(self):
-    """Benchmark 8 gpu.
-
-      Should converge to 27.3 BLEU (uncased). This has not been confirmed yet.
-    """
-    self._setup()
-    self._set_data_file_flags()
-    FLAGS.num_gpus = 8
-    FLAGS.param_set = 'base'
-    FLAGS.batch_size = 4096*8
-    FLAGS.train_steps = 100000
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.steps_between_evals = 5000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=27,
-                                   bleu_max=28)
-
-
-class TransformerBigKerasAccuracy(TransformerBenchmark):
-  """Benchmark accuracy tests for Transformer Big model w/ Keras."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """Benchmark accuracy tests for Transformer Big model w/ Keras.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-    flag_methods = [misc.define_transformer_flags]
-
-    super(TransformerBigKerasAccuracy, self).__init__(
-        output_dir=output_dir, root_data_dir=root_data_dir,
-        flag_methods=flag_methods)
-
-  def benchmark_8_gpu(self):
-    """Benchmark 8 gpu.
-
-    Over 6 runs with eval every 20K steps the average highest value was 28.195
-    (bleu uncased). 28.424 was the highest and 27.96 the lowest. The values are
-    the highest value seen during a run and occurred at a median of iteration 9.
-    Iterations are not epochs, an iteration is a number of steps between evals.
-    """
-    self._setup()
-    self._set_data_file_flags()
-    FLAGS.num_gpus = 8
-    FLAGS.param_set = 'big'
-    FLAGS.batch_size = 3072*8
-    FLAGS.train_steps = 20000 * 12
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=27.9,
-                                   bleu_max=29.2)
-
-  def benchmark_8_gpu_static_batch(self):
-    """Benchmark 8 gpu.
-
-    Should converge to 28.4 BLEU (uncased). This has not be verified yet."
-    """
-    self._setup()
-    self._set_data_file_flags()
-    FLAGS.num_gpus = 8
-    FLAGS.param_set = 'big'
-    FLAGS.batch_size = 3072*8
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.train_steps = 20000 * 12
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=28,
-                                   bleu_max=29.2)
-
-  def benchmark_8_gpu_fp16(self):
-    """Benchmark 8 gpu with dynamic batch and fp16.
-
-    Over 6 runs with eval every 20K steps the average highest value was 28.247
-    (bleu uncased). 28.424 was the highest and 28.09 the lowest. The values are
-    the highest value seen during a run and occurred at a median of iteration
-    11. While this could be interpreted as worse than FP32, if looking at the
-    first iteration at which 28 is passed FP16 performs equal and possibly
-    better. Although not part of the initial test runs, the highest value
-    recorded with the arguments below was 28.9 at iteration 12. Iterations are
-    not epochs, an iteration is a number of steps between evals.
-    """
-    self._setup()
-    self._set_data_file_flags()
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.param_set = 'big'
-    FLAGS.batch_size = 3072*8
-    FLAGS.train_steps = 20000 * 12
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=28,
-                                   bleu_max=29.2)
-
-  def benchmark_8_gpu_fp16_amp(self):
-    """Benchmark 8 gpu with dynamic batch and fp16 with automatic mixed precision.
-
-      Should converge to 28.4 BLEU (uncased). This has not be verified yet."
-    """
-    self._setup()
-    self._set_data_file_flags()
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.param_set = 'big'
-    FLAGS.batch_size = 3072*8
-    FLAGS.train_steps = 20000 * 12
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16_amp')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=28,
-                                   bleu_max=29)
-
-  def benchmark_8_gpu_static_batch_fp16(self):
-    """Benchmark 8 gpu with static batch and fp16.
-
-      Should converge to 28.4 BLEU (uncased). This has not be verified yet."
-    """
-    self._setup()
-    self._set_data_file_flags()
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.param_set = 'big'
-    FLAGS.batch_size = 3072*8
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.train_steps = 400000
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch_fp16')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=28,
-                                   bleu_max=29.2)
-
-  def benchmark_xla_8_gpu_static_batch_fp16(self):
-    """Benchmark 8 gpu with static batch, XLA, and FP16.
-
-      Should converge to 28.4 BLEU (uncased). This has not be verified yet."
-    """
-    self._setup()
-    self._set_data_file_flags()
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_xla = True
-    FLAGS.param_set = 'big'
-    FLAGS.batch_size = 3072*8
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.train_steps = 400000
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_xla_8_gpu_static_batch_fp16')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=28,
-                                   bleu_max=29.2)
-
-
-class TransformerKerasBenchmark(TransformerBenchmark):
-  """Benchmarks for Transformer (Base and Big) using Keras."""
-
-  def __init__(self, output_dir=None, default_flags=None,
-               root_data_dir=None, batch_per_gpu=4096, tpu=None):
-    """Initialize.
-
-    Args:
-      output_dir: Based directory for saving artifacts, e.g. checkpoints.
-      default_flags: default flags to use for all tests.
-      root_data_dir: root directory for data, e.g. training.
-      batch_per_gpu: batch size to use per gpu.
-      tpu: Target TPU to use.
-    """
-    flag_methods = [misc.define_transformer_flags]
-    self.batch_per_gpu = batch_per_gpu
-
-    super(TransformerKerasBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        root_data_dir=root_data_dir,
-        flag_methods=flag_methods,
-        tpu=tpu)
-
-  def benchmark_1_gpu_no_dist_strat(self):
-    """Benchmark 1 gpu without distribution strategy."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_1_gpu_no_dist_strat_static_batch(self):
-    """Benchmark 1 gpu without distribution strategy with static batch."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_ds_sb')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_1_gpu(self):
-    """Benchmark 1 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_1_gpu_fp16(self):
-    """Benchmark 1 gpu FP16."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_1_gpu(self):
-    """Benchmark 1 gpu w/xla."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu')
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_1_gpu_fp16(self):
-    """Benchmark 1 gpu w/xla and FP16."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16')
-    FLAGS.enable_xla = True
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_1_gpu_static_batch(self):
-    """Benchmark 1 gpu with static batch."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_static_batch')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_1_gpu_static_batch(self):
-    """Benchmark 1 gpu with static batch w/xla."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_static_batch')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_1_gpu_static_batch_fp16(self):
-    """Benchmark 1 gpu with static batch FP16."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_static_batch_fp16')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_1_gpu_static_batch_fp16(self):
-    """Benchmark 1 gpu with static batch w/xla and FP16."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_xla_1_gpu_static_batch_fp16')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.enable_xla = True
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_8_gpu(self):
-    """Benchmark 8 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_8_gpu_fp16(self):
-    """Benchmark 8 gpu FP16."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_8_gpu(self):
-    """Benchmark 8 gpu w/xla."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.enable_xla = True
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_8_gpu_fp16(self):
-    """Benchmark 8 gpu w/xla and FP16."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.enable_xla = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_8_gpu_static_batch(self):
-    """Benchmark 8 gpu with static batch."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_8_gpu_static_batch_fp16(self):
-    """Benchmark 8 gpu with static batch FP16."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_8_gpu_static_batch_fp16')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_8_gpu_static_batch(self):
-    """Benchmark 8 gpu with static batch w/xla."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.enable_xla = True
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_static_batch')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_8_gpu_static_batch_fp16(self):
-    """Benchmark 8 gpu with static batch w/xla and FP16."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.enable_xla = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_xla_8_gpu_static_batch_fp16')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-
-class TransformerBaseKerasBenchmarkReal(TransformerKerasBenchmark):
-  """Transformer based version real data benchmark tests."""
-
-  def __init__(self, output_dir=TMP_DIR, root_data_dir=TMP_DIR, **kwargs):
-    def_flags = {}
-    def_flags['param_set'] = 'base'
-    def_flags['train_steps'] = 50
-    def_flags['log_steps'] = 10
-
-    super(TransformerBaseKerasBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=def_flags,
-        root_data_dir=root_data_dir, batch_per_gpu=4096)
-
-
-class TransformerBigKerasBenchmarkReal(TransformerKerasBenchmark):
-  """Transformer based version real data benchmark tests."""
-
-  def __init__(self, output_dir=TMP_DIR, root_data_dir=TMP_DIR,
-               tpu=None, **kwargs):
-    def_flags = {}
-    def_flags['param_set'] = 'big'
-    def_flags['train_steps'] = 50
-    def_flags['log_steps'] = 10
-
-    super(TransformerBigKerasBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=def_flags,
-        root_data_dir=root_data_dir, batch_per_gpu=3072,
-        tpu=tpu)
-
-  def _set_df_common(self):
-    self._set_data_files(tpu_run=True)
-    FLAGS.data_dir = self.train_data_dir
-    FLAGS.vocab_file = self.vocab_file
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.padded_decode = True
-    FLAGS.train_steps = 300
-    FLAGS.log_steps = 150
-    FLAGS.steps_between_evals = 150
-    FLAGS.static_batch = True
-    FLAGS.use_ctl = True
-    FLAGS.enable_checkpointing = False
-    FLAGS.max_length = 64
-    FLAGS.decode_batch_size = 32
-    FLAGS.decode_max_length = 97
-
-  def benchmark_2x2_tpu(self):
-    """Port of former snaggletooth transformer_big model on 2x2."""
-    self._setup()
-    self._set_df_common()
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu')
-    FLAGS.batch_size = 6144
-
-    self._run_and_report_benchmark(
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-
-  @owner_utils.Owner('tf-graph-compiler')
-  def benchmark_2x2_tpu_mlir(self):
-    """Run transformer_big model on 2x2 with the MLIR Bridge enabled."""
-    self._setup()
-    self._set_df_common()
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_mlir')
-    FLAGS.batch_size = 6144
-    tf.config.experimental.enable_mlir_bridge()
-
-    self._run_and_report_benchmark(
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-
-  def benchmark_4x4_tpu(self):
-    """Port of former GCP transformer_big model on 4x4."""
-    self._setup()
-    self._set_df_common()
-    FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu')
-    FLAGS.batch_size = 24576
-
-    self._run_and_report_benchmark(
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-
-  @owner_utils.Owner('tf-graph-compiler')
-  def benchmark_4x4_tpu_mlir(self):
-    """Run transformer_big model on 4x4 with the MLIR Bridge enabled."""
-    self._setup()
-    self._set_df_common()
-    FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu_mlir')
-    FLAGS.batch_size = 24576
-    tf.config.experimental.enable_mlir_bridge()
-
-    self._run_and_report_benchmark(
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/unet3d_benchmark.py
+++ b/official/benchmark/unet3d_benchmark.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes benchmark testing for 3D Unet model."""
-# pylint: disable=line-too-long
-from __future__ import print_function
-
-import functools
-import os
-import time
-from typing import Optional
-from absl import flags
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.benchmark import benchmark_wrappers
-from official.benchmark import keras_benchmark
-from official.benchmark import owner_utils
-from official.vision.segmentation import unet_main as unet_training_lib
-from official.vision.segmentation import unet_model as unet_model_lib
-
-UNET3D_MIN_ACCURACY = 0.90
-UNET3D_MAX_ACCURACY = 0.98
-UNET_TRAINING_FILES = 'gs://mlcompass-data/unet3d/train_data/*'
-UNET_EVAL_FILES = 'gs://mlcompass-data/unet3d/eval_data/*'
-UNET_MODEL_CONFIG_FILE = 'gs://mlcompass-data/unet3d/config/unet_config.yaml'
-
-FLAGS = flags.FLAGS
-
-
-class Unet3DAccuracyBenchmark(keras_benchmark.KerasBenchmark):
-  """Benchmark accuracy tests for UNet3D model in Keras."""
-
-  def __init__(self,
-               output_dir: Optional[str] = None,
-               root_data_dir: Optional[str] = None,
-               **kwargs):
-    """A benchmark class.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-        constructor forward compatible in case PerfZero provides more named
-        arguments before updating the constructor.
-    """
-
-    flag_methods = [unet_training_lib.define_unet3d_flags]
-
-    # UNet3D model in Keras."""
-    self.training_file_pattern = UNET_TRAINING_FILES
-    self.eval_file_pattern = UNET_EVAL_FILES
-
-    # TODO(hongjunchoi): Create and use shared config file instead.
-    self.config_file = UNET_MODEL_CONFIG_FILE
-    super(Unet3DAccuracyBenchmark, self).__init__(
-        output_dir=output_dir, flag_methods=flag_methods)
-
-  def _set_benchmark_parameters(self, experiment_name):
-    """Overrides training parameters for benchmark tests."""
-    FLAGS.model_dir = self._get_model_dir(experiment_name)
-    FLAGS.mode = 'train'
-    FLAGS.training_file_pattern = self.training_file_pattern
-    FLAGS.eval_file_pattern = self.eval_file_pattern
-    FLAGS.config_file = self.config_file
-    FLAGS.lr_init_value = 0.00005
-    FLAGS.lr_decay_rate = 0.5
-    FLAGS.epochs = 3
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                experiment_name: str,
-                                min_accuracy: float = UNET3D_MIN_ACCURACY,
-                                max_accuracy: float = UNET3D_MAX_ACCURACY,
-                                distribution_strategy: str = 'tpu',
-                                epochs: int = 10,
-                                steps: int = 0,
-                                epochs_between_evals: int = 1,
-                                dtype: str = 'float32',
-                                enable_xla: bool = False,
-                                run_eagerly: bool = False):
-    """Runs and reports the benchmark given the provided configuration."""
-    params = unet_training_lib.extract_params(FLAGS)
-    strategy = unet_training_lib.create_distribution_strategy(params)
-
-    input_dtype = params.dtype
-    if input_dtype == 'float16' or input_dtype == 'bfloat16':
-      policy = tf.keras.mixed_precision.experimental.Policy(
-          'mixed_bfloat16' if input_dtype == 'bfloat16' else 'mixed_float16')
-      tf.keras.mixed_precision.experimental.set_policy(policy)
-
-    stats = {}
-    start_time_sec = time.time()
-    with strategy.scope():
-      unet_model = unet_model_lib.build_unet_model(params)
-      history = unet_training_lib.train(
-          params, strategy, unet_model,
-          functools.partial(unet_training_lib.get_train_dataset, params),
-          functools.partial(unet_training_lib.get_eval_dataset, params))
-
-      stats['accuracy_top_1'] = history.history['val_metric_accuracy'][-1]
-      stats['training_accuracy_top_1'] = history.history['metric_accuracy'][-1]
-    wall_time_sec = time.time() - start_time_sec
-
-    super(Unet3DAccuracyBenchmark, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        top_1_min=min_accuracy,
-        top_1_max=max_accuracy,
-        total_batch_size=params.train_batch_size)
-
-  def _get_model_dir(self, folder_name):
-    return os.path.join(self.output_dir, folder_name)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_4x4_tpu_bf16(self):
-    """Test Keras model with 4x4 TPU, fp16."""
-    experiment_name = 'benchmark_4x4_tpu_fp16'
-    self._setup()
-    self._set_benchmark_parameters(experiment_name)
-    self._run_and_report_benchmark(
-        experiment_name=experiment_name,
-        dtype='bfloat16',
-        distribution_strategy='tpu')
-
-  @owner_utils.Owner('tf-graph-compiler')
-  def benchmark_4x4_tpu_bf16_mlir(self):
-    """Test Keras model with 4x4 TPU, fp16 and MLIR enabled."""
-    experiment_name = 'benchmark_4x4_tpu_fp16_mlir'
-    tf.config.experimental.enable_mlir_bridge()
-    self._setup()
-    self._set_benchmark_parameters(experiment_name)
-    self._run_and_report_benchmark(
-        experiment_name=experiment_name,
-        dtype='bfloat16',
-        distribution_strategy='tpu')
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/xlnet_benchmark.py
+++ b/official/benchmark/xlnet_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes XLNet benchmarks and accuracy tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import json
-import os
-import time
-
-# pylint: disable=g-bad-import-order
-from absl import flags
-from absl.testing import flagsaver
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from official.benchmark import bert_benchmark_utils as benchmark_utils
-from official.benchmark import owner_utils
-from official.nlp.xlnet import run_classifier
-from official.nlp.xlnet import run_squad
-from official.benchmark import benchmark_wrappers
-
-
-# pylint: disable=line-too-long
-PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/xlnet/large/xlnet_model-1'
-CLASSIFIER_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/xlnet/imdb/spiece.model.len-512.train.tf_record'
-CLASSIFIER_EVAL_DATA_PATH = 'gs://tf-perfzero-data/xlnet/imdb/spiece.model.len-512.dev.eval.tf_record'
-SQUAD_DATA_PATH = 'gs://tf-perfzero-data/xlnet/squadv2_cased/'
-# pylint: enable=line-too-long
-
-FLAGS = flags.FLAGS
-
-
-class XLNetBenchmarkBase(benchmark_utils.BertBenchmarkBase):
-  """Base class to hold methods common to test classes in the module."""
-
-  def __init__(self, output_dir=None, tpu=None):
-    super(XLNetBenchmarkBase, self).__init__(output_dir=output_dir, tpu=tpu)
-    self.num_epochs = None
-    self.num_steps_per_epoch = None
-
-  @flagsaver.flagsaver
-  def _run_xlnet_classifier(self):
-    """Starts XLNet classification task."""
-    run_classifier.main(unused_argv=None)
-
-  @flagsaver.flagsaver
-  def _run_xlnet_squad(self):
-    """Starts XLNet classification task."""
-    run_squad.main(unused_argv=None)
-
-
-class XLNetClassifyAccuracy(XLNetBenchmarkBase):
-  """Short accuracy test for XLNet classifier model.
-
-  Tests XLNet classification task model accuracy. The naming
-  convention of below test cases follow
-  `benchmark_(number of gpus)_gpu_(dataset type)` format.
-  """
-
-  def __init__(self, output_dir=None, tpu=None, **kwargs):
-    self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
-    self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
-    self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
-
-    super(XLNetClassifyAccuracy, self).__init__(output_dir=output_dir, tpu=tpu)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                training_summary_path,
-                                min_accuracy=0.95,
-                                max_accuracy=0.97):
-    """Starts XLNet accuracy benchmark test."""
-
-    start_time_sec = time.time()
-    self._run_xlnet_classifier()
-    wall_time_sec = time.time() - start_time_sec
-
-    with tf.io.gfile.GFile(training_summary_path, 'rb') as reader:
-      summary = json.loads(reader.read().decode('utf-8'))
-
-    super(XLNetClassifyAccuracy, self)._report_benchmark(
-        stats=summary,
-        wall_time_sec=wall_time_sec,
-        min_accuracy=min_accuracy,
-        max_accuracy=max_accuracy)
-
-  def _setup(self):
-    super(XLNetClassifyAccuracy, self)._setup()
-    FLAGS.test_data_size = 25024
-    FLAGS.train_batch_size = 16
-    FLAGS.seq_len = 512
-    FLAGS.mem_len = 0
-    FLAGS.n_layer = 24
-    FLAGS.d_model = 1024
-    FLAGS.d_embed = 1024
-    FLAGS.n_head = 16
-    FLAGS.d_head = 64
-    FLAGS.d_inner = 4096
-    FLAGS.untie_r = True
-    FLAGS.n_class = 2
-    FLAGS.ff_activation = 'gelu'
-    FLAGS.strategy_type = 'mirror'
-    FLAGS.learning_rate = 2e-5
-    FLAGS.train_steps = 4000
-    FLAGS.warmup_steps = 500
-    FLAGS.iterations = 200
-    FLAGS.bi_data = False
-    FLAGS.init_checkpoint = self.pretrained_checkpoint_path
-    FLAGS.train_tfrecord_path = self.train_data_path
-    FLAGS.test_tfrecord_path = self.eval_data_path
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_8_gpu_imdb(self):
-    """Run XLNet model accuracy test with 8 GPUs."""
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_imdb')
-    # Sets timer_callback to None as we do not use it now.
-    self.timer_callback = None
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_2x2_tpu_imdb(self):
-    """Run XLNet model accuracy test on 2x2 tpu."""
-    self._setup()
-    FLAGS.strategy_type = 'tpu'
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_imdb')
-    # Sets timer_callback to None as we do not use it now.
-    self.timer_callback = None
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-
-class XLNetSquadAccuracy(XLNetBenchmarkBase):
-  """Short accuracy test for XLNet squad model.
-
-  Tests XLNet squad task model accuracy. The naming
-  convention of below test cases follow
-  `benchmark_(number of gpus)_gpu_(dataset type)` format.
-  """
-
-  def __init__(self, output_dir=None, tpu=None, **kwargs):
-    self.train_data_path = SQUAD_DATA_PATH
-    self.predict_file = os.path.join(SQUAD_DATA_PATH, "dev-v2.0.json")
-    self.test_data_path = os.path.join(SQUAD_DATA_PATH, "12048.eval.tf_record")
-    self.spiece_model_file = os.path.join(SQUAD_DATA_PATH, "spiece.cased.model")
-    self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
-
-    super(XLNetSquadAccuracy, self).__init__(output_dir=output_dir, tpu=tpu)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                training_summary_path,
-                                min_accuracy=87.0,
-                                max_accuracy=89.0):
-    """Starts XLNet accuracy benchmark test."""
-
-    start_time_sec = time.time()
-    self._run_xlnet_squad()
-    wall_time_sec = time.time() - start_time_sec
-
-    with tf.io.gfile.GFile(training_summary_path, 'rb') as reader:
-      summary = json.loads(reader.read().decode('utf-8'))
-
-    super(XLNetSquadAccuracy, self)._report_benchmark(
-        stats=summary,
-        wall_time_sec=wall_time_sec,
-        min_accuracy=min_accuracy,
-        max_accuracy=max_accuracy)
-
-  def _setup(self):
-    super(XLNetSquadAccuracy, self)._setup()
-    FLAGS.train_batch_size = 16
-    FLAGS.seq_len = 512
-    FLAGS.mem_len = 0
-    FLAGS.n_layer = 24
-    FLAGS.d_model = 1024
-    FLAGS.d_embed = 1024
-    FLAGS.n_head = 16
-    FLAGS.d_head = 64
-    FLAGS.d_inner = 4096
-    FLAGS.untie_r = True
-    FLAGS.ff_activation = 'gelu'
-    FLAGS.strategy_type = 'mirror'
-    FLAGS.learning_rate = 3e-5
-    FLAGS.train_steps = 8000
-    FLAGS.warmup_steps = 1000
-    FLAGS.iterations = 1000
-    FLAGS.bi_data = False
-    FLAGS.init_checkpoint = self.pretrained_checkpoint_path
-    FLAGS.train_tfrecord_path = self.train_data_path
-    FLAGS.test_tfrecord_path = self.test_data_path
-    FLAGS.spiece_model_file = self.spiece_model_file
-    FLAGS.predict_file = self.predict_file
-    FLAGS.adam_epsilon = 1e-6
-    FLAGS.lr_layer_decay_rate = 0.75
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_8_gpu_squadv2(self):
-    """Run XLNet model squad v2 accuracy test with 8 GPUs."""
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squadv2')
-    FLAGS.predict_dir = FLAGS.model_dir
-    # Sets timer_callback to None as we do not use it now.
-    self.timer_callback = None
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_2x2_tpu_squadv2(self):
-    """Run XLNet model squad v2 accuracy test on 2x2 tpu."""
-    self._setup()
-    FLAGS.strategy_type = 'tpu'
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_squadv2')
-    FLAGS.predict_dir = FLAGS.model_dir
-    # Sets timer_callback to None as we do not use it now.
-    self.timer_callback = None
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-
-if __name__ == '__main__':
-  tf.test.main()