Update code to v2.8.0

9485aa1d · qianyj · 89cfa348 · f5fc733a · 9485aa1d · 9485aa1d
Commit 9485aa1d authored Nov 28, 2023 by qianyj
20 changed files
--- a/official/benchmark/models/resnet_cifar_test.py
+++ b/official/benchmark/models/resnet_cifar_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test the keras ResNet model with Cifar data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+
+import tensorflow as tf
+
+from tensorflow.python.eager import context
+from tensorflow.python.platform import googletest
+from official.benchmark.models import cifar_preprocessing
+from official.benchmark.models import resnet_cifar_main
+from official.utils.testing import integration
+
+
+class KerasCifarTest(googletest.TestCase):
+  """Unit tests for Keras ResNet with Cifar."""
+
+  _extra_flags = [
+      "-batch_size", "4", "-train_steps", "1", "-use_synthetic_data", "true"
+  ]
+  _tempdir = None
+
+  def get_temp_dir(self):
+    if not self._tempdir:
+      self._tempdir = tempfile.mkdtemp(dir=googletest.GetTempDir())
+    return self._tempdir
+
+  @classmethod
+  def setUpClass(cls):  # pylint: disable=invalid-name
+    super(KerasCifarTest, cls).setUpClass()
+    resnet_cifar_main.define_cifar_flags()
+
+  def setUp(self):
+    super(KerasCifarTest, self).setUp()
+    cifar_preprocessing.NUM_IMAGES["validation"] = 4
+
+  def tearDown(self):
+    super(KerasCifarTest, self).tearDown()
+    tf.io.gfile.rmtree(self.get_temp_dir())
+
+  def test_end_to_end_no_dist_strat(self):
+    """Test Keras model with 1 GPU, no distribution strategy."""
+
+    extra_flags = [
+        "-distribution_strategy",
+        "off",
+        "-model_dir",
+        "keras_cifar_no_dist_strat",
+        "-data_format",
+        "channels_last",
+    ]
+    extra_flags = extra_flags + self._extra_flags
+
+    integration.run_synthetic(
+        main=resnet_cifar_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+  def test_end_to_end_graph_no_dist_strat(self):
+    """Test Keras model in legacy graph mode with 1 GPU, no dist strat."""
+    extra_flags = [
+        "-enable_eager",
+        "false",
+        "-distribution_strategy",
+        "off",
+        "-model_dir",
+        "keras_cifar_graph_no_dist_strat",
+        "-data_format",
+        "channels_last",
+    ]
+    extra_flags = extra_flags + self._extra_flags
+
+    integration.run_synthetic(
+        main=resnet_cifar_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+  def test_end_to_end_1_gpu(self):
+    """Test Keras model with 1 GPU."""
+
+    if context.num_gpus() < 1:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available"
+          .format(1, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus",
+        "1",
+        "-distribution_strategy",
+        "mirrored",
+        "-model_dir",
+        "keras_cifar_1_gpu",
+        "-data_format",
+        "channels_last",
+    ]
+    extra_flags = extra_flags + self._extra_flags
+
+    integration.run_synthetic(
+        main=resnet_cifar_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+  def test_end_to_end_graph_1_gpu(self):
+    """Test Keras model in legacy graph mode with 1 GPU."""
+    if context.num_gpus() < 1:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available"
+          .format(1, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus",
+        "1",
+        "-noenable_eager",
+        "-distribution_strategy",
+        "mirrored",
+        "-model_dir",
+        "keras_cifar_graph_1_gpu",
+        "-data_format",
+        "channels_last",
+    ]
+    extra_flags = extra_flags + self._extra_flags
+
+    integration.run_synthetic(
+        main=resnet_cifar_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+  def test_end_to_end_2_gpu(self):
+    """Test Keras model with 2 GPUs."""
+
+    if context.num_gpus() < 2:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available"
+          .format(2, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus",
+        "2",
+        "-distribution_strategy",
+        "mirrored",
+        "-model_dir",
+        "keras_cifar_2_gpu",
+    ]
+    extra_flags = extra_flags + self._extra_flags
+
+    integration.run_synthetic(
+        main=resnet_cifar_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+  def test_end_to_end_graph_2_gpu(self):
+    """Test Keras model in legacy graph mode with 2 GPUs."""
+    if context.num_gpus() < 2:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available"
+          .format(2, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus",
+        "2",
+        "-enable_eager",
+        "false",
+        "-distribution_strategy",
+        "mirrored",
+        "-model_dir",
+        "keras_cifar_graph_2_gpu",
+    ]
+    extra_flags = extra_flags + self._extra_flags
+
+    integration.run_synthetic(
+        main=resnet_cifar_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+
+if __name__ == "__main__":
+  googletest.main()
--- a/official/benchmark/models/resnet_imagenet_main.py
+++ b/official/benchmark/models/resnet_imagenet_main.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a ResNet model on the ImageNet dataset."""
+
+import os
+
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+from official.common import distribute_utils
+from official.legacy.image_classification import test_utils
+from official.legacy.image_classification.resnet import common
+from official.legacy.image_classification.resnet import imagenet_preprocessing
+from official.legacy.image_classification.resnet import resnet_model
+from official.modeling import performance
+from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+from official.utils.misc import model_helpers
+
+
+def _cluster_last_three_conv2d_layers(model):
+  """Helper method to cluster last three conv2d layers."""
+  import tensorflow_model_optimization as tfmot  # pylint: disable=g-import-not-at-top
+  last_three_conv2d_layers = [
+      layer for layer in model.layers
+      if isinstance(layer, tf.keras.layers.Conv2D)
+    ][-3:]
+
+  cluster_weights = tfmot.clustering.keras.cluster_weights
+  centroid_initialization = tfmot.clustering.keras.CentroidInitialization
+
+  def cluster_fn(layer):
+    if layer not in last_three_conv2d_layers:
+      return layer
+
+    if layer == last_three_conv2d_layers[0] or \
+      layer == last_three_conv2d_layers[1]:
+      clustered = cluster_weights(layer, number_of_clusters=256, \
+          cluster_centroids_init=centroid_initialization.LINEAR)
+      print('Clustered {} with 256 clusters'.format(layer.name))
+    else:
+      clustered = cluster_weights(layer, number_of_clusters=32, \
+          cluster_centroids_init=centroid_initialization.LINEAR)
+      print('Clustered {} with 32 clusters'.format(layer.name))
+    return clustered
+
+  return tf.keras.models.clone_model(model, clone_function=cluster_fn)
+
+
+def run(flags_obj):
+  """Run ResNet ImageNet training and eval loop using native Keras APIs.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+
+  Raises:
+    ValueError: If fp16 is passed as it is not currently supported.
+    NotImplementedError: If some features are not currently supported.
+
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  keras_utils.set_session_config(
+      enable_xla=flags_obj.enable_xla)
+  # Execute flag override logic for better model performance
+  if flags_obj.tf_gpu_thread_mode:
+    keras_utils.set_gpu_thread_mode_and_count(
+        per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+        gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+        num_gpus=flags_obj.num_gpus,
+        datasets_num_private_threads=flags_obj.datasets_num_private_threads)
+  common.set_cudnn_batchnorm_mode()
+
+  dtype = flags_core.get_tf_dtype(flags_obj)
+  performance.set_mixed_precision_policy(
+      flags_core.get_tf_dtype(flags_obj))
+
+  data_format = flags_obj.data_format
+  if data_format is None:
+    data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
+                   else 'channels_last')
+  tf.keras.backend.set_image_data_format(data_format)
+
+  # Configures cluster spec for distribution strategy.
+  _ = distribute_utils.configure_cluster(flags_obj.worker_hosts,
+                                         flags_obj.task_index)
+
+  strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs,
+      tpu_address=flags_obj.tpu)
+
+  if strategy:
+    # flags_obj.enable_get_next_as_optional controls whether enabling
+    # get_next_as_optional behavior in DistributedIterator. If true, last
+    # partial batch can be supported.
+    strategy.extended.experimental_enable_get_next_as_optional = (
+        flags_obj.enable_get_next_as_optional
+    )
+
+  strategy_scope = distribute_utils.get_strategy_scope(strategy)
+
+  # pylint: disable=protected-access
+  if flags_obj.use_synthetic_data:
+    input_fn = common.get_synth_input_fn(
+        height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
+        width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
+        num_channels=imagenet_preprocessing.NUM_CHANNELS,
+        num_classes=imagenet_preprocessing.NUM_CLASSES,
+        dtype=dtype,
+        drop_remainder=True)
+  else:
+    input_fn = imagenet_preprocessing.input_fn
+
+  # When `enable_xla` is True, we always drop the remainder of the batches
+  # in the dataset, as XLA-GPU doesn't support dynamic shapes.
+  drop_remainder = flags_obj.enable_xla
+
+  # Current resnet_model.resnet50 input format is always channel-last.
+  # We use keras_application mobilenet model which input format is depends on
+  # the keras beckend image data format.
+  # This use_keras_image_data_format flags indicates whether image preprocessor
+  # output format should be same as the keras backend image data format or just
+  # channel-last format.
+  use_keras_image_data_format = \
+    (flags_obj.model == 'mobilenet' or flags_obj.model == 'mobilenet_pretrained')
+
+  train_input_dataset = input_fn(
+      is_training=True,
+      data_dir=flags_obj.data_dir,
+      batch_size=flags_obj.batch_size,
+      parse_record_fn=imagenet_preprocessing.get_parse_record_fn(
+          use_keras_image_data_format=use_keras_image_data_format),
+      datasets_num_private_threads=flags_obj.datasets_num_private_threads,
+      dtype=dtype,
+      drop_remainder=drop_remainder,
+      tf_data_experimental_slack=flags_obj.tf_data_experimental_slack,
+      training_dataset_cache=flags_obj.training_dataset_cache,
+  )
+
+  eval_input_dataset = None
+  if not flags_obj.skip_eval:
+    eval_input_dataset = input_fn(
+        is_training=False,
+        data_dir=flags_obj.data_dir,
+        batch_size=flags_obj.batch_size,
+        parse_record_fn=imagenet_preprocessing.get_parse_record_fn(
+            use_keras_image_data_format=use_keras_image_data_format),
+        dtype=dtype,
+        drop_remainder=drop_remainder)
+
+  lr_schedule = common.PiecewiseConstantDecayWithWarmup(
+      batch_size=flags_obj.batch_size,
+      epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
+      warmup_epochs=common.LR_SCHEDULE[0][1],
+      boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
+      multipliers=list(p[0] for p in common.LR_SCHEDULE),
+      compute_lr_on_cpu=True)
+  steps_per_epoch = (
+      imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
+
+  with strategy_scope:
+    if flags_obj.optimizer == 'resnet50_default':
+      optimizer = common.get_optimizer(lr_schedule)
+    elif flags_obj.optimizer == 'mobilenet_default' or flags_obj.optimizer == 'mobilenet_fine_tune':
+      initial_learning_rate = \
+          flags_obj.initial_learning_rate_per_sample * flags_obj.batch_size
+      if flags_obj.optimizer == 'mobilenet_fine_tune':
+        initial_learning_rate = 1e-5
+      optimizer = tf.keras.optimizers.SGD(
+          learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
+              initial_learning_rate,
+              decay_steps=steps_per_epoch * flags_obj.num_epochs_per_decay,
+              decay_rate=flags_obj.lr_decay_factor,
+              staircase=True),
+          momentum=0.9)
+    optimizer = performance.configure_optimizer(
+        optimizer,
+        use_float16=flags_core.get_tf_dtype(flags_obj) == tf.float16,
+        loss_scale=flags_core.get_loss_scale(flags_obj, default_for_fp16=128),)
+
+    # TODO(hongkuny): Remove trivial model usage and move it to benchmark.
+    if flags_obj.use_trivial_model:
+      model = test_utils.trivial_model(imagenet_preprocessing.NUM_CLASSES)
+    elif flags_obj.model == 'resnet50_v1.5':
+      model = resnet_model.resnet50(
+          num_classes=imagenet_preprocessing.NUM_CLASSES)
+    elif flags_obj.model == 'mobilenet' or flags_obj.model == 'mobilenet_pretrained':
+      # TODO(kimjaehong): Remove layers attribute when minimum TF version
+      # support 2.0 layers by default.
+      if flags_obj.model == 'mobilenet_pretrained':
+        classes_labels = 1000
+        initial_weights = 'imagenet'
+      else:
+        classes_labels = imagenet_preprocessing.NUM_CLASSES
+        initial_weights = None
+      model = tf.keras.applications.mobilenet.MobileNet(
+          weights=initial_weights,
+          classes=classes_labels,
+          layers=tf.keras.layers)
+
+    if flags_obj.pretrained_filepath:
+      model.load_weights(flags_obj.pretrained_filepath)
+
+    if flags_obj.pruning_method == 'polynomial_decay':
+      import tensorflow_model_optimization as tfmot  # pylint: disable=g-import-not-at-top
+      if dtype != tf.float32:
+        raise NotImplementedError(
+            'Pruning is currently only supported on dtype=tf.float32.')
+      pruning_params = {
+          'pruning_schedule':
+              tfmot.sparsity.keras.PolynomialDecay(
+                  initial_sparsity=flags_obj.pruning_initial_sparsity,
+                  final_sparsity=flags_obj.pruning_final_sparsity,
+                  begin_step=flags_obj.pruning_begin_step,
+                  end_step=flags_obj.pruning_end_step,
+                  frequency=flags_obj.pruning_frequency),
+      }
+      model = tfmot.sparsity.keras.prune_low_magnitude(model, **pruning_params)
+    elif flags_obj.pruning_method:
+      raise NotImplementedError('Only polynomial_decay is currently supported.')
+
+    if flags_obj.clustering_method == 'selective_clustering':
+      import tensorflow_model_optimization as tfmot  # pylint: disable=g-import-not-at-top
+      if dtype != tf.float32:
+        raise NotImplementedError(
+            'Clustering is currently only supported on dtype=tf.float32.')
+      model = _cluster_last_three_conv2d_layers(model)
+    elif flags_obj.clustering_method:
+      raise NotImplementedError(
+          'Only selective_clustering is implemented.')
+
+    model.compile(
+        loss='sparse_categorical_crossentropy',
+        optimizer=optimizer,
+        metrics=(['sparse_categorical_accuracy']
+                 if flags_obj.report_accuracy_metrics else None),
+        run_eagerly=flags_obj.run_eagerly)
+
+  train_epochs = flags_obj.train_epochs
+
+  callbacks = common.get_callbacks(
+      pruning_method=flags_obj.pruning_method,
+      enable_checkpoint_and_export=flags_obj.enable_checkpoint_and_export,
+      model_dir=flags_obj.model_dir)
+
+  # If mutliple epochs, ignore the train_steps flag.
+  if train_epochs <= 1 and flags_obj.train_steps:
+    steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
+    train_epochs = 1
+
+  num_eval_steps = (
+      imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size)
+
+  validation_data = eval_input_dataset
+  if flags_obj.skip_eval:
+    # Only build the training graph. This reduces memory usage introduced by
+    # control flow ops in layers that have different implementations for
+    # training and inference (e.g., batch norm).
+    if flags_obj.set_learning_phase_to_train:
+      # TODO(haoyuzhang): Understand slowdown of setting learning phase when
+      # not using distribution strategy.
+      tf.keras.backend.set_learning_phase(1)
+    num_eval_steps = None
+    validation_data = None
+
+  if not strategy and flags_obj.explicit_gpu_placement:
+    # TODO(b/135607227): Add device scope automatically in Keras training loop
+    # when not using distribution strategy.
+    no_dist_strat_device = tf.device('/device:GPU:0')
+    no_dist_strat_device.__enter__()
+
+  history = model.fit(train_input_dataset,
+                      epochs=train_epochs,
+                      steps_per_epoch=steps_per_epoch,
+                      callbacks=callbacks,
+                      validation_steps=num_eval_steps,
+                      validation_data=validation_data,
+                      validation_freq=flags_obj.epochs_between_evals,
+                      verbose=2)
+
+  eval_output = None
+  if not flags_obj.skip_eval:
+    eval_output = model.evaluate(eval_input_dataset,
+                                 steps=num_eval_steps,
+                                 verbose=2)
+
+  if flags_obj.pruning_method:
+    model = tfmot.sparsity.keras.strip_pruning(model)
+
+  if flags_obj.clustering_method:
+    model = tfmot.clustering.keras.strip_clustering(model)
+
+  if flags_obj.enable_checkpoint_and_export:
+    if dtype == tf.bfloat16:
+      logging.warning('Keras model.save does not support bfloat16 dtype.')
+    else:
+      # Keras model.save assumes a float32 input designature.
+      export_path = os.path.join(flags_obj.model_dir, 'saved_model')
+      model.save(export_path, include_optimizer=False)
+
+  if not strategy and flags_obj.explicit_gpu_placement:
+    no_dist_strat_device.__exit__()
+
+  stats = common.build_stats(history, eval_output, callbacks)
+  return stats
+
+
+def define_imagenet_keras_flags():
+  common.define_keras_flags(
+      model=True,
+      optimizer=True,
+      pretrained_filepath=True)
+  common.define_pruning_flags()
+  common.define_clustering_flags()
+  flags_core.set_defaults()
+  flags.adopt_module_key_flags(common)
+
+
+def main(_):
+  model_helpers.apply_clean(flags.FLAGS)
+  stats = run(flags.FLAGS)
+  logging.info('Run stats:\n%s', stats)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  define_imagenet_keras_flags()
+  app.run(main)
--- a/official/benchmark/models/resnet_imagenet_test.py
+++ b/official/benchmark/models/resnet_imagenet_test.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test the keras ResNet model with ImageNet data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.eager import context
+from official.benchmark.models import resnet_imagenet_main
+from official.legacy.image_classification.resnet import imagenet_preprocessing
+from official.utils.testing import integration
+
+
+@parameterized.parameters(
+    "resnet",
+    # "resnet_polynomial_decay",  b/151854314
+    "mobilenet",
+    # "mobilenet_polynomial_decay",  b/151854314
+    "mobilenet_selective_clustering",
+)
+class KerasImagenetTest(tf.test.TestCase):
+  """Unit tests for Keras Models with ImageNet."""
+  _default_flags_dict = [
+      "-batch_size",
+      "4",
+      "-train_steps",
+      "1",
+      "-use_synthetic_data",
+      "true",
+      "-data_format",
+      "channels_last",
+  ]
+  _extra_flags_dict = {
+      "resnet": [
+          "-model",
+          "resnet50_v1.5",
+          "-optimizer",
+          "resnet50_default",
+      ],
+      "resnet_polynomial_decay": [
+          "-model",
+          "resnet50_v1.5",
+          "-optimizer",
+          "resnet50_default",
+          "-pruning_method",
+          "polynomial_decay",
+      ],
+      "mobilenet": [
+          "-model",
+          "mobilenet",
+          "-optimizer",
+          "mobilenet_default",
+      ],
+      "mobilenet_polynomial_decay": [
+          "-model",
+          "mobilenet",
+          "-optimizer",
+          "mobilenet_default",
+          "-pruning_method",
+          "polynomial_decay",
+      ],
+      "mobilenet_selective_clustering": [
+          "-model", "mobilenet_pretrained",
+          "-optimizer", "mobilenet_fine_tune",
+          "-clustering_method", "selective_clustering",
+      ]
+  }
+  _tempdir = None
+
+  @classmethod
+  def setUpClass(cls):  # pylint: disable=invalid-name
+    super(KerasImagenetTest, cls).setUpClass()
+    resnet_imagenet_main.define_imagenet_keras_flags()
+
+  def setUp(self):
+    super(KerasImagenetTest, self).setUp()
+    imagenet_preprocessing.NUM_IMAGES["validation"] = 4
+    self.policy = tf.keras.mixed_precision.global_policy()
+
+  def tearDown(self):
+    super(KerasImagenetTest, self).tearDown()
+    tf.io.gfile.rmtree(self.get_temp_dir())
+    tf.keras.mixed_precision.set_global_policy(self.policy)
+
+  def get_extra_flags_dict(self, flags_key):
+    return self._extra_flags_dict[flags_key] + self._default_flags_dict
+
+  def test_end_to_end_no_dist_strat(self, flags_key):
+    """Test Keras model with 1 GPU, no distribution strategy."""
+
+    extra_flags = [
+        "-distribution_strategy",
+        "off",
+    ]
+    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
+
+    integration.run_synthetic(
+        main=resnet_imagenet_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+  def test_end_to_end_graph_no_dist_strat(self, flags_key):
+    """Test Keras model in legacy graph mode with 1 GPU, no dist strat."""
+    extra_flags = [
+        "-enable_eager",
+        "false",
+        "-distribution_strategy",
+        "off",
+    ]
+    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
+
+    integration.run_synthetic(
+        main=resnet_imagenet_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+  def test_end_to_end_1_gpu(self, flags_key):
+    """Test Keras model with 1 GPU."""
+
+    if context.num_gpus() < 1:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available"
+          .format(1, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus",
+        "1",
+        "-distribution_strategy",
+        "mirrored",
+        "-enable_checkpoint_and_export",
+        "1",
+    ]
+    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
+
+    integration.run_synthetic(
+        main=resnet_imagenet_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+  def test_end_to_end_1_gpu_fp16(self, flags_key):
+    """Test Keras model with 1 GPU and fp16."""
+
+    if context.num_gpus() < 1:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available"
+          .format(1, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus",
+        "1",
+        "-dtype",
+        "fp16",
+        "-distribution_strategy",
+        "mirrored",
+    ]
+    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
+
+    if "polynomial_decay" in extra_flags:
+      self.skipTest("Pruning with fp16 is currently not supported.")
+
+    if "selective_clustering" in extra_flags:
+      self.skipTest("Clustering with fp16 is currently not supported.")
+
+    integration.run_synthetic(
+        main=resnet_imagenet_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+  def test_end_to_end_2_gpu(self, flags_key):
+    """Test Keras model with 2 GPUs."""
+
+    if context.num_gpus() < 2:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available"
+          .format(2, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus",
+        "2",
+        "-distribution_strategy",
+        "mirrored",
+    ]
+    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
+
+    integration.run_synthetic(
+        main=resnet_imagenet_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+  def test_end_to_end_xla_2_gpu(self, flags_key):
+    """Test Keras model with XLA and 2 GPUs."""
+
+    if context.num_gpus() < 2:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available"
+          .format(2, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus",
+        "2",
+        "-enable_xla",
+        "true",
+        "-distribution_strategy",
+        "mirrored",
+    ]
+    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
+
+    integration.run_synthetic(
+        main=resnet_imagenet_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+  def test_end_to_end_2_gpu_fp16(self, flags_key):
+    """Test Keras model with 2 GPUs and fp16."""
+
+    if context.num_gpus() < 2:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available"
+          .format(2, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus",
+        "2",
+        "-dtype",
+        "fp16",
+        "-distribution_strategy",
+        "mirrored",
+    ]
+    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
+
+    if "polynomial_decay" in extra_flags:
+      self.skipTest("Pruning with fp16 is currently not supported.")
+
+    if "selective_clustering" in extra_flags:
+      self.skipTest("Clustering with fp16 is currently not supported.")
+
+    integration.run_synthetic(
+        main=resnet_imagenet_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+  def test_end_to_end_xla_2_gpu_fp16(self, flags_key):
+    """Test Keras model with XLA, 2 GPUs and fp16."""
+    if context.num_gpus() < 2:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available"
+          .format(2, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus",
+        "2",
+        "-dtype",
+        "fp16",
+        "-enable_xla",
+        "true",
+        "-distribution_strategy",
+        "mirrored",
+    ]
+    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
+
+    if "polynomial_decay" in extra_flags:
+      self.skipTest("Pruning with fp16 is currently not supported.")
+
+    if "selective_clustering" in extra_flags:
+      self.skipTest("Clustering with fp16 is currently not supported.")
+
+    integration.run_synthetic(
+        main=resnet_imagenet_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/benchmark/models/resnet_imagenet_test_tpu.py
+++ b/official/benchmark/models/resnet_imagenet_test_tpu.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test the keras ResNet model with ImageNet data on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import tensorflow as tf
+from official.benchmark.models import resnet_imagenet_main
+from official.legacy.image_classification.resnet import imagenet_preprocessing
+from official.utils.testing import integration
+
+
+class KerasImagenetTest(tf.test.TestCase, parameterized.TestCase):
+  """Unit tests for Keras Models with ImageNet."""
+
+  _extra_flags_dict = {
+      "resnet": [
+          "-batch_size",
+          "4",
+          "-train_steps",
+          "1",
+          "-use_synthetic_data",
+          "true"
+          "-model",
+          "resnet50_v1.5",
+          "-optimizer",
+          "resnet50_default",
+      ],
+      "resnet_polynomial_decay": [
+          "-batch_size",
+          "4",
+          "-train_steps",
+          "1",
+          "-use_synthetic_data",
+          "true",
+          "-model",
+          "resnet50_v1.5",
+          "-optimizer",
+          "resnet50_default",
+          "-pruning_method",
+          "polynomial_decay",
+      ],
+  }
+  _tempdir = None
+
+  @classmethod
+  def setUpClass(cls):  # pylint: disable=invalid-name
+    super(KerasImagenetTest, cls).setUpClass()
+    resnet_imagenet_main.define_imagenet_keras_flags()
+
+  def setUp(self):
+    super(KerasImagenetTest, self).setUp()
+    imagenet_preprocessing.NUM_IMAGES["validation"] = 4
+    self.policy = tf.keras.mixed_precision.global_policy()
+
+  def tearDown(self):
+    super(KerasImagenetTest, self).tearDown()
+    tf.io.gfile.rmtree(self.get_temp_dir())
+    tf.keras.mixed_precision.set_global_policy(self.policy)
+
+  @parameterized.parameters([
+      "resnet",
+      # "resnet_polynomial_decay"  b/151854314
+  ])
+  def test_end_to_end_tpu(self, flags_key):
+    """Test Keras model with TPU distribution strategy."""
+
+    extra_flags = [
+        "-distribution_strategy",
+        "tpu",
+        "-data_format",
+        "channels_last",
+        "-enable_checkpoint_and_export",
+        "1",
+    ]
+    extra_flags = extra_flags + self._extra_flags_dict[flags_key]
+
+    integration.run_synthetic(
+        main=resnet_imagenet_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+  @parameterized.parameters(["resnet"])
+  def test_end_to_end_tpu_bf16(self, flags_key):
+    """Test Keras model with TPU and bfloat16 activation."""
+
+    extra_flags = [
+        "-distribution_strategy",
+        "tpu",
+        "-data_format",
+        "channels_last",
+        "-dtype",
+        "bf16",
+    ]
+    extra_flags = extra_flags + self._extra_flags_dict[flags_key]
+
+    integration.run_synthetic(
+        main=resnet_imagenet_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/benchmark/models/shakespeare/README.md
+++ b/official/benchmark/models/shakespeare/README.md
+# Shakespeare character LSTM model
+
+This is an implemention of a simple character LSTM used to generate text.
+
+## Instructions
+
+First download the source data:
+
+```
+wget https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
+```
+
+Note that files other than shakepeare.txt can also be used to train the model to generater other text.
+
+Then train the model:
+
+```python
+python3 shakespeare_main.py --training_data shakespeare.txt \
+    --model_dir /tmp/shakespeare
+```
+
+This will place model checkpoints in `/tmp/shakespeare`, so that we can use them to make predictions.
+
+Then generate predictions:
+
+```python
+python3 shakespeare_main.py --training_data shakespeare.txt \
+    --model_dir /tmp/shakespeare --notrain --predict_context=ROMEO:
+```
+
+Change `--predict_context` and `--predict_length` to suit your needs.
--- a/official/benchmark/models/shakespeare/__init__.py
+++ b/official/benchmark/models/shakespeare/__init__.py
+
--- a/official/benchmark/models/shakespeare/shakespeare_main.py
+++ b/official/benchmark/models/shakespeare/shakespeare_main.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a character LSTM model trained on Shakespeare."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+# pylint: disable=wrong-import-order
+from absl import app
+from absl import flags
+import numpy as np
+import tensorflow as tf
+from official.common import distribute_utils
+# pylint: enable=wrong-import-order
+
+from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+
+EMBEDDING_DIM = 256
+RNN_UNITS = 1024
+SEQ_LENGTH = 100
+# Calculated by running batch_size=1
+BATCHES_PER_EPOCH = 11043
+
+
+def define_flags():
+  """Define the flags for the Shakespeare character LSTM."""
+  flags_core.define_base(data_dir=False,
+                         clean=False,
+                         train_epochs=True,
+                         epochs_between_evals=False,
+                         stop_threshold=False,
+                         num_gpu=True,
+                         export_dir=False,
+                         run_eagerly=True,
+                         distribution_strategy=True)
+
+  flags_core.define_performance(num_parallel_calls=False,
+                                inter_op=False,
+                                intra_op=False,
+                                synthetic_data=False,
+                                max_train_steps=False,
+                                dtype=True,
+                                enable_xla=True)
+
+  flags_core.set_defaults(train_epochs=43,
+                          batch_size=64)
+
+  flags.DEFINE_boolean(name='enable_eager', default=True, help='Enable eager?')
+  flags.DEFINE_boolean(
+      name='train', default=True,
+      help='If true trains the model.')
+  flags.DEFINE_string(
+      name='predict_context', default=None,
+      help='If set, makes a prediction with the given context.')
+  flags.DEFINE_integer(
+      name='predict_length', default=1000,
+      help='Length of the predicted text including the context.')
+  flags.DEFINE_integer(name='train_steps', default=None,
+                       help='Overrides train_steps per epoch if not None.')
+  flags.DEFINE_integer(
+      name='log_steps', default=100,
+      help='For every log_steps, we log the timing information such as '
+      'examples per second.')
+  flags.DEFINE_string(
+      name='training_data', default=None,
+      help='Path to file containing the training data.')
+  flags.DEFINE_boolean(name='cudnn', default=True, help='Use CuDNN LSTM.')
+
+
+def get_dataset(path_to_file, batch_size=None, seq_length=SEQ_LENGTH):
+  """Creates a dataset from a given text file.
+
+  Args:
+    path_to_file: The path to the training data.
+    batch_size: Batch size to use.
+    seq_length: The length of the LSTM sequence.
+
+  Returns:
+    A tuple, consisting of the Dataset and the class to character mapping
+    and character to class mapping.
+  """
+  with tf.io.gfile.GFile(path_to_file, 'rb') as train_data:
+    text = train_data.read().decode(encoding='utf-8')
+
+  # Create vocab
+  vocab = sorted(set(text))
+  char2idx = {u: i for i, u in enumerate(vocab)}
+  idx2char = np.array(vocab)
+
+  # Split text into sequence length + 1 chucks to create examples
+  text_as_int = np.array([char2idx[c] for c in text])
+  char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
+  sequences = char_dataset.batch(
+      seq_length + 1, drop_remainder=True, num_parallel_calls=tf.data.AUTOTUNE)
+
+  def split_input_target(chunk):
+    input_text = chunk[:-1]
+    target_text = chunk[1:]
+    return input_text, tf.one_hot(target_text, len(vocab))
+  dataset = sequences.map(split_input_target)
+  dataset = dataset.shuffle(10000).repeat()
+  dataset = dataset.batch(
+      batch_size, drop_remainder=True, num_parallel_calls=tf.data.AUTOTUNE)
+
+  return dataset, idx2char, char2idx
+
+
+def build_model(vocab_size,
+                embedding_dim=EMBEDDING_DIM,
+                rnn_units=RNN_UNITS,
+                batch_size=None,
+                stateful=False,
+                use_cudnn=True):
+  """Builds the Shakespeare model.
+
+  Args:
+    vocab_size: The number of character classes in the input.
+    embedding_dim: The dimension of the embedding space for each class.
+    rnn_units: The number of RNN units in the layer.
+    batch_size: When predicting, the batch size of the predictions.
+    stateful: If true, the LSTM is stateful.
+
+  Returns:
+    A Keras Model.
+  """
+  LSTM = functools.partial(tf.keras.layers.LSTM, implementation=2)
+
+  # By indirecting the activation through a lambda layer, the logic to dispatch
+  # to CuDNN in V2 doesn't trigger and we force the LSTM to run in non-CuDNN
+  # mode.
+  lstm_activation = ('tanh' if use_cudnn else
+                     lambda x: tf.math.tanh(x))
+
+  batch_shape = [batch_size if stateful else None, None]
+  return tf.keras.Sequential([
+      tf.keras.layers.Embedding(vocab_size, embedding_dim,
+                                batch_input_shape=batch_shape),
+      LSTM(rnn_units,
+           activation=lstm_activation,
+           return_sequences=True,
+           stateful=stateful,
+           recurrent_initializer='glorot_uniform'),
+      tf.keras.layers.Dense(vocab_size),
+      tf.keras.layers.Softmax(dtype=tf.float32)])
+
+
+def train_model(flags_obj, dataset, vocab_size, strategy, checkpoint_dir=None):
+  """Trains a Shakespeare model.
+
+  Args:
+    flags_obj: An object containing parsed flag values.s
+    dataset: the training data set.
+    vocab_size: the number of unique character classes.
+    strategy: distribution strategy to use.
+    checkpoint_dir: if not None, the directory in which to make checkpoints.
+
+  Returns:
+    The training history and callbacks.
+  """
+  if flags_obj.train_steps:
+    train_steps = flags_obj.train_steps
+  else:
+    train_steps = BATCHES_PER_EPOCH // flags_obj.batch_size
+  strategy_scope = distribute_utils.get_strategy_scope(strategy)
+
+  with strategy_scope:
+    model = build_model(vocab_size=vocab_size, batch_size=flags_obj.batch_size,
+                        use_cudnn=flags_obj.cudnn)
+
+    # Model.fit() automatically applies loss scaling so we don't need to create
+    # a LossScaleOptimizer.
+    model.compile(
+        optimizer=tf.keras.optimizers.Adam(),
+        loss=tf.keras.losses.CategoricalCrossentropy(),
+        metrics=[tf.keras.metrics.Recall(top_k=1, name='RecallAt1'),
+                 tf.keras.metrics.Recall(top_k=5, name='RecallAt5')],
+        run_eagerly=flags_obj.run_eagerly)
+
+  callbacks = []
+  if checkpoint_dir:
+    checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')
+    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+        filepath=checkpoint_prefix,
+        save_weights_only=True)
+    callbacks.append(checkpoint_callback)
+  time_callback = keras_utils.TimeHistory(flags_obj.batch_size,
+                                          flags_obj.log_steps)
+  callbacks.append(time_callback)
+  history = model.fit(dataset,
+                      epochs=flags_obj.train_epochs,
+                      steps_per_epoch=train_steps,
+                      callbacks=callbacks,
+                      verbose=2)
+  return history, callbacks
+
+
+def make_prediction(checkpoint_dir, length, context, idx2char, char2idx):
+  """Make predictions from a Shakespeare model.
+
+  Args:
+    checkpoint_dir: the directory from which to load checkpoints
+    length: the total length of the generated text (including the context).
+    context: the initial text with which the LSTM is primed.
+    idx2char: the character class to character mapping.
+    char2idx: the character to character class mapping.
+
+  Returns:
+    A generated string of text of the given length.
+  """
+  prediction_model = build_model(
+      vocab_size=len(idx2char), batch_size=1, stateful=True)
+  prediction_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
+  prediction_model.build(tf.TensorShape([1, None]))
+
+  input_eval = [char2idx[s] for s in context]
+  input_eval = tf.expand_dims(input_eval, 0)
+
+  text_generated = []
+
+  prediction_model.reset_states()
+  for _ in range(length - len(context)):
+    predictions = prediction_model(input_eval)
+    predictions = tf.squeeze(predictions, 0)
+
+    # We applied a softmax to the output of the model so that
+    # tf.keras.metrics.Recall would work. We need logits for
+    # tf.random.categorical, so we convert the probabilities back to log odds
+    predictions = tf.math.log(predictions / (1 - predictions))
+
+    random_output = tf.random.categorical(predictions, num_samples=1)
+    selected_id = random_output[-1, 0].numpy()
+    input_eval = tf.expand_dims([selected_id], 0)
+    text_generated.append(idx2char[selected_id])
+
+  return context + ''.join(text_generated)
+
+
+def run(flags_obj):
+  """Run Shakespeare training and predict.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+
+  Returns:
+    Dictionary with status from the run.
+  """
+  if not flags_obj.training_data:
+    raise ValueError(
+        'Must set the path to a training data file. e.g download the following '
+        'https://storage.googleapis.com/download.tensorflow.org/data/'
+        'shakespeare.txt')
+
+  if flags_obj.dtype == 'fp16':
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
+
+  keras_utils.set_session_config(
+      enable_xla=flags_obj.enable_xla)
+
+  strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus)
+
+  dataset, idx2char, char2idx = get_dataset(flags_obj.training_data,
+                                            batch_size=flags_obj.batch_size)
+  stats = {}
+  if flags_obj.train:
+    history, callbacks = train_model(flags_obj, dataset,
+                                     len(idx2char), strategy,
+                                     checkpoint_dir=flags_obj.model_dir)
+
+    stats['history'] = history.history
+    stats['callbacks'] = callbacks
+
+  if flags_obj.predict_context:
+    if not flags_obj.model_dir:
+      raise ValueError('Must set model_dir to get predictions.')
+    print(make_prediction(flags_obj.model_dir,
+                          flags_obj.predict_length,
+                          flags_obj.predict_context,
+                          idx2char,
+                          char2idx))
+
+  return stats
+
+
+def main(_):
+  flags_obj = flags.FLAGS
+  run(flags_obj)
+
+
+if __name__ == '__main__':
+  define_flags()
+  app.run(main)
--- a/official/benchmark/models/synthetic_util.py
+++ b/official/benchmark/models/synthetic_util.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helper functions to generate data directly on devices."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import string
+
+from absl import logging
+import tensorflow as tf
+
+
+# The `SyntheticDataset` is a temporary solution for generating synthetic data
+# directly on devices. It is only useful for Keras with Distribution
+# Strategies. We will have better support in `tf.data` or Distribution Strategy
+# later.
+class SyntheticDataset(object):
+  """A dataset that generates synthetic data on each device."""
+
+  def __init__(self, dataset, split_by=1):
+    # dataset.take(1) doesn't have GPU kernel.
+    with tf.device('device:CPU:0'):
+      tensor = tf.data.experimental.get_single_element(dataset.take(1))
+    flat_tensor = tf.nest.flatten(tensor)
+    variable_data = []
+    initializers = []
+    for t in flat_tensor:
+      rebatched_t = tf.split(t, num_or_size_splits=split_by, axis=0)[0]
+      assert rebatched_t.shape.is_fully_defined(), rebatched_t.shape
+      v = tf.compat.v1.get_local_variable(
+          self._random_name(), initializer=rebatched_t)
+      variable_data.append(v)
+      initializers.append(v.initializer)
+    input_data = tf.nest.pack_sequence_as(tensor, variable_data)
+    self._iterator = SyntheticIterator(input_data, initializers)
+
+  def _random_name(self, size=10, chars=string.ascii_uppercase + string.digits):
+    return ''.join(random.choice(chars) for _ in range(size))
+
+  def __iter__(self):
+    return self._iterator
+
+  def make_one_shot_iterator(self):
+    return self._iterator
+
+  def make_initializable_iterator(self):
+    return self._iterator
+
+
+class SyntheticIterator(object):
+  """A dataset that generates synthetic data on each device."""
+
+  def __init__(self, input_data, initializers):
+    self._input_data = input_data
+    self._initializers = initializers
+
+  def get_next(self):
+    return self._input_data
+
+  def next(self):
+    return self.__next__()
+
+  def __next__(self):
+    try:
+      return self.get_next()
+    except tf.errors.OutOfRangeError:
+      raise StopIteration
+
+  def initialize(self):
+    if tf.executing_eagerly():
+      return tf.no_op()
+    else:
+      return self._initializers
+
+
+def _monkey_patch_dataset_method(strategy):
+  """Monkey-patch `strategy`'s `make_dataset_iterator` method."""
+
+  def make_dataset(self, dataset):
+    logging.info('Using pure synthetic data.')
+    with self.scope():
+      if self.extended._global_batch_size:  # pylint: disable=protected-access
+        return SyntheticDataset(dataset, self.num_replicas_in_sync)
+      else:
+        return SyntheticDataset(dataset)
+
+  def make_iterator(self, dataset):
+    dist_dataset = make_dataset(self, dataset)
+    return iter(dist_dataset)
+
+  strategy.orig_make_dataset_iterator = strategy.make_dataset_iterator
+  strategy.make_dataset_iterator = make_iterator
+  strategy.orig_distribute_dataset = strategy.experimental_distribute_dataset
+  strategy.experimental_distribute_dataset = make_dataset
+
+
+def _undo_monkey_patch_dataset_method(strategy):
+  if hasattr(strategy, 'orig_make_dataset_iterator'):
+    strategy.make_dataset_iterator = strategy.orig_make_dataset_iterator
+  if hasattr(strategy, 'orig_distribute_dataset'):
+    strategy.make_dataset_iterator = strategy.orig_distribute_dataset
+
+
+def set_up_synthetic_data():
+  _monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
+  _monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
+  _monkey_patch_dataset_method(
+      tf.distribute.experimental.MultiWorkerMirroredStrategy)
+
+
+def undo_set_up_synthetic_data():
+  _undo_monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
+  _undo_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
+  _undo_monkey_patch_dataset_method(
+      tf.distribute.experimental.MultiWorkerMirroredStrategy)
--- a/official/benchmark/ncf_keras_benchmark.py
+++ b/official/benchmark/ncf_keras_benchmark.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes Keras benchmarks and accuracy tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+
+from absl import flags
+from absl import logging
+from absl.testing import flagsaver
+import tensorflow as tf
+from official.benchmark import benchmark_wrappers
+from official.benchmark import owner_utils
+from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
+from official.recommendation import ncf_common
+from official.recommendation import ncf_keras_main
+from official.utils.flags import core
+
+FLAGS = flags.FLAGS
+NCF_DATA_DIR_NAME = 'movielens_data'
+NCF_TF_REGRESSION_DATA_DIR_NAME = 'gs://tf-regression/ncf/data'
+
+
+class NCFKerasBenchmarkBase(PerfZeroBenchmark):
+  """Base class for NCF model benchmark."""
+
+  def __init__(self, output_dir=None, default_flags=None, **kwargs):
+    super(NCFKerasBenchmarkBase, self).__init__(output_dir, default_flags,
+                                                **kwargs)
+
+    # Run all benchmarks with ml_perf flag.
+    self.default_flags['ml_perf'] = True
+
+  def _setup(self):
+    """Sets up and resets flags before each test."""
+    logging.set_verbosity(logging.INFO)
+    if NCFKerasBenchmarkBase.local_flags is None:
+      ncf_common.define_ncf_flags()
+      # Loads flags to get defaults to then override. List cannot be empty.
+      flags.FLAGS(['foo'])
+      core.set_defaults(**self.default_flags)
+      saved_flag_values = flagsaver.save_flag_values()
+      NCFKerasBenchmarkBase.local_flags = saved_flag_values
+    else:
+      flagsaver.restore_flag_values(NCFKerasBenchmarkBase.local_flags)
+
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self, hr_at_10_min=0, hr_at_10_max=0):
+    start_time_sec = time.time()
+    stats = ncf_keras_main.run_ncf(FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+
+    metrics = []
+    metrics.append({
+        'name': 'exp_per_second',
+        'value': stats['avg_exp_per_second']
+    })
+
+    if hr_at_10_min > 0:
+      metrics.append({
+          'name': 'hr_at_10',
+          'value': stats['eval_hit_rate'],
+          'min_value': hr_at_10_min,
+          'max_value': hr_at_10_max
+      })
+
+      metrics.append({'name': 'train_loss', 'value': stats['loss']})
+
+    self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=metrics)
+
+
+class NCFKerasAccuracy(NCFKerasBenchmarkBase):
+  """Benchmark NCF model using real data."""
+
+  def __init__(self,
+               output_dir=None,
+               root_data_dir=None,
+               default_flags=None,
+               **kwargs):
+    root_data_dir = root_data_dir if root_data_dir else ''
+    default_flags = {}
+    default_flags['dataset'] = 'ml-20m'
+    default_flags['num_gpus'] = 1
+    default_flags['train_epochs'] = 10
+    default_flags['clean'] = True
+    default_flags['batch_size'] = 99000
+    default_flags['learning_rate'] = 0.00382059
+    default_flags['beta1'] = 0.783529
+    default_flags['beta2'] = 0.909003
+    default_flags['epsilon'] = 1.45439e-07
+    default_flags['layers'] = [256, 256, 128, 64]
+    default_flags['num_factors'] = 64
+    default_flags['hr_threshold'] = 0.635
+    default_flags['ml_perf'] = True
+    default_flags['use_synthetic_data'] = False
+    default_flags['data_dir'] = os.path.join(root_data_dir, NCF_DATA_DIR_NAME)
+
+    super(NCFKerasAccuracy, self).__init__(
+        output_dir=output_dir, default_flags=default_flags, **kwargs)
+
+  def _run_and_report_benchmark_mlperf_like(self):
+    """Run test and report results.
+
+    Note: MLPerf like tests are not tuned to hit a specific hr@10 value, but
+    we want it recorded.
+    """
+    self._run_and_report_benchmark(hr_at_10_min=0.61)
+
+  def _run_and_report_benchmark(self, hr_at_10_min=0.630, hr_at_10_max=0.645):
+    """Run test and report results.
+
+    Note: Target is 0.635, but some runs are below that level. Until we have
+    multi-run tests, we have to accept a lower target.
+
+    Args:
+      hr_at_10_min: Minimum acceptable hr@10 value.
+      hr_at_10_max: Maximum acceptable hr@10 value.
+    """
+    super(NCFKerasAccuracy, self)._run_and_report_benchmark(
+        hr_at_10_min=hr_at_10_min, hr_at_10_max=hr_at_10_max)
+
+  def _set_8_gpu_defaults(self):
+    FLAGS.num_gpus = 8
+    FLAGS.learning_rate = 0.0045
+    FLAGS.beta1 = 0.25
+    FLAGS.beta2 = 0.5
+    FLAGS.epsilon = 1e-8
+    FLAGS.train_epochs = 14
+    FLAGS.batch_size = 99000
+    FLAGS.eval_batch_size = 160000
+    FLAGS.train_dataset_path = os.path.join(NCF_TF_REGRESSION_DATA_DIR_NAME,
+                                            'training_cycle_*/*')
+    FLAGS.eval_dataset_path = os.path.join(NCF_TF_REGRESSION_DATA_DIR_NAME,
+                                           'eval_data/*')
+    FLAGS.input_meta_data_path = os.path.join(NCF_TF_REGRESSION_DATA_DIR_NAME,
+                                              'metadata')
+    FLAGS.data_dir = NCF_TF_REGRESSION_DATA_DIR_NAME
+
+  def benchmark_1_gpu_early_stop(self):
+    self._setup()
+    FLAGS.early_stopping = True
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_no_dist_strat_early_stop(self):
+    self._setup()
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.early_stopping = True
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_no_dist_strat_run_eagerly_early_stop(self):
+    self._setup()
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.early_stopping = True
+    FLAGS.run_eagerly = True
+    self._run_and_report_benchmark()
+
+  def benchmark_xla_1_gpu_early_stop(self):
+    self._setup()
+    FLAGS.early_stopping = True
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_ctl_early_stop(self):
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.early_stopping = True
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_ctl_run_eagerly_early_stop(self):
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.early_stopping = True
+    FLAGS.run_eagerly = True
+    self._run_and_report_benchmark()
+
+  def benchmark_xla_1_gpu_ctl_early_stop(self):
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.early_stopping = True
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
+
+  def benchmark_2_gpus_early_stop(self):
+    self._setup()
+    FLAGS.early_stopping = True
+    FLAGS.num_gpus = 2
+    FLAGS.eval_batch_size = 160000
+    self._run_and_report_benchmark()
+
+  def benchmark_2_gpus_ctl_early_stop(self):
+    """NCF with custom training loop. Works only in TF 2.0."""
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.early_stopping = True
+    FLAGS.num_gpus = 2
+    FLAGS.eval_batch_size = 160000
+    self._run_and_report_benchmark()
+
+
+#############################################
+# Tests below with mlperf in the test name are of two types:
+#  1) 1 GPU tests are based on MLPerf 0.5 and the TensorFlow pulled submission.
+#  2) 8 GPU tests are based on MLPerf 0.5 and use NVIDIA's hyper parameters.
+#
+# The purpose of both is to get a number to compare to existing results. To do
+# this the number of epochs is held constant rather than a race to a given
+# accuracy. The accuracy validation is done by the "early_stop" tests.
+#############################################
+
+  def benchmark_1_gpu_mlperf_like(self):
+    """1 GPU using keras fit/compile."""
+    self._setup()
+    FLAGS.train_epochs = 7
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_1_gpu_no_dist_strat_mlperf_like(self):
+    """1 GPU using compile/fit without dist_strat."""
+    self._setup()
+    FLAGS.train_epochs = 7
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_1_gpu_no_dist_strat_run_eagerly_mlperf_like(self):
+    self._setup()
+    FLAGS.train_epochs = 7
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.run_eagerly = True
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_xla_1_gpu_mlperf_like(self):
+    """1 GPU using compile/fit with XLA."""
+    self._setup()
+    FLAGS.train_epochs = 7
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_1_gpu_ctl_mlperf_like(self):
+    """1 GPU using CTL."""
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.train_epochs = 7
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_1_gpu_ctl_fp16_mlperf_like(self):
+    """1 GPU using CTL and FP16."""
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.train_epochs = 7
+    FLAGS.dtype = 'fp16'
+    FLAGS.loss_scale = 8192
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_1_gpu_fp16_mlperf_like(self):
+    """1 GPU using FP16."""
+    self._setup()
+    FLAGS.train_epochs = 7
+    FLAGS.dtype = 'fp16'
+    FLAGS.loss_scale = 8192
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_1_gpu_ctl_run_eagerly_mlperf_like(self):
+    """1 GPU using CTL with eager and distribution strategy."""
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.run_eagerly = True
+    FLAGS.train_epochs = 7
+    self._run_and_report_benchmark()
+
+  def benchmark_xla_1_gpu_ctl_mlperf_like(self):
+    """1 GPU using CTL with XLA."""
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.enable_xla = True
+    FLAGS.train_epochs = 7
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_xla_1_gpu_fp16_mlperf_like(self):
+    """1 GPU using with XLA and FP16."""
+    self._setup()
+    FLAGS.enable_xla = True
+    FLAGS.train_epochs = 7
+    FLAGS.dtype = 'fp16'
+    FLAGS.loss_scale = 8192
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_xla_1_gpu_ctl_fp16_mlperf_like(self):
+    """1 GPU using CTL with XLA and FP16."""
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.enable_xla = True
+    FLAGS.train_epochs = 7
+    FLAGS.dtype = 'fp16'
+    FLAGS.loss_scale = 8192
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_8_gpu_mlperf_like(self):
+    """8 GPU using keras fit/compile."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.train_epochs = 17
+    FLAGS.batch_size = 1048576
+    FLAGS.eval_batch_size = 160000
+    FLAGS.learning_rate = 0.0045
+    FLAGS.beta1 = 0.25
+    FLAGS.beta2 = 0.5
+    FLAGS.epsilon = 1e-8
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_8_gpu_ctl_mlperf_like(self):
+    """8 GPU using CTL."""
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.num_gpus = 8
+    FLAGS.train_epochs = 17
+    FLAGS.batch_size = 1048576
+    FLAGS.eval_batch_size = 160000
+    FLAGS.learning_rate = 0.0045
+    FLAGS.beta1 = 0.25
+    FLAGS.beta2 = 0.5
+    FLAGS.epsilon = 1e-8
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_8_gpu_tf_data_ctl_mlperf_like(self):
+    """8 GPU using CTL."""
+    self._setup()
+    self._set_8_gpu_defaults()
+    FLAGS.keras_use_ctl = True
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_8_gpu_tf_data_fp16_mlperf_like(self):
+    """8 GPU FP16."""
+    self._setup()
+    self._set_8_gpu_defaults()
+    FLAGS.dtype = 'fp16'
+    FLAGS.loss_scale = 8192
+    self._run_and_report_benchmark_mlperf_like()
+
+  def benchmark_8_gpu_tf_data_ctl_fp16_mlperf_like(self):
+    """8 GPU FP16 using CTL."""
+    self._setup()
+    self._set_8_gpu_defaults()
+    FLAGS.keras_use_ctl = True
+    FLAGS.dtype = 'fp16'
+    FLAGS.loss_scale = 8192
+    self._run_and_report_benchmark_mlperf_like()
+
+
+class NCFKerasBenchmarkReal(NCFKerasBenchmarkBase):
+  """NCF Keras throughput benchmarks."""
+
+  def __init__(self,
+               output_dir=None,
+               root_data_dir=None,
+               default_flags=None,
+               **kwargs):
+
+    root_data_dir = root_data_dir if root_data_dir else ''
+    default_flags = {}
+    default_flags['dataset'] = 'ml-20m'
+    default_flags['num_gpus'] = 1
+    default_flags['train_epochs'] = 14
+    default_flags['clean'] = True
+    default_flags['batch_size'] = 99000
+    default_flags['eval_batch_size'] = 160000
+    default_flags['learning_rate'] = 0.00382059
+    default_flags['beta1'] = 0.783529
+    default_flags['beta2'] = 0.909003
+    default_flags['epsilon'] = 1.45439e-07
+    default_flags['layers'] = [256, 256, 128, 64]
+    default_flags['num_factors'] = 64
+    default_flags['hr_threshold'] = 0.635
+    default_flags['ml_perf'] = True
+    default_flags['use_synthetic_data'] = False
+    default_flags['train_dataset_path'] = os.path.join(
+        NCF_TF_REGRESSION_DATA_DIR_NAME, 'training_cycle_*/*')
+    default_flags['eval_dataset_path'] = os.path.join(
+        NCF_TF_REGRESSION_DATA_DIR_NAME, 'eval_data/*')
+    default_flags['input_meta_data_path'] = os.path.join(
+        NCF_TF_REGRESSION_DATA_DIR_NAME, 'metadata')
+    default_flags['data_dir'] = NCF_TF_REGRESSION_DATA_DIR_NAME
+
+    super(NCFKerasBenchmarkReal, self).__init__(
+        output_dir=output_dir, default_flags=default_flags, **kwargs)
+
+  def benchmark_2x2_tpu(self):
+    """2x2 TPU using CTL with distribution strategy."""
+    self._setup()
+    FLAGS.distribution_strategy = 'tpu'
+    FLAGS.keras_use_ctl = True
+    FLAGS.num_gpus = 0
+    FLAGS.train_epochs = 1
+    self._run_and_report_benchmark()
+
+  @owner_utils.Owner('tf-graph-compiler')
+  def benchmark_2x2_tpu_mlir(self):
+    """2x2 TPU using CTL with distribution strategy using the MLIR bridge."""
+    self._setup()
+    FLAGS.distribution_strategy = 'tpu'
+    FLAGS.keras_use_ctl = True
+    FLAGS.num_gpus = 0
+    FLAGS.train_epochs = 1
+    tf.config.experimental.enable_mlir_bridge()
+    self._run_and_report_benchmark()
+
+
+class NCFKerasSynth(NCFKerasBenchmarkBase):
+  """Benchmark NCF model using synthetic data."""
+
+  def __init__(self, output_dir=None, default_flags=None, **kwargs):
+
+    default_flags = {}
+    default_flags['dataset'] = 'ml-20m'
+    default_flags['num_gpus'] = 1
+    default_flags['train_epochs'] = 8
+    default_flags['batch_size'] = 99000
+    default_flags['eval_batch_size'] = 160000
+    default_flags['learning_rate'] = 0.00382059
+    default_flags['beta1'] = 0.783529
+    default_flags['beta2'] = 0.909003
+    default_flags['epsilon'] = 1.45439e-07
+    default_flags['layers'] = [256, 256, 128, 64]
+    default_flags['num_factors'] = 64
+    default_flags['hr_threshold'] = 0.635
+    default_flags['use_synthetic_data'] = True
+
+    super(NCFKerasSynth, self).__init__(
+        output_dir=output_dir, default_flags=default_flags, **kwargs)
+
+  def benchmark_1_gpu(self):
+    self._setup()
+    self._run_and_report_benchmark()
+
+  def benchmark_2_gpus(self):
+    self._setup()
+    FLAGS.num_gpus = 2
+    self._run_and_report_benchmark()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/benchmark/nhnet_benchmark.py
+++ b/official/benchmark/nhnet_benchmark.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes benchmark testing for bert pretraining."""
+# pylint: disable=line-too-long
+from __future__ import print_function
+
+import time
+from typing import Optional
+
+from absl import flags
+import tensorflow as tf
+
+from official.benchmark import benchmark_wrappers
+from official.benchmark import owner_utils
+from official.benchmark import perfzero_benchmark
+from official.projects.nhnet import trainer
+from official.utils.flags import core as flags_core
+
+MIN_LOSS = 0.40
+MAX_LOSS = 0.55
+NHNET_DATA = 'gs://tf-perfzero-data/nhnet/v1/processed/train.tfrecord*'
+PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12/bert_model.ckpt'
+
+FLAGS = flags.FLAGS
+
+
+class NHNetBenchmark(perfzero_benchmark.PerfZeroBenchmark):
+  """Base benchmark class for NHNet."""
+
+  def __init__(self, output_dir=None, default_flags=None, tpu=None, **kwargs):
+    self.default_flags = default_flags or {}
+    flag_methods = trainer.define_flags()
+    super(NHNetBenchmark, self).__init__(
+        output_dir=output_dir,
+        default_flags=default_flags,
+        flag_methods=flag_methods,
+        tpu=tpu,
+        **kwargs)
+
+  def _report_benchmark(self,
+                        stats,
+                        wall_time_sec,
+                        max_value=None,
+                        min_value=None):
+    """Report benchmark results by writing to local protobuf file.
+
+    Args:
+      stats: dict returned from keras models with known entries.
+      wall_time_sec: the during of the benchmark execution in seconds
+      max_value: highest passing level.
+      min_value: lowest passing level.
+    """
+
+    metrics = []
+    metrics.append({
+        'name': 'training_loss',
+        'value': stats['training_loss'],
+        'min_value': min_value,
+        'max_value': max_value
+    })
+    # These metrics are placeholders to avoid PerfZero failure.
+    metrics.append({
+        'name': 'exp_per_second',
+        'value': 0.0,
+    })
+    metrics.append({
+        'name': 'startup_time',
+        'value': 9999.,
+    })
+    flags_str = flags_core.get_nondefault_flags_as_str()
+    self.report_benchmark(
+        iters=-1,
+        wall_time=wall_time_sec,
+        metrics=metrics,
+        extras={'flags': flags_str})
+
+
+class NHNetAccuracyBenchmark(NHNetBenchmark):
+  """Benchmark accuracy tests for NHNet."""
+
+  def __init__(self,
+               output_dir: Optional[str] = None,
+               tpu: Optional[str] = None,
+               **kwargs):
+    default_flags = dict(
+        mode='train',
+        train_file_pattern=NHNET_DATA,
+        train_batch_size=1024,
+        model_type='nhnet',
+        len_title=15,
+        len_passage=200,
+        num_encoder_layers=12,
+        num_decoder_layers=12,
+        num_nhnet_articles=5,
+        steps_per_loop=1000,
+        params_override='init_from_bert2bert=false')
+    super(NHNetAccuracyBenchmark, self).__init__(
+        output_dir=output_dir, default_flags=default_flags, tpu=tpu, **kwargs)
+
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self, max_value=MAX_LOSS, min_value=MIN_LOSS):
+    """Runs and reports the benchmark given the provided configuration."""
+    start_time_sec = time.time()
+    stats = trainer.run()
+    wall_time_sec = time.time() - start_time_sec
+    self._report_benchmark(
+        stats, wall_time_sec, max_value=max_value, min_value=min_value)
+
+  @owner_utils.Owner('tf-model-garden')
+  def benchmark_accuracy_4x4_tpu_f32_50k_steps(self):
+    """Test bert pretraining with 4x4 TPU for 50k steps."""
+    # This is used for accuracy test.
+    self._setup()
+    FLAGS.train_steps = 50000
+    FLAGS.checkpoint_interval = FLAGS.train_steps
+    FLAGS.distribution_strategy = 'tpu'
+    FLAGS.init_checkpoint = PRETRAINED_CHECKPOINT_PATH
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_accuracy_4x4_tpu_bf32_50k_steps')
+    self._run_and_report_benchmark()
+
+  @owner_utils.Owner('tf-model-garden')
+  def benchmark_accuracy_4x4_tpu_f32_1k_steps(self):
+    """Test bert pretraining with 4x4 TPU for 1k steps."""
+    self._setup()
+    FLAGS.train_steps = 1000
+    FLAGS.checkpoint_interval = FLAGS.train_steps
+    FLAGS.distribution_strategy = 'tpu'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_accuracy_4x4_tpu_bf32_1k_steps')
+    self._run_and_report_benchmark()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/benchmark/owner_utils.py
+++ b/official/benchmark/owner_utils.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils to set Owner annotations on benchmarks.
+
+@owner_utils.Owner('owner_team/user') can be set either at the benchmark class
+level / benchmark method level or both.
+
+Runner frameworks can use owner_utils.GetOwner(benchmark_method) to get the
+actual owner. Python inheritance for the owner attribute is respected.  (E.g
+method level owner takes precedence over class level).
+
+See owner_utils_test for associated tests and more examples.
+
+The decorator can be applied both at the method level and at the class level.
+
+Simple example:
+===============
+
+class MLBenchmark:
+
+  @Owner('example_id')
+  def benchmark_method_1_gpu(self):
+    return True
+"""
+
+
+def Owner(owner_name):
+  """Sets the owner attribute on a decorated method or class."""
+
+  def _Wrapper(func_or_class):
+    """Sets the benchmark owner attribute."""
+    func_or_class.__benchmark__owner__ = owner_name
+    return func_or_class
+
+  return _Wrapper
+
+
+def GetOwner(benchmark_method_or_class):
+  """Gets the inherited owner attribute for this benchmark.
+
+  Checks for existence of __benchmark__owner__. If it's not present, looks for
+  it in the parent class's attribute list.
+
+  Args:
+    benchmark_method_or_class: A benchmark method or class.
+
+  Returns:
+    string - the associated owner if present / None.
+  """
+  if hasattr(benchmark_method_or_class, '__benchmark__owner__'):
+    return benchmark_method_or_class.__benchmark__owner__
+  elif hasattr(benchmark_method_or_class, '__self__'):
+    if hasattr(benchmark_method_or_class.__self__, '__benchmark__owner__'):
+      return benchmark_method_or_class.__self__.__benchmark__owner__
+  return None
--- a/official/benchmark/owner_utils_test.py
+++ b/official/benchmark/owner_utils_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.benchmark.owner_utils."""
+
+from absl.testing import absltest
+
+from official.benchmark import owner_utils
+
+
+@owner_utils.Owner('static_owner')
+def static_function(foo=5):
+  return foo
+
+
+def static_function_without_owner(foo=5):
+  return foo
+
+
+class BenchmarkClassWithoutOwner:
+
+  def method_without_owner(self):
+    return 100
+
+  @owner_utils.Owner('method_owner')
+  def method_with_owner(self):
+    return 200
+
+
+@owner_utils.Owner('class_owner')
+class SomeBenchmarkClass:
+
+  def method_inherited_owner(self):
+    return 123
+
+  @owner_utils.Owner('method_owner')
+  def method_override_owner(self):
+    return 345
+
+
+@owner_utils.Owner('new_class_owner')
+class InheritedClass(SomeBenchmarkClass):
+
+  def method_inherited_owner(self):
+    return 456
+
+  @owner_utils.Owner('new_method_owner')
+  def method_override_owner(self):
+    return 567
+
+
+class OwnerUtilsTest(absltest.TestCase):
+  """Tests to assert for owner decorator functionality."""
+
+  def test_owner_tag_missing(self):
+    self.assertEqual(None, owner_utils.GetOwner(static_function_without_owner))
+
+    benchmark_class = BenchmarkClassWithoutOwner()
+    self.assertEqual(None,
+                     owner_utils.GetOwner(benchmark_class.method_without_owner))
+    self.assertEqual(100, benchmark_class.method_without_owner())
+
+    self.assertEqual('method_owner',
+                     owner_utils.GetOwner(benchmark_class.method_with_owner))
+    self.assertEqual(200, benchmark_class.method_with_owner())
+
+  def test_owner_attributes_static(self):
+    self.assertEqual('static_owner', owner_utils.GetOwner(static_function))
+    self.assertEqual(5, static_function(5))
+
+  def test_owner_attributes_per_class(self):
+    level1 = SomeBenchmarkClass()
+    self.assertEqual('class_owner',
+                     owner_utils.GetOwner(level1.method_inherited_owner))
+    self.assertEqual(123, level1.method_inherited_owner())
+
+    self.assertEqual('method_owner',
+                     owner_utils.GetOwner(level1.method_override_owner))
+    self.assertEqual(345, level1.method_override_owner())
+
+  def test_owner_attributes_inherited_class(self):
+    level2 = InheritedClass()
+    self.assertEqual('new_class_owner',
+                     owner_utils.GetOwner(level2.method_inherited_owner))
+    self.assertEqual(456, level2.method_inherited_owner())
+
+    self.assertEqual('new_method_owner',
+                     owner_utils.GetOwner(level2.method_override_owner))
+    self.assertEqual(567, level2.method_override_owner())
+
+
+if __name__ == '__main__':
+  absltest.main()
--- a/official/benchmark/perfzero_benchmark.py
+++ b/official/benchmark/perfzero_benchmark.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for creating PerfZero benchmarks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import flags
+from absl import logging
+from absl.testing import flagsaver
+import tensorflow as tf
+
+FLAGS = flags.FLAGS
+
+
+class PerfZeroBenchmark(tf.test.Benchmark):
+  """Common methods used in PerfZero Benchmarks.
+
+     Handles the resetting of flags between tests, loading of default_flags,
+     overriding of defaults.  PerfZero (OSS) runs each test in a separate
+     process reducing some need to reset the flags.
+  """
+  local_flags = None
+
+  def __init__(self,
+               output_dir=None,
+               default_flags=None,
+               root_data_dir=None,
+               flag_methods=None,
+               tpu=None):
+    """Initialize class.
+
+    Args:
+      output_dir: Base directory to store all output for the test.
+      default_flags: Set of flags to pass to model.
+      root_data_dir: Optional param used by child classes to look for the
+        dataset.
+      flag_methods: Set of flag methods to run during setup.
+      tpu: (optional) TPU name to use in a TPU benchmark.
+    """
+    if os.getenv('BENCHMARK_OUTPUT_DIR'):
+      self.output_dir = os.getenv('BENCHMARK_OUTPUT_DIR')
+    elif output_dir:
+      self.output_dir = output_dir
+    else:
+      self.output_dir = '/tmp'
+    self.default_flags = default_flags or {}
+    self.flag_methods = flag_methods or {}
+
+    if os.getenv('BENCHMARK_TPU'):
+      resolved_tpu = os.getenv('BENCHMARK_TPU')
+    elif tpu:
+      resolved_tpu = tpu
+    else:
+      resolved_tpu = None
+
+    if resolved_tpu:
+      # TPU models are expected to accept a --tpu=name flag. PerfZero creates
+      # the TPU at runtime and passes the TPU's name to this flag.
+      self.default_flags['tpu'] = resolved_tpu
+
+    logging.info('root_data_dir: %s', root_data_dir)
+
+  @property
+  def tpu(self):
+    return self.default_flags.get('tpu', None)
+
+  def _get_model_dir(self, folder_name):
+    """Returns directory to store info, e.g. saved model and event log."""
+    return os.path.join(self.output_dir, folder_name)
+
+  def _setup(self):
+    """Sets up and resets flags before each test."""
+    logging.set_verbosity(logging.INFO)
+    if PerfZeroBenchmark.local_flags is None:
+      for flag_method in self.flag_methods:
+        flag_method()
+      # Loads flags to get defaults to then override. List cannot be empty.
+      flags.FLAGS(['foo'])
+      # Overrides flag values with defaults for the class of tests.
+      for k, v in self.default_flags.items():
+        setattr(FLAGS, k, v)
+      saved_flag_values = flagsaver.save_flag_values()
+      PerfZeroBenchmark.local_flags = saved_flag_values
+    else:
+      flagsaver.restore_flag_values(PerfZeroBenchmark.local_flags)
--- a/official/benchmark/resnet50_keras_core.py
+++ b/official/benchmark/resnet50_keras_core.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Resnet50 Keras core benchmark."""
+
+import tempfile
+import time
+
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+from official.benchmark import perfzero_benchmark
+
+
+def _decode_and_center_crop(image_bytes):
+  """Crops to center of image with padding then scales image_size."""
+  shape = tf.image.extract_jpeg_shape(image_bytes)
+  image_height, image_width, image_size = shape[0], shape[1], 224
+
+  padded_center_crop_size = tf.cast(
+      ((image_size / (image_size + 32)) *
+       tf.cast(tf.minimum(image_height, image_width), tf.float32)),
+      tf.int32,
+  )
+
+  offset_height = ((image_height - padded_center_crop_size) + 1) // 2
+  offset_width = ((image_width - padded_center_crop_size) + 1) // 2
+  crop_window = tf.stack([
+      offset_height, offset_width, padded_center_crop_size,
+      padded_center_crop_size
+  ])
+  image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
+  return tf.image.resize(image, [image_size, image_size], method="bicubic")
+
+
+def _preprocessing(data):
+  return (
+      tf.cast(_decode_and_center_crop(data["image"]), tf.float32),
+      data["label"],
+  )
+
+
+def _run_benchmark():
+  """Runs a resnet50 compile/fit() call and returns the wall time."""
+  tmp_dir = tempfile.mkdtemp()
+  start_time = time.time()
+
+  batch_size = 64
+  dataset = tfds.load(
+      "imagenette",
+      decoders={"image": tfds.decode.SkipDecoding()},
+      split="train",
+  )
+
+  dataset = (
+      dataset.cache().repeat(
+          2
+      )  # Artificially increase time per epoch to make it easier to measure
+      .map(_preprocessing,
+           num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(
+               batch_size).prefetch(1))
+
+  with tf.distribute.MirroredStrategy().scope():
+    model = tf.keras.applications.ResNet50(weights=None)
+    model.compile(
+        optimizer=tf.compat.v1.mixed_precision
+        .enable_mixed_precision_graph_rewrite(
+            tf.keras.optimizers.Adam(), loss_scale="dynamic"),
+        loss="sparse_categorical_crossentropy",
+    )
+
+  tb_cbk = tf.keras.callbacks.TensorBoard(
+      f"{tmp_dir}/{tf.__version__}", profile_batch=300)
+  model.fit(dataset, verbose=2, epochs=3, callbacks=[tb_cbk])
+  end_time = time.time()
+  return end_time - start_time
+
+
+class Resnet50KerasCoreBenchmark(perfzero_benchmark.PerfZeroBenchmark):
+  """Resnet50 Keras core benchmarks."""
+
+  def benchmark_1_gpu(self):
+    wall_time = _run_benchmark()
+    self.report_benchmark(iters=-1, wall_time=wall_time)
+
+  def benchmark_1_gpu_avg_3(self):
+    num_trials = 3
+    wall_times = []
+    for _ in range(num_trials):
+      wall_times.append(_run_benchmark())
+    avg_wall_time = sum(wall_times) / float(len(wall_times))
+    self.report_benchmark(iters=-1, wall_time=avg_wall_time)
+
+  def benchmark_1_gpu_max_3(self):
+    num_trials = 3
+    wall_times = []
+    for _ in range(num_trials):
+      wall_times.append(_run_benchmark())
+    max_wall_time = max(wall_times)
+    self.report_benchmark(iters=-1, wall_time=max_wall_time)
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/benchmark/resnet_ctl_imagenet_benchmark.py
+++ b/official/benchmark/resnet_ctl_imagenet_benchmark.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes CTL benchmarks and accuracy tests."""
+# pylint: disable=line-too-long,g-bad-import-order
+from __future__ import print_function
+
+import os  # pylint: disable=unused-import
+import time
+
+from absl import flags
+import tensorflow as tf
+
+from official.benchmark import owner_utils
+from official.legacy.image_classification.resnet import common
+from official.legacy.image_classification.resnet import resnet_ctl_imagenet_main
+from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
+from official.benchmark import benchmark_wrappers
+from official.utils.flags import core as flags_core
+
+IMAGENET_DEFAULT_DATA_PATH = 'gs://mlcompass-data/imagenet/imagenet-2012-tfrecord'
+# TODO(emizan) Remove comment once you make sure that dataset caching has similar or better
+# performance as the uncached local SSD dataset below.
+# IMAGENET_EXP_DATA_PATH = 'gs://mlcompass-data/imagenet/imagenet-2012-tfrecord'
+
+MIN_TOP_1_ACCURACY = 0.76
+MAX_TOP_1_ACCURACY = 0.77
+
+FLAGS = flags.FLAGS
+
+
+class CtlBenchmark(PerfZeroBenchmark):
+  """Base benchmark class with methods to simplify testing."""
+
+  def __init__(self,
+               output_dir=None,
+               default_flags=None,
+               flag_methods=None,
+               **kwargs):
+    self.default_flags = default_flags or {}
+    self.flag_methods = flag_methods or {}
+    super(CtlBenchmark, self).__init__(
+        output_dir=output_dir,
+        default_flags=self.default_flags,
+        flag_methods=self.flag_methods,
+        **kwargs)
+
+  def _report_benchmark(self,
+                        stats,
+                        wall_time_sec,
+                        top_1_max=None,
+                        top_1_min=None,
+                        total_batch_size=None,
+                        log_steps=None,
+                        warmup=1,
+                        start_time_sec=None):
+    """Report benchmark results by writing to local protobuf file.
+
+    Args:
+      stats: dict returned from keras models with known entries.
+      wall_time_sec: the during of the benchmark execution in seconds
+      top_1_max: highest passing level for top_1 accuracy.
+      top_1_min: lowest passing level for top_1 accuracy.
+      total_batch_size: Global batch-size.
+      log_steps: How often the log was created for stats['step_timestamp_log'].
+      warmup: number of entries in stats['step_timestamp_log'] to ignore.
+      start_time_sec: the start time of the program in seconds since epoch.
+    """
+
+    metrics = []
+    if 'eval_acc' in stats:
+      metrics.append({
+          'name': 'accuracy_top_1',
+          'value': stats['eval_acc'],
+          'min_value': top_1_min,
+          'max_value': top_1_max
+      })
+      metrics.append({'name': 'eval_loss', 'value': stats['eval_loss']})
+
+      metrics.append({
+          'name': 'top_1_train_accuracy',
+          'value': stats['train_acc']
+      })
+      metrics.append({'name': 'train_loss', 'value': stats['train_loss']})
+
+    if (warmup and 'step_timestamp_log' in stats and
+        len(stats['step_timestamp_log']) > warmup + 1):
+      # first entry in the time_log is start of step 0. The rest of the
+      # entries are the end of each step recorded
+      time_log = stats['step_timestamp_log']
+      steps_elapsed = time_log[-1].batch_index - time_log[warmup].batch_index
+      time_elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
+      examples_per_sec = total_batch_size * (steps_elapsed / time_elapsed)
+      metrics.append({'name': 'exp_per_second', 'value': examples_per_sec})
+
+    if 'avg_exp_per_second' in stats:
+      metrics.append({
+          'name': 'avg_exp_per_second',
+          'value': stats['avg_exp_per_second']
+      })
+
+    if start_time_sec and 'step_timestamp_log' in stats:
+      time_log = stats['step_timestamp_log']
+      # time_log[0] is recorded at the beginning of the first step.
+      startup_time = time_log[0].timestamp - start_time_sec
+      metrics.append({'name': 'startup_time', 'value': startup_time})
+
+    flags_str = flags_core.get_nondefault_flags_as_str()
+    self.report_benchmark(
+        iters=-1,
+        wall_time=wall_time_sec,
+        metrics=metrics,
+        extras={'flags': flags_str})
+
+
+class Resnet50CtlAccuracy(CtlBenchmark):
+  """Benchmark accuracy tests for ResNet50 in CTL."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """A benchmark class.
+
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+        constructor forward compatible in case PerfZero provides more named
+        arguments before updating the constructor.
+    """
+
+    flag_methods = [common.define_keras_flags]
+
+    self.data_dir = os.path.join(root_data_dir, 'imagenet')
+    super(Resnet50CtlAccuracy, self).__init__(
+        output_dir=output_dir, flag_methods=flag_methods)
+
+  def benchmark_8_gpu(self):
+    """Test Keras model with eager, dist_strat and 8 GPUs."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128 * 8
+    FLAGS.train_epochs = 90
+    FLAGS.epochs_between_evals = 10
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
+    FLAGS.dtype = 'fp32'
+    self._run_and_report_benchmark()
+
+  def benchmark_8_gpu_fp16(self):
+    """Test Keras model with eager, 8 GPUs with tf.keras mixed precision."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 256 * 8
+    FLAGS.train_epochs = 90
+    FLAGS.epochs_between_evals = 10
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
+    FLAGS.dtype = 'fp16'
+    self._run_and_report_benchmark()
+
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self):
+    start_time_sec = time.time()
+    stats = resnet_ctl_imagenet_main.run(flags.FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+
+    super(Resnet50CtlAccuracy, self)._report_benchmark(
+        stats,
+        wall_time_sec,
+        top_1_min=MIN_TOP_1_ACCURACY,
+        top_1_max=MAX_TOP_1_ACCURACY,
+        total_batch_size=FLAGS.batch_size,
+        log_steps=100,
+        start_time_sec=start_time_sec)
+
+
+class Resnet50CtlBenchmarkBase(CtlBenchmark):
+  """Resnet50 benchmarks."""
+
+  def __init__(self, output_dir=None, default_flags=None, **kwargs):
+    flag_methods = [common.define_keras_flags]
+
+    super(Resnet50CtlBenchmarkBase, self).__init__(
+        output_dir=output_dir,
+        flag_methods=flag_methods,
+        default_flags=default_flags,
+        **kwargs)
+
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self):
+    start_time_sec = time.time()
+    stats = resnet_ctl_imagenet_main.run(FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+
+    # Warmup means the number of logged step time entries that are excluded in
+    # performance report. Default to exclude 1 FLAGS.log_steps time.
+    super(Resnet50CtlBenchmarkBase, self)._report_benchmark(
+        stats,
+        wall_time_sec,
+        total_batch_size=FLAGS.batch_size,
+        log_steps=FLAGS.log_steps,
+        warmup=1,
+        start_time_sec=start_time_sec)
+
+  def benchmark_1_gpu_no_dist_strat(self):
+    """Test Keras model with 1 GPU, no distribution strategy."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
+    FLAGS.batch_size = 128
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu(self):
+    """Test Keras model with 1 GPU."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
+    FLAGS.batch_size = 128
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_fp16(self):
+    """Test Keras model with 1 GPU with tf.keras mixed precision."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
+    FLAGS.batch_size = 256
+    FLAGS.dtype = 'fp16'
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_eager(self):
+    """Test Keras model with 1 GPU in pure eager mode."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_eager')
+    FLAGS.batch_size = 120
+    FLAGS.use_tf_function = False
+    FLAGS.use_tf_while_loop = False
+    FLAGS.single_l2_loss_op = True
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_fp16_eager(self):
+    """Test Keras model with 1 GPU with fp16 and pure eager mode."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_eager')
+    FLAGS.batch_size = 232
+    FLAGS.dtype = 'fp16'
+    FLAGS.use_tf_function = False
+    FLAGS.use_tf_while_loop = False
+    FLAGS.single_l2_loss_op = True
+    self._run_and_report_benchmark()
+
+  def benchmark_8_gpu(self):
+    """Test Keras model with 8 GPUs."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
+    FLAGS.batch_size = 128 * 8  # 8 GPUs
+    self._run_and_report_benchmark()
+
+  def benchmark_8_gpu_fp32_no_tf32(self):
+    """Test Keras model with 8 GPUs.Runs in FP32 by disabling TF32 execution."""
+    self._setup()
+    tf.config.experimental.enable_tensor_float_32_execution(False)
+    FLAGS.num_gpus = 8
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp32_no_tf32')
+    FLAGS.batch_size = 128 * 8  # 8 GPUs
+    self._run_and_report_benchmark()
+
+  def benchmark_8_gpu_fp16(self):
+    """Test Keras model with 8 GPUs with tf.keras mixed precision."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
+    FLAGS.batch_size = 256 * 8  # 8 GPUs
+    FLAGS.dtype = 'fp16'
+    self._run_and_report_benchmark()
+
+  def benchmark_xla_8_gpu_fp16(self):
+    """Test Keras model with 8 GPUs with tf.keras mixed precision."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16')
+    FLAGS.batch_size = 256 * 8  # 8 GPUs
+    FLAGS.dtype = 'fp16'
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
+
+  def benchmark_8_gpu_eager(self):
+    """Test Keras model with 8 GPUs, eager, fp32."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.use_tf_function = False
+    FLAGS.use_tf_while_loop = False
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_eager')
+    FLAGS.batch_size = 128
+    self._run_and_report_benchmark()
+
+  def benchmark_8_gpu_eager_fp16(self):
+    """Test Keras model with 8 GPUs, eager, fp16."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.dtype = 'fp16'
+    FLAGS.use_tf_function = False
+    FLAGS.use_tf_while_loop = False
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_eager_fp16')
+    FLAGS.batch_size = 128
+    self._run_and_report_benchmark()
+
+  def _set_df_common(self):
+    FLAGS.steps_per_loop = 500
+    FLAGS.train_epochs = 2
+    FLAGS.train_steps = None
+    FLAGS.skip_eval = True
+    FLAGS.enable_eager = True
+    FLAGS.enable_tensorboard = False
+    FLAGS.distribution_strategy = 'tpu'
+    FLAGS.report_accuracy_metrics = False
+    FLAGS.log_steps = 50
+    FLAGS.single_l2_loss_op = True
+    FLAGS.use_tf_function = True
+    FLAGS.enable_checkpoint_and_export = False
+    FLAGS.data_dir = IMAGENET_DEFAULT_DATA_PATH
+
+  def benchmark_2x2_tpu_bf16(self):
+    self._setup()
+    self._set_df_common()
+    FLAGS.batch_size = 1024
+    FLAGS.dtype = 'bf16'
+    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_bf16')
+    self._run_and_report_benchmark()
+
+  @owner_utils.Owner('tf-graph-compiler')
+  def benchmark_2x2_tpu_bf16_mlir(self):
+    self._setup()
+    self._set_df_common()
+    FLAGS.batch_size = 1024
+    FLAGS.dtype = 'bf16'
+    tf.config.experimental.enable_mlir_bridge()
+    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_bf16_mlir')
+    self._run_and_report_benchmark()
+
+  def benchmark_4x4_tpu_bf16(self):
+    self._setup()
+    self._set_df_common()
+    FLAGS.batch_size = 8192
+    FLAGS.train_epochs = 4
+    FLAGS.dtype = 'bf16'
+    FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu_bf16')
+    FLAGS.data_dir = IMAGENET_DEFAULT_DATA_PATH
+    FLAGS.training_dataset_cache = True
+    self._run_and_report_benchmark()
+
+  @owner_utils.Owner('tf-graph-compiler')
+  def benchmark_4x4_tpu_bf16_mlir(self):
+    """Run resnet model on 4x4 with the MLIR Bridge enabled."""
+    self._setup()
+    self._set_df_common()
+    FLAGS.batch_size = 4096
+    FLAGS.dtype = 'bf16'
+    FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu_bf16_mlir')
+    tf.config.experimental.enable_mlir_bridge()
+    self._run_and_report_benchmark()
+
+  def benchmark_8x8_tpu_bf16(self):
+    self._setup()
+    self._set_df_common()
+    FLAGS.batch_size = 8192
+    FLAGS.dtype = 'bf16'
+    FLAGS.model_dir = self._get_model_dir('benchmark_8x8_tpu_bf16')
+    self._run_and_report_benchmark()
+
+  @owner_utils.Owner('tf-graph-compiler')
+  def benchmark_8x8_tpu_bf16_mlir(self):
+    self._setup()
+    self._set_df_common()
+    FLAGS.batch_size = 8192
+    FLAGS.dtype = 'bf16'
+    FLAGS.model_dir = self._get_model_dir('benchmark_8x8_tpu_bf16_mlir')
+    tf.config.experimental.enable_mlir_bridge()
+    self._run_and_report_benchmark()
+
+  def benchmark_8x8_tpu(self):
+    self._setup()
+    self._set_df_common()
+    FLAGS.batch_size = 8192
+    FLAGS.model_dir = self._get_model_dir('benchmark_8x8_tpu')
+    self._run_and_report_benchmark()
+
+  @owner_utils.Owner('tf-graph-compiler')
+  def benchmark_8x8_tpu_mlir(self):
+    self._setup()
+    self._set_df_common()
+    FLAGS.batch_size = 8192
+    FLAGS.model_dir = self._get_model_dir('benchmark_8x8_tpu_mlir')
+    tf.config.experimental.enable_mlir_bridge()
+    self._run_and_report_benchmark()
+
+  def benchmark_8x16_tpu_bf16(self):
+    self._setup()
+    self._set_df_common()
+    FLAGS.batch_size = 8192
+    FLAGS.dtype = 'bf16'
+    FLAGS.model_dir = self._get_model_dir('benchmark_8x16_tpu_bf16')
+    self._run_and_report_benchmark()
+
+  def fill_report_object(self, stats):
+    super(Resnet50CtlBenchmarkBase, self).fill_report_object(
+        stats, total_batch_size=FLAGS.batch_size, log_steps=FLAGS.log_steps)
+
+
+class Resnet50CtlBenchmarkSynth(Resnet50CtlBenchmarkBase):
+  """Resnet50 synthetic benchmark tests."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    def_flags = {}
+    def_flags['skip_eval'] = True
+    def_flags['use_synthetic_data'] = True
+    def_flags['train_steps'] = 110
+    def_flags['steps_per_loop'] = 10
+    def_flags['log_steps'] = 10
+
+    super(Resnet50CtlBenchmarkSynth, self).__init__(
+        output_dir=output_dir, default_flags=def_flags, **kwargs)
+
+
+class Resnet50CtlBenchmarkReal(Resnet50CtlBenchmarkBase):
+  """Resnet50 real data benchmark tests."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    def_flags = {}
+    def_flags['skip_eval'] = True
+    def_flags[
+        'data_dir'] = os.path.join(root_data_dir, 'imagenet')
+    def_flags['train_steps'] = 110
+    def_flags['steps_per_loop'] = 10
+    def_flags['log_steps'] = 10
+
+    super(Resnet50CtlBenchmarkReal, self).__init__(
+        output_dir=output_dir, default_flags=def_flags, **kwargs)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/benchmark/retinanet_benchmark.py
+++ b/official/benchmark/retinanet_benchmark.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes RetinaNet benchmarks and accuracy tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=g-bad-import-order
+import json
+import time
+
+from absl import flags
+from absl.testing import flagsaver
+import tensorflow as tf
+# pylint: enable=g-bad-import-order
+
+from official.benchmark import benchmark_wrappers
+from official.benchmark import perfzero_benchmark
+from official.legacy.detection import main as detection
+from official.legacy.detection.configs import base_config
+from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+
+FLAGS = flags.FLAGS
+
+# pylint: disable=line-too-long
+COCO_TRAIN_DATA = 'gs://tf-perfzero-data/coco/train*'
+COCO_EVAL_DATA = 'gs://tf-perfzero-data/coco/val*'
+COCO_EVAL_JSON = 'gs://tf-perfzero-data/coco/instances_val2017.json'
+RESNET_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07'
+# pylint: enable=line-too-long
+
+
+class BenchmarkBase(perfzero_benchmark.PerfZeroBenchmark):
+  """Base class to hold methods common to test classes."""
+
+  def __init__(self, **kwargs):
+    super(BenchmarkBase, self).__init__(**kwargs)
+    self.timer_callback = None
+
+  def _report_benchmark(self, stats, start_time_sec, wall_time_sec, min_ap,
+                        max_ap, warmup):
+    """Report benchmark results by writing to local protobuf file.
+
+    Args:
+      stats: dict returned from Detection models with known entries.
+      start_time_sec: the start of the benchmark execution in seconds
+      wall_time_sec: the duration of the benchmark execution in seconds
+      min_ap: Minimum detection AP constraint to verify correctness of the
+        model.
+      max_ap: Maximum detection AP accuracy constraint to verify correctness of
+        the model.
+      warmup: Number of time log entries to ignore when computing examples/sec.
+    """
+    metrics = [{
+        'name': 'total_loss',
+        'value': stats['total_loss'],
+    }]
+    if self.timer_callback:
+      metrics.append({
+          'name': 'exp_per_second',
+          'value': self.timer_callback.get_examples_per_sec(warmup)
+      })
+      metrics.append({
+          'name': 'startup_time',
+          'value': self.timer_callback.get_startup_time(start_time_sec)
+      })
+    else:
+      metrics.append({
+          'name': 'exp_per_second',
+          'value': 0.0,
+      })
+
+    if 'eval_metrics' in stats:
+      metrics.append({
+          'name': 'AP',
+          'value': stats['AP'],
+          'min_value': min_ap,
+          'max_value': max_ap,
+      })
+    flags_str = flags_core.get_nondefault_flags_as_str()
+    self.report_benchmark(
+        iters=stats['total_steps'],
+        wall_time=wall_time_sec,
+        metrics=metrics,
+        extras={'flags': flags_str})
+
+
+class DetectionBenchmarkBase(BenchmarkBase):
+  """Base class to hold methods common to test classes in the module."""
+
+  def __init__(self, **kwargs):
+    self.train_data_path = COCO_TRAIN_DATA
+    self.eval_data_path = COCO_EVAL_DATA
+    self.eval_json_path = COCO_EVAL_JSON
+    self.resnet_checkpoint_path = RESNET_CHECKPOINT_PATH
+    super(DetectionBenchmarkBase, self).__init__(**kwargs)
+
+  def _run_detection_main(self):
+    """Starts detection job."""
+    if self.timer_callback:
+      FLAGS.log_steps = 0  # prevent detection.run from adding the same callback
+      return detection.run(callbacks=[self.timer_callback])
+    else:
+      return detection.run()
+
+
+class DetectionAccuracy(DetectionBenchmarkBase):
+  """Accuracy test for RetinaNet model.
+
+  Tests RetinaNet detection task model accuracy. The naming
+  convention of below test cases follow
+  `benchmark_(number of gpus)_gpu_(dataset type)` format.
+  """
+
+  def __init__(self, model, per_gpu_batch_size=8, **kwargs):
+    self.model = model
+    self.per_gpu_batch_size = per_gpu_batch_size
+    super(DetectionAccuracy, self).__init__(**kwargs)
+
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self,
+                                params,
+                                min_ap=0.325,
+                                max_ap=0.35,
+                                do_eval=True,
+                                warmup=1):
+    """Starts Detection accuracy benchmark test."""
+    FLAGS.params_override = json.dumps(params)
+    # Need timer callback to measure performance
+    self.timer_callback = keras_utils.TimeHistory(
+        batch_size=params['train']['batch_size'],
+        log_steps=FLAGS.log_steps,
+    )
+
+    start_time_sec = time.time()
+    FLAGS.mode = 'train'
+    summary, _ = self._run_detection_main()
+    wall_time_sec = time.time() - start_time_sec
+
+    if do_eval:
+      FLAGS.mode = 'eval'
+      eval_metrics = self._run_detection_main()
+      summary.update(eval_metrics)
+
+    summary['total_steps'] = params['train']['total_steps']
+    self._report_benchmark(summary, start_time_sec, wall_time_sec, min_ap,
+                           max_ap, warmup)
+
+  def _setup(self):
+    super(DetectionAccuracy, self)._setup()
+    FLAGS.model = self.model
+
+  def _params(self):
+    return {
+        'architecture': {
+            'use_bfloat16': True,
+        },
+        'train': {
+            'batch_size': 64,
+            'iterations_per_loop': 100,
+            'total_steps': 22500,
+            'train_file_pattern': self.train_data_path,
+            'checkpoint': {
+                'path': self.resnet_checkpoint_path,
+                'prefix': 'resnet50/'
+            },
+            # Speed up ResNet training when loading from the checkpoint.
+            'frozen_variable_prefix': base_config.RESNET_FROZEN_VAR_PREFIX,
+        },
+        'eval': {
+            'batch_size': 8,
+            'eval_samples': 5000,
+            'val_json_file': self.eval_json_path,
+            'eval_file_pattern': self.eval_data_path,
+        },
+    }
+
+  @flagsaver.flagsaver
+  def benchmark_8_gpu_coco(self):
+    """Run RetinaNet model accuracy test with 8 GPUs."""
+    self._setup()
+    params = self._params()
+    FLAGS.num_gpus = 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_coco')
+    FLAGS.strategy_type = 'mirrored'
+    self._run_and_report_benchmark(params)
+
+
+class DetectionBenchmarkReal(DetectionAccuracy):
+  """Short benchmark performance tests for a detection model.
+
+  Tests detection performance in different accelerator configurations.
+  The naming convention of below test cases follow
+  `benchmark_(number of gpus)_gpu` format.
+  """
+
+  def _setup(self):
+    super(DetectionBenchmarkReal, self)._setup()
+    # Use negative value to avoid saving checkpoints.
+    FLAGS.save_checkpoint_freq = -1
+
+  @flagsaver.flagsaver
+  def benchmark_8_gpu_coco(self):
+    """Run detection model accuracy test with 8 GPUs."""
+    self._setup()
+    params = self._params()
+    params['architecture']['use_bfloat16'] = False
+    params['train']['total_steps'] = 1875  # One epoch.
+    params['train']['batch_size'] = 8 * self.per_gpu_batch_size
+    # The iterations_per_loop must be one, otherwise the number of examples per
+    # second would be wrong. Currently only support calling callback per batch
+    # when each loop only runs on one batch, i.e. host loop for one step. The
+    # performance of this situation might be lower than the case of
+    # iterations_per_loop > 1.
+    # Related bug: b/135933080
+    params['train']['iterations_per_loop'] = 1
+    params['eval']['eval_samples'] = 8
+    FLAGS.num_gpus = 8
+    FLAGS.model_dir = self._get_model_dir('real_benchmark_8_gpu_coco')
+    FLAGS.strategy_type = 'mirrored'
+    self._run_and_report_benchmark(params)
+
+  @flagsaver.flagsaver
+  def benchmark_1_gpu_coco(self):
+    """Run detection model accuracy test with 1 GPU."""
+    self._setup()
+    params = self._params()
+    params['architecture']['use_bfloat16'] = False
+    params['train']['batch_size'] = 1 * self.per_gpu_batch_size
+    params['train']['total_steps'] = 200
+    params['train']['iterations_per_loop'] = 1
+    params['eval']['eval_samples'] = 8
+    FLAGS.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('real_benchmark_1_gpu_coco')
+    FLAGS.strategy_type = 'one_device'
+    self._run_and_report_benchmark(params)
+
+  @flagsaver.flagsaver
+  def benchmark_xla_1_gpu_coco(self):
+    """Run detection model accuracy test with 1 GPU and XLA enabled."""
+    self._setup()
+    params = self._params()
+    params['architecture']['use_bfloat16'] = False
+    params['train']['batch_size'] = 1 * self.per_gpu_batch_size
+    params['train']['total_steps'] = 200
+    params['train']['iterations_per_loop'] = 1
+    params['eval']['eval_samples'] = 8
+    FLAGS.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('real_benchmark_xla_1_gpu_coco')
+    FLAGS.strategy_type = 'one_device'
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark(params)
+
+  @flagsaver.flagsaver
+  def benchmark_2x2_tpu_coco(self):
+    """Run detection model accuracy test with 4 TPUs."""
+    self._setup()
+    params = self._params()
+    params['train']['batch_size'] = 64
+    params['train']['total_steps'] = 1875  # One epoch.
+    params['train']['iterations_per_loop'] = 500
+    FLAGS.model_dir = self._get_model_dir('real_benchmark_2x2_tpu_coco')
+    FLAGS.strategy_type = 'tpu'
+    self._run_and_report_benchmark(params, do_eval=False, warmup=0)
+
+  @flagsaver.flagsaver
+  def benchmark_4x4_tpu_coco(self):
+    """Run detection model accuracy test with 4x4 TPU."""
+    self._setup()
+    params = self._params()
+    params['train']['batch_size'] = 256
+    params['train']['total_steps'] = 10 * 469  # 10 epochs.
+    params['train']['iterations_per_loop'] = 500
+    FLAGS.model_dir = self._get_model_dir('real_benchmark_4x4_tpu_coco')
+    FLAGS.strategy_type = 'tpu'
+    self._run_and_report_benchmark(params, do_eval=False, warmup=1)
+
+  @flagsaver.flagsaver
+  def benchmark_2x2_tpu_coco_mlir(self):
+    """Run detection model accuracy test with 4 TPUs."""
+    self._setup()
+    params = self._params()
+    params['train']['batch_size'] = 64
+    params['train']['total_steps'] = 1875  # One epoch.
+    params['train']['iterations_per_loop'] = 500
+    FLAGS.model_dir = self._get_model_dir('real_benchmark_2x2_tpu_coco_mlir')
+    FLAGS.strategy_type = 'tpu'
+    tf.config.experimental.enable_mlir_bridge()
+    self._run_and_report_benchmark(params, do_eval=False, warmup=0)
+
+  @flagsaver.flagsaver
+  def benchmark_4x4_tpu_coco_mlir(self):
+    """Run RetinaNet model accuracy test with 4 TPUs."""
+    self._setup()
+    params = self._params()
+    params['train']['batch_size'] = 256
+    params['train']['total_steps'] = 469  # One epoch.
+    params['train']['iterations_per_loop'] = 500
+    FLAGS.model_dir = self._get_model_dir('real_benchmark_4x4_tpu_coco_mlir')
+    FLAGS.strategy_type = 'tpu'
+    tf.config.experimental.enable_mlir_bridge()
+    self._run_and_report_benchmark(params, do_eval=False, warmup=0)
+
+  @flagsaver.flagsaver
+  def benchmark_2x2_tpu_spinenet_coco(self):
+    """Run detection model with SpineNet backbone accuracy test with 4 TPUs."""
+    self._setup()
+    params = self._params()
+    params['architecture']['backbone'] = 'spinenet'
+    params['architecture']['multilevel_features'] = 'identity'
+    params['architecture']['use_bfloat16'] = False
+    params['train']['batch_size'] = 64
+    params['train']['total_steps'] = 1875  # One epoch.
+    params['train']['iterations_per_loop'] = 500
+    params['train']['checkpoint']['path'] = ''
+    FLAGS.model_dir = self._get_model_dir(
+        'real_benchmark_2x2_tpu_spinenet_coco')
+    FLAGS.strategy_type = 'tpu'
+    self._run_and_report_benchmark(params, do_eval=False, warmup=0)
+
+
+class RetinanetBenchmarkReal(DetectionBenchmarkReal):
+  """Short benchmark performance tests for Retinanet model."""
+
+  def __init__(self, **kwargs):
+    super(RetinanetBenchmarkReal, self).__init__(model='retinanet',
+                                                 per_gpu_batch_size=8,
+                                                 **kwargs)
+
+
+class MaskRCNNBenchmarkReal(DetectionBenchmarkReal):
+  """Short benchmark performance tests for Mask RCNN model."""
+
+  def __init__(self, **kwargs):
+    super(MaskRCNNBenchmarkReal, self).__init__(model='mask_rcnn',
+                                                per_gpu_batch_size=4,
+                                                **kwargs)
+
+
+class ShapeMaskBenchmarkReal(DetectionBenchmarkReal):
+  """Short benchmark performance tests for ShapeMask model."""
+
+  def __init__(self, **kwargs):
+    super(ShapeMaskBenchmarkReal, self).__init__(model='shapemask',
+                                                 per_gpu_batch_size=4,
+                                                 **kwargs)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/benchmark/shakespeare_benchmark.py
+++ b/official/benchmark/shakespeare_benchmark.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes Shakespeare (LSTM) benchmark and accuracy tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+
+from absl import flags
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+from official.benchmark.models.shakespeare import shakespeare_main
+from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+from official.benchmark import benchmark_wrappers
+from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
+
+SHAKESPEARE_TRAIN_DATA = 'shakespeare/shakespeare.txt'
+TMP_DIR = os.getenv('TMPDIR')
+FLAGS = flags.FLAGS
+
+
+class ShakespeareBenchmarkBase(PerfZeroBenchmark):
+  """Base class for Shakespeare (LSTM) benchmark and accuracy tests."""
+
+  def __init__(self, output_dir=None, default_flags=None, root_data_dir=None):
+    super(ShakespeareBenchmarkBase, self).__init__(
+        output_dir=output_dir,
+        default_flags=default_flags,
+        flag_methods=[shakespeare_main.define_flags])
+
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self,
+                                top_1_train_min=0.91,
+                                top_1_train_max=0.94,
+                                warmup=1,
+                                log_steps=100):
+    """Report benchmark results by writing to local protobuf file.
+
+    Average epoch time is calculated by skipping the first epoch. This average
+    ignores time spent between epoch and is recorded by begin and end epoch. To
+    skip accuracy check set `top_1_train_min=None`.
+
+    Args:
+      top_1_train_min: lowest passing value.
+      top_1_train_max: highest passing value.
+      warmup: number of entries in `timestamp_log` to ignore.
+      log_steps: How often the log was created for `timestamp_log`.
+    """
+    total_batch_size = FLAGS.batch_size
+    metrics = []
+    start_time_sec = time.time()
+    stats = shakespeare_main.run(FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+
+    if top_1_train_min:
+      metrics.append({
+          'name': 'accuracy_top_1_train',
+          'value': stats['history']['RecallAt1'][-1],
+          'min_value': top_1_train_min,
+          'max_value': top_1_train_max
+      })
+
+    # Look for the time history callback which was used during keras.fit
+    for callback in stats['callbacks']:
+      if isinstance(callback, keras_utils.TimeHistory):
+        epoch_timings = callback.epoch_runtime_log
+        if len(epoch_timings) > 1:
+          average_time = sum(epoch_timings[1:]) / len(epoch_timings[1:])
+          metrics.append({'name': 'avg_epoch_time', 'value': average_time})
+
+      # First entry in timestamp_log is the start of step 1. The rest of the
+      # entries are the end of each step recorded.
+      time_log = callback.timestamp_log
+      elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
+      num_examples = (
+          total_batch_size * log_steps * (len(time_log) - warmup - 1))
+      if elapsed > 0:
+        examples_per_sec = num_examples / elapsed
+        metrics.append({'name': 'exp_per_second', 'value': examples_per_sec})
+
+    flags_str = flags_core.get_nondefault_flags_as_str()
+    self.report_benchmark(
+        iters=-1,
+        wall_time=wall_time_sec,
+        metrics=metrics,
+        extras={'flags': flags_str})
+
+
+class ShakespeareAccuracy(ShakespeareBenchmarkBase):
+  """Shakespeare accuracy tests.
+
+  This is not an ideal test. The best we can use for the accuracy check is to
+  validate top_1 of the training set. At batch size 64 the top_1 training
+  stabilizes to ~0.92 around 40-45 epochs.
+  """
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """Shakespeare accuracy tests.
+
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+        constructor forward compatible in case PerfZero provides more named
+        arguments before updating the constructor.
+    """
+    self.train_data = os.path.join(root_data_dir, SHAKESPEARE_TRAIN_DATA)
+    super(ShakespeareAccuracy, self).__init__(
+        output_dir=output_dir, root_data_dir=root_data_dir)
+
+  def benchmark_cpu(self):
+    """Benchmark cpu."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    self._run_and_report_benchmark()
+
+  def benchmark_cpu_no_ds_run_eagerly(self):
+    """Benchmark cpu without distribution strategies and run eagerly."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu(self):
+    """Benchmark 1 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_no_ds(self):
+    """Benchmark 1 gpu without distribution strategies."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_no_ds_run_eagerly(self):
+    """Benchmark 1 gpu without distribution strategies and run eagerly."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+
+  def benchmark_xla_1_gpu(self):
+    """Benchmark 1 gpu w/xla."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
+
+  def benchmark_8_gpu(self):
+    """Benchmark 8 gpu.
+
+    This is test is for accuracy not scaling.  The batch-size is not scaled to
+    the number of gpus.
+    """
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.training_data = self.train_data
+    FLAGS.batch_size = 64
+    FLAGS.train_epochs = 43
+    FLAGS.model_dir = ''
+    self._run_and_report_benchmark()
+
+
+class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase):
+  """Benchmark accuracy tests."""
+
+  def __init__(self, output_dir=None, root_data_dir=TMP_DIR, **kwargs):
+    """Benchmark tests w/Keras.
+
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+        constructor forward compatible in case PerfZero provides more named
+        arguments before updating the constructor.
+    """
+    self.train_data = os.path.join(root_data_dir, SHAKESPEARE_TRAIN_DATA)
+
+    def_flags = {}
+    def_flags['training_data'] = self.train_data
+    def_flags['model_dir'] = ''
+    def_flags['train_epochs'] = 4
+    def_flags['log_steps'] = 50
+
+    super(ShakespeareKerasBenchmarkReal, self).__init__(
+        output_dir=output_dir,
+        root_data_dir=root_data_dir,
+        default_flags=def_flags)
+
+  def benchmark_cpu(self):
+    """Benchmark cpu."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.batch_size = 64
+    self._run_and_report_benchmark()
+
+  def benchmark_cpu_no_ds_run_eagerly(self):
+    """Benchmark cpu without distribution strategy and run eagerly."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.batch_size = 64
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.run_eagerly = True
+    self._run_and_report_benchmark()
+
+  def benchmark_cpu_no_ds(self):
+    """Benchmark cpu without distribution strategy."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.batch_size = 64
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+
+  def benchmark_cpu_no_ds_force_v2(self):
+    """Benchmark cpu no ds, and force v2."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.batch_size = 64
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu(self):
+    """Benchmark 1 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_no_cudnn(self):
+    """Benchmark 1 gpu with CuDNN disabled."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64
+    FLAGS.cudnn = False
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_no_ds(self):
+    """Benchmark 1 gpu without distribution strategies."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_no_ds_run_eagerly(self):
+    """Benchmark 1 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+
+  def benchmark_xla_1_gpu(self):
+    """Benchmark 1 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
+
+  def benchmark_xla_1_gpu_no_cudnn(self):
+    """Benchmark 1 gpu w/xla and CuDNN disabled."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64
+    FLAGS.cudnn = False
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
+
+  def benchmark_8_gpu(self):
+    """Benchmark 8 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.batch_size = 64 * 8
+    FLAGS.log_steps = 10
+    self._run_and_report_benchmark()
+
+  def benchmark_8_gpu_no_cudnn(self):
+    """Benchmark 8 gpu with CuDNN disabled."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.batch_size = 64 * 8
+    FLAGS.log_steps = 10
+    FLAGS.cudnn = False
+    self._run_and_report_benchmark()
+
+  def benchmark_xla_8_gpu(self):
+    """Benchmark 8 gpu w/xla."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 64 * 8
+    FLAGS.log_steps = 10
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
+
+  def benchmark_xla_8_gpu_no_cudnn(self):
+    """Benchmark 8 gpu w/xla and CuDNN disabled."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.batch_size = 64 * 8
+    FLAGS.log_steps = 10
+    FLAGS.cudnn = False
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
+
+  def _run_and_report_benchmark(self):
+    """Run and report benchmark."""
+    super(ShakespeareKerasBenchmarkReal, self)._run_and_report_benchmark(
+        top_1_train_min=None, log_steps=FLAGS.log_steps)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/benchmark/tf_scan_benchmark.py
+++ b/official/benchmark/tf_scan_benchmark.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to setup a tf scan e2e benchmark."""
+
+import time
+
+import numpy as np
+import tensorflow as tf
+from tqdm import tqdm
+
+from official.benchmark import perfzero_benchmark
+
+
+# pylint: disable=invalid-name
+# pylint: disable=no-value-for-parameter
+# pylint: disable=unused-variable
+def gen_batches(num_batches, batch_size, units):
+  for _ in range(num_batches):
+    x = np.random.random((batch_size, 20, units))
+    y = np.random.randint(1, units, size=(batch_size, 20))
+    yield x, y
+
+
+class MyModel(tf.keras.models.Model):
+  """Test model."""
+
+  def __init__(self, units):
+    super().__init__()
+
+    self._tf_layers = {}
+
+    self.units = units
+
+    self.transition_param = self.add_weight(
+        name="transition_param", shape=(units, units))
+
+    self.optimizer = tf.keras.optimizers.Adam()
+    self._training = False
+
+  def _loss_fn_with_scan(self, inputs, transition_params):
+    first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1])
+    first_input = tf.squeeze(first_input, [1])
+    rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1])
+    rest_of_input = tf.transpose(rest_of_input, [1, 0, 2])
+    transition_params = tf.expand_dims(transition_params, 0)
+
+    def _scan_fn(_state, _inputs):
+      _state = tf.expand_dims(_state, 2)
+      transition_scores = _state + transition_params
+      new_alphas = _inputs + tf.reduce_logsumexp(transition_scores, [1])
+      return new_alphas
+
+    all_alphas = tf.transpose(
+        tf.scan(_scan_fn, rest_of_input, first_input), [1, 0, 2])
+    # add first state for sequences of length 1
+    all_alphas = tf.concat([tf.expand_dims(first_input, 1), all_alphas], 1)
+    return all_alphas
+
+  def _loss(self, x, y):
+    logits = tf.cast(x, dtype=tf.float32)
+    loss = self._loss_fn_with_scan(logits, self.transition_param)
+    return tf.reduce_mean(loss)
+
+  @tf.function
+  def train_on_batch(self, *args):
+    with tf.GradientTape(persistent=True) as tape:
+      loss = self._loss(*args)
+    grads = tape.gradient(loss, self.trainable_weights)
+    self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
+    return loss
+
+  def train(self, epochs, batch_size, num_batches):
+    data_generator_iter = gen_batches(num_batches, batch_size, self.units)
+    sample_x, sample_y = next(data_generator_iter)
+    self.train_on_batch(sample_x, sample_y)
+    self._training = True
+    progress_bar = tqdm(range(epochs), desc="Epochs")
+    for epoch in progress_bar:
+      for batch_x, batch_y in data_generator_iter:
+        loss = self.train_on_batch(batch_x, batch_y)
+      progress_bar.update(1)
+      progress_bar.set_postfix({"loss": f"{loss.numpy():.3f}"})
+
+
+def _run_benchmark(model):
+  """Runs the benchmark."""
+  np.random.seed(123)
+  num_batches = 5000
+  batch_size = 32
+  epochs = 100
+
+  start_time = time.time()
+  model.train(epochs, batch_size, num_batches)
+  end_time = time.time()
+  wall_time = end_time - start_time
+  return wall_time
+
+
+class TfScanE2EBenchmark(perfzero_benchmark.PerfZeroBenchmark):
+  """Scan E2E benchmark."""
+
+  def benchmark_cpu(self):
+    units = 64
+    model = MyModel(units)
+    wall_time = _run_benchmark(model)
+    self.report_benchmark(iters=-1, wall_time=wall_time)
+
+  def benchmark_cpu_avg_4(self):
+    units = 64
+    model = MyModel(units)
+
+    num_trials = 4
+    wall_times = []
+    for _ in range(num_trials):
+      wall_times.append(_run_benchmark(model))
+    avg_wall_time = sum(wall_times) / float(len(wall_times))
+    self.report_benchmark(iters=-1, wall_time=avg_wall_time)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/benchmark/tf_vision_saved_model_benchmark.py
+++ b/official/benchmark/tf_vision_saved_model_benchmark.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark TF-vision saved models on a TFRecord dataset."""
+
+import time
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('saved_model_path', None, 'Path to saved model.')
+flags.DEFINE_string('tf_examples_path', None, 'Path to TF examples.')
+flags.DEFINE_integer('num_samples', 100, 'Number of samples.')
+flags.DEFINE_integer('num_ignore_samples', 5,
+                     ('Number of initial samples to ignore. '
+                      'The first few samples (usually 1) are used by '
+                      'tensorflow to optimize the tf.function call'))
+
+
+flags.mark_flag_as_required('saved_model_path')
+flags.mark_flag_as_required('tf_examples_path')
+flags.mark_flag_as_required('num_samples')
+
+
+def main(_) -> None:
+  files = tf.data.Dataset.list_files(FLAGS.tf_examples_path)
+
+  logging.info('Found %d files.', len(files))
+  dataset = tf.data.TFRecordDataset(files)
+
+  model = tf.saved_model.load(FLAGS.saved_model_path)
+  detect_fn = model.signatures['serving_default']
+
+  time_taken = 0.0
+  for (i, sample) in enumerate(dataset.take(FLAGS.num_samples)):
+
+    example = tf.train.Example()
+    example.ParseFromString(sample.numpy())
+
+    image_encoded = example.features.feature['image/encoded']
+    image = tf.io.decode_image(image_encoded.bytes_list.value[0])
+    image = image[tf.newaxis]
+    start_time = time.time()
+    _ = detect_fn(image)
+    sample_time = time.time() - start_time
+
+    if (i % 10) == 0:
+      logging.info('Finished sample %d %.2f ms', i, sample_time * 1000.0)
+
+    if i < FLAGS.num_ignore_samples:
+      continue
+
+    time_taken += sample_time
+
+  num_benchmark_samples = FLAGS.num_samples - FLAGS.num_ignore_samples
+  logging.info('Per-sample time for {} samples = {:.2f}ms'.format(
+      num_benchmark_samples, 1000.0 * time_taken / num_benchmark_samples))
+
+
+if __name__ == '__main__':
+  app.run(main)
--- a/official/benchmark/tfhub_memory_usage_benchmark.py
+++ b/official/benchmark/tfhub_memory_usage_benchmark.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a memory usage benchmark for a Tensorflow Hub model.
+
+Loads a SavedModel and records memory usage.
+"""
+import functools
+import time
+
+from absl import flags
+import tensorflow as tf
+import tensorflow_hub as hub
+
+from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
+
+FLAGS = flags.FLAGS
+
+
+class TfHubMemoryUsageBenchmark(PerfZeroBenchmark):
+  """A benchmark measuring memory usage for a given TF Hub SavedModel."""
+
+  def __init__(self,
+               hub_model_handle_list=None,
+               output_dir=None,
+               default_flags=None,
+               root_data_dir=None,
+               **kwargs):
+    super(TfHubMemoryUsageBenchmark, self).__init__(
+        output_dir=output_dir, default_flags=default_flags, **kwargs)
+    if hub_model_handle_list:
+      for hub_model_handle in hub_model_handle_list.split(';'):
+        # Converts a model handle of the form
+        # https://tfhub.dev/google/nnlm-en-dim128/1 to valid python method name
+        # like google_nnlm_en_dim128_1.
+        hub_model_method_name = hub_model_handle.replace(
+            'https://tfhub.dev', '').replace('/', '_').replace('-',
+                                                               '_').strip('_')
+        setattr(
+            self, 'benchmark_' + hub_model_method_name,
+            functools.partial(self.benchmark_memory_usage, hub_model_handle))
+
+  def benchmark_memory_usage(
+      self, hub_model_handle='https://tfhub.dev/google/nnlm-en-dim128/1'):
+    start_time_sec = time.time()
+    self.load_model(hub_model_handle)
+    wall_time_sec = time.time() - start_time_sec
+
+    metrics = []
+    self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=metrics)
+
+  def load_model(self, hub_model_handle):
+    """Loads a TF Hub module."""
+    hub.load(hub_model_handle)
+
+
+if __name__ == '__main__':
+  tf.test.main()