Merge pull request #1 from tensorflow/master

new pull

Merge pull request #1 from tensorflow/master
new pull
f16a7b5b · vedanshu · GitHub · 8e9296ff · 8f58f396 · 8e9296ff
Unverified Commit f16a7b5b authored May 04, 2021 by vedanshu Committed by GitHub May 04, 2021
20 changed files
--- a/official/benchmark/models/cifar_preprocessing.py
+++ b/official/benchmark/models/cifar_preprocessing.py
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Provides utilities to Cifar-10 dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-from absl import logging
-import tensorflow as tf
-
-from official.vision.image_classification.resnet import imagenet_preprocessing
-
-HEIGHT = 32
-WIDTH = 32
-NUM_CHANNELS = 3
-_DEFAULT_IMAGE_BYTES = HEIGHT * WIDTH * NUM_CHANNELS
-# The record is the image plus a one-byte label
-_RECORD_BYTES = _DEFAULT_IMAGE_BYTES + 1
-
-# TODO(tobyboyd): Change to best practice 45K(train)/5K(val)/10K(test) splits.
-NUM_IMAGES = {
-    'train': 50000,
-    'validation': 10000,
-}
-_NUM_DATA_FILES = 5
-NUM_CLASSES = 10
-
-
-def parse_record(raw_record, is_training, dtype):
-  """Parses a record containing a training example of an image.
-
-  The input record is parsed into a label and image, and the image is passed
-  through preprocessing steps (cropping, flipping, and so on).
-
-  This method converts the label to one hot to fit the loss function.
-
-  Args:
-    raw_record: scalar Tensor tf.string containing a serialized
-      Example protocol buffer.
-    is_training: A boolean denoting whether the input is for training.
-    dtype: Data type to use for input images.
-
-  Returns:
-    Tuple with processed image tensor and one-hot-encoded label tensor.
-  """
-  # Convert bytes to a vector of uint8 that is record_bytes long.
-  record_vector = tf.io.decode_raw(raw_record, tf.uint8)
-
-  # The first byte represents the label, which we convert from uint8 to int32
-  # and then to one-hot.
-  label = tf.cast(record_vector[0], tf.int32)
-
-  # The remaining bytes after the label represent the image, which we reshape
-  # from [depth * height * width] to [depth, height, width].
-  depth_major = tf.reshape(record_vector[1:_RECORD_BYTES],
-                           [NUM_CHANNELS, HEIGHT, WIDTH])
-
-  # Convert from [depth, height, width] to [height, width, depth], and cast as
-  # float32.
-  image = tf.cast(tf.transpose(a=depth_major, perm=[1, 2, 0]), tf.float32)
-
-  image = preprocess_image(image, is_training)
-  image = tf.cast(image, dtype)
-
-  return image, label
-
-
-def preprocess_image(image, is_training):
-  """Preprocess a single image of layout [height, width, depth]."""
-  if is_training:
-    # Resize the image to add four extra pixels on each side.
-    image = tf.image.resize_with_crop_or_pad(
-        image, HEIGHT + 8, WIDTH + 8)
-
-    # Randomly crop a [HEIGHT, WIDTH] section of the image.
-    image = tf.image.random_crop(image, [HEIGHT, WIDTH, NUM_CHANNELS])
-
-    # Randomly flip the image horizontally.
-    image = tf.image.random_flip_left_right(image)
-
-  # Subtract off the mean and divide by the variance of the pixels.
-  image = tf.image.per_image_standardization(image)
-  return image
-
-
-def get_filenames(is_training, data_dir):
-  """Returns a list of filenames."""
-  assert tf.io.gfile.exists(data_dir), (
-      'Run cifar10_download_and_extract.py first to download and extract the '
-      'CIFAR-10 data.')
-
-  if is_training:
-    return [
-        os.path.join(data_dir, 'data_batch_%d.bin' % i)
-        for i in range(1, _NUM_DATA_FILES + 1)
-    ]
-  else:
-    return [os.path.join(data_dir, 'test_batch.bin')]
-
-
-def input_fn(is_training,
-             data_dir,
-             batch_size,
-             dtype=tf.float32,
-             datasets_num_private_threads=None,
-             parse_record_fn=parse_record,
-             input_context=None,
-             drop_remainder=False):
-  """Input function which provides batches for train or eval.
-
-  Args:
-    is_training: A boolean denoting whether the input is for training.
-    data_dir: The directory containing the input data.
-    batch_size: The number of samples per batch.
-    dtype: Data type to use for images/features
-    datasets_num_private_threads: Number of private threads for tf.data.
-    parse_record_fn: Function to use for parsing the records.
-    input_context: A `tf.distribute.InputContext` object passed in by
-      `tf.distribute.Strategy`.
-    drop_remainder: A boolean indicates whether to drop the remainder of the
-      batches. If True, the batch dimension will be static.
-
-  Returns:
-    A dataset that can be used for iteration.
-  """
-  filenames = get_filenames(is_training, data_dir)
-  dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)
-
-  if input_context:
-    logging.info(
-        'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
-        input_context.input_pipeline_id, input_context.num_input_pipelines)
-    dataset = dataset.shard(input_context.num_input_pipelines,
-                            input_context.input_pipeline_id)
-
-  return imagenet_preprocessing.process_record_dataset(
-      dataset=dataset,
-      is_training=is_training,
-      batch_size=batch_size,
-      shuffle_buffer=NUM_IMAGES['train'],
-      parse_record_fn=parse_record_fn,
-      dtype=dtype,
-      datasets_num_private_threads=datasets_num_private_threads,
-      drop_remainder=drop_remainder
-  )
--- a/official/benchmark/models/resnet_cifar_main.py
+++ b/official/benchmark/models/resnet_cifar_main.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Runs a ResNet model on the Cifar-10 dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-from absl import logging
-import numpy as np
-import tensorflow as tf
-from official.benchmark.models import cifar_preprocessing
-from official.benchmark.models import resnet_cifar_model
-from official.benchmark.models import synthetic_util
-from official.utils.flags import core as flags_core
-from official.utils.misc import distribution_utils
-from official.utils.misc import keras_utils
-from official.vision.image_classification.resnet import common
-
-
-LR_SCHEDULE = [  # (multiplier, epoch to start) tuples
-    (0.1, 91), (0.01, 136), (0.001, 182)
-]
-
-
-def learning_rate_schedule(current_epoch,
-                           current_batch,
-                           batches_per_epoch,
-                           batch_size):
-  """Handles linear scaling rule and LR decay.
-
-  Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the
-  provided scaling factor.
-
-  Args:
-    current_epoch: integer, current epoch indexed from 0.
-    current_batch: integer, current batch in the current epoch, indexed from 0.
-    batches_per_epoch: integer, number of steps in an epoch.
-    batch_size: integer, total batch sized.
-
-  Returns:
-    Adjusted learning rate.
-  """
-  del current_batch, batches_per_epoch  # not used
-  initial_learning_rate = common.BASE_LEARNING_RATE * batch_size / 128
-  learning_rate = initial_learning_rate
-  for mult, start_epoch in LR_SCHEDULE:
-    if current_epoch >= start_epoch:
-      learning_rate = initial_learning_rate * mult
-    else:
-      break
-  return learning_rate
-
-
-class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
-  """Callback to update learning rate on every batch (not epoch boundaries).
-
-  N.B. Only support Keras optimizers, not TF optimizers.
-
-  Attributes:
-      schedule: a function that takes an epoch index and a batch index as input
-          (both integer, indexed from 0) and returns a new learning rate as
-          output (float).
-  """
-
-  def __init__(self, schedule, batch_size, steps_per_epoch):
-    super(LearningRateBatchScheduler, self).__init__()
-    self.schedule = schedule
-    self.steps_per_epoch = steps_per_epoch
-    self.batch_size = batch_size
-    self.epochs = -1
-    self.prev_lr = -1
-
-  def on_epoch_begin(self, epoch, logs=None):
-    if not hasattr(self.model.optimizer, 'learning_rate'):
-      raise ValueError('Optimizer must have a "learning_rate" attribute.')
-    self.epochs += 1
-
-  def on_batch_begin(self, batch, logs=None):
-    """Executes before step begins."""
-    lr = self.schedule(self.epochs,
-                       batch,
-                       self.steps_per_epoch,
-                       self.batch_size)
-    if not isinstance(lr, (float, np.float32, np.float64)):
-      raise ValueError('The output of the "schedule" function should be float.')
-    if lr != self.prev_lr:
-      self.model.optimizer.learning_rate = lr  # lr should be a float here
-      self.prev_lr = lr
-      logging.debug(
-          'Epoch %05d Batch %05d: LearningRateBatchScheduler '
-          'change learning rate to %s.', self.epochs, batch, lr)
-
-
-def run(flags_obj):
-  """Run ResNet Cifar-10 training and eval loop using native Keras APIs.
-
-  Args:
-    flags_obj: An object containing parsed flag values.
-
-  Raises:
-    ValueError: If fp16 is passed as it is not currently supported.
-
-  Returns:
-    Dictionary of training and eval stats.
-  """
-  keras_utils.set_session_config(
-      enable_xla=flags_obj.enable_xla)
-
-  # Execute flag override logic for better model performance
-  if flags_obj.tf_gpu_thread_mode:
-    keras_utils.set_gpu_thread_mode_and_count(
-        per_gpu_thread_count=flags_obj.per_gpu_thread_count,
-        gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
-        num_gpus=flags_obj.num_gpus,
-        datasets_num_private_threads=flags_obj.datasets_num_private_threads)
-  common.set_cudnn_batchnorm_mode()
-
-  dtype = flags_core.get_tf_dtype(flags_obj)
-  if dtype == 'fp16':
-    raise ValueError('dtype fp16 is not supported in Keras. Use the default '
-                     'value(fp32).')
-
-  data_format = flags_obj.data_format
-  if data_format is None:
-    data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
-                   else 'channels_last')
-  tf.keras.backend.set_image_data_format(data_format)
-
-  strategy = distribution_utils.get_distribution_strategy(
-      distribution_strategy=flags_obj.distribution_strategy,
-      num_gpus=flags_obj.num_gpus,
-      all_reduce_alg=flags_obj.all_reduce_alg,
-      num_packs=flags_obj.num_packs)
-
-  if strategy:
-    # flags_obj.enable_get_next_as_optional controls whether enabling
-    # get_next_as_optional behavior in DistributedIterator. If true, last
-    # partial batch can be supported.
-    strategy.extended.experimental_enable_get_next_as_optional = (
-        flags_obj.enable_get_next_as_optional
-    )
-
-  strategy_scope = distribution_utils.get_strategy_scope(strategy)
-
-  if flags_obj.use_synthetic_data:
-    synthetic_util.set_up_synthetic_data()
-    input_fn = common.get_synth_input_fn(
-        height=cifar_preprocessing.HEIGHT,
-        width=cifar_preprocessing.WIDTH,
-        num_channels=cifar_preprocessing.NUM_CHANNELS,
-        num_classes=cifar_preprocessing.NUM_CLASSES,
-        dtype=flags_core.get_tf_dtype(flags_obj),
-        drop_remainder=True)
-  else:
-    synthetic_util.undo_set_up_synthetic_data()
-    input_fn = cifar_preprocessing.input_fn
-
-  train_input_dataset = input_fn(
-      is_training=True,
-      data_dir=flags_obj.data_dir,
-      batch_size=flags_obj.batch_size,
-      parse_record_fn=cifar_preprocessing.parse_record,
-      datasets_num_private_threads=flags_obj.datasets_num_private_threads,
-      dtype=dtype,
-      # Setting drop_remainder to avoid the partial batch logic in normalization
-      # layer, which triggers tf.where and leads to extra memory copy of input
-      # sizes between host and GPU.
-      drop_remainder=(not flags_obj.enable_get_next_as_optional))
-
-  eval_input_dataset = None
-  if not flags_obj.skip_eval:
-    eval_input_dataset = input_fn(
-        is_training=False,
-        data_dir=flags_obj.data_dir,
-        batch_size=flags_obj.batch_size,
-        parse_record_fn=cifar_preprocessing.parse_record)
-
-  steps_per_epoch = (
-      cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
-  lr_schedule = 0.1
-  if flags_obj.use_tensor_lr:
-    initial_learning_rate = common.BASE_LEARNING_RATE * flags_obj.batch_size / 128
-    lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
-        boundaries=list(p[1] * steps_per_epoch for p in LR_SCHEDULE),
-        values=[initial_learning_rate] +
-        list(p[0] * initial_learning_rate for p in LR_SCHEDULE))
-
-  with strategy_scope:
-    optimizer = common.get_optimizer(lr_schedule)
-    model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES)
-    model.compile(
-        loss='sparse_categorical_crossentropy',
-        optimizer=optimizer,
-        metrics=(['sparse_categorical_accuracy']
-                 if flags_obj.report_accuracy_metrics else None),
-        run_eagerly=flags_obj.run_eagerly)
-
-  train_epochs = flags_obj.train_epochs
-
-  callbacks = common.get_callbacks()
-
-  if not flags_obj.use_tensor_lr:
-    lr_callback = LearningRateBatchScheduler(
-        schedule=learning_rate_schedule,
-        batch_size=flags_obj.batch_size,
-        steps_per_epoch=steps_per_epoch)
-    callbacks.append(lr_callback)
-
-  # if mutliple epochs, ignore the train_steps flag.
-  if train_epochs <= 1 and flags_obj.train_steps:
-    steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
-    train_epochs = 1
-
-  num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] //
-                    flags_obj.batch_size)
-
-  validation_data = eval_input_dataset
-  if flags_obj.skip_eval:
-    if flags_obj.set_learning_phase_to_train:
-      # TODO(haoyuzhang): Understand slowdown of setting learning phase when
-      # not using distribution strategy.
-      tf.keras.backend.set_learning_phase(1)
-    num_eval_steps = None
-    validation_data = None
-
-  if not strategy and flags_obj.explicit_gpu_placement:
-    # TODO(b/135607227): Add device scope automatically in Keras training loop
-    # when not using distribition strategy.
-    no_dist_strat_device = tf.device('/device:GPU:0')
-    no_dist_strat_device.__enter__()
-
-  history = model.fit(train_input_dataset,
-                      epochs=train_epochs,
-                      steps_per_epoch=steps_per_epoch,
-                      callbacks=callbacks,
-                      validation_steps=num_eval_steps,
-                      validation_data=validation_data,
-                      validation_freq=flags_obj.epochs_between_evals,
-                      verbose=2)
-  eval_output = None
-  if not flags_obj.skip_eval:
-    eval_output = model.evaluate(eval_input_dataset,
-                                 steps=num_eval_steps,
-                                 verbose=2)
-
-  if not strategy and flags_obj.explicit_gpu_placement:
-    no_dist_strat_device.__exit__()
-
-  stats = common.build_stats(history, eval_output, callbacks)
-  return stats
-
-
-def define_cifar_flags():
-  common.define_keras_flags(dynamic_loss_scale=False)
-
-  flags_core.set_defaults(data_dir='/tmp/cifar10_data/cifar-10-batches-bin',
-                          model_dir='/tmp/cifar10_model',
-                          epochs_between_evals=10,
-                          batch_size=128)
-
-
-def main(_):
-  return run(flags.FLAGS)
-
-
-if __name__ == '__main__':
-  logging.set_verbosity(logging.INFO)
-  define_cifar_flags()
-  app.run(main)
--- a/official/benchmark/models/resnet_cifar_model.py
+++ b/official/benchmark/models/resnet_cifar_model.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""ResNet56 model for Keras adapted from tf.keras.applications.ResNet50.
-
-# Reference:
- [Deep Residual Learning for Image Recognition](
-    https://arxiv.org/abs/1512.03385)
-Adapted from code contributed by BigMoyan.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import tensorflow as tf
-from tensorflow.python.keras import backend
-from tensorflow.python.keras  import initializers
-from tensorflow.python.keras import layers
-from tensorflow.python.keras import regularizers
-
-
-BATCH_NORM_DECAY = 0.997
-BATCH_NORM_EPSILON = 1e-5
-L2_WEIGHT_DECAY = 2e-4
-
-
-def identity_building_block(input_tensor,
-                            kernel_size,
-                            filters,
-                            stage,
-                            block,
-                            training=None):
-  """The identity block is the block that has no conv layer at shortcut.
-
-  Arguments:
-    input_tensor: input tensor
-    kernel_size: default 3, the kernel size of
-        middle conv layer at main path
-    filters: list of integers, the filters of 3 conv layer at main path
-    stage: integer, current stage label, used for generating layer names
-    block: current block label, used for generating layer names
-    training: Only used if training keras model with Estimator.  In other
-      scenarios it is handled automatically.
-
-  Returns:
-    Output tensor for the block.
-  """
-  filters1, filters2 = filters
-  if backend.image_data_format() == 'channels_last':
-    bn_axis = 3
-  else:
-    bn_axis = 1
-  conv_name_base = 'res' + str(stage) + block + '_branch'
-  bn_name_base = 'bn' + str(stage) + block + '_branch'
-
-  x = layers.Conv2D(filters1, kernel_size,
-                    padding='same', use_bias=False,
-                    kernel_initializer='he_normal',
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                    name=conv_name_base + '2a')(input_tensor)
-  x = layers.BatchNormalization(
-      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
-      name=bn_name_base + '2a')(x, training=training)
-  x = layers.Activation('relu')(x)
-
-  x = layers.Conv2D(filters2, kernel_size,
-                    padding='same', use_bias=False,
-                    kernel_initializer='he_normal',
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                    name=conv_name_base + '2b')(x)
-  x = layers.BatchNormalization(
-      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
-      name=bn_name_base + '2b')(x, training=training)
-
-  x = layers.add([x, input_tensor])
-  x = layers.Activation('relu')(x)
-  return x
-
-
-def conv_building_block(input_tensor,
-                        kernel_size,
-                        filters,
-                        stage,
-                        block,
-                        strides=(2, 2),
-                        training=None):
-  """A block that has a conv layer at shortcut.
-
-  Arguments:
-    input_tensor: input tensor
-    kernel_size: default 3, the kernel size of
-        middle conv layer at main path
-    filters: list of integers, the filters of 3 conv layer at main path
-    stage: integer, current stage label, used for generating layer names
-    block: current block label, used for generating layer names
-    strides: Strides for the first conv layer in the block.
-    training: Only used if training keras model with Estimator.  In other
-      scenarios it is handled automatically.
-
-  Returns:
-    Output tensor for the block.
-
-  Note that from stage 3,
-  the first conv layer at main path is with strides=(2, 2)
-  And the shortcut should have strides=(2, 2) as well
-  """
-  filters1, filters2 = filters
-  if tf.keras.backend.image_data_format() == 'channels_last':
-    bn_axis = 3
-  else:
-    bn_axis = 1
-  conv_name_base = 'res' + str(stage) + block + '_branch'
-  bn_name_base = 'bn' + str(stage) + block + '_branch'
-
-  x = layers.Conv2D(filters1, kernel_size, strides=strides,
-                    padding='same', use_bias=False,
-                    kernel_initializer='he_normal',
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                    name=conv_name_base + '2a')(input_tensor)
-  x = layers.BatchNormalization(
-      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
-      name=bn_name_base + '2a')(x, training=training)
-  x = layers.Activation('relu')(x)
-
-  x = layers.Conv2D(filters2, kernel_size, padding='same', use_bias=False,
-                    kernel_initializer='he_normal',
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                    name=conv_name_base + '2b')(x)
-  x = layers.BatchNormalization(
-      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
-      name=bn_name_base + '2b')(x, training=training)
-
-  shortcut = layers.Conv2D(filters2, (1, 1), strides=strides, use_bias=False,
-                           kernel_initializer='he_normal',
-                           kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                           name=conv_name_base + '1')(input_tensor)
-  shortcut = layers.BatchNormalization(
-      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
-      name=bn_name_base + '1')(shortcut, training=training)
-
-  x = layers.add([x, shortcut])
-  x = layers.Activation('relu')(x)
-  return x
-
-
-def resnet_block(input_tensor,
-                 size,
-                 kernel_size,
-                 filters,
-                 stage,
-                 conv_strides=(2, 2),
-                 training=None):
-  """A block which applies conv followed by multiple identity blocks.
-
-  Arguments:
-    input_tensor: input tensor
-    size: integer, number of constituent conv/identity building blocks.
-    A conv block is applied once, followed by (size - 1) identity blocks.
-    kernel_size: default 3, the kernel size of
-        middle conv layer at main path
-    filters: list of integers, the filters of 3 conv layer at main path
-    stage: integer, current stage label, used for generating layer names
-    conv_strides: Strides for the first conv layer in the block.
-    training: Only used if training keras model with Estimator.  In other
-      scenarios it is handled automatically.
-
-  Returns:
-    Output tensor after applying conv and identity blocks.
-  """
-
-  x = conv_building_block(input_tensor, kernel_size, filters, stage=stage,
-                          strides=conv_strides, block='block_0',
-                          training=training)
-  for i in range(size - 1):
-    x = identity_building_block(x, kernel_size, filters, stage=stage,
-                                block='block_%d' % (i + 1), training=training)
-  return x
-
-
-def resnet(num_blocks, classes=10, training=None):
-  """Instantiates the ResNet architecture.
-
-  Arguments:
-    num_blocks: integer, the number of conv/identity blocks in each block.
-      The ResNet contains 3 blocks with each block containing one conv block
-      followed by (layers_per_block - 1) number of idenity blocks. Each
-      conv/idenity block has 2 convolutional layers. With the input
-      convolutional layer and the pooling layer towards the end, this brings
-      the total size of the network to (6*num_blocks + 2)
-    classes: optional number of classes to classify images into
-    training: Only used if training keras model with Estimator.  In other
-    scenarios it is handled automatically.
-
-  Returns:
-    A Keras model instance.
-  """
-
-  input_shape = (32, 32, 3)
-  img_input = layers.Input(shape=input_shape)
-
-  if backend.image_data_format() == 'channels_first':
-    x = layers.Lambda(lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)),
-                      name='transpose')(img_input)
-    bn_axis = 1
-  else:  # channel_last
-    x = img_input
-    bn_axis = 3
-
-  x = layers.ZeroPadding2D(padding=(1, 1), name='conv1_pad')(x)
-  x = layers.Conv2D(16, (3, 3),
-                    strides=(1, 1),
-                    padding='valid', use_bias=False,
-                    kernel_initializer='he_normal',
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                    name='conv1')(x)
-  x = layers.BatchNormalization(axis=bn_axis,
-                                momentum=BATCH_NORM_DECAY,
-                                epsilon=BATCH_NORM_EPSILON,
-                                name='bn_conv1',)(x, training=training)
-  x = layers.Activation('relu')(x)
-
-  x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[16, 16],
-                   stage=2, conv_strides=(1, 1), training=training)
-
-  x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[32, 32],
-                   stage=3, conv_strides=(2, 2), training=training)
-
-  x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[64, 64],
-                   stage=4, conv_strides=(2, 2), training=training)
-
-  rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3]
-  x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x)
-  x = layers.Dense(classes,
-                   activation='softmax',
-                   kernel_initializer=initializers.RandomNormal(stddev=0.01),
-                   kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                   bias_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                   name='fc10')(x)
-
-  inputs = img_input
-  # Create model.
-  model = tf.keras.models.Model(inputs, x, name='resnet56')
-
-  return model
-
-
-resnet20 = functools.partial(resnet, num_blocks=3)
-resnet32 = functools.partial(resnet, num_blocks=5)
-resnet56 = functools.partial(resnet, num_blocks=9)
-resnet10 = functools.partial(resnet, num_blocks=110)
--- a/official/benchmark/models/resnet_cifar_test.py
+++ b/official/benchmark/models/resnet_cifar_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test the keras ResNet model with Cifar data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-
-import tensorflow as tf
-
-from tensorflow.python.eager import context
-from tensorflow.python.platform import googletest
-from official.benchmark.models import cifar_preprocessing
-from official.benchmark.models import resnet_cifar_main
-from official.utils.testing import integration
-
-
-class KerasCifarTest(googletest.TestCase):
-  """Unit tests for Keras ResNet with Cifar."""
-
-  _extra_flags = [
-      "-batch_size", "4",
-      "-train_steps", "1",
-      "-use_synthetic_data", "true"
-  ]
-  _tempdir = None
-
-  def get_temp_dir(self):
-    if not self._tempdir:
-      self._tempdir = tempfile.mkdtemp(dir=googletest.GetTempDir())
-    return self._tempdir
-
-  @classmethod
-  def setUpClass(cls):  # pylint: disable=invalid-name
-    super(KerasCifarTest, cls).setUpClass()
-    resnet_cifar_main.define_cifar_flags()
-
-  def setUp(self):
-    super(KerasCifarTest, self).setUp()
-    cifar_preprocessing.NUM_IMAGES["validation"] = 4
-
-  def tearDown(self):
-    super(KerasCifarTest, self).tearDown()
-    tf.io.gfile.rmtree(self.get_temp_dir())
-
-  def test_end_to_end_no_dist_strat(self):
-    """Test Keras model with 1 GPU, no distribution strategy."""
-
-    extra_flags = [
-        "-distribution_strategy", "off",
-        "-model_dir", "keras_cifar_no_dist_strat",
-        "-data_format", "channels_last",
-    ]
-    extra_flags = extra_flags + self._extra_flags
-
-    integration.run_synthetic(
-        main=resnet_cifar_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_graph_no_dist_strat(self):
-    """Test Keras model in legacy graph mode with 1 GPU, no dist strat."""
-    extra_flags = [
-        "-enable_eager", "false",
-        "-distribution_strategy", "off",
-        "-model_dir", "keras_cifar_graph_no_dist_strat",
-        "-data_format", "channels_last",
-    ]
-    extra_flags = extra_flags + self._extra_flags
-
-    integration.run_synthetic(
-        main=resnet_cifar_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_1_gpu(self):
-    """Test Keras model with 1 GPU."""
-
-    if context.num_gpus() < 1:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(1, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "1",
-        "-distribution_strategy", "mirrored",
-        "-model_dir", "keras_cifar_1_gpu",
-        "-data_format", "channels_last",
-    ]
-    extra_flags = extra_flags + self._extra_flags
-
-    integration.run_synthetic(
-        main=resnet_cifar_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_graph_1_gpu(self):
-    """Test Keras model in legacy graph mode with 1 GPU."""
-    if context.num_gpus() < 1:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(1, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "1",
-        "-noenable_eager",
-        "-distribution_strategy", "mirrored",
-        "-model_dir", "keras_cifar_graph_1_gpu",
-        "-data_format", "channels_last",
-    ]
-    extra_flags = extra_flags + self._extra_flags
-
-    integration.run_synthetic(
-        main=resnet_cifar_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_2_gpu(self):
-    """Test Keras model with 2 GPUs."""
-
-    if context.num_gpus() < 2:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(2, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "2",
-        "-distribution_strategy", "mirrored",
-        "-model_dir", "keras_cifar_2_gpu",
-    ]
-    extra_flags = extra_flags + self._extra_flags
-
-    integration.run_synthetic(
-        main=resnet_cifar_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_graph_2_gpu(self):
-    """Test Keras model in legacy graph mode with 2 GPUs."""
-    if context.num_gpus() < 2:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(2, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "2",
-        "-enable_eager", "false",
-        "-distribution_strategy", "mirrored",
-        "-model_dir", "keras_cifar_graph_2_gpu",
-    ]
-    extra_flags = extra_flags + self._extra_flags
-
-    integration.run_synthetic(
-        main=resnet_cifar_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-
-if __name__ == "__main__":
-  googletest.main()
--- a/official/benchmark/models/resnet_imagenet_main.py
+++ b/official/benchmark/models/resnet_imagenet_main.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Runs a ResNet model on the ImageNet dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-
-import tensorflow_model_optimization as tfmot
-from official.modeling import performance
-from official.utils.flags import core as flags_core
-from official.utils.misc import distribution_utils
-from official.utils.misc import keras_utils
-from official.utils.misc import model_helpers
-from official.vision.image_classification import test_utils
-from official.vision.image_classification.resnet import common
-from official.vision.image_classification.resnet import imagenet_preprocessing
-from official.vision.image_classification.resnet import resnet_model
-
-
-def run(flags_obj):
-  """Run ResNet ImageNet training and eval loop using native Keras APIs.
-
-  Args:
-    flags_obj: An object containing parsed flag values.
-
-  Raises:
-    ValueError: If fp16 is passed as it is not currently supported.
-    NotImplementedError: If some features are not currently supported.
-
-  Returns:
-    Dictionary of training and eval stats.
-  """
-  keras_utils.set_session_config(
-      enable_xla=flags_obj.enable_xla)
-
-  # Execute flag override logic for better model performance
-  if flags_obj.tf_gpu_thread_mode:
-    keras_utils.set_gpu_thread_mode_and_count(
-        per_gpu_thread_count=flags_obj.per_gpu_thread_count,
-        gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
-        num_gpus=flags_obj.num_gpus,
-        datasets_num_private_threads=flags_obj.datasets_num_private_threads)
-  common.set_cudnn_batchnorm_mode()
-
-  dtype = flags_core.get_tf_dtype(flags_obj)
-  performance.set_mixed_precision_policy(
-      flags_core.get_tf_dtype(flags_obj),
-      flags_core.get_loss_scale(flags_obj, default_for_fp16=128))
-
-  data_format = flags_obj.data_format
-  if data_format is None:
-    data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
-                   else 'channels_last')
-  tf.keras.backend.set_image_data_format(data_format)
-
-  # Configures cluster spec for distribution strategy.
-  _ = distribution_utils.configure_cluster(flags_obj.worker_hosts,
-                                           flags_obj.task_index)
-
-  strategy = distribution_utils.get_distribution_strategy(
-      distribution_strategy=flags_obj.distribution_strategy,
-      num_gpus=flags_obj.num_gpus,
-      all_reduce_alg=flags_obj.all_reduce_alg,
-      num_packs=flags_obj.num_packs,
-      tpu_address=flags_obj.tpu)
-
-  if strategy:
-    # flags_obj.enable_get_next_as_optional controls whether enabling
-    # get_next_as_optional behavior in DistributedIterator. If true, last
-    # partial batch can be supported.
-    strategy.extended.experimental_enable_get_next_as_optional = (
-        flags_obj.enable_get_next_as_optional
-    )
-
-  strategy_scope = distribution_utils.get_strategy_scope(strategy)
-
-  # pylint: disable=protected-access
-  if flags_obj.use_synthetic_data:
-    input_fn = common.get_synth_input_fn(
-        height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
-        width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
-        num_channels=imagenet_preprocessing.NUM_CHANNELS,
-        num_classes=imagenet_preprocessing.NUM_CLASSES,
-        dtype=dtype,
-        drop_remainder=True)
-  else:
-    input_fn = imagenet_preprocessing.input_fn
-
-  # When `enable_xla` is True, we always drop the remainder of the batches
-  # in the dataset, as XLA-GPU doesn't support dynamic shapes.
-  drop_remainder = flags_obj.enable_xla
-
-  # Current resnet_model.resnet50 input format is always channel-last.
-  # We use keras_application mobilenet model which input format is depends on
-  # the keras beckend image data format.
-  # This use_keras_image_data_format flags indicates whether image preprocessor
-  # output format should be same as the keras backend image data format or just
-  # channel-last format.
-  use_keras_image_data_format = (flags_obj.model == 'mobilenet')
-  train_input_dataset = input_fn(
-      is_training=True,
-      data_dir=flags_obj.data_dir,
-      batch_size=flags_obj.batch_size,
-      parse_record_fn=imagenet_preprocessing.get_parse_record_fn(
-          use_keras_image_data_format=use_keras_image_data_format),
-      datasets_num_private_threads=flags_obj.datasets_num_private_threads,
-      dtype=dtype,
-      drop_remainder=drop_remainder,
-      tf_data_experimental_slack=flags_obj.tf_data_experimental_slack,
-      training_dataset_cache=flags_obj.training_dataset_cache,
-  )
-
-  eval_input_dataset = None
-  if not flags_obj.skip_eval:
-    eval_input_dataset = input_fn(
-        is_training=False,
-        data_dir=flags_obj.data_dir,
-        batch_size=flags_obj.batch_size,
-        parse_record_fn=imagenet_preprocessing.get_parse_record_fn(
-            use_keras_image_data_format=use_keras_image_data_format),
-        dtype=dtype,
-        drop_remainder=drop_remainder)
-
-  lr_schedule = common.PiecewiseConstantDecayWithWarmup(
-      batch_size=flags_obj.batch_size,
-      epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
-      warmup_epochs=common.LR_SCHEDULE[0][1],
-      boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
-      multipliers=list(p[0] for p in common.LR_SCHEDULE),
-      compute_lr_on_cpu=True)
-  steps_per_epoch = (
-      imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
-
-  with strategy_scope:
-    if flags_obj.optimizer == 'resnet50_default':
-      optimizer = common.get_optimizer(lr_schedule)
-    elif flags_obj.optimizer == 'mobilenet_default':
-      initial_learning_rate = \
-          flags_obj.initial_learning_rate_per_sample * flags_obj.batch_size
-      optimizer = tf.keras.optimizers.SGD(
-          learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
-              initial_learning_rate,
-              decay_steps=steps_per_epoch * flags_obj.num_epochs_per_decay,
-              decay_rate=flags_obj.lr_decay_factor,
-              staircase=True),
-          momentum=0.9)
-    if flags_obj.fp16_implementation == 'graph_rewrite':
-      # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as
-      # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32'
-      # which will ensure tf.compat.v2.keras.mixed_precision and
-      # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
-      # up.
-      optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
-          optimizer)
-
-    # TODO(hongkuny): Remove trivial model usage and move it to benchmark.
-    if flags_obj.use_trivial_model:
-      model = test_utils.trivial_model(imagenet_preprocessing.NUM_CLASSES)
-    elif flags_obj.model == 'resnet50_v1.5':
-      model = resnet_model.resnet50(
-          num_classes=imagenet_preprocessing.NUM_CLASSES)
-    elif flags_obj.model == 'mobilenet':
-      # TODO(kimjaehong): Remove layers attribute when minimum TF version
-      # support 2.0 layers by default.
-      model = tf.keras.applications.mobilenet.MobileNet(
-          weights=None,
-          classes=imagenet_preprocessing.NUM_CLASSES,
-          layers=tf.keras.layers)
-    if flags_obj.pretrained_filepath:
-      model.load_weights(flags_obj.pretrained_filepath)
-
-    if flags_obj.pruning_method == 'polynomial_decay':
-      if dtype != tf.float32:
-        raise NotImplementedError(
-            'Pruning is currently only supported on dtype=tf.float32.')
-      pruning_params = {
-          'pruning_schedule':
-              tfmot.sparsity.keras.PolynomialDecay(
-                  initial_sparsity=flags_obj.pruning_initial_sparsity,
-                  final_sparsity=flags_obj.pruning_final_sparsity,
-                  begin_step=flags_obj.pruning_begin_step,
-                  end_step=flags_obj.pruning_end_step,
-                  frequency=flags_obj.pruning_frequency),
-      }
-      model = tfmot.sparsity.keras.prune_low_magnitude(model, **pruning_params)
-    elif flags_obj.pruning_method:
-      raise NotImplementedError(
-          'Only polynomial_decay is currently supported.')
-
-    model.compile(
-        loss='sparse_categorical_crossentropy',
-        optimizer=optimizer,
-        metrics=(['sparse_categorical_accuracy']
-                 if flags_obj.report_accuracy_metrics else None),
-        run_eagerly=flags_obj.run_eagerly)
-
-  train_epochs = flags_obj.train_epochs
-
-  callbacks = common.get_callbacks(
-      pruning_method=flags_obj.pruning_method,
-      enable_checkpoint_and_export=flags_obj.enable_checkpoint_and_export,
-      model_dir=flags_obj.model_dir)
-
-  # if mutliple epochs, ignore the train_steps flag.
-  if train_epochs <= 1 and flags_obj.train_steps:
-    steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
-    train_epochs = 1
-
-  num_eval_steps = (
-      imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size)
-
-  validation_data = eval_input_dataset
-  if flags_obj.skip_eval:
-    # Only build the training graph. This reduces memory usage introduced by
-    # control flow ops in layers that have different implementations for
-    # training and inference (e.g., batch norm).
-    if flags_obj.set_learning_phase_to_train:
-      # TODO(haoyuzhang): Understand slowdown of setting learning phase when
-      # not using distribution strategy.
-      tf.keras.backend.set_learning_phase(1)
-    num_eval_steps = None
-    validation_data = None
-
-  if not strategy and flags_obj.explicit_gpu_placement:
-    # TODO(b/135607227): Add device scope automatically in Keras training loop
-    # when not using distribition strategy.
-    no_dist_strat_device = tf.device('/device:GPU:0')
-    no_dist_strat_device.__enter__()
-
-  history = model.fit(train_input_dataset,
-                      epochs=train_epochs,
-                      steps_per_epoch=steps_per_epoch,
-                      callbacks=callbacks,
-                      validation_steps=num_eval_steps,
-                      validation_data=validation_data,
-                      validation_freq=flags_obj.epochs_between_evals,
-                      verbose=2)
-
-  eval_output = None
-  if not flags_obj.skip_eval:
-    eval_output = model.evaluate(eval_input_dataset,
-                                 steps=num_eval_steps,
-                                 verbose=2)
-
-  if flags_obj.pruning_method:
-    model = tfmot.sparsity.keras.strip_pruning(model)
-  if flags_obj.enable_checkpoint_and_export:
-    if dtype == tf.bfloat16:
-      logging.warning('Keras model.save does not support bfloat16 dtype.')
-    else:
-      # Keras model.save assumes a float32 input designature.
-      export_path = os.path.join(flags_obj.model_dir, 'saved_model')
-      model.save(export_path, include_optimizer=False)
-
-  if not strategy and flags_obj.explicit_gpu_placement:
-    no_dist_strat_device.__exit__()
-
-  stats = common.build_stats(history, eval_output, callbacks)
-  return stats
-
-
-def define_imagenet_keras_flags():
-  common.define_keras_flags(
-      model=True,
-      optimizer=True,
-      pretrained_filepath=True)
-  common.define_pruning_flags()
-  flags_core.set_defaults()
-  flags.adopt_module_key_flags(common)
-
-
-def main(_):
-  model_helpers.apply_clean(flags.FLAGS)
-  stats = run(flags.FLAGS)
-  logging.info('Run stats:\n%s', stats)
-
-
-if __name__ == '__main__':
-  logging.set_verbosity(logging.INFO)
-  define_imagenet_keras_flags()
-  app.run(main)
--- a/official/benchmark/models/resnet_imagenet_test.py
+++ b/official/benchmark/models/resnet_imagenet_test.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test the keras ResNet model with ImageNet data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from tensorflow.python.eager import context
-from official.benchmark.models import resnet_imagenet_main
-from official.utils.testing import integration
-from official.vision.image_classification.resnet import imagenet_preprocessing
-
-
-@parameterized.parameters(
-    "resnet",
-    # "resnet_polynomial_decay",  b/151854314
-    "mobilenet",
-    # "mobilenet_polynomial_decay"  b/151854314
-)
-class KerasImagenetTest(tf.test.TestCase):
-  """Unit tests for Keras Models with ImageNet."""
-  _default_flags_dict = [
-      "-batch_size", "4",
-      "-train_steps", "1",
-      "-use_synthetic_data", "true",
-      "-data_format", "channels_last",
-  ]
-  _extra_flags_dict = {
-      "resnet": [
-          "-model", "resnet50_v1.5",
-          "-optimizer", "resnet50_default",
-      ],
-      "resnet_polynomial_decay": [
-          "-model", "resnet50_v1.5",
-          "-optimizer", "resnet50_default",
-          "-pruning_method", "polynomial_decay",
-      ],
-      "mobilenet": [
-          "-model", "mobilenet",
-          "-optimizer", "mobilenet_default",
-      ],
-      "mobilenet_polynomial_decay": [
-          "-model", "mobilenet",
-          "-optimizer", "mobilenet_default",
-          "-pruning_method", "polynomial_decay",
-      ],
-  }
-  _tempdir = None
-
-  @classmethod
-  def setUpClass(cls):  # pylint: disable=invalid-name
-    super(KerasImagenetTest, cls).setUpClass()
-    resnet_imagenet_main.define_imagenet_keras_flags()
-
-  def setUp(self):
-    super(KerasImagenetTest, self).setUp()
-    imagenet_preprocessing.NUM_IMAGES["validation"] = 4
-    self.policy = \
-        tf.keras.mixed_precision.experimental.global_policy()
-
-  def tearDown(self):
-    super(KerasImagenetTest, self).tearDown()
-    tf.io.gfile.rmtree(self.get_temp_dir())
-    tf.keras.mixed_precision.experimental.set_policy(self.policy)
-
-  def get_extra_flags_dict(self, flags_key):
-    return self._extra_flags_dict[flags_key] + self._default_flags_dict
-
-  def test_end_to_end_no_dist_strat(self, flags_key):
-    """Test Keras model with 1 GPU, no distribution strategy."""
-
-    extra_flags = [
-        "-distribution_strategy", "off",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_graph_no_dist_strat(self, flags_key):
-    """Test Keras model in legacy graph mode with 1 GPU, no dist strat."""
-    extra_flags = [
-        "-enable_eager", "false",
-        "-distribution_strategy", "off",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_1_gpu(self, flags_key):
-    """Test Keras model with 1 GPU."""
-
-    if context.num_gpus() < 1:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(1, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "1",
-        "-distribution_strategy", "mirrored",
-        "-enable_checkpoint_and_export", "1",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_1_gpu_fp16(self, flags_key):
-    """Test Keras model with 1 GPU and fp16."""
-
-    if context.num_gpus() < 1:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available"
-          .format(1, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "1",
-        "-dtype", "fp16",
-        "-distribution_strategy", "mirrored",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    if "polynomial_decay" in extra_flags:
-      self.skipTest("Pruning with fp16 is not currently supported.")
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_2_gpu(self, flags_key):
-    """Test Keras model with 2 GPUs."""
-
-    if context.num_gpus() < 2:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(2, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "2",
-        "-distribution_strategy", "mirrored",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_xla_2_gpu(self, flags_key):
-    """Test Keras model with XLA and 2 GPUs."""
-
-    if context.num_gpus() < 2:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(2, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "2",
-        "-enable_xla", "true",
-        "-distribution_strategy", "mirrored",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_2_gpu_fp16(self, flags_key):
-    """Test Keras model with 2 GPUs and fp16."""
-
-    if context.num_gpus() < 2:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(2, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "2",
-        "-dtype", "fp16",
-        "-distribution_strategy", "mirrored",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    if "polynomial_decay" in extra_flags:
-      self.skipTest("Pruning with fp16 is not currently supported.")
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_xla_2_gpu_fp16(self, flags_key):
-    """Test Keras model with XLA, 2 GPUs and fp16."""
-    if context.num_gpus() < 2:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(2, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "2",
-        "-dtype", "fp16",
-        "-enable_xla", "true",
-        "-distribution_strategy", "mirrored",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    if "polynomial_decay" in extra_flags:
-      self.skipTest("Pruning with fp16 is not currently supported.")
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/benchmark/models/resnet_imagenet_test_tpu.py
+++ b/official/benchmark/models/resnet_imagenet_test_tpu.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test the keras ResNet model with ImageNet data on TPU."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import tensorflow as tf
-from official.benchmark.models import resnet_imagenet_main
-from official.utils.testing import integration
-from official.vision.image_classification.resnet import imagenet_preprocessing
-
-
-class KerasImagenetTest(tf.test.TestCase, parameterized.TestCase):
-  """Unit tests for Keras Models with ImageNet."""
-
-  _extra_flags_dict = {
-      "resnet": [
-          "-batch_size", "4",
-          "-train_steps", "1",
-          "-use_synthetic_data", "true"
-          "-model", "resnet50_v1.5",
-          "-optimizer", "resnet50_default",
-      ],
-      "resnet_polynomial_decay": [
-          "-batch_size", "4",
-          "-train_steps", "1",
-          "-use_synthetic_data", "true",
-          "-model", "resnet50_v1.5",
-          "-optimizer", "resnet50_default",
-          "-pruning_method", "polynomial_decay",
-      ],
-  }
-  _tempdir = None
-
-  @classmethod
-  def setUpClass(cls):  # pylint: disable=invalid-name
-    super(KerasImagenetTest, cls).setUpClass()
-    resnet_imagenet_main.define_imagenet_keras_flags()
-
-  def setUp(self):
-    super(KerasImagenetTest, self).setUp()
-    imagenet_preprocessing.NUM_IMAGES["validation"] = 4
-    self.policy = \
-        tf.keras.mixed_precision.experimental.global_policy()
-
-  def tearDown(self):
-    super(KerasImagenetTest, self).tearDown()
-    tf.io.gfile.rmtree(self.get_temp_dir())
-    tf.keras.mixed_precision.experimental.set_policy(self.policy)
-
-  @parameterized.parameters([
-      "resnet",
-      # "resnet_polynomial_decay"  b/151854314
-  ])
-  def test_end_to_end_tpu(self, flags_key):
-    """Test Keras model with TPU distribution strategy."""
-
-    extra_flags = [
-        "-distribution_strategy", "tpu",
-        "-data_format", "channels_last",
-        "-enable_checkpoint_and_export", "1",
-    ]
-    extra_flags = extra_flags + self._extra_flags_dict[flags_key]
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  @parameterized.parameters(["resnet"])
-  def test_end_to_end_tpu_bf16(self, flags_key):
-    """Test Keras model with TPU and bfloat16 activation."""
-
-    extra_flags = [
-        "-distribution_strategy", "tpu",
-        "-data_format", "channels_last",
-        "-dtype", "bf16",
-    ]
-    extra_flags = extra_flags + self._extra_flags_dict[flags_key]
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/benchmark/models/shakespeare/README.md
+++ b/official/benchmark/models/shakespeare/README.md
-# Shakespeare character LSTM model
-
-This is an implemention of a simple character LSTM used to generate text.
-
-## Instructions
-
-First download the source data:
-
-```
-wget https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
-```
-
-Note that files other than shakepeare.txt can also be used to train the model to generater other text.
-
-Then train the model:
-
-```python
-python3 shakespeare_main.py --training_data shakespeare.txt \
-    --model_dir /tmp/shakespeare
-```
-
-This will place model checkpoints in `/tmp/shakespeare`, so that we can use them to make predictions.
-
-Then generate predictions:
-
-```python
-python3 shakespeare_main.py --training_data shakespeare.txt \
-    --model_dir /tmp/shakespeare --notrain --predict_context=ROMEO:
-```
-
-Change `--predict_context` and `--predict_length` to suit your needs.
--- a/official/benchmark/models/shakespeare/__init__.py
+++ b/official/benchmark/models/shakespeare/__init__.py
-
--- a/official/benchmark/models/shakespeare/shakespeare_main.py
+++ b/official/benchmark/models/shakespeare/shakespeare_main.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Runs a character LSTM model trained on Shakespeare."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import os
-
-# pylint: disable=wrong-import-order
-from absl import app
-from absl import flags
-import numpy as np
-import tensorflow as tf
-# pylint: enable=wrong-import-order
-
-from official.utils.flags import core as flags_core
-from official.utils.misc import distribution_utils
-from official.utils.misc import keras_utils
-
-EMBEDDING_DIM = 256
-RNN_UNITS = 1024
-SEQ_LENGTH = 100
-# Calculated by running batch_size=1
-BATCHES_PER_EPOCH = 11043
-
-
-def define_flags():
-  """Define the flags for the Shakespeare character LSTM."""
-  flags_core.define_base(data_dir=False,
-                         clean=False,
-                         train_epochs=True,
-                         epochs_between_evals=False,
-                         stop_threshold=False,
-                         num_gpu=True,
-                         export_dir=False,
-                         run_eagerly=True,
-                         distribution_strategy=True)
-
-  flags_core.define_performance(num_parallel_calls=False,
-                                inter_op=False,
-                                intra_op=False,
-                                synthetic_data=False,
-                                max_train_steps=False,
-                                dtype=True,
-                                loss_scale=True,
-                                enable_xla=True)
-
-  flags_core.set_defaults(train_epochs=43,
-                          batch_size=64)
-
-  flags.DEFINE_boolean(name='enable_eager', default=True, help='Enable eager?')
-  flags.DEFINE_boolean(
-      name='train', default=True,
-      help='If true trains the model.')
-  flags.DEFINE_string(
-      name='predict_context', default=None,
-      help='If set, makes a prediction with the given context.')
-  flags.DEFINE_integer(
-      name='predict_length', default=1000,
-      help='Length of the predicted text including the context.')
-  flags.DEFINE_integer(name='train_steps', default=None,
-                       help='Overrides train_steps per epoch if not None.')
-  flags.DEFINE_integer(
-      name='log_steps', default=100,
-      help='For every log_steps, we log the timing information such as '
-      'examples per second.')
-  flags.DEFINE_string(
-      name='training_data', default=None,
-      help='Path to file containing the training data.')
-  flags.DEFINE_boolean(name='cudnn', default=True, help='Use CuDNN LSTM.')
-
-
-def get_dataset(path_to_file, batch_size=None, seq_length=SEQ_LENGTH):
-  """Creates a dataset from a given text file.
-
-  Args:
-    path_to_file: The path to the training data.
-    batch_size: Batch size to use.
-    seq_length: The length of the LSTM sequence.
-
-  Returns:
-    A tuple, consisting of the Dataset and the class to character mapping
-    and character to class mapping.
-  """
-  with tf.io.gfile.GFile(path_to_file, 'rb') as train_data:
-    text = train_data.read().decode(encoding='utf-8')
-
-  # Create vocab
-  vocab = sorted(set(text))
-  char2idx = {u: i for i, u in enumerate(vocab)}
-  idx2char = np.array(vocab)
-
-  # Split text into sequence length + 1 chucks to create examples
-  text_as_int = np.array([char2idx[c] for c in text])
-  char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
-  sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
-
-  def split_input_target(chunk):
-    input_text = chunk[:-1]
-    target_text = chunk[1:]
-    return input_text, tf.one_hot(target_text, len(vocab))
-  dataset = sequences.map(split_input_target)
-  dataset = dataset.shuffle(10000).repeat()
-  dataset = dataset.batch(batch_size, drop_remainder=True)
-
-  return dataset, idx2char, char2idx
-
-
-def build_model(vocab_size,
-                embedding_dim=EMBEDDING_DIM,
-                rnn_units=RNN_UNITS,
-                batch_size=None,
-                stateful=False,
-                use_cudnn=True):
-  """Builds the Shakespeare model.
-
-  Args:
-    vocab_size: The number of character classes in the input.
-    embedding_dim: The dimension of the embedding space for each class.
-    rnn_units: The number of RNN units in the layer.
-    batch_size: When predicting, the batch size of the predictions.
-    stateful: If true, the LSTM is stateful.
-
-  Returns:
-    A Keras Model.
-  """
-  LSTM = functools.partial(tf.keras.layers.LSTM, implementation=2)
-
-  # By indirecting the activation through a lambda layer, the logic to dispatch
-  # to CuDNN in V2 doesn't trigger and we force the LSTM to run in non-CuDNN
-  # mode.
-  lstm_activation = ('tanh' if use_cudnn else
-                     lambda x: tf.math.tanh(x))
-
-  batch_shape = [batch_size if stateful else None, None]
-  return tf.keras.Sequential([
-      tf.keras.layers.Embedding(vocab_size, embedding_dim,
-                                batch_input_shape=batch_shape),
-      LSTM(rnn_units,
-           activation=lstm_activation,
-           return_sequences=True,
-           stateful=stateful,
-           recurrent_initializer='glorot_uniform'),
-      tf.keras.layers.Dense(vocab_size),
-      tf.keras.layers.Softmax(dtype=tf.float32)])
-
-
-def train_model(flags_obj, dataset, vocab_size, strategy, checkpoint_dir=None):
-  """Trains a Shakespeare model.
-
-  Args:
-    flags_obj: An object containing parsed flag values.s
-    dataset: the training data set.
-    vocab_size: the number of unique character classes.
-    strategy: distribution strategy to use.
-    checkpoint_dir: if not None, the directory in which to make checkpoints.
-
-  Returns:
-    The training history and callbacks.
-  """
-  if flags_obj.train_steps:
-    train_steps = flags_obj.train_steps
-  else:
-    train_steps = BATCHES_PER_EPOCH // flags_obj.batch_size
-  strategy_scope = distribution_utils.get_strategy_scope(strategy)
-
-  with strategy_scope:
-    model = build_model(vocab_size=vocab_size, batch_size=flags_obj.batch_size,
-                        use_cudnn=flags_obj.cudnn)
-
-   # When keras_use_ctl is False, Model.fit() automatically applies
-   # loss scaling so we don't need to create a LossScaleOptimizer.
-    model.compile(
-        optimizer=tf.keras.optimizers.Adam(),
-        loss=tf.keras.losses.CategoricalCrossentropy(),
-        metrics=[tf.keras.metrics.Recall(top_k=1, name='RecallAt1'),
-                 tf.keras.metrics.Recall(top_k=5, name='RecallAt5')],
-        run_eagerly=flags_obj.run_eagerly)
-
-  callbacks = []
-  if checkpoint_dir:
-    checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')
-    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
-        filepath=checkpoint_prefix,
-        save_weights_only=True)
-    callbacks.append(checkpoint_callback)
-  time_callback = keras_utils.TimeHistory(flags_obj.batch_size,
-                                          flags_obj.log_steps)
-  callbacks.append(time_callback)
-  history = model.fit(dataset,
-                      epochs=flags_obj.train_epochs,
-                      steps_per_epoch=train_steps,
-                      callbacks=callbacks,
-                      verbose=2)
-  return history, callbacks
-
-
-def make_prediction(checkpoint_dir, length, context, idx2char, char2idx):
-  """Make predictions from a Shakespeare model.
-
-  Args:
-    checkpoint_dir: the directory from which to load checkpoints
-    length: the total length of the generated text (including the context).
-    context: the initial text with which the LSTM is primed.
-    idx2char: the character class to character mapping.
-    char2idx: the character to character class mapping.
-
-  Returns:
-    A generated string of text of the given length.
-  """
-  prediction_model = build_model(
-      vocab_size=len(idx2char), batch_size=1, stateful=True)
-  prediction_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
-  prediction_model.build(tf.TensorShape([1, None]))
-
-  input_eval = [char2idx[s] for s in context]
-  input_eval = tf.expand_dims(input_eval, 0)
-
-  text_generated = []
-
-  prediction_model.reset_states()
-  for _ in range(length - len(context)):
-    predictions = prediction_model(input_eval)
-    predictions = tf.squeeze(predictions, 0)
-
-    # We applied a softmax to the output of the model so that
-    # tf.keras.metrics.Recall would work. We need logits for
-    # tf.random.categorical, so we convert the probabilities back to log odds
-    predictions = tf.math.log(predictions / (1 - predictions))
-
-    random_output = tf.random.categorical(predictions, num_samples=1)
-    selected_id = random_output[-1, 0].numpy()
-    input_eval = tf.expand_dims([selected_id], 0)
-    text_generated.append(idx2char[selected_id])
-
-  return context + ''.join(text_generated)
-
-
-def run(flags_obj):
-  """Run Shakespeare training and predict.
-
-  Args:
-    flags_obj: An object containing parsed flag values.
-
-  Returns:
-    Dictionary with status from the run.
-  """
-  if not flags_obj.training_data:
-    raise ValueError(
-        'Must set the path to a training data file. e.g download the following '
-        'https://storage.googleapis.com/download.tensorflow.org/data/'
-        'shakespeare.txt')
-
-  if flags_obj.dtype == 'fp16':
-    policy = tf.keras.mixed_precision.experimental.Policy(
-        'mixed_float16',
-        loss_scale=flags_core.get_loss_scale(flags_obj,
-                                             default_for_fp16='dynamic'))
-    tf.keras.mixed_precision.experimental.set_policy(policy)
-
-  keras_utils.set_session_config(
-      enable_xla=flags_obj.enable_xla)
-
-  strategy = distribution_utils.get_distribution_strategy(
-      distribution_strategy=flags_obj.distribution_strategy,
-      num_gpus=flags_obj.num_gpus)
-
-  dataset, idx2char, char2idx = get_dataset(flags_obj.training_data,
-                                            batch_size=flags_obj.batch_size)
-  stats = {}
-  if flags_obj.train:
-    history, callbacks = train_model(flags_obj, dataset,
-                                     len(idx2char), strategy,
-                                     checkpoint_dir=flags_obj.model_dir)
-
-    stats['history'] = history.history
-    stats['callbacks'] = callbacks
-
-  if flags_obj.predict_context:
-    if not flags_obj.model_dir:
-      raise ValueError('Must set model_dir to get predictions.')
-    print(make_prediction(flags_obj.model_dir,
-                          flags_obj.predict_length,
-                          flags_obj.predict_context,
-                          idx2char,
-                          char2idx))
-
-  return stats
-
-
-def main(_):
-  flags_obj = flags.FLAGS
-  run(flags_obj)
-
-
-if __name__ == '__main__':
-  define_flags()
-  app.run(main)
--- a/official/benchmark/models/synthetic_util.py
+++ b/official/benchmark/models/synthetic_util.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Helper functions to generate data directly on devices."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import random
-import string
-
-from absl import logging
-import tensorflow as tf
-
-
-# The `SyntheticDataset` is a temporary solution for generating synthetic data
-# directly on devices. It is only useful for Keras with Distribution
-# Strategies. We will have better support in `tf.data` or Distribution Strategy
-# later.
-class SyntheticDataset(object):
-  """A dataset that generates synthetic data on each device."""
-
-  def __init__(self, dataset, split_by=1):
-    # dataset.take(1) doesn't have GPU kernel.
-    with tf.device('device:CPU:0'):
-      tensor = tf.data.experimental.get_single_element(dataset.take(1))
-    flat_tensor = tf.nest.flatten(tensor)
-    variable_data = []
-    initializers = []
-    for t in flat_tensor:
-      rebatched_t = tf.split(t, num_or_size_splits=split_by, axis=0)[0]
-      assert rebatched_t.shape.is_fully_defined(), rebatched_t.shape
-      v = tf.compat.v1.get_local_variable(self._random_name(),
-                                          initializer=rebatched_t)
-      variable_data.append(v)
-      initializers.append(v.initializer)
-    input_data = tf.nest.pack_sequence_as(tensor, variable_data)
-    self._iterator = SyntheticIterator(input_data, initializers)
-
-  def _random_name(self, size=10, chars=string.ascii_uppercase + string.digits):
-    return ''.join(random.choice(chars) for _ in range(size))
-
-  def __iter__(self):
-    return self._iterator
-
-  def make_one_shot_iterator(self):
-    return self._iterator
-
-  def make_initializable_iterator(self):
-    return self._iterator
-
-
-class SyntheticIterator(object):
-  """A dataset that generates synthetic data on each device."""
-
-  def __init__(self, input_data, initializers):
-    self._input_data = input_data
-    self._initializers = initializers
-
-  def get_next(self):
-    return self._input_data
-
-  def next(self):
-    return self.__next__()
-
-  def __next__(self):
-    try:
-      return self.get_next()
-    except tf.errors.OutOfRangeError:
-      raise StopIteration
-
-  def initialize(self):
-    if tf.executing_eagerly():
-      return tf.no_op()
-    else:
-      return self._initializers
-
-
-def _monkey_patch_dataset_method(strategy):
-  """Monkey-patch `strategy`'s `make_dataset_iterator` method."""
-  def make_dataset(self, dataset):
-    logging.info('Using pure synthetic data.')
-    with self.scope():
-      if self.extended._global_batch_size:  # pylint: disable=protected-access
-        return SyntheticDataset(dataset, self.num_replicas_in_sync)
-      else:
-        return SyntheticDataset(dataset)
-
-  def make_iterator(self, dataset):
-    dist_dataset = make_dataset(self, dataset)
-    return iter(dist_dataset)
-
-  strategy.orig_make_dataset_iterator = strategy.make_dataset_iterator
-  strategy.make_dataset_iterator = make_iterator
-  strategy.orig_distribute_dataset = strategy.experimental_distribute_dataset
-  strategy.experimental_distribute_dataset = make_dataset
-
-
-def _undo_monkey_patch_dataset_method(strategy):
-  if hasattr(strategy, 'orig_make_dataset_iterator'):
-    strategy.make_dataset_iterator = strategy.orig_make_dataset_iterator
-  if hasattr(strategy, 'orig_distribute_dataset'):
-    strategy.make_dataset_iterator = strategy.orig_distribute_dataset
-
-
-def set_up_synthetic_data():
-  _monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
-  _monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
-  _monkey_patch_dataset_method(
-      tf.distribute.experimental.MultiWorkerMirroredStrategy)
-
-
-def undo_set_up_synthetic_data():
-  _undo_monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
-  _undo_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
-  _undo_monkey_patch_dataset_method(
-      tf.distribute.experimental.MultiWorkerMirroredStrategy)
--- a/official/benchmark/ncf_keras_benchmark.py
+++ b/official/benchmark/ncf_keras_benchmark.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes Keras benchmarks and accuracy tests."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-
-from absl import flags
-from absl import logging
-from absl.testing import flagsaver
-import tensorflow as tf
-from official.benchmark import benchmark_wrappers
-from official.benchmark import owner_utils
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-from official.recommendation import ncf_common
-from official.recommendation import ncf_keras_main
-from official.utils.flags import core
-
-FLAGS = flags.FLAGS
-NCF_DATA_DIR_NAME = 'movielens_data'
-NCF_TF_REGRESSION_DATA_DIR_NAME = 'gs://tf-regression/ncf/data'
-
-
-class NCFKerasBenchmarkBase(PerfZeroBenchmark):
-  """Base class for NCF model benchmark."""
-
-  def __init__(self, output_dir=None, default_flags=None, **kwargs):
-    super(NCFKerasBenchmarkBase, self).__init__(output_dir, default_flags,
-                                                **kwargs)
-
-    # Run all benchmarks with ml_perf flag.
-    self.default_flags['ml_perf'] = True
-
-  def _setup(self):
-    """Sets up and resets flags before each test."""
-    logging.set_verbosity(logging.INFO)
-    if NCFKerasBenchmarkBase.local_flags is None:
-      ncf_common.define_ncf_flags()
-      # Loads flags to get defaults to then override. List cannot be empty.
-      flags.FLAGS(['foo'])
-      core.set_defaults(**self.default_flags)
-      saved_flag_values = flagsaver.save_flag_values()
-      NCFKerasBenchmarkBase.local_flags = saved_flag_values
-    else:
-      flagsaver.restore_flag_values(NCFKerasBenchmarkBase.local_flags)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self, hr_at_10_min=0, hr_at_10_max=0):
-    start_time_sec = time.time()
-    stats = ncf_keras_main.run_ncf(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    metrics = []
-    metrics.append({'name': 'exp_per_second',
-                    'value': stats['avg_exp_per_second']})
-
-    if hr_at_10_min > 0:
-      metrics.append({'name': 'hr_at_10',
-                      'value': stats['eval_hit_rate'],
-                      'min_value': hr_at_10_min,
-                      'max_value': hr_at_10_max})
-
-      metrics.append({'name': 'train_loss',
-                      'value': stats['loss']})
-
-    self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=metrics)
-
-
-class NCFKerasAccuracy(NCFKerasBenchmarkBase):
-  """Benchmark NCF model using real data."""
-
-  def __init__(self,
-               output_dir=None,
-               root_data_dir=None,
-               default_flags=None,
-               **kwargs):
-    root_data_dir = root_data_dir if root_data_dir else ''
-    default_flags = {}
-    default_flags['dataset'] = 'ml-20m'
-    default_flags['num_gpus'] = 1
-    default_flags['train_epochs'] = 10
-    default_flags['clean'] = True
-    default_flags['batch_size'] = 99000
-    default_flags['learning_rate'] = 0.00382059
-    default_flags['beta1'] = 0.783529
-    default_flags['beta2'] = 0.909003
-    default_flags['epsilon'] = 1.45439e-07
-    default_flags['layers'] = [256, 256, 128, 64]
-    default_flags['num_factors'] = 64
-    default_flags['hr_threshold'] = 0.635
-    default_flags['ml_perf'] = True
-    default_flags['use_synthetic_data'] = False
-    default_flags['data_dir'] = os.path.join(root_data_dir, NCF_DATA_DIR_NAME)
-
-    super(NCFKerasAccuracy, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        **kwargs)
-
-  def _run_and_report_benchmark_mlperf_like(self):
-    """Run test and report results.
-
-    Note: MLPerf like tests are not tuned to hit a specific hr@10 value, but
-    we want it recorded.
-    """
-    self._run_and_report_benchmark(hr_at_10_min=0.61)
-
-  def _run_and_report_benchmark(self, hr_at_10_min=0.630, hr_at_10_max=0.645):
-    """Run test and report results.
-
-    Note: Target is 0.635, but some runs are below that level. Until we have
-    multi-run tests, we have to accept a lower target.
-
-    Args:
-      hr_at_10_min: Minimum acceptable hr@10 value.
-      hr_at_10_max: Maximum acceptable hr@10 value.
-    """
-    super(NCFKerasAccuracy, self)._run_and_report_benchmark(
-        hr_at_10_min=hr_at_10_min,
-        hr_at_10_max=hr_at_10_max)
-
-  def _set_8_gpu_defaults(self):
-    FLAGS.num_gpus = 8
-    FLAGS.learning_rate = 0.0045
-    FLAGS.beta1 = 0.25
-    FLAGS.beta2 = 0.5
-    FLAGS.epsilon = 1e-8
-    FLAGS.train_epochs = 14
-    FLAGS.batch_size = 99000
-    FLAGS.eval_batch_size = 160000
-    FLAGS.train_dataset_path = os.path.join(NCF_TF_REGRESSION_DATA_DIR_NAME,
-                                            'training_cycle_*/*')
-    FLAGS.eval_dataset_path = os.path.join(NCF_TF_REGRESSION_DATA_DIR_NAME,
-                                           'eval_data/*')
-    FLAGS.input_meta_data_path = os.path.join(NCF_TF_REGRESSION_DATA_DIR_NAME,
-                                              'metadata')
-    FLAGS.data_dir = NCF_TF_REGRESSION_DATA_DIR_NAME
-
-  def benchmark_1_gpu_early_stop(self):
-    self._setup()
-    FLAGS.early_stopping = True
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_early_stop(self):
-    self._setup()
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.early_stopping = True
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly_early_stop(self):
-    self._setup()
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.early_stopping = True
-    FLAGS.run_eagerly = True
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_early_stop(self):
-    self._setup()
-    FLAGS.early_stopping = True
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_ctl_early_stop(self):
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.early_stopping = True
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_ctl_run_eagerly_early_stop(self):
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.early_stopping = True
-    FLAGS.run_eagerly = True
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_ctl_early_stop(self):
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.early_stopping = True
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_2_gpus_early_stop(self):
-    self._setup()
-    FLAGS.early_stopping = True
-    FLAGS.num_gpus = 2
-    FLAGS.eval_batch_size = 160000
-    self._run_and_report_benchmark()
-
-  def benchmark_2_gpus_ctl_early_stop(self):
-    """NCF with custom training loop. Works only in TF 2.0."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.early_stopping = True
-    FLAGS.num_gpus = 2
-    FLAGS.eval_batch_size = 160000
-    self._run_and_report_benchmark()
-
-#############################################
-# Tests below with mlperf in the test name are of two types:
-#  1) 1 GPU tests are based on MLPerf 0.5 and the TensorFlow pulled submission.
-#  2) 8 GPU tests are based on MLPerf 0.5 and use NVIDIA's hyper parameters.
-#
-# The purpose of both is to get a number to compare to existing results. To do
-# this the number of epochs is held constant rather than a race to a given
-# accuracy. The accuracy validation is done by the "early_stop" tests.
-#############################################
-
-  def benchmark_1_gpu_mlperf_like(self):
-    """1 GPU using keras fit/compile."""
-    self._setup()
-    FLAGS.train_epochs = 7
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_no_dist_strat_mlperf_like(self):
-    """1 GPU using compile/fit without dist_strat."""
-    self._setup()
-    FLAGS.train_epochs = 7
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly_mlperf_like(self):
-    self._setup()
-    FLAGS.train_epochs = 7
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.run_eagerly = True
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_xla_1_gpu_mlperf_like(self):
-    """1 GPU using compile/fit with XLA."""
-    self._setup()
-    FLAGS.train_epochs = 7
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_ctl_mlperf_like(self):
-    """1 GPU using CTL."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.train_epochs = 7
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_ctl_fp16_mlperf_like(self):
-    """1 GPU using CTL and FP16."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.train_epochs = 7
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_fp16_mlperf_like(self):
-    """1 GPU using FP16."""
-    self._setup()
-    FLAGS.train_epochs = 7
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_ctl_fp16_graph_rewrite_mlperf_like(self):
-    """1 GPU using CTL and FP16 graph rewrite."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.train_epochs = 7
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_fp16_graph_rewrite_mlperf_like(self):
-    """1 GPU using FP16 graph rewrite."""
-    self._setup()
-    FLAGS.train_epochs = 7
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_1_gpu_ctl_run_eagerly_mlperf_like(self):
-    """1 GPU using CTL with eager and distribution strategy."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.run_eagerly = True
-    FLAGS.train_epochs = 7
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_ctl_mlperf_like(self):
-    """1 GPU using CTL with XLA."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.enable_xla = True
-    FLAGS.train_epochs = 7
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_xla_1_gpu_fp16_mlperf_like(self):
-    """1 GPU using with XLA and FP16."""
-    self._setup()
-    FLAGS.enable_xla = True
-    FLAGS.train_epochs = 7
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_xla_1_gpu_ctl_fp16_mlperf_like(self):
-    """1 GPU using CTL with XLA and FP16."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.enable_xla = True
-    FLAGS.train_epochs = 7
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_8_gpu_mlperf_like(self):
-    """8 GPU using keras fit/compile."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.train_epochs = 17
-    FLAGS.batch_size = 1048576
-    FLAGS.eval_batch_size = 160000
-    FLAGS.learning_rate = 0.0045
-    FLAGS.beta1 = 0.25
-    FLAGS.beta2 = 0.5
-    FLAGS.epsilon = 1e-8
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_8_gpu_ctl_mlperf_like(self):
-    """8 GPU using CTL."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.num_gpus = 8
-    FLAGS.train_epochs = 17
-    FLAGS.batch_size = 1048576
-    FLAGS.eval_batch_size = 160000
-    FLAGS.learning_rate = 0.0045
-    FLAGS.beta1 = 0.25
-    FLAGS.beta2 = 0.5
-    FLAGS.epsilon = 1e-8
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_8_gpu_tf_data_ctl_mlperf_like(self):
-    """8 GPU using CTL."""
-    self._setup()
-    self._set_8_gpu_defaults()
-    FLAGS.keras_use_ctl = True
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_8_gpu_tf_data_fp16_mlperf_like(self):
-    """8 GPU FP16."""
-    self._setup()
-    self._set_8_gpu_defaults()
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_8_gpu_tf_data_ctl_fp16_mlperf_like(self):
-    """8 GPU FP16 using CTL."""
-    self._setup()
-    self._set_8_gpu_defaults()
-    FLAGS.keras_use_ctl = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_8_gpu_tf_data_ctl_fp16_graph_rewrite_mlperf_like(self):
-    """8 GPU FP16 graph rewrite using CTL."""
-    self._setup()
-    self._set_8_gpu_defaults()
-    FLAGS.keras_use_ctl = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.loss_scale = 8192
-    self._run_and_report_benchmark_mlperf_like()
-
-
-class NCFKerasBenchmarkReal(NCFKerasBenchmarkBase):
-  """NCF Keras throughput benchmarks."""
-
-  def __init__(self,
-               output_dir=None,
-               root_data_dir=None,
-               default_flags=None,
-               **kwargs):
-
-    root_data_dir = root_data_dir if root_data_dir else ''
-    default_flags = {}
-    default_flags['dataset'] = 'ml-20m'
-    default_flags['num_gpus'] = 1
-    default_flags['train_epochs'] = 14
-    default_flags['clean'] = True
-    default_flags['batch_size'] = 99000
-    default_flags['eval_batch_size'] = 160000
-    default_flags['learning_rate'] = 0.00382059
-    default_flags['beta1'] = 0.783529
-    default_flags['beta2'] = 0.909003
-    default_flags['epsilon'] = 1.45439e-07
-    default_flags['layers'] = [256, 256, 128, 64]
-    default_flags['num_factors'] = 64
-    default_flags['hr_threshold'] = 0.635
-    default_flags['ml_perf'] = True
-    default_flags['use_synthetic_data'] = False
-    default_flags['train_dataset_path'] = os.path.join(
-        NCF_TF_REGRESSION_DATA_DIR_NAME, 'training_cycle_*/*')
-    default_flags['eval_dataset_path'] = os.path.join(
-        NCF_TF_REGRESSION_DATA_DIR_NAME, 'eval_data/*')
-    default_flags['input_meta_data_path'] = os.path.join(
-        NCF_TF_REGRESSION_DATA_DIR_NAME, 'metadata')
-    default_flags['data_dir'] = NCF_TF_REGRESSION_DATA_DIR_NAME
-
-    super(NCFKerasBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=default_flags, **kwargs)
-
-  def benchmark_2x2_tpu(self):
-    """2x2 TPU using CTL with distribution strategy."""
-    self._setup()
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.keras_use_ctl = True
-    FLAGS.num_gpus = 0
-    FLAGS.train_epochs = 1
-    self._run_and_report_benchmark()
-
-  @owner_utils.Owner('tf-graph-compiler')
-  def benchmark_2x2_tpu_mlir(self):
-    """2x2 TPU using CTL with distribution strategy using the MLIR bridge."""
-    self._setup()
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.keras_use_ctl = True
-    FLAGS.num_gpus = 0
-    FLAGS.train_epochs = 1
-    tf.config.experimental.enable_mlir_bridge()
-    self._run_and_report_benchmark()
-
-
-class NCFKerasSynth(NCFKerasBenchmarkBase):
-  """Benchmark NCF model using synthetic data."""
-
-  def __init__(self,
-               output_dir=None,
-               default_flags=None,
-               **kwargs):
-
-    default_flags = {}
-    default_flags['dataset'] = 'ml-20m'
-    default_flags['num_gpus'] = 1
-    default_flags['train_epochs'] = 8
-    default_flags['batch_size'] = 99000
-    default_flags['eval_batch_size'] = 160000
-    default_flags['learning_rate'] = 0.00382059
-    default_flags['beta1'] = 0.783529
-    default_flags['beta2'] = 0.909003
-    default_flags['epsilon'] = 1.45439e-07
-    default_flags['layers'] = [256, 256, 128, 64]
-    default_flags['num_factors'] = 64
-    default_flags['hr_threshold'] = 0.635
-    default_flags['use_synthetic_data'] = True
-
-    super(NCFKerasSynth, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        **kwargs)
-
-  def benchmark_1_gpu(self):
-    self._setup()
-    self._run_and_report_benchmark()
-
-  def benchmark_2_gpus(self):
-    self._setup()
-    FLAGS.num_gpus = 2
-    self._run_and_report_benchmark()
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/nhnet_benchmark.py
+++ b/official/benchmark/nhnet_benchmark.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes benchmark testing for bert pretraining."""
-# pylint: disable=line-too-long
-from __future__ import print_function
-
-import time
-from typing import Optional
-
-from absl import flags
-import tensorflow as tf
-
-from official.benchmark import benchmark_wrappers
-from official.benchmark import owner_utils
-from official.benchmark import perfzero_benchmark
-from official.nlp.nhnet import trainer
-from official.utils.flags import core as flags_core
-
-MIN_LOSS = 0.40
-MAX_LOSS = 0.55
-NHNET_DATA = 'gs://tf-perfzero-data/nhnet/v1/processed/train.tfrecord*'
-PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12/bert_model.ckpt'
-
-FLAGS = flags.FLAGS
-
-
-class NHNetBenchmark(perfzero_benchmark.PerfZeroBenchmark):
-  """Base benchmark class for NHNet."""
-
-  def __init__(self, output_dir=None, default_flags=None, tpu=None, **kwargs):
-    self.default_flags = default_flags or {}
-    flag_methods = trainer.define_flags()
-    super(NHNetBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        flag_methods=flag_methods,
-        tpu=tpu,
-        **kwargs)
-
-  def _report_benchmark(self,
-                        stats,
-                        wall_time_sec,
-                        max_value=None,
-                        min_value=None):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      stats: dict returned from keras models with known entries.
-      wall_time_sec: the during of the benchmark execution in seconds
-      max_value: highest passing level.
-      min_value: lowest passing level.
-    """
-
-    metrics = []
-    metrics.append({
-        'name': 'training_loss',
-        'value': stats['training_loss'],
-        'min_value': min_value,
-        'max_value': max_value
-    })
-    # These metrics are placeholders to avoid PerfZero failure.
-    metrics.append({
-        'name': 'exp_per_second',
-        'value': 0.0,
-    })
-    metrics.append({
-        'name': 'startup_time',
-        'value': 9999.,
-    })
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(
-        iters=-1,
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_str})
-
-
-class NHNetAccuracyBenchmark(NHNetBenchmark):
-  """Benchmark accuracy tests for NHNet."""
-
-  def __init__(self,
-               output_dir: Optional[str] = None,
-               tpu: Optional[str] = None,
-               **kwargs):
-    default_flags = dict(
-        mode='train',
-        train_file_pattern=NHNET_DATA,
-        train_batch_size=1024,
-        model_type='nhnet',
-        len_title=15,
-        len_passage=200,
-        num_encoder_layers=12,
-        num_decoder_layers=12,
-        num_nhnet_articles=5,
-        steps_per_loop=1000,
-        params_override='init_from_bert2bert=false')
-    super(NHNetAccuracyBenchmark, self).__init__(
-        output_dir=output_dir, default_flags=default_flags, tpu=tpu, **kwargs)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self, max_value=MAX_LOSS, min_value=MIN_LOSS):
-    """Runs and reports the benchmark given the provided configuration."""
-    start_time_sec = time.time()
-    stats = trainer.run()
-    wall_time_sec = time.time() - start_time_sec
-    self._report_benchmark(
-        stats, wall_time_sec, max_value=max_value, min_value=min_value)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_accuracy_4x4_tpu_f32_50k_steps(self):
-    """Test bert pretraining with 4x4 TPU for 50k steps."""
-    # This is used for accuracy test.
-    self._setup()
-    FLAGS.train_steps = 50000
-    FLAGS.checkpoint_interval = FLAGS.train_steps
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.init_checkpoint = PRETRAINED_CHECKPOINT_PATH
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_accuracy_4x4_tpu_bf32_50k_steps')
-    self._run_and_report_benchmark()
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_accuracy_4x4_tpu_f32_1k_steps(self):
-    """Test bert pretraining with 4x4 TPU for 1k steps."""
-    self._setup()
-    FLAGS.train_steps = 1000
-    FLAGS.checkpoint_interval = FLAGS.train_steps
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_accuracy_4x4_tpu_bf32_1k_steps')
-    self._run_and_report_benchmark()
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/owner_utils.py
+++ b/official/benchmark/owner_utils.py
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utils to set Owner annotations on benchmarks.
-
-@owner_utils.Owner('owner_team/user') can be set either at the benchmark class
-level / benchmark method level or both.
-
-Runner frameworks can use owner_utils.GetOwner(benchmark_method) to get the
-actual owner. Python inheritance for the owner attribute is respected.  (E.g
-method level owner takes precedence over class level).
-
-See owner_utils_test for associated tests and more examples.
-
-The decorator can be applied both at the method level and at the class level.
-
-Simple example:
-===============
-
-class MLBenchmark:
-
-  @Owner('example_id')
-  def benchmark_method_1_gpu(self):
-    return True
-"""
-
-
-def Owner(owner_name):
-  """Sets the owner attribute on a decorated method or class."""
-
-  def _Wrapper(func_or_class):
-    """Sets the benchmark owner attribute."""
-    func_or_class.__benchmark__owner__ = owner_name
-    return func_or_class
-
-  return _Wrapper
-
-
-def GetOwner(benchmark_method_or_class):
-  """Gets the inherited owner attribute for this benchmark.
-
-  Checks for existence of __benchmark__owner__. If it's not present, looks for
-  it in the parent class's attribute list.
-
-  Args:
-    benchmark_method_or_class: A benchmark method or class.
-
-  Returns:
-    string - the associated owner if present / None.
-  """
-  if hasattr(benchmark_method_or_class, '__benchmark__owner__'):
-    return benchmark_method_or_class.__benchmark__owner__
-  elif hasattr(benchmark_method_or_class, '__self__'):
-    if hasattr(benchmark_method_or_class.__self__, '__benchmark__owner__'):
-      return benchmark_method_or_class.__self__.__benchmark__owner__
-  return None
--- a/official/benchmark/owner_utils_test.py
+++ b/official/benchmark/owner_utils_test.py
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for official.benchmark.owner_utils."""
-
-from absl.testing import absltest
-
-from official.benchmark import owner_utils
-
-
-@owner_utils.Owner('static_owner')
-def static_function(foo=5):
-  return foo
-
-
-def static_function_without_owner(foo=5):
-  return foo
-
-
-class BenchmarkClassWithoutOwner:
-
-  def method_without_owner(self):
-    return 100
-
-  @owner_utils.Owner('method_owner')
-  def method_with_owner(self):
-    return 200
-
-
-@owner_utils.Owner('class_owner')
-class SomeBenchmarkClass:
-
-  def method_inherited_owner(self):
-    return 123
-
-  @owner_utils.Owner('method_owner')
-  def method_override_owner(self):
-    return 345
-
-
-@owner_utils.Owner('new_class_owner')
-class InheritedClass(SomeBenchmarkClass):
-
-  def method_inherited_owner(self):
-    return 456
-
-  @owner_utils.Owner('new_method_owner')
-  def method_override_owner(self):
-    return 567
-
-
-class OwnerUtilsTest(absltest.TestCase):
-  """Tests to assert for owner decorator functionality."""
-
-  def test_owner_tag_missing(self):
-    self.assertEqual(None, owner_utils.GetOwner(static_function_without_owner))
-
-    benchmark_class = BenchmarkClassWithoutOwner()
-    self.assertEqual(None,
-                     owner_utils.GetOwner(benchmark_class.method_without_owner))
-    self.assertEqual(100, benchmark_class.method_without_owner())
-
-    self.assertEqual('method_owner',
-                     owner_utils.GetOwner(benchmark_class.method_with_owner))
-    self.assertEqual(200, benchmark_class.method_with_owner())
-
-  def test_owner_attributes_static(self):
-    self.assertEqual('static_owner', owner_utils.GetOwner(static_function))
-    self.assertEqual(5, static_function(5))
-
-  def test_owner_attributes_per_class(self):
-    level1 = SomeBenchmarkClass()
-    self.assertEqual('class_owner',
-                     owner_utils.GetOwner(level1.method_inherited_owner))
-    self.assertEqual(123, level1.method_inherited_owner())
-
-    self.assertEqual('method_owner',
-                     owner_utils.GetOwner(level1.method_override_owner))
-    self.assertEqual(345, level1.method_override_owner())
-
-  def test_owner_attributes_inherited_class(self):
-    level2 = InheritedClass()
-    self.assertEqual('new_class_owner',
-                     owner_utils.GetOwner(level2.method_inherited_owner))
-    self.assertEqual(456, level2.method_inherited_owner())
-
-    self.assertEqual('new_method_owner',
-                     owner_utils.GetOwner(level2.method_override_owner))
-    self.assertEqual(567, level2.method_override_owner())
-
-
-if __name__ == '__main__':
-  absltest.main()
--- a/official/benchmark/perfzero_benchmark.py
+++ b/official/benchmark/perfzero_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utils for creating PerfZero benchmarks."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl import flags
-from absl import logging
-from absl.testing import flagsaver
-import tensorflow as tf
-
-FLAGS = flags.FLAGS
-
-
-class PerfZeroBenchmark(tf.test.Benchmark):
-  """Common methods used in PerfZero Benchmarks.
-
-     Handles the resetting of flags between tests, loading of default_flags,
-     overriding of defaults.  PerfZero (OSS) runs each test in a separate
-     process reducing some need to reset the flags.
-  """
-  local_flags = None
-
-  def __init__(self,
-               output_dir=None,
-               default_flags=None,
-               root_data_dir=None,
-               flag_methods=None,
-               tpu=None):
-    """Initialize class.
-
-    Args:
-      output_dir: Base directory to store all output for the test.
-      default_flags: Set of flags to pass to model.
-      root_data_dir: Optional param used by child classes to look for the
-        dataset.
-      flag_methods: Set of flag methods to run during setup.
-      tpu: (optional) TPU name to use in a TPU benchmark.
-    """
-    if os.getenv('BENCHMARK_OUTPUT_DIR'):
-      self.output_dir = os.getenv('BENCHMARK_OUTPUT_DIR')
-    elif output_dir:
-      self.output_dir = output_dir
-    else:
-      self.output_dir = '/tmp'
-    self.default_flags = default_flags or {}
-    self.flag_methods = flag_methods or {}
-
-    if os.getenv('BENCHMARK_TPU'):
-      resolved_tpu = os.getenv('BENCHMARK_TPU')
-    elif tpu:
-      resolved_tpu = tpu
-    else:
-      resolved_tpu = None
-
-    if resolved_tpu:
-      # TPU models are expected to accept a --tpu=name flag. PerfZero creates
-      # the TPU at runtime and passes the TPU's name to this flag.
-      self.default_flags['tpu'] = resolved_tpu
-
-    logging.info('root_data_dir: %s', root_data_dir)
-
-  @property
-  def tpu(self):
-    return self.default_flags.get('tpu', None)
-
-  def _get_model_dir(self, folder_name):
-    """Returns directory to store info, e.g. saved model and event log."""
-    return os.path.join(self.output_dir, folder_name)
-
-  def _setup(self):
-    """Sets up and resets flags before each test."""
-    logging.set_verbosity(logging.INFO)
-    if PerfZeroBenchmark.local_flags is None:
-      for flag_method in self.flag_methods:
-        flag_method()
-      # Loads flags to get defaults to then override. List cannot be empty.
-      flags.FLAGS(['foo'])
-      # Overrides flag values with defaults for the class of tests.
-      for k, v in self.default_flags.items():
-        setattr(FLAGS, k, v)
-      saved_flag_values = flagsaver.save_flag_values()
-      PerfZeroBenchmark.local_flags = saved_flag_values
-    else:
-      flagsaver.restore_flag_values(PerfZeroBenchmark.local_flags)
--- a/official/benchmark/resnet_ctl_imagenet_benchmark.py
+++ b/official/benchmark/resnet_ctl_imagenet_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes CTL benchmarks and accuracy tests."""
-# pylint: disable=line-too-long,g-bad-import-order
-from __future__ import print_function
-
-import os
-import time
-
-from absl import flags
-import tensorflow as tf
-
-from official.benchmark import owner_utils
-from official.vision.image_classification.resnet import common
-from official.vision.image_classification.resnet import resnet_ctl_imagenet_main
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-from official.benchmark import benchmark_wrappers
-from official.utils.flags import core as flags_core
-
-MIN_TOP_1_ACCURACY = 0.76
-MAX_TOP_1_ACCURACY = 0.77
-
-FLAGS = flags.FLAGS
-
-
-class CtlBenchmark(PerfZeroBenchmark):
-  """Base benchmark class with methods to simplify testing."""
-
-  def __init__(self, output_dir=None, default_flags=None, flag_methods=None):
-    self.default_flags = default_flags or {}
-    self.flag_methods = flag_methods or {}
-    super(CtlBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=self.default_flags,
-        flag_methods=self.flag_methods)
-
-  def _report_benchmark(self,
-                        stats,
-                        wall_time_sec,
-                        top_1_max=None,
-                        top_1_min=None,
-                        total_batch_size=None,
-                        log_steps=None,
-                        warmup=1,
-                        start_time_sec=None):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      stats: dict returned from keras models with known entries.
-      wall_time_sec: the during of the benchmark execution in seconds
-      top_1_max: highest passing level for top_1 accuracy.
-      top_1_min: lowest passing level for top_1 accuracy.
-      total_batch_size: Global batch-size.
-      log_steps: How often the log was created for stats['step_timestamp_log'].
-      warmup: number of entries in stats['step_timestamp_log'] to ignore.
-      start_time_sec: the start time of the program in seconds since epoch.
-    """
-
-    metrics = []
-    if 'eval_acc' in stats:
-      metrics.append({
-          'name': 'accuracy_top_1',
-          'value': stats['eval_acc'],
-          'min_value': top_1_min,
-          'max_value': top_1_max
-      })
-      metrics.append({'name': 'eval_loss', 'value': stats['eval_loss']})
-
-      metrics.append({
-          'name': 'top_1_train_accuracy',
-          'value': stats['train_acc']
-      })
-      metrics.append({'name': 'train_loss', 'value': stats['train_loss']})
-
-    if (warmup and 'step_timestamp_log' in stats and
-        len(stats['step_timestamp_log']) > warmup + 1):
-      # first entry in the time_log is start of step 0. The rest of the
-      # entries are the end of each step recorded
-      time_log = stats['step_timestamp_log']
-      steps_elapsed = time_log[-1].batch_index - time_log[warmup].batch_index
-      time_elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
-      examples_per_sec = total_batch_size * (steps_elapsed / time_elapsed)
-      metrics.append({'name': 'exp_per_second', 'value': examples_per_sec})
-
-    if 'avg_exp_per_second' in stats:
-      metrics.append({
-          'name': 'avg_exp_per_second',
-          'value': stats['avg_exp_per_second']
-      })
-
-    if start_time_sec and 'step_timestamp_log' in stats:
-      time_log = stats['step_timestamp_log']
-      # time_log[0] is recorded at the beginning of the first step.
-      startup_time = time_log[0].timestamp - start_time_sec
-      metrics.append({'name': 'startup_time', 'value': startup_time})
-
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(
-        iters=-1,
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_str})
-
-
-class Resnet50CtlAccuracy(CtlBenchmark):
-  """Benchmark accuracy tests for ResNet50 in CTL."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """A benchmark class.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-        constructor forward compatible in case PerfZero provides more named
-        arguments before updating the constructor.
-    """
-
-    flag_methods = [common.define_keras_flags]
-
-    self.data_dir = os.path.join(root_data_dir, 'imagenet')
-    super(Resnet50CtlAccuracy, self).__init__(
-        output_dir=output_dir, flag_methods=flag_methods)
-
-  def benchmark_8_gpu(self):
-    """Test Keras model with eager, dist_strat and 8 GPUs."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    FLAGS.dtype = 'fp32'
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_fp16(self):
-    """Test Keras model with eager, 8 GPUs with tf.keras mixed precision."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 256 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_amp(self):
-    """Test Keras model with 8 GPUs and mixed precision via graph rewrite."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 256 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_amp')
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    self._run_and_report_benchmark()
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = resnet_ctl_imagenet_main.run(flags.FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    super(Resnet50CtlAccuracy, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        top_1_min=MIN_TOP_1_ACCURACY,
-        top_1_max=MAX_TOP_1_ACCURACY,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=100,
-        start_time_sec=start_time_sec)
-
-
-class Resnet50CtlBenchmarkBase(CtlBenchmark):
-  """Resnet50 benchmarks."""
-
-  def __init__(self, output_dir=None, default_flags=None):
-    flag_methods = [common.define_keras_flags]
-
-    super(Resnet50CtlBenchmarkBase, self).__init__(
-        output_dir=output_dir,
-        flag_methods=flag_methods,
-        default_flags=default_flags)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = resnet_ctl_imagenet_main.run(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    # Warmup means the number of logged step time entries that are excluded in
-    # performance report. Default to exclude 1 FLAGS.log_steps time.
-    super(Resnet50CtlBenchmarkBase, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps,
-        warmup=1,
-        start_time_sec=start_time_sec)
-
-  def benchmark_1_gpu_no_dist_strat(self):
-    """Test Keras model with 1 GPU, no distribution strategy."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu(self):
-    """Test Keras model with 1 GPU."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_fp16(self):
-    """Test Keras model with 1 GPU with tf.keras mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
-    FLAGS.batch_size = 256
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_amp(self):
-    """Test Keras model with 1 GPU with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_amp')
-    FLAGS.batch_size = 256
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_amp(self):
-    """Test Keras model with XLA and 1 GPU with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_amp')
-    FLAGS.batch_size = 256
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_eager(self):
-    """Test Keras model with 1 GPU in pure eager mode."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_eager')
-    FLAGS.batch_size = 120
-    FLAGS.use_tf_function = False
-    FLAGS.use_tf_while_loop = False
-    FLAGS.single_l2_loss_op = True
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_fp16_eager(self):
-    """Test Keras model with 1 GPU with fp16 and pure eager mode."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_eager')
-    FLAGS.batch_size = 240
-    FLAGS.dtype = 'fp16'
-    FLAGS.use_tf_function = False
-    FLAGS.use_tf_while_loop = False
-    FLAGS.single_l2_loss_op = True
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu(self):
-    """Test Keras model with 8 GPUs."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    FLAGS.batch_size = 128 * 8  # 8 GPUs
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_fp16(self):
-    """Test Keras model with 8 GPUs with tf.keras mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_eager(self):
-    """Test Keras model with 8 GPUs, eager, fp32."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.use_tf_function = False
-    FLAGS.use_tf_while_loop = False
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_eager')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_eager_fp16(self):
-    """Test Keras model with 8 GPUs, eager, fp16."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.use_tf_function = False
-    FLAGS.use_tf_while_loop = False
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_eager_fp16')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_amp(self):
-    """Test Keras model with 8 GPUs with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_amp')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu_amp(self):
-    """Test Keras model with XLA and 8 GPUs with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_amp')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def _set_df_common(self):
-    FLAGS.steps_per_loop = 500
-    FLAGS.train_epochs = 2
-    FLAGS.train_steps = None
-    FLAGS.skip_eval = True
-    FLAGS.enable_eager = True
-    FLAGS.enable_tensorboard = False
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.report_accuracy_metrics = False
-    FLAGS.log_steps = 50
-    FLAGS.single_l2_loss_op = True
-    FLAGS.use_tf_function = True
-    FLAGS.enable_checkpoint_and_export = False
-
-  def benchmark_2x2_tpu_bf16(self):
-    self._setup()
-    self._set_df_common()
-    FLAGS.batch_size = 1024
-    FLAGS.dtype = 'bf16'
-    self._run_and_report_benchmark()
-
-  def benchmark_4x4_tpu_bf16(self):
-    self._setup()
-    self._set_df_common()
-    FLAGS.batch_size = 4096
-    FLAGS.dtype = 'bf16'
-    self._run_and_report_benchmark()
-
-  @owner_utils.Owner('tf-graph-compiler')
-  def benchmark_4x4_tpu_bf16_mlir(self):
-    """Run resnet model on 4x4 with the MLIR Bridge enabled."""
-    self._setup()
-    self._set_df_common()
-    FLAGS.batch_size = 4096
-    FLAGS.dtype = 'bf16'
-    tf.config.experimental.enable_mlir_bridge()
-    self._run_and_report_benchmark()
-
-  def benchmark_8x16_tpu_bf16(self):
-    self._setup()
-    self._set_df_common()
-    FLAGS.batch_size = 8192
-    FLAGS.dtype = 'bf16'
-    self._run_and_report_benchmark()
-
-  def fill_report_object(self, stats):
-    super(Resnet50CtlBenchmarkBase, self).fill_report_object(
-        stats, total_batch_size=FLAGS.batch_size, log_steps=FLAGS.log_steps)
-
-
-class Resnet50CtlBenchmarkSynth(Resnet50CtlBenchmarkBase):
-  """Resnet50 synthetic benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    def_flags = {}
-    def_flags['skip_eval'] = True
-    def_flags['use_synthetic_data'] = True
-    def_flags['train_steps'] = 110
-    def_flags['steps_per_loop'] = 20
-    def_flags['log_steps'] = 10
-
-    super(Resnet50CtlBenchmarkSynth, self).__init__(
-        output_dir=output_dir, default_flags=def_flags)
-
-
-class Resnet50CtlBenchmarkReal(Resnet50CtlBenchmarkBase):
-  """Resnet50 real data benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    def_flags = {}
-    def_flags['skip_eval'] = True
-    def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
-    def_flags['train_steps'] = 110
-    def_flags['steps_per_loop'] = 20
-    def_flags['log_steps'] = 10
-
-    super(Resnet50CtlBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=def_flags)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/retinanet_benchmark.py
+++ b/official/benchmark/retinanet_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes RetinaNet benchmarks and accuracy tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=g-bad-import-order
-import json
-import time
-
-from absl import flags
-from absl.testing import flagsaver
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from official.benchmark import benchmark_wrappers
-from official.benchmark import perfzero_benchmark
-from official.utils.flags import core as flags_core
-from official.utils.misc import keras_utils
-from official.vision.detection import main as detection
-from official.vision.detection.configs import base_config
-
-FLAGS = flags.FLAGS
-
-# pylint: disable=line-too-long
-COCO_TRAIN_DATA = 'gs://tf-perfzero-data/coco/train*'
-COCO_EVAL_DATA = 'gs://tf-perfzero-data/coco/val*'
-COCO_EVAL_JSON = 'gs://tf-perfzero-data/coco/instances_val2017.json'
-RESNET_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07'
-# pylint: enable=line-too-long
-
-
-class DetectionBenchmarkBase(perfzero_benchmark.PerfZeroBenchmark):
-  """Base class to hold methods common to test classes."""
-
-  def __init__(self, **kwargs):
-    super(DetectionBenchmarkBase, self).__init__(**kwargs)
-    self.timer_callback = None
-
-  def _report_benchmark(self, stats, start_time_sec, wall_time_sec, min_ap,
-                        max_ap, warmup):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      stats: dict returned from Detection models with known entries.
-      start_time_sec: the start of the benchmark execution in seconds
-      wall_time_sec: the duration of the benchmark execution in seconds
-      min_ap: Minimum detection AP constraint to verify correctness of the
-        model.
-      max_ap: Maximum detection AP accuracy constraint to verify correctness of
-        the model.
-      warmup: Number of time log entries to ignore when computing examples/sec.
-    """
-    metrics = [{
-        'name': 'total_loss',
-        'value': stats['total_loss'],
-    }]
-    if self.timer_callback:
-      metrics.append({
-          'name': 'exp_per_second',
-          'value': self.timer_callback.get_examples_per_sec(warmup)
-      })
-      metrics.append({
-          'name': 'startup_time',
-          'value': self.timer_callback.get_startup_time(start_time_sec)
-      })
-    else:
-      metrics.append({
-          'name': 'exp_per_second',
-          'value': 0.0,
-      })
-
-    if 'eval_metrics' in stats:
-      metrics.append({
-          'name': 'AP',
-          'value': stats['AP'],
-          'min_value': min_ap,
-          'max_value': max_ap,
-      })
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(
-        iters=stats['total_steps'],
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_str})
-
-
-class RetinanetBenchmarkBase(DetectionBenchmarkBase):
-  """Base class to hold methods common to test classes in the module."""
-
-  def __init__(self, **kwargs):
-    self.train_data_path = COCO_TRAIN_DATA
-    self.eval_data_path = COCO_EVAL_DATA
-    self.eval_json_path = COCO_EVAL_JSON
-    self.resnet_checkpoint_path = RESNET_CHECKPOINT_PATH
-    super(RetinanetBenchmarkBase, self).__init__(**kwargs)
-
-  def _run_detection_main(self):
-    """Starts detection job."""
-    if self.timer_callback:
-      FLAGS.log_steps = 0  # prevent detection.run from adding the same callback
-      return detection.run(callbacks=[self.timer_callback])
-    else:
-      return detection.run()
-
-
-class RetinanetAccuracy(RetinanetBenchmarkBase):
-  """Accuracy test for RetinaNet model.
-
-  Tests RetinaNet detection task model accuracy. The naming
-  convention of below test cases follow
-  `benchmark_(number of gpus)_gpu_(dataset type)` format.
-  """
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                params,
-                                min_ap=0.325,
-                                max_ap=0.35,
-                                do_eval=True,
-                                warmup=1):
-    """Starts RetinaNet accuracy benchmark test."""
-    FLAGS.params_override = json.dumps(params)
-    # Need timer callback to measure performance
-    self.timer_callback = keras_utils.TimeHistory(
-        batch_size=params['train']['batch_size'],
-        log_steps=FLAGS.log_steps,
-    )
-
-    start_time_sec = time.time()
-    FLAGS.mode = 'train'
-    summary, _ = self._run_detection_main()
-    wall_time_sec = time.time() - start_time_sec
-
-    if do_eval:
-      FLAGS.mode = 'eval'
-      eval_metrics = self._run_detection_main()
-      summary.update(eval_metrics)
-
-    summary['total_steps'] = params['train']['total_steps']
-    self._report_benchmark(summary, start_time_sec, wall_time_sec, min_ap,
-                           max_ap, warmup)
-
-  def _setup(self):
-    super(RetinanetAccuracy, self)._setup()
-    FLAGS.model = 'retinanet'
-
-  def _params(self):
-    return {
-        'architecture': {
-            'use_bfloat16': True,
-        },
-        'train': {
-            'batch_size': 64,
-            'iterations_per_loop': 100,
-            'total_steps': 22500,
-            'train_file_pattern': self.train_data_path,
-            'checkpoint': {
-                'path': self.resnet_checkpoint_path,
-                'prefix': 'resnet50/'
-            },
-            # Speed up ResNet training when loading from the checkpoint.
-            'frozen_variable_prefix': base_config.RESNET_FROZEN_VAR_PREFIX,
-        },
-        'eval': {
-            'batch_size': 8,
-            'eval_samples': 5000,
-            'val_json_file': self.eval_json_path,
-            'eval_file_pattern': self.eval_data_path,
-        },
-    }
-
-  @flagsaver.flagsaver
-  def benchmark_8_gpu_coco(self):
-    """Run RetinaNet model accuracy test with 8 GPUs."""
-    self._setup()
-    params = self._params()
-    FLAGS.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_coco')
-    FLAGS.strategy_type = 'mirrored'
-    self._run_and_report_benchmark(params)
-
-
-class RetinanetBenchmarkReal(RetinanetAccuracy):
-  """Short benchmark performance tests for RetinaNet model.
-
-  Tests RetinaNet performance in different GPU configurations.
-  The naming convention of below test cases follow
-  `benchmark_(number of gpus)_gpu` format.
-  """
-
-  def _setup(self):
-    super(RetinanetBenchmarkReal, self)._setup()
-    # Use negative value to avoid saving checkpoints.
-    FLAGS.save_checkpoint_freq = -1
-
-  @flagsaver.flagsaver
-  def benchmark_8_gpu_coco(self):
-    """Run RetinaNet model accuracy test with 8 GPUs."""
-    self._setup()
-    params = self._params()
-    params['architecture']['use_bfloat16'] = False
-    params['train']['total_steps'] = 1875  # One epoch.
-    # The iterations_per_loop must be one, otherwise the number of examples per
-    # second would be wrong. Currently only support calling callback per batch
-    # when each loop only runs on one batch, i.e. host loop for one step. The
-    # performance of this situation might be lower than the case of
-    # iterations_per_loop > 1.
-    # Related bug: b/135933080
-    params['train']['iterations_per_loop'] = 1
-    params['eval']['eval_samples'] = 8
-    FLAGS.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('real_benchmark_8_gpu_coco')
-    FLAGS.strategy_type = 'mirrored'
-    self._run_and_report_benchmark(params)
-
-  @flagsaver.flagsaver
-  def benchmark_1_gpu_coco(self):
-    """Run RetinaNet model accuracy test with 1 GPU."""
-    self._setup()
-    params = self._params()
-    params['architecture']['use_bfloat16'] = False
-    params['train']['batch_size'] = 8
-    params['train']['total_steps'] = 200
-    params['train']['iterations_per_loop'] = 1
-    params['eval']['eval_samples'] = 8
-    FLAGS.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('real_benchmark_1_gpu_coco')
-    FLAGS.strategy_type = 'one_device'
-    self._run_and_report_benchmark(params)
-
-  @flagsaver.flagsaver
-  def benchmark_xla_1_gpu_coco(self):
-    """Run RetinaNet model accuracy test with 1 GPU and XLA enabled."""
-    self._setup()
-    params = self._params()
-    params['architecture']['use_bfloat16'] = False
-    params['train']['batch_size'] = 8
-    params['train']['total_steps'] = 200
-    params['train']['iterations_per_loop'] = 1
-    params['eval']['eval_samples'] = 8
-    FLAGS.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('real_benchmark_xla_1_gpu_coco')
-    FLAGS.strategy_type = 'one_device'
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark(params)
-
-  @flagsaver.flagsaver
-  def benchmark_2x2_tpu_coco(self):
-    """Run RetinaNet model accuracy test with 4 TPUs."""
-    self._setup()
-    params = self._params()
-    params['train']['batch_size'] = 64
-    params['train']['total_steps'] = 1875  # One epoch.
-    params['train']['iterations_per_loop'] = 500
-    FLAGS.model_dir = self._get_model_dir('real_benchmark_2x2_tpu_coco')
-    FLAGS.strategy_type = 'tpu'
-    self._run_and_report_benchmark(params, do_eval=False, warmup=0)
-
-  @flagsaver.flagsaver
-  def benchmark_2x2_tpu_spinenet_coco(self):
-    """Run SpineNet with RetinaNet model accuracy test with 4 TPUs."""
-    self._setup()
-    params = self._params()
-    params['architecture']['backbone'] = 'spinenet'
-    params['architecture']['multilevel_features'] = 'identity'
-    params['architecture']['use_bfloat16'] = False
-    params['train']['batch_size'] = 64
-    params['train']['total_steps'] = 1875  # One epoch.
-    params['train']['iterations_per_loop'] = 500
-    params['train']['checkpoint']['path'] = ''
-    FLAGS.model_dir = self._get_model_dir(
-        'real_benchmark_2x2_tpu_spinenet_coco')
-    FLAGS.strategy_type = 'tpu'
-    self._run_and_report_benchmark(params, do_eval=False, warmup=0)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/shakespeare_benchmark.py
+++ b/official/benchmark/shakespeare_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes Shakespeare (LSTM) benchmark and accuracy tests."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-
-from absl import flags
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.benchmark.models.shakespeare import shakespeare_main
-from official.utils.flags import core as flags_core
-from official.utils.misc import keras_utils
-from official.benchmark import benchmark_wrappers
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-
-SHAKESPEARE_TRAIN_DATA = 'shakespeare/shakespeare.txt'
-TMP_DIR = os.getenv('TMPDIR')
-FLAGS = flags.FLAGS
-
-
-class ShakespeareBenchmarkBase(PerfZeroBenchmark):
-  """Base class for Shakespeare (LSTM) benchmark and accuracy tests."""
-
-  def __init__(self, output_dir=None, default_flags=None, root_data_dir=None):
-    super(ShakespeareBenchmarkBase, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        flag_methods=[shakespeare_main.define_flags])
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                top_1_train_min=0.91,
-                                top_1_train_max=0.94,
-                                warmup=1,
-                                log_steps=100):
-    """Report benchmark results by writing to local protobuf file.
-
-    Average epoch time is calculated by skipping the first epoch. This average
-    ignores time spent between epoch and is recorded by begin and end epoch. To
-    skip accuracy check set `top_1_train_min=None`.
-
-    Args:
-      top_1_train_min: lowest passing value.
-      top_1_train_max: highest passing value.
-      warmup: number of entries in `timestamp_log` to ignore.
-      log_steps: How often the log was created for `timestamp_log`.
-    """
-    total_batch_size = FLAGS.batch_size
-    metrics = []
-    start_time_sec = time.time()
-    stats = shakespeare_main.run(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    if top_1_train_min:
-      metrics.append({'name': 'accuracy_top_1_train',
-                      'value': stats['history']['RecallAt1'][-1],
-                      'min_value': top_1_train_min,
-                      'max_value': top_1_train_max})
-
-    # Look for the time history callback which was used during keras.fit
-    for callback in stats['callbacks']:
-      if isinstance(callback, keras_utils.TimeHistory):
-        epoch_timings = callback.epoch_runtime_log
-        if len(epoch_timings) > 1:
-          average_time = sum(epoch_timings[1:]) / len(epoch_timings[1:])
-          metrics.append({'name': 'avg_epoch_time',
-                          'value': average_time})
-
-      # First entry in timestamp_log is the start of step 1. The rest of the
-      # entries are the end of each step recorded.
-      time_log = callback.timestamp_log
-      elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
-      num_examples = (
-          total_batch_size * log_steps * (len(time_log) - warmup - 1))
-      if elapsed > 0:
-        examples_per_sec = num_examples / elapsed
-        metrics.append({'name': 'exp_per_second',
-                        'value': examples_per_sec})
-
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(iters=-1, wall_time=wall_time_sec,
-                          metrics=metrics,
-                          extras={'flags': flags_str})
-
-
-class ShakespeareAccuracy(ShakespeareBenchmarkBase):
-  """Shakespeare accuracy tests.
-
-  This is not an ideal test. The best we can use for the accuracy check is to
-  validate top_1 of the training set. At batch size 64 the top_1 training
-  stabilizes to ~0.92 around 40-45 epochs.
-  """
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """Shakespeare accuracy tests.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-    self.train_data = os.path.join(root_data_dir, SHAKESPEARE_TRAIN_DATA)
-    super(ShakespeareAccuracy, self).__init__(
-        output_dir=output_dir, root_data_dir=root_data_dir)
-
-  def benchmark_cpu(self):
-    """Benchmark cpu."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_ds_run_eagerly(self):
-    """Benchmark cpu without distribution strategies and run eagerly."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu(self):
-    """Benchmark 1 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_ds(self):
-    """Benchmark 1 gpu without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_ds_run_eagerly(self):
-    """Benchmark 1 gpu without distribution strategies and run eagerly."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu(self):
-    """Benchmark 1 gpu w/xla."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu(self):
-    """Benchmark 8 gpu.
-
-    This is test is for accuracy not scaling.  The batch-size is not scaled to
-    the number of gpus.
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    self._run_and_report_benchmark()
-
-
-class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase):
-  """Benchmark accuracy tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=TMP_DIR, **kwargs):
-    """Benchmark tests w/Keras.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-    self.train_data = os.path.join(root_data_dir, SHAKESPEARE_TRAIN_DATA)
-
-    def_flags = {}
-    def_flags['training_data'] = self.train_data
-    def_flags['model_dir'] = ''
-    def_flags['train_epochs'] = 4
-    def_flags['log_steps'] = 50
-
-    super(ShakespeareKerasBenchmarkReal, self).__init__(
-        output_dir=output_dir,
-        root_data_dir=root_data_dir,
-        default_flags=def_flags)
-
-  def benchmark_cpu(self):
-    """Benchmark cpu."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.batch_size = 64
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_ds_run_eagerly(self):
-    """Benchmark cpu without distribution strategy and run eagerly."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.batch_size = 64
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.run_eagerly = True
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_ds(self):
-    """Benchmark cpu without distribution strategy."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.batch_size = 64
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_ds_force_v2(self):
-    """Benchmark cpu no ds, and force v2."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.batch_size = 64
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu(self):
-    """Benchmark 1 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_cudnn(self):
-    """Benchmark 1 gpu with CuDNN disabled."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64
-    FLAGS.cudnn = False
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_ds(self):
-    """Benchmark 1 gpu without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_ds_run_eagerly(self):
-    """Benchmark 1 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu(self):
-    """Benchmark 1 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_no_cudnn(self):
-    """Benchmark 1 gpu w/xla and CuDNN disabled."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64
-    FLAGS.cudnn = False
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu(self):
-    """Benchmark 8 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.batch_size = 64 * 8
-    FLAGS.log_steps = 10
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_no_cudnn(self):
-    """Benchmark 8 gpu with CuDNN disabled."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.batch_size = 64 * 8
-    FLAGS.log_steps = 10
-    FLAGS.cudnn = False
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu(self):
-    """Benchmark 8 gpu w/xla."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 64 * 8
-    FLAGS.log_steps = 10
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu_no_cudnn(self):
-    """Benchmark 8 gpu w/xla and CuDNN disabled."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.batch_size = 64 * 8
-    FLAGS.log_steps = 10
-    FLAGS.cudnn = False
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
-
-  def _run_and_report_benchmark(self):
-    """Run and report benchmark."""
-    super(ShakespeareKerasBenchmarkReal, self)._run_and_report_benchmark(
-        top_1_train_min=None, log_steps=FLAGS.log_steps)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/tfhub_memory_usage_benchmark.py
+++ b/official/benchmark/tfhub_memory_usage_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Runs a memory usage benchmark for a Tensorflow Hub model.
-
-Loads a SavedModel and records memory usage.
-"""
-import functools
-import time
-
-from absl import flags
-import tensorflow as tf
-import tensorflow_hub as hub
-
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-
-FLAGS = flags.FLAGS
-
-
-class TfHubMemoryUsageBenchmark(PerfZeroBenchmark):
-  """A benchmark measuring memory usage for a given TF Hub SavedModel."""
-
-  def __init__(self,
-               hub_model_handle_list=None,
-               output_dir=None,
-               default_flags=None,
-               root_data_dir=None,
-               **kwargs):
-    super(TfHubMemoryUsageBenchmark, self).__init__(
-        output_dir=output_dir, default_flags=default_flags, **kwargs)
-    if hub_model_handle_list:
-      for hub_model_handle in hub_model_handle_list.split(';'):
-        # Converts a model handle of the form
-        # https://tfhub.dev/google/nnlm-en-dim128/1 to valid python method name
-        # like google_nnlm_en_dim128_1.
-        hub_model_method_name = hub_model_handle.replace(
-            'https://tfhub.dev',
-            '').replace('/', '_').replace('-', '_').strip('_')
-        setattr(
-            self, 'benchmark_' + hub_model_method_name,
-            functools.partial(self.benchmark_memory_usage, hub_model_handle))
-
-  def benchmark_memory_usage(
-      self, hub_model_handle='https://tfhub.dev/google/nnlm-en-dim128/1'):
-    start_time_sec = time.time()
-    self.load_model(hub_model_handle)
-    wall_time_sec = time.time() - start_time_sec
-
-    metrics = []
-    self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=metrics)
-
-  def load_model(self, hub_model_handle):
-    """Loads a TF Hub module."""
-    hub.load(hub_model_handle)
-
-
-if __name__ == '__main__':
-  tf.test.main()