Merge pull request #10338 from srihari-humbarwadi:readme

PiperOrigin-RevId: 413033276

Merge pull request #10338 from srihari-humbarwadi:readme
PiperOrigin-RevId: 413033276
c57e975a · saberkun · 7fb4f3cd · acf4156e · c57e975a · c57e975a
Commit c57e975a authored Nov 29, 2021 by saberkun
20 changed files
--- a/official/vision/keras_cv/__init__.py
+++ b/official/vision/keras_cv/__init__.py
@@ -12,7 +12,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""Keras-CV package definition."""
-# pylint: disable=wildcard-import
-from official.vision.keras_cv import losses
-from official.vision.keras_cv import ops
--- a/official/legacy/image_classification/resnet/common.py
+++ b/official/legacy/image_classification/resnet/common.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common util functions and classes used by both keras cifar and imagenet."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import flags
+import tensorflow as tf
+
+import tensorflow_model_optimization as tfmot
+from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+
+FLAGS = flags.FLAGS
+BASE_LEARNING_RATE = 0.1  # This matches Jing's version.
+TRAIN_TOP_1 = 'training_accuracy_top_1'
+LR_SCHEDULE = [  # (multiplier, epoch to start) tuples
+    (1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
+]
+
+
+class PiecewiseConstantDecayWithWarmup(
+    tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Piecewise constant decay with warmup schedule."""
+
+  def __init__(self,
+               batch_size,
+               epoch_size,
+               warmup_epochs,
+               boundaries,
+               multipliers,
+               compute_lr_on_cpu=True,
+               name=None):
+    super(PiecewiseConstantDecayWithWarmup, self).__init__()
+    if len(boundaries) != len(multipliers) - 1:
+      raise ValueError('The length of boundaries must be 1 less than the '
+                       'length of multipliers')
+
+    base_lr_batch_size = 256
+    steps_per_epoch = epoch_size // batch_size
+
+    self.rescaled_lr = BASE_LEARNING_RATE * batch_size / base_lr_batch_size
+    self.step_boundaries = [float(steps_per_epoch) * x for x in boundaries]
+    self.lr_values = [self.rescaled_lr * m for m in multipliers]
+    self.warmup_steps = warmup_epochs * steps_per_epoch
+    self.compute_lr_on_cpu = compute_lr_on_cpu
+    self.name = name
+
+    self.learning_rate_ops_cache = {}
+
+  def __call__(self, step):
+    if tf.executing_eagerly():
+      return self._get_learning_rate(step)
+
+    # In an eager function or graph, the current implementation of optimizer
+    # repeatedly call and thus create ops for the learning rate schedule. To
+    # avoid this, we cache the ops if not executing eagerly.
+    graph = tf.compat.v1.get_default_graph()
+    if graph not in self.learning_rate_ops_cache:
+      if self.compute_lr_on_cpu:
+        with tf.device('/device:CPU:0'):
+          self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
+      else:
+        self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
+    return self.learning_rate_ops_cache[graph]
+
+  def _get_learning_rate(self, step):
+    """Compute learning rate at given step."""
+    with tf.name_scope('PiecewiseConstantDecayWithWarmup'):
+
+      def warmup_lr(step):
+        return self.rescaled_lr * (
+            tf.cast(step, tf.float32) / tf.cast(self.warmup_steps, tf.float32))
+
+      def piecewise_lr(step):
+        return tf.compat.v1.train.piecewise_constant(step, self.step_boundaries,
+                                                     self.lr_values)
+
+      return tf.cond(step < self.warmup_steps, lambda: warmup_lr(step),
+                     lambda: piecewise_lr(step))
+
+  def get_config(self):
+    return {
+        'rescaled_lr': self.rescaled_lr,
+        'step_boundaries': self.step_boundaries,
+        'lr_values': self.lr_values,
+        'warmup_steps': self.warmup_steps,
+        'compute_lr_on_cpu': self.compute_lr_on_cpu,
+        'name': self.name
+    }
+
+
+def get_optimizer(learning_rate=0.1):
+  """Returns optimizer to use."""
+  # The learning_rate is overwritten at the beginning of each step by callback.
+  return tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
+
+
+def get_callbacks(pruning_method=None,
+                  enable_checkpoint_and_export=False,
+                  model_dir=None):
+  """Returns common callbacks."""
+  time_callback = keras_utils.TimeHistory(
+      FLAGS.batch_size,
+      FLAGS.log_steps,
+      logdir=FLAGS.model_dir if FLAGS.enable_tensorboard else None)
+  callbacks = [time_callback]
+
+  if FLAGS.enable_tensorboard:
+    tensorboard_callback = tf.keras.callbacks.TensorBoard(
+        log_dir=FLAGS.model_dir, profile_batch=FLAGS.profile_steps)
+    callbacks.append(tensorboard_callback)
+
+  is_pruning_enabled = pruning_method is not None
+  if is_pruning_enabled:
+    callbacks.append(tfmot.sparsity.keras.UpdatePruningStep())
+    if model_dir is not None:
+      callbacks.append(
+          tfmot.sparsity.keras.PruningSummaries(
+              log_dir=model_dir, profile_batch=0))
+
+  if enable_checkpoint_and_export:
+    if model_dir is not None:
+      ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
+      callbacks.append(
+          tf.keras.callbacks.ModelCheckpoint(
+              ckpt_full_path, save_weights_only=True))
+  return callbacks
+
+
+def build_stats(history, eval_output, callbacks):
+  """Normalizes and returns dictionary of stats.
+
+  Args:
+    history: Results of the training step. Supports both categorical_accuracy
+      and sparse_categorical_accuracy.
+    eval_output: Output of the eval step. Assumes first value is eval_loss and
+      second value is accuracy_top_1.
+    callbacks: a list of callbacks which might include a time history callback
+      used during keras.fit.
+
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+  if eval_output:
+    stats['accuracy_top_1'] = float(eval_output[1])
+    stats['eval_loss'] = float(eval_output[0])
+  if history and history.history:
+    train_hist = history.history
+    # Gets final loss from training.
+    stats['loss'] = float(train_hist['loss'][-1])
+    # Gets top_1 training accuracy.
+    if 'categorical_accuracy' in train_hist:
+      stats[TRAIN_TOP_1] = float(train_hist['categorical_accuracy'][-1])
+    elif 'sparse_categorical_accuracy' in train_hist:
+      stats[TRAIN_TOP_1] = float(train_hist['sparse_categorical_accuracy'][-1])
+    elif 'accuracy' in train_hist:
+      stats[TRAIN_TOP_1] = float(train_hist['accuracy'][-1])
+
+  if not callbacks:
+    return stats
+
+  # Look for the time history callback which was used during keras.fit
+  for callback in callbacks:
+    if isinstance(callback, keras_utils.TimeHistory):
+      timestamp_log = callback.timestamp_log
+      stats['step_timestamp_log'] = timestamp_log
+      stats['train_finish_time'] = callback.train_finish_time
+      if callback.epoch_runtime_log:
+        stats['avg_exp_per_second'] = callback.average_examples_per_second
+
+  return stats
+
+
+def define_keras_flags(model=False,
+                       optimizer=False,
+                       pretrained_filepath=False):
+  """Define flags for Keras models."""
+  flags_core.define_base(
+      clean=True,
+      num_gpu=True,
+      run_eagerly=True,
+      train_epochs=True,
+      epochs_between_evals=True,
+      distribution_strategy=True)
+  flags_core.define_performance(
+      num_parallel_calls=False,
+      synthetic_data=True,
+      dtype=True,
+      all_reduce_alg=True,
+      num_packs=True,
+      tf_gpu_thread_mode=True,
+      datasets_num_private_threads=True,
+      loss_scale=True,
+      fp16_implementation=True,
+      tf_data_experimental_slack=True,
+      enable_xla=True,
+      training_dataset_cache=True)
+  flags_core.define_image()
+  flags_core.define_benchmark()
+  flags_core.define_distribution()
+  flags.adopt_module_key_flags(flags_core)
+
+  flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
+  flags.DEFINE_boolean(name='skip_eval', default=False, help='Skip evaluation?')
+  # TODO(b/135607288): Remove this flag once we understand the root cause of
+  # slowdown when setting the learning phase in Keras backend.
+  flags.DEFINE_boolean(
+      name='set_learning_phase_to_train',
+      default=True,
+      help='If skip eval, also set Keras learning phase to 1 (training).')
+  flags.DEFINE_boolean(
+      name='explicit_gpu_placement',
+      default=False,
+      help='If not using distribution strategy, explicitly set device scope '
+      'for the Keras training loop.')
+  flags.DEFINE_boolean(
+      name='use_trivial_model',
+      default=False,
+      help='Whether to use a trivial Keras model.')
+  flags.DEFINE_boolean(
+      name='report_accuracy_metrics',
+      default=True,
+      help='Report metrics during training and evaluation.')
+  flags.DEFINE_boolean(
+      name='use_tensor_lr',
+      default=True,
+      help='Use learning rate tensor instead of a callback.')
+  flags.DEFINE_boolean(
+      name='enable_tensorboard',
+      default=False,
+      help='Whether to enable TensorBoard callback.')
+  flags.DEFINE_string(
+      name='profile_steps',
+      default=None,
+      help='Save profiling data to model dir at given range of global steps. The '
+      'value must be a comma separated pair of positive integers, specifying '
+      'the first and last step to profile. For example, "--profile_steps=2,4" '
+      'triggers the profiler to process 3 steps, starting from the 2nd step. '
+      'Note that profiler has a non-trivial performance overhead, and the '
+      'output file can be gigantic if profiling many steps.')
+  flags.DEFINE_integer(
+      name='train_steps',
+      default=None,
+      help='The number of steps to run for training. If it is larger than '
+      '# batches per epoch, then use # batches per epoch. This flag will be '
+      'ignored if train_epochs is set to be larger than 1. ')
+  flags.DEFINE_boolean(
+      name='batchnorm_spatial_persistent',
+      default=True,
+      help='Enable the spacial persistent mode for CuDNN batch norm kernel.')
+  flags.DEFINE_boolean(
+      name='enable_get_next_as_optional',
+      default=False,
+      help='Enable get_next_as_optional behavior in DistributedIterator.')
+  flags.DEFINE_boolean(
+      name='enable_checkpoint_and_export',
+      default=False,
+      help='Whether to enable a checkpoint callback and export the savedmodel.')
+  flags.DEFINE_string(name='tpu', default='', help='TPU address to connect to.')
+  flags.DEFINE_integer(
+      name='steps_per_loop',
+      default=None,
+      help='Number of steps per training loop. Only training step happens '
+      'inside the loop. Callbacks will not be called inside. Will be capped at '
+      'steps per epoch.')
+  flags.DEFINE_boolean(
+      name='use_tf_while_loop',
+      default=True,
+      help='Whether to build a tf.while_loop inside the training loop on the '
+      'host. Setting it to True is critical to have peak performance on '
+      'TPU.')
+
+  if model:
+    flags.DEFINE_string('model', 'resnet50_v1.5',
+                        'Name of model preset. (mobilenet, resnet50_v1.5)')
+  if optimizer:
+    flags.DEFINE_string(
+        'optimizer', 'resnet50_default', 'Name of optimizer preset. '
+        '(mobilenet_default, resnet50_default)')
+    # TODO(kimjaehong): Replace as general hyper-params not only for mobilenet.
+    flags.DEFINE_float(
+        'initial_learning_rate_per_sample', 0.00007,
+        'Initial value of learning rate per sample for '
+        'mobilenet_default.')
+    flags.DEFINE_float('lr_decay_factor', 0.94,
+                       'Learning rate decay factor for mobilenet_default.')
+    flags.DEFINE_float('num_epochs_per_decay', 2.5,
+                       'Number of epochs per decay for mobilenet_default.')
+  if pretrained_filepath:
+    flags.DEFINE_string('pretrained_filepath', '', 'Pretrained file path.')
+
+
+def get_synth_data(height, width, num_channels, num_classes, dtype):
+  """Creates a set of synthetic random data.
+
+  Args:
+    height: Integer height that will be used to create a fake image tensor.
+    width: Integer width that will be used to create a fake image tensor.
+    num_channels: Integer depth that will be used to create a fake image tensor.
+    num_classes: Number of classes that should be represented in the fake labels
+      tensor
+    dtype: Data type for features/images.
+
+  Returns:
+    A tuple of tensors representing the inputs and labels.
+
+  """
+  # Synthetic input should be within [0, 255].
+  inputs = tf.random.truncated_normal([height, width, num_channels],
+                                      dtype=dtype,
+                                      mean=127,
+                                      stddev=60,
+                                      name='synthetic_inputs')
+  labels = tf.random.uniform([1],
+                             minval=0,
+                             maxval=num_classes - 1,
+                             dtype=tf.int32,
+                             name='synthetic_labels')
+  return inputs, labels
+
+
+def define_pruning_flags():
+  """Define flags for pruning methods."""
+  flags.DEFINE_string(
+      'pruning_method', None, 'Pruning method.'
+      'None (no pruning) or polynomial_decay.')
+  flags.DEFINE_float('pruning_initial_sparsity', 0.0,
+                     'Initial sparsity for pruning.')
+  flags.DEFINE_float('pruning_final_sparsity', 0.5,
+                     'Final sparsity for pruning.')
+  flags.DEFINE_integer('pruning_begin_step', 0, 'Begin step for pruning.')
+  flags.DEFINE_integer('pruning_end_step', 100000, 'End step for pruning.')
+  flags.DEFINE_integer('pruning_frequency', 100, 'Frequency for pruning.')
+
+
+def define_clustering_flags():
+  """Define flags for clustering methods."""
+  flags.DEFINE_string('clustering_method', None,
+                      'None (no clustering) or selective_clustering '
+                      '(cluster last three Conv2D layers of the model).')
+
+
+def get_synth_input_fn(height,
+                       width,
+                       num_channels,
+                       num_classes,
+                       dtype=tf.float32,
+                       drop_remainder=True):
+  """Returns an input function that returns a dataset with random data.
+
+  This input_fn returns a data set that iterates over a set of random data and
+  bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
+  copy is still included. This used to find the upper throughput bound when
+  tuning the full input pipeline.
+
+  Args:
+    height: Integer height that will be used to create a fake image tensor.
+    width: Integer width that will be used to create a fake image tensor.
+    num_channels: Integer depth that will be used to create a fake image tensor.
+    num_classes: Number of classes that should be represented in the fake labels
+      tensor
+    dtype: Data type for features/images.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+
+  Returns:
+    An input_fn that can be used in place of a real one to return a dataset
+    that can be used for iteration.
+  """
+
+  # pylint: disable=unused-argument
+  def input_fn(is_training, data_dir, batch_size, *args, **kwargs):
+    """Returns dataset filled with random data."""
+    inputs, labels = get_synth_data(
+        height=height,
+        width=width,
+        num_channels=num_channels,
+        num_classes=num_classes,
+        dtype=dtype)
+    # Cast to float32 for Keras model.
+    labels = tf.cast(labels, dtype=tf.float32)
+    data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
+
+    # `drop_remainder` will make dataset produce outputs with known shapes.
+    data = data.batch(batch_size, drop_remainder=drop_remainder)
+    data = data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+    return data
+
+  return input_fn
+
+
+def set_cudnn_batchnorm_mode():
+  """Set CuDNN batchnorm mode for better performance.
+
+     Note: Spatial Persistent mode may lead to accuracy losses for certain
+     models.
+  """
+  if FLAGS.batchnorm_spatial_persistent:
+    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
+  else:
+    os.environ.pop('TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT', None)
--- a/official/legacy/image_classification/resnet/imagenet_preprocessing.py
+++ b/official/legacy/image_classification/resnet/imagenet_preprocessing.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Provides utilities to preprocess images.
+
+Training images are sampled using the provided bounding boxes, and subsequently
+cropped to the sampled bounding box. Images are additionally flipped randomly,
+then resized to the target output size (without aspect-ratio preservation).
+
+Images used during evaluation are resized (with aspect-ratio preservation) and
+centrally cropped.
+
+All images undergo mean color subtraction.
+
+Note that these steps are colloquially referred to as "ResNet preprocessing,"
+and they differ from "VGG preprocessing," which does not use bounding boxes
+and instead does an aspect-preserving resize followed by random crop during
+training. (These both differ from "Inception preprocessing," which introduces
+color distortion steps.)
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import logging
+import tensorflow as tf
+
+DEFAULT_IMAGE_SIZE = 224
+NUM_CHANNELS = 3
+NUM_CLASSES = 1001
+
+NUM_IMAGES = {
+    'train': 1281167,
+    'validation': 50000,
+}
+
+_NUM_TRAIN_FILES = 1024
+_SHUFFLE_BUFFER = 10000
+
+_R_MEAN = 123.68
+_G_MEAN = 116.78
+_B_MEAN = 103.94
+CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
+
+# The lower bound for the smallest side of the image for aspect-preserving
+# resizing. For example, if an image is 500 x 1000, it will be resized to
+# _RESIZE_MIN x (_RESIZE_MIN * 2).
+_RESIZE_MIN = 256
+
+
+def process_record_dataset(dataset,
+                           is_training,
+                           batch_size,
+                           shuffle_buffer,
+                           parse_record_fn,
+                           dtype=tf.float32,
+                           datasets_num_private_threads=None,
+                           drop_remainder=False,
+                           tf_data_experimental_slack=False):
+  """Given a Dataset with raw records, return an iterator over the records.
+
+  Args:
+    dataset: A Dataset representing raw records
+    is_training: A boolean denoting whether the input is for training.
+    batch_size: The number of samples per batch.
+    shuffle_buffer: The buffer size to use when shuffling records. A larger
+      value results in better randomness, but smaller values reduce startup time
+      and use less memory.
+    parse_record_fn: A function that takes a raw record and returns the
+      corresponding (image, label) pair.
+    dtype: Data type to use for images/features.
+    datasets_num_private_threads: Number of threads for a private threadpool
+      created for all datasets computation.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+    tf_data_experimental_slack: Whether to enable tf.data's `experimental_slack`
+      option.
+
+  Returns:
+    Dataset of (image, label) pairs ready for iteration.
+  """
+  # Defines a specific size thread pool for tf.data operations.
+  if datasets_num_private_threads:
+    options = tf.data.Options()
+    options.experimental_threading.private_threadpool_size = (
+        datasets_num_private_threads)
+    dataset = dataset.with_options(options)
+    logging.info('datasets_num_private_threads: %s',
+                 datasets_num_private_threads)
+
+  if is_training:
+    # Shuffles records before repeating to respect epoch boundaries.
+    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
+    # Repeats the dataset for the number of epochs to train.
+    dataset = dataset.repeat()
+
+  # Parses the raw records into images and labels.
+  dataset = dataset.map(
+      lambda value: parse_record_fn(value, is_training, dtype),
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+
+  # Operations between the final prefetch and the get_next call to the iterator
+  # will happen synchronously during run time. We prefetch here again to
+  # background all of the above processing work and keep it out of the
+  # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
+  # allows DistributionStrategies to adjust how many batches to fetch based
+  # on how many devices are present.
+  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+
+  options = tf.data.Options()
+  options.experimental_slack = tf_data_experimental_slack
+  dataset = dataset.with_options(options)
+
+  return dataset
+
+
+def get_filenames(is_training, data_dir):
+  """Return filenames for dataset."""
+  if is_training:
+    return [
+        os.path.join(data_dir, 'train-%05d-of-01024' % i)
+        for i in range(_NUM_TRAIN_FILES)
+    ]
+  else:
+    return [
+        os.path.join(data_dir, 'validation-%05d-of-00128' % i)
+        for i in range(128)
+    ]
+
+
+def parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+
+  The output of the build_image_data.py image preprocessing script is a dataset
+  containing serialized Example protocol buffers. Each Example proto contains
+  the following fields (values are included as examples):
+
+    image/height: 462
+    image/width: 581
+    image/colorspace: 'RGB'
+    image/channels: 3
+    image/class/label: 615
+    image/class/synset: 'n03623198'
+    image/class/text: 'knee pad'
+    image/object/bbox/xmin: 0.1
+    image/object/bbox/xmax: 0.9
+    image/object/bbox/ymin: 0.2
+    image/object/bbox/ymax: 0.6
+    image/object/bbox/label: 615
+    image/format: 'JPEG'
+    image/filename: 'ILSVRC2012_val_00041207.JPEG'
+    image/encoded: <JPEG encoded string>
+
+  Args:
+    example_serialized: scalar Tensor tf.string containing a serialized Example
+      protocol buffer.
+
+  Returns:
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
+    label: Tensor tf.int32 containing the label.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+  """
+  # Dense features in Example proto.
+  feature_map = {
+      'image/encoded':
+          tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
+      'image/class/label':
+          tf.io.FixedLenFeature([], dtype=tf.int64, default_value=-1),
+      'image/class/text':
+          tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
+  }
+  sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update({
+      k: sparse_float32 for k in [
+          'image/object/bbox/xmin', 'image/object/bbox/ymin',
+          'image/object/bbox/xmax', 'image/object/bbox/ymax'
+      ]
+  })
+
+  features = tf.io.parse_single_example(
+      serialized=example_serialized, features=feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+
+  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
+  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
+  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
+  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
+
+  # Note that we impose an ordering of (y, x) just to make life difficult.
+  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
+
+  # Force the variable number of bounding boxes into the shape
+  # [1, num_boxes, coords].
+  bbox = tf.expand_dims(bbox, 0)
+  bbox = tf.transpose(a=bbox, perm=[0, 2, 1])
+
+  return features['image/encoded'], label, bbox
+
+
+def parse_record(raw_record, is_training, dtype):
+  """Parses a record containing a training example of an image.
+
+  The input record is parsed into a label and image, and the image is passed
+  through preprocessing steps (cropping, flipping, and so on).
+
+  Args:
+    raw_record: scalar Tensor tf.string containing a serialized Example protocol
+      buffer.
+    is_training: A boolean denoting whether the input is for training.
+    dtype: data type to use for images/features.
+
+  Returns:
+    Tuple with processed image tensor in a channel-last format and
+    one-hot-encoded label tensor.
+  """
+  image_buffer, label, bbox = parse_example_proto(raw_record)
+
+  image = preprocess_image(
+      image_buffer=image_buffer,
+      bbox=bbox,
+      output_height=DEFAULT_IMAGE_SIZE,
+      output_width=DEFAULT_IMAGE_SIZE,
+      num_channels=NUM_CHANNELS,
+      is_training=is_training)
+  image = tf.cast(image, dtype)
+
+  # Subtract one so that labels are in [0, 1000), and cast to float32 for
+  # Keras model.
+  label = tf.cast(
+      tf.cast(tf.reshape(label, shape=[1]), dtype=tf.int32) - 1,
+      dtype=tf.float32)
+  return image, label
+
+
+def get_parse_record_fn(use_keras_image_data_format=False):
+  """Get a function for parsing the records, accounting for image format.
+
+  This is useful by handling different types of Keras models. For instance,
+  the current resnet_model.resnet50 input format is always channel-last,
+  whereas the keras_applications mobilenet input format depends on
+  tf.keras.backend.image_data_format(). We should set
+  use_keras_image_data_format=False for the former and True for the latter.
+
+  Args:
+    use_keras_image_data_format: A boolean denoting whether data format is keras
+      backend image data format. If False, the image format is channel-last. If
+      True, the image format matches tf.keras.backend.image_data_format().
+
+  Returns:
+    Function to use for parsing the records.
+  """
+
+  def parse_record_fn(raw_record, is_training, dtype):
+    image, label = parse_record(raw_record, is_training, dtype)
+    if use_keras_image_data_format:
+      if tf.keras.backend.image_data_format() == 'channels_first':
+        image = tf.transpose(image, perm=[2, 0, 1])
+    return image, label
+
+  return parse_record_fn
+
+
+def input_fn(is_training,
+             data_dir,
+             batch_size,
+             dtype=tf.float32,
+             datasets_num_private_threads=None,
+             parse_record_fn=parse_record,
+             input_context=None,
+             drop_remainder=False,
+             tf_data_experimental_slack=False,
+             training_dataset_cache=False,
+             filenames=None):
+  """Input function which provides batches for train or eval.
+
+  Args:
+    is_training: A boolean denoting whether the input is for training.
+    data_dir: The directory containing the input data.
+    batch_size: The number of samples per batch.
+    dtype: Data type to use for images/features
+    datasets_num_private_threads: Number of private threads for tf.data.
+    parse_record_fn: Function to use for parsing the records.
+    input_context: A `tf.distribute.InputContext` object passed in by
+      `tf.distribute.Strategy`.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+    tf_data_experimental_slack: Whether to enable tf.data's `experimental_slack`
+      option.
+    training_dataset_cache: Whether to cache the training dataset on workers.
+      Typically used to improve training performance when training data is in
+      remote storage and can fit into worker memory.
+    filenames: Optional field for providing the file names of the TFRecords.
+
+  Returns:
+    A dataset that can be used for iteration.
+  """
+  if filenames is None:
+    filenames = get_filenames(is_training, data_dir)
+  dataset = tf.data.Dataset.from_tensor_slices(filenames)
+
+  if input_context:
+    logging.info(
+        'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
+        input_context.input_pipeline_id, input_context.num_input_pipelines)
+    dataset = dataset.shard(input_context.num_input_pipelines,
+                            input_context.input_pipeline_id)
+
+  if is_training:
+    # Shuffle the input files
+    dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)
+
+  # Convert to individual records.
+  # cycle_length = 10 means that up to 10 files will be read and deserialized in
+  # parallel. You may want to increase this number if you have a large number of
+  # CPU cores.
+  dataset = dataset.interleave(
+      tf.data.TFRecordDataset,
+      cycle_length=10,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  if is_training and training_dataset_cache:
+    # Improve training performance when training data is in remote storage and
+    # can fit into worker memory.
+    dataset = dataset.cache()
+
+  return process_record_dataset(
+      dataset=dataset,
+      is_training=is_training,
+      batch_size=batch_size,
+      shuffle_buffer=_SHUFFLE_BUFFER,
+      parse_record_fn=parse_record_fn,
+      dtype=dtype,
+      datasets_num_private_threads=datasets_num_private_threads,
+      drop_remainder=drop_remainder,
+      tf_data_experimental_slack=tf_data_experimental_slack,
+  )
+
+
+def _decode_crop_and_flip(image_buffer, bbox, num_channels):
+  """Crops the given image to a random part of the image, and randomly flips.
+
+  We use the fused decode_and_crop op, which performs better than the two ops
+  used separately in series, but note that this requires that the image be
+  passed in as an un-decoded string Tensor.
+
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as [ymin,
+      xmin, ymax, xmax].
+    num_channels: Integer depth of the image buffer for decoding.
+
+  Returns:
+    3-D tensor with cropped image.
+
+  """
+  # A large fraction of image datasets contain a human-annotated bounding box
+  # delineating the region of the image containing the object of interest.  We
+  # choose to create a new bounding box for the object which is a randomly
+  # distorted version of the human-annotated bounding box that obeys an
+  # allowed range of aspect ratios, sizes and overlap with the human-annotated
+  # bounding box. If no box is supplied, then we assume the bounding box is
+  # the entire image.
+  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+      tf.image.extract_jpeg_shape(image_buffer),
+      bounding_boxes=bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=[0.75, 1.33],
+      area_range=[0.05, 1.0],
+      max_attempts=100,
+      use_image_if_no_bounding_boxes=True)
+  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+
+  # Reassemble the bounding box in the format the crop op requires.
+  offset_y, offset_x, _ = tf.unstack(bbox_begin)
+  target_height, target_width, _ = tf.unstack(bbox_size)
+  crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+
+  # Use the fused decode and crop op here, which is faster than each in series.
+  cropped = tf.image.decode_and_crop_jpeg(
+      image_buffer, crop_window, channels=num_channels)
+
+  # Flip to add a little more random distortion in.
+  cropped = tf.image.random_flip_left_right(cropped)
+  return cropped
+
+
+def _central_crop(image, crop_height, crop_width):
+  """Performs central crops of the given image list.
+
+  Args:
+    image: a 3-D image tensor
+    crop_height: the height of the image following the crop.
+    crop_width: the width of the image following the crop.
+
+  Returns:
+    3-D tensor with cropped image.
+  """
+  shape = tf.shape(input=image)
+  height, width = shape[0], shape[1]
+
+  amount_to_be_cropped_h = (height - crop_height)
+  crop_top = amount_to_be_cropped_h // 2
+  amount_to_be_cropped_w = (width - crop_width)
+  crop_left = amount_to_be_cropped_w // 2
+  return tf.slice(image, [crop_top, crop_left, 0],
+                  [crop_height, crop_width, -1])
+
+
+def _mean_image_subtraction(image, means, num_channels):
+  """Subtracts the given means from each image channel.
+
+  For example:
+    means = [123.68, 116.779, 103.939]
+    image = _mean_image_subtraction(image, means)
+
+  Note that the rank of `image` must be known.
+
+  Args:
+    image: a tensor of size [height, width, C].
+    means: a C-vector of values to subtract from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+
+  Returns:
+    the centered image.
+
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `means`.
+  """
+  if image.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+
+  if len(means) != num_channels:
+    raise ValueError('len(means) must match the number of channels')
+
+  # We have a 1-D tensor of means; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  means = tf.broadcast_to(means, tf.shape(image))
+
+  return image - means
+
+
+def _smallest_size_at_least(height, width, resize_min):
+  """Computes new shape with the smallest side equal to `smallest_side`.
+
+  Computes new shape with the smallest side equal to `smallest_side` while
+  preserving the original aspect ratio.
+
+  Args:
+    height: an int32 scalar tensor indicating the current height.
+    width: an int32 scalar tensor indicating the current width.
+    resize_min: A python integer or scalar `Tensor` indicating the size of the
+      smallest side after resize.
+
+  Returns:
+    new_height: an int32 scalar tensor indicating the new height.
+    new_width: an int32 scalar tensor indicating the new width.
+  """
+  resize_min = tf.cast(resize_min, tf.float32)
+
+  # Convert to floats to make subsequent calculations go smoothly.
+  height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
+
+  smaller_dim = tf.minimum(height, width)
+  scale_ratio = resize_min / smaller_dim
+
+  # Convert back to ints to make heights and widths that TF ops will accept.
+  new_height = tf.cast(height * scale_ratio, tf.int32)
+  new_width = tf.cast(width * scale_ratio, tf.int32)
+
+  return new_height, new_width
+
+
+def _aspect_preserving_resize(image, resize_min):
+  """Resize images preserving the original aspect ratio.
+
+  Args:
+    image: A 3-D image `Tensor`.
+    resize_min: A python integer or scalar `Tensor` indicating the size of the
+      smallest side after resize.
+
+  Returns:
+    resized_image: A 3-D tensor containing the resized image.
+  """
+  shape = tf.shape(input=image)
+  height, width = shape[0], shape[1]
+
+  new_height, new_width = _smallest_size_at_least(height, width, resize_min)
+
+  return _resize_image(image, new_height, new_width)
+
+
+def _resize_image(image, height, width):
+  """Simple wrapper around tf.resize_images.
+
+  This is primarily to make sure we use the same `ResizeMethod` and other
+  details each time.
+
+  Args:
+    image: A 3-D image `Tensor`.
+    height: The target height for the resized image.
+    width: The target width for the resized image.
+
+  Returns:
+    resized_image: A 3-D tensor containing the resized image. The first two
+      dimensions have the shape [height, width].
+  """
+  return tf.compat.v1.image.resize(
+      image, [height, width],
+      method=tf.image.ResizeMethod.BILINEAR,
+      align_corners=False)
+
+
+def preprocess_image(image_buffer,
+                     bbox,
+                     output_height,
+                     output_width,
+                     num_channels,
+                     is_training=False):
+  """Preprocesses the given image.
+
+  Preprocessing includes decoding, cropping, and resizing for both training
+  and eval images. Training preprocessing, however, introduces some random
+  distortion of the image to improve accuracy.
+
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as [ymin,
+      xmin, ymax, xmax].
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    num_channels: Integer depth of the image buffer for decoding.
+    is_training: `True` if we're preprocessing the image for training and
+      `False` otherwise.
+
+  Returns:
+    A preprocessed image.
+  """
+  if is_training:
+    # For training, we want to randomize some of the distortions.
+    image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
+    image = _resize_image(image, output_height, output_width)
+  else:
+    # For validation, we want to decode, resize, then just crop the middle.
+    image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
+    image = _aspect_preserving_resize(image, _RESIZE_MIN)
+    image = _central_crop(image, output_height, output_width)
+
+  image.set_shape([output_height, output_width, num_channels])
+
+  return _mean_image_subtraction(image, CHANNEL_MEANS, num_channels)
--- a/official/modeling/hyperparams/config_definitions.py
+++ b/official/modeling/hyperparams/config_definitions.py
@@ -12,46 +12,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""Common configuration settings."""
-# pylint:disable=wildcard-import
-import dataclasses
+# Lint as: python3
+"""Configuration definitions for ResNet losses, learning rates, and optimizers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function

-from official.core.config_definitions import *
+import dataclasses
+from official.legacy.image_classification.configs import base_configs
 from official.modeling.hyperparams import base_config


-# TODO(hongkuny): These configs are used in models that are going to deprecate.
-# Once those models are removed, we should delete this file to avoid confusion.
-# Users should not use this file anymore.
 @dataclasses.dataclass
-class TensorboardConfig(base_config.Config):
-  """Configuration for Tensorboard.
-
-  Attributes:
-    track_lr: Whether or not to track the learning rate in Tensorboard. Defaults
-      to True.
-    write_model_weights: Whether or not to write the model weights as images in
-      Tensorboard. Defaults to False.
-  """
-  track_lr: bool = True
-  write_model_weights: bool = False
-
-
-@dataclasses.dataclass
-class CallbacksConfig(base_config.Config):
-  """Configuration for Callbacks.
-
-  Attributes:
-    enable_checkpoint_and_export: Whether or not to enable checkpoints as a
-      Callback. Defaults to True.
-    enable_backup_and_restore: Whether or not to add BackupAndRestore
-      callback. Defaults to True.
-    enable_tensorboard: Whether or not to enable Tensorboard as a Callback.
-      Defaults to True.
-    enable_time_history: Whether or not to enable TimeHistory Callbacks.
-      Defaults to True.
-  """
-  enable_checkpoint_and_export: bool = True
-  enable_backup_and_restore: bool = False
-  enable_tensorboard: bool = True
-  enable_time_history: bool = True
+class ResNetModelConfig(base_configs.ModelConfig):
+  """Configuration for the ResNet model."""
+  name: str = 'ResNet'
+  num_classes: int = 1000
+  model_params: base_config.Config = dataclasses.field(
+      # pylint: disable=g-long-lambda
+      default_factory=lambda: {
+          'num_classes': 1000,
+          'batch_size': None,
+          'use_l2_regularizer': True,
+          'rescale_inputs': False,
+      })
+  # pylint: enable=g-long-lambda
+  loss: base_configs.LossConfig = base_configs.LossConfig(
+      name='sparse_categorical_crossentropy')
+  optimizer: base_configs.OptimizerConfig = base_configs.OptimizerConfig(
+      name='momentum',
+      decay=0.9,
+      epsilon=0.001,
+      momentum=0.9,
+      moving_average_decay=None)
+  learning_rate: base_configs.LearningRateConfig = (
+      base_configs.LearningRateConfig(
+          name='stepwise',
+          initial_lr=0.1,
+          examples_per_epoch=1281167,
+          boundaries=[30, 60, 80],
+          warmup_epochs=5,
+          scale_by_batch_size=1. / 256.,
+          multipliers=[0.1 / 256, 0.01 / 256, 0.001 / 256, 0.0001 / 256]))
--- a/official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py
+++ b/official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+import math
+import os
+
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import orbit
+import tensorflow as tf
+from official.common import distribute_utils
+from official.legacy.image_classification.resnet import common
+from official.legacy.image_classification.resnet import imagenet_preprocessing
+from official.legacy.image_classification.resnet import resnet_runnable
+from official.modeling import performance
+from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+from official.utils.misc import model_helpers
+
+flags.DEFINE_boolean(name='use_tf_function', default=True,
+                     help='Wrap the train and test step inside a '
+                     'tf.function.')
+flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
+                     help='Calculate L2_loss on concatenated weights, '
+                     'instead of using Keras per-layer L2 loss.')
+
+
+def build_stats(runnable, time_callback):
+  """Normalizes and returns dictionary of stats.
+
+  Args:
+    runnable: The module containing all the training and evaluation metrics.
+    time_callback: Time tracking callback instance.
+
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+
+  if not runnable.flags_obj.skip_eval:
+    stats['eval_loss'] = runnable.test_loss.result().numpy()
+    stats['eval_acc'] = runnable.test_accuracy.result().numpy()
+
+    stats['train_loss'] = runnable.train_loss.result().numpy()
+    stats['train_acc'] = runnable.train_accuracy.result().numpy()
+
+  if time_callback:
+    timestamp_log = time_callback.timestamp_log
+    stats['step_timestamp_log'] = timestamp_log
+    stats['train_finish_time'] = time_callback.train_finish_time
+    if time_callback.epoch_runtime_log:
+      stats['avg_exp_per_second'] = time_callback.average_examples_per_second
+
+  return stats
+
+
+def get_num_train_iterations(flags_obj):
+  """Returns the number of training steps, train and test epochs."""
+  train_steps = (
+      imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
+  train_epochs = flags_obj.train_epochs
+
+  if flags_obj.train_steps:
+    train_steps = min(flags_obj.train_steps, train_steps)
+    train_epochs = 1
+
+  eval_steps = math.ceil(1.0 * imagenet_preprocessing.NUM_IMAGES['validation'] /
+                         flags_obj.batch_size)
+
+  return train_steps, train_epochs, eval_steps
+
+
+def run(flags_obj):
+  """Run ResNet ImageNet training and eval loop using custom training loops.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+
+  Raises:
+    ValueError: If fp16 is passed as it is not currently supported.
+
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  keras_utils.set_session_config()
+  performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
+
+  if tf.config.list_physical_devices('GPU'):
+    if flags_obj.tf_gpu_thread_mode:
+      keras_utils.set_gpu_thread_mode_and_count(
+          per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+          gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+          num_gpus=flags_obj.num_gpus,
+          datasets_num_private_threads=flags_obj.datasets_num_private_threads)
+    common.set_cudnn_batchnorm_mode()
+
+  data_format = flags_obj.data_format
+  if data_format is None:
+    data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
+                   else 'channels_last')
+  tf.keras.backend.set_image_data_format(data_format)
+
+  strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs,
+      tpu_address=flags_obj.tpu)
+
+  per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations(
+      flags_obj)
+  if flags_obj.steps_per_loop is None:
+    steps_per_loop = per_epoch_steps
+  elif flags_obj.steps_per_loop > per_epoch_steps:
+    steps_per_loop = per_epoch_steps
+    logging.warn('Setting steps_per_loop to %d to respect epoch boundary.',
+                 steps_per_loop)
+  else:
+    steps_per_loop = flags_obj.steps_per_loop
+
+  logging.info(
+      'Training %d epochs, each epoch has %d steps, '
+      'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
+      train_epochs * per_epoch_steps, eval_steps)
+
+  time_callback = keras_utils.TimeHistory(
+      flags_obj.batch_size,
+      flags_obj.log_steps,
+      logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
+  with distribute_utils.get_strategy_scope(strategy):
+    runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback,
+                                              per_epoch_steps)
+
+  eval_interval = flags_obj.epochs_between_evals * per_epoch_steps
+  checkpoint_interval = (
+      steps_per_loop * 5 if flags_obj.enable_checkpoint_and_export else None)
+  summary_interval = steps_per_loop if flags_obj.enable_tensorboard else None
+
+  checkpoint_manager = tf.train.CheckpointManager(
+      runnable.checkpoint,
+      directory=flags_obj.model_dir,
+      max_to_keep=10,
+      step_counter=runnable.global_step,
+      checkpoint_interval=checkpoint_interval)
+
+  resnet_controller = orbit.Controller(
+      strategy=strategy,
+      trainer=runnable,
+      evaluator=runnable if not flags_obj.skip_eval else None,
+      global_step=runnable.global_step,
+      steps_per_loop=steps_per_loop,
+      checkpoint_manager=checkpoint_manager,
+      summary_interval=summary_interval,
+      summary_dir=flags_obj.model_dir,
+      eval_summary_dir=os.path.join(flags_obj.model_dir, 'eval'))
+
+  time_callback.on_train_begin()
+  if not flags_obj.skip_eval:
+    resnet_controller.train_and_evaluate(
+        train_steps=per_epoch_steps * train_epochs,
+        eval_steps=eval_steps,
+        eval_interval=eval_interval)
+  else:
+    resnet_controller.train(steps=per_epoch_steps * train_epochs)
+  time_callback.on_train_end()
+
+  stats = build_stats(runnable, time_callback)
+  return stats
+
+
+def main(_):
+  model_helpers.apply_clean(flags.FLAGS)
+  stats = run(flags.FLAGS)
+  logging.info('Run stats:\n%s', stats)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  common.define_keras_flags()
+  app.run(main)
--- a/official/legacy/image_classification/resnet/resnet_model.py
+++ b/official/legacy/image_classification/resnet/resnet_model.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ResNet50 model for Keras.
+
+Adapted from tf.keras.applications.resnet50.ResNet50().
+This is ResNet model version 1.5.
+
+Related papers/blogs:
+- https://arxiv.org/abs/1512.03385
+- https://arxiv.org/pdf/1603.05027v2.pdf
+- http://torch.ch/blog/2016/02/04/resnets.html
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from official.legacy.image_classification.resnet import imagenet_preprocessing
+
+layers = tf.keras.layers
+
+
+def _gen_l2_regularizer(use_l2_regularizer=True, l2_weight_decay=1e-4):
+  return tf.keras.regularizers.L2(
+      l2_weight_decay) if use_l2_regularizer else None
+
+
+def identity_block(input_tensor,
+                   kernel_size,
+                   filters,
+                   stage,
+                   block,
+                   use_l2_regularizer=True,
+                   batch_norm_decay=0.9,
+                   batch_norm_epsilon=1e-5):
+  """The identity block is the block that has no conv layer at shortcut.
+
+  Args:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: 'a','b'..., current block label, used for generating layer names
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
+    batch_norm_decay: Moment of batch norm layers.
+    batch_norm_epsilon: Epsilon of batch borm layers.
+
+  Returns:
+    Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if tf.keras.backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = layers.Conv2D(
+      filters1, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2a')(
+          input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2a')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters2,
+      kernel_size,
+      padding='same',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2b')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2b')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters3, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2c')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2c')(
+          x)
+
+  x = layers.add([x, input_tensor])
+  x = layers.Activation('relu')(x)
+  return x
+
+
+def conv_block(input_tensor,
+               kernel_size,
+               filters,
+               stage,
+               block,
+               strides=(2, 2),
+               use_l2_regularizer=True,
+               batch_norm_decay=0.9,
+               batch_norm_epsilon=1e-5):
+  """A block that has a conv layer at shortcut.
+
+  Note that from stage 3,
+  the second conv layer at main path is with strides=(2, 2)
+  And the shortcut should have strides=(2, 2) as well
+
+  Args:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: 'a','b'..., current block label, used for generating layer names
+    strides: Strides for the second conv layer in the block.
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
+    batch_norm_decay: Moment of batch norm layers.
+    batch_norm_epsilon: Epsilon of batch borm layers.
+
+  Returns:
+    Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if tf.keras.backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = layers.Conv2D(
+      filters1, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2a')(
+          input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2a')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters2,
+      kernel_size,
+      strides=strides,
+      padding='same',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2b')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2b')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters3, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2c')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2c')(
+          x)
+
+  shortcut = layers.Conv2D(
+      filters3, (1, 1),
+      strides=strides,
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '1')(
+          input_tensor)
+  shortcut = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '1')(
+          shortcut)
+
+  x = layers.add([x, shortcut])
+  x = layers.Activation('relu')(x)
+  return x
+
+
+def resnet50(num_classes,
+             batch_size=None,
+             use_l2_regularizer=True,
+             rescale_inputs=False,
+             batch_norm_decay=0.9,
+             batch_norm_epsilon=1e-5):
+  """Instantiates the ResNet50 architecture.
+
+  Args:
+    num_classes: `int` number of classes for image classification.
+    batch_size: Size of the batches for each step.
+    use_l2_regularizer: whether to use L2 regularizer on Conv/Dense layer.
+    rescale_inputs: whether to rescale inputs from 0 to 1.
+    batch_norm_decay: Moment of batch norm layers.
+    batch_norm_epsilon: Epsilon of batch borm layers.
+
+  Returns:
+      A Keras model instance.
+  """
+  input_shape = (224, 224, 3)
+  img_input = layers.Input(shape=input_shape, batch_size=batch_size)
+  if rescale_inputs:
+    # Hub image modules expect inputs in the range [0, 1]. This rescales these
+    # inputs to the range expected by the trained model.
+    x = layers.Lambda(
+        lambda x: x * 255.0 - tf.keras.backend.constant(    # pylint: disable=g-long-lambda
+            imagenet_preprocessing.CHANNEL_MEANS,
+            shape=[1, 1, 3],
+            dtype=x.dtype),
+        name='rescale')(
+            img_input)
+  else:
+    x = img_input
+
+  if tf.keras.backend.image_data_format() == 'channels_first':
+    x = layers.Permute((3, 1, 2))(x)
+    bn_axis = 1
+  else:  # channels_last
+    bn_axis = 3
+
+  block_config = dict(
+      use_l2_regularizer=use_l2_regularizer,
+      batch_norm_decay=batch_norm_decay,
+      batch_norm_epsilon=batch_norm_epsilon)
+  x = layers.ZeroPadding2D(padding=(3, 3), name='conv1_pad')(x)
+  x = layers.Conv2D(
+      64, (7, 7),
+      strides=(2, 2),
+      padding='valid',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name='conv1')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name='bn_conv1')(
+          x)
+  x = layers.Activation('relu')(x)
+  x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
+
+  x = conv_block(
+      x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), **block_config)
+  x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', **block_config)
+  x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', **block_config)
+
+  x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', **block_config)
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', **block_config)
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', **block_config)
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', **block_config)
+
+  x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f', **block_config)
+
+  x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a', **block_config)
+  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b', **block_config)
+  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c', **block_config)
+
+  x = layers.GlobalAveragePooling2D()(x)
+  x = layers.Dense(
+      num_classes,
+      kernel_initializer=tf.initializers.random_normal(stddev=0.01),
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      bias_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name='fc1000')(
+          x)
+
+  # A softmax that is followed by the model loss must be done cannot be done
+  # in float16 due to numeric issues. So we pass dtype=float32.
+  x = layers.Activation('softmax', dtype='float32')(x)
+
+  # Create model.
+  return tf.keras.Model(img_input, x, name='resnet50')
--- a/official/legacy/image_classification/resnet/resnet_runnable.py
+++ b/official/legacy/image_classification/resnet/resnet_runnable.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+import orbit
+import tensorflow as tf
+from official.legacy.image_classification.resnet import common
+from official.legacy.image_classification.resnet import imagenet_preprocessing
+from official.legacy.image_classification.resnet import resnet_model
+from official.modeling import grad_utils
+from official.modeling import performance
+from official.utils.flags import core as flags_core
+
+
+class ResnetRunnable(orbit.StandardTrainer, orbit.StandardEvaluator):
+  """Implements the training and evaluation APIs for Resnet model."""
+
+  def __init__(self, flags_obj, time_callback, epoch_steps):
+    self.strategy = tf.distribute.get_strategy()
+    self.flags_obj = flags_obj
+    self.dtype = flags_core.get_tf_dtype(flags_obj)
+    self.time_callback = time_callback
+
+    # Input pipeline related
+    batch_size = flags_obj.batch_size
+    if batch_size % self.strategy.num_replicas_in_sync != 0:
+      raise ValueError(
+          'Batch size must be divisible by number of replicas : {}'.format(
+              self.strategy.num_replicas_in_sync))
+
+    # As auto rebatching is not supported in
+    # `distribute_datasets_from_function()` API, which is
+    # required when cloning dataset to multiple workers in eager mode,
+    # we use per-replica batch size.
+    self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync)
+
+    if self.flags_obj.use_synthetic_data:
+      self.input_fn = common.get_synth_input_fn(
+          height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
+          width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
+          num_channels=imagenet_preprocessing.NUM_CHANNELS,
+          num_classes=imagenet_preprocessing.NUM_CLASSES,
+          dtype=self.dtype,
+          drop_remainder=True)
+    else:
+      self.input_fn = imagenet_preprocessing.input_fn
+
+    self.model = resnet_model.resnet50(
+        num_classes=imagenet_preprocessing.NUM_CLASSES,
+        use_l2_regularizer=not flags_obj.single_l2_loss_op)
+
+    lr_schedule = common.PiecewiseConstantDecayWithWarmup(
+        batch_size=flags_obj.batch_size,
+        epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
+        warmup_epochs=common.LR_SCHEDULE[0][1],
+        boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
+        multipliers=list(p[0] for p in common.LR_SCHEDULE),
+        compute_lr_on_cpu=True)
+    self.optimizer = common.get_optimizer(lr_schedule)
+    # Make sure iterations variable is created inside scope.
+    self.global_step = self.optimizer.iterations
+    self.optimizer = performance.configure_optimizer(
+        self.optimizer,
+        use_float16=self.dtype == tf.float16,
+        loss_scale=flags_core.get_loss_scale(flags_obj, default_for_fp16=128))
+
+    self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
+    self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        'train_accuracy', dtype=tf.float32)
+    self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
+    self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        'test_accuracy', dtype=tf.float32)
+
+    self.checkpoint = tf.train.Checkpoint(
+        model=self.model, optimizer=self.optimizer)
+
+    # Handling epochs.
+    self.epoch_steps = epoch_steps
+    self.epoch_helper = orbit.utils.EpochHelper(epoch_steps, self.global_step)
+    train_dataset = orbit.utils.make_distributed_dataset(
+        self.strategy,
+        self.input_fn,
+        is_training=True,
+        data_dir=self.flags_obj.data_dir,
+        batch_size=self.batch_size,
+        parse_record_fn=imagenet_preprocessing.parse_record,
+        datasets_num_private_threads=self.flags_obj
+        .datasets_num_private_threads,
+        dtype=self.dtype,
+        drop_remainder=True)
+    orbit.StandardTrainer.__init__(
+        self,
+        train_dataset,
+        options=orbit.StandardTrainerOptions(
+            use_tf_while_loop=flags_obj.use_tf_while_loop,
+            use_tf_function=flags_obj.use_tf_function))
+    if not flags_obj.skip_eval:
+      eval_dataset = orbit.utils.make_distributed_dataset(
+          self.strategy,
+          self.input_fn,
+          is_training=False,
+          data_dir=self.flags_obj.data_dir,
+          batch_size=self.batch_size,
+          parse_record_fn=imagenet_preprocessing.parse_record,
+          dtype=self.dtype)
+      orbit.StandardEvaluator.__init__(
+          self,
+          eval_dataset,
+          options=orbit.StandardEvaluatorOptions(
+              use_tf_function=flags_obj.use_tf_function))
+
+  def train_loop_begin(self):
+    """See base class."""
+    # Reset all metrics
+    self.train_loss.reset_states()
+    self.train_accuracy.reset_states()
+
+    self._epoch_begin()
+    self.time_callback.on_batch_begin(self.epoch_helper.batch_index)
+
+  def train_step(self, iterator):
+    """See base class."""
+
+    def step_fn(inputs):
+      """Function to run on the device."""
+      images, labels = inputs
+      with tf.GradientTape() as tape:
+        logits = self.model(images, training=True)
+
+        prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
+            labels, logits)
+        loss = tf.reduce_sum(prediction_loss) * (1.0 /
+                                                 self.flags_obj.batch_size)
+        num_replicas = self.strategy.num_replicas_in_sync
+        l2_weight_decay = 1e-4
+        if self.flags_obj.single_l2_loss_op:
+          l2_loss = l2_weight_decay * 2 * tf.add_n([
+              tf.nn.l2_loss(v)
+              for v in self.model.trainable_variables
+              if 'bn' not in v.name
+          ])
+
+          loss += (l2_loss / num_replicas)
+        else:
+          loss += (tf.reduce_sum(self.model.losses) / num_replicas)
+
+      grad_utils.minimize_using_explicit_allreduce(
+          tape, self.optimizer, loss, self.model.trainable_variables)
+      self.train_loss.update_state(loss)
+      self.train_accuracy.update_state(labels, logits)
+    if self.flags_obj.enable_xla:
+      step_fn = tf.function(step_fn, jit_compile=True)
+    self.strategy.run(step_fn, args=(next(iterator),))
+
+  def train_loop_end(self):
+    """See base class."""
+    metrics = {
+        'train_loss': self.train_loss.result(),
+        'train_accuracy': self.train_accuracy.result(),
+    }
+    self.time_callback.on_batch_end(self.epoch_helper.batch_index - 1)
+    self._epoch_end()
+    return metrics
+
+  def eval_begin(self):
+    """See base class."""
+    self.test_loss.reset_states()
+    self.test_accuracy.reset_states()
+
+  def eval_step(self, iterator):
+    """See base class."""
+
+    def step_fn(inputs):
+      """Function to run on the device."""
+      images, labels = inputs
+      logits = self.model(images, training=False)
+      loss = tf.keras.losses.sparse_categorical_crossentropy(labels, logits)
+      loss = tf.reduce_sum(loss) * (1.0 / self.flags_obj.batch_size)
+      self.test_loss.update_state(loss)
+      self.test_accuracy.update_state(labels, logits)
+
+    self.strategy.run(step_fn, args=(next(iterator),))
+
+  def eval_end(self):
+    """See base class."""
+    return {
+        'test_loss': self.test_loss.result(),
+        'test_accuracy': self.test_accuracy.result()
+    }
+
+  def _epoch_begin(self):
+    if self.epoch_helper.epoch_begin():
+      self.time_callback.on_epoch_begin(self.epoch_helper.current_epoch)
+
+  def _epoch_end(self):
+    if self.epoch_helper.epoch_end():
+      self.time_callback.on_epoch_end(self.epoch_helper.current_epoch)
--- a/official/legacy/image_classification/resnet/tfhub_export.py
+++ b/official/legacy/image_classification/resnet/tfhub_export.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A script to export TF-Hub SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import os
+
+# Import libraries
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+from official.legacy.image_classification.resnet import imagenet_preprocessing
+from official.legacy.image_classification.resnet import resnet_model
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_path", None,
+                    "File path to TF model checkpoint or H5 file.")
+flags.DEFINE_string("export_path", None,
+                    "TF-Hub SavedModel destination path to export.")
+
+
+def export_tfhub(model_path, hub_destination):
+  """Restores a tf.keras.Model and saves for TF-Hub."""
+  model = resnet_model.resnet50(
+      num_classes=imagenet_preprocessing.NUM_CLASSES, rescale_inputs=True)
+  model.load_weights(model_path)
+  model.save(
+      os.path.join(hub_destination, "classification"), include_optimizer=False)
+
+  # Extracts a sub-model to use pooling feature vector as model output.
+  image_input = model.get_layer(index=0).get_output_at(0)
+  feature_vector_output = model.get_layer(name="reduce_mean").get_output_at(0)
+  hub_model = tf.keras.Model(image_input, feature_vector_output)
+
+  # Exports a SavedModel.
+  hub_model.save(
+      os.path.join(hub_destination, "feature-vector"), include_optimizer=False)
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+
+  export_tfhub(FLAGS.model_path, FLAGS.export_path)
+
+
+if __name__ == "__main__":
+  app.run(main)
--- a/official/legacy/image_classification/test_utils.py
+++ b/official/legacy/image_classification/test_utils.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test utilities for image classification tasks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+def trivial_model(num_classes):
+  """Trivial model for ImageNet dataset."""
+
+  input_shape = (224, 224, 3)
+  img_input = tf.keras.layers.Input(shape=input_shape)
+
+  x = tf.keras.layers.Lambda(
+      lambda x: tf.keras.backend.reshape(x, [-1, 224 * 224 * 3]),
+      name='reshape')(img_input)
+  x = tf.keras.layers.Dense(1, name='fc1')(x)
+  x = tf.keras.layers.Dense(num_classes, name='fc1000')(x)
+  x = tf.keras.layers.Activation('softmax', dtype='float32')(x)
+
+  return tf.keras.models.Model(img_input, x, name='trivial')
--- a/official/modeling/grad_utils_test.py
+++ b/official/modeling/grad_utils_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for grad_utils."""
+
+import tensorflow as tf
+from official.modeling import grad_utils
+from official.modeling import performance
+
+
+class GradUtilsTest(tf.test.TestCase):
+
+  def test_minimize(self):
+
+    optimizer = tf.keras.optimizers.SGD(0.1)
+    with tf.GradientTape() as tape:
+      model = tf.keras.layers.Dense(2)
+      outputs = model(tf.zeros((2, 2), tf.float32))
+      loss = tf.reduce_mean(outputs)
+
+    grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss,
+                                                 model.trainable_variables)
+
+  def test_minimize_fp16(self):
+
+    optimizer = performance.configure_optimizer(
+        tf.keras.optimizers.SGD(0.1), use_float16=True)
+    performance.set_mixed_precision_policy(tf.float16)
+    with tf.GradientTape() as tape:
+      model = tf.keras.layers.Dense(2)
+      outputs = model(tf.zeros((2, 2), tf.float16))
+      loss = tf.reduce_mean(outputs)
+
+    grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss,
+                                                 model.trainable_variables)
+
+    # Test other fp16 settings.
+    def _clip_by_global_norm(grads_and_vars):
+      grads, tvars = list(zip(*grads_and_vars))
+      (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
+      return zip(grads, tvars)
+    with tf.GradientTape() as tape:
+      model = tf.keras.layers.Dense(2)
+      outputs = model(tf.zeros((2, 2), tf.float16))
+      loss = tf.reduce_mean(outputs)
+    optimizer = performance.configure_optimizer(
+        tf.keras.optimizers.SGD(0.1), use_float16=True, loss_scale=128)
+    grad_utils.minimize_using_explicit_allreduce(
+        tape,
+        optimizer,
+        loss,
+        model.trainable_variables,
+        pre_allreduce_callbacks=[_clip_by_global_norm],
+        post_allreduce_callbacks=[_clip_by_global_norm])
+
+  def test_set_mixed_precision_policy(self):
+    performance.set_mixed_precision_policy(tf.float16)
+    performance.set_mixed_precision_policy(tf.bfloat16)
+    performance.set_mixed_precision_policy(tf.float32)
+
+    with self.assertRaises(ValueError):
+      performance.set_mixed_precision_policy(tf.int32)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/performance.py
+++ b/official/modeling/performance.py
@@ -14,14 +14,19 @@

 """Functions and classes related to training performance."""

+from absl import logging
 import tensorflow as tf


 def configure_optimizer(optimizer,
                        use_float16=False,
-                        use_graph_rewrite=False,
-                        loss_scale=None):
+                        loss_scale=None,
+                        use_graph_rewrite=None):
  """Configures optimizer object with performance options."""
+  if use_graph_rewrite is not None:
+    logging.warning('`use_graph_rewrite` is deprecated inside '
+                    '`configure_optimizer`. Please remove the usage.')
+  del use_graph_rewrite
  if use_float16:
    if loss_scale in (None, 'dynamic'):
      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
@@ -29,13 +34,6 @@ def configure_optimizer(optimizer,
      # loss_scale is a number. We interpret that as a fixed loss scale.
      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
          optimizer, dynamic=False, initial_scale=loss_scale)
-  if use_graph_rewrite:
-    # Note: the model dtype must be 'float32', which will ensure
-    # tf.keras.mixed_precision and enable_mixed_precision_graph_rewrite do not
-    # double up.
-    optimizer = (
-        tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
-            optimizer))
  return optimizer



--- a/official/modeling/tf_utils.py
+++ b/official/modeling/tf_utils.py
@@ -110,6 +110,8 @@ def get_activation(identifier, use_keras_layer=False):
          "swish": "swish",
          "sigmoid": "sigmoid",
          "relu6": tf.nn.relu6,
+          "hard_swish": activations.hard_swish,
+          "hard_sigmoid": activations.hard_sigmoid,
      }
      if identifier in keras_layer_allowlist:
        return tf.keras.layers.Activation(keras_layer_allowlist[identifier])

--- a/official/nlp/README.md
+++ b/official/nlp/README.md
@@ -44,9 +44,6 @@ READMEs for specific papers.
 4.  [Transformer for translation](transformer):
    [Attention Is All You Need](https://arxiv.org/abs/1706.03762) by Vaswani et
    al., 2017
-5.  [NHNet](nhnet):
-    [Generating Representative Headlines for News Stories](https://arxiv.org/abs/2001.09386)
-    by Gu et al, 2020

 ### Common Training Driver


--- a/official/nlp/bert/common_flags.py
+++ b/official/nlp/bert/common_flags.py
@@ -121,9 +121,5 @@ def use_float16():
  return flags_core.get_tf_dtype(flags.FLAGS) == tf.float16


-def use_graph_rewrite():
-  return flags.FLAGS.fp16_implementation == 'graph_rewrite'
-
-
 def get_loss_scale():
  return flags_core.get_loss_scale(flags.FLAGS, default_for_fp16='dynamic')
--- a/official/nlp/bert/run_classifier.py
+++ b/official/nlp/bert/run_classifier.py
@@ -150,8 +150,7 @@ def run_bert_classifier(strategy,
                                              FLAGS.optimizer_type)
    classifier_model.optimizer = performance.configure_optimizer(
        optimizer,
-        use_float16=common_flags.use_float16(),
-        use_graph_rewrite=common_flags.use_graph_rewrite())
+        use_float16=common_flags.use_float16())
    return classifier_model, core_model

  # tf.keras.losses objects accept optional sample_weight arguments (eg. coming

--- a/official/nlp/bert/run_pretraining.py
+++ b/official/nlp/bert/run_pretraining.py
@@ -125,8 +125,7 @@ def run_customized_training(strategy,
        end_lr, optimizer_type)
    pretrain_model.optimizer = performance.configure_optimizer(
        optimizer,
-        use_float16=common_flags.use_float16(),
-        use_graph_rewrite=common_flags.use_graph_rewrite())
+        use_float16=common_flags.use_float16())
    return pretrain_model, core_model

  trained_model = model_training_utils.run_customized_training_loop(

--- a/official/nlp/bert/run_squad_helper.py
+++ b/official/nlp/bert/run_squad_helper.py
@@ -252,8 +252,7 @@ def train_squad(strategy,

    squad_model.optimizer = performance.configure_optimizer(
        optimizer,
-        use_float16=common_flags.use_float16(),
-        use_graph_rewrite=common_flags.use_graph_rewrite())
+        use_float16=common_flags.use_float16())
    return squad_model, core_model

  # Only when explicit_allreduce = True, post_allreduce_callbacks and

--- a/official/nlp/configs/encoders.py
+++ b/official/nlp/configs/encoders.py
@@ -170,6 +170,32 @@ class KernelEncoderConfig(hyperparams.Config):
  scale: Optional[float] = None


+@dataclasses.dataclass
+class ReuseEncoderConfig(hyperparams.Config):
+  """Reuse encoder configuration."""
+  vocab_size: int = 30522
+  hidden_size: int = 768
+  num_layers: int = 12
+  num_attention_heads: int = 12
+  hidden_activation: str = "gelu"
+  intermediate_size: int = 3072
+  dropout_rate: float = 0.1
+  attention_dropout_rate: float = 0.1
+  max_position_embeddings: int = 512
+  type_vocab_size: int = 2
+  initializer_range: float = 0.02
+  embedding_size: Optional[int] = None
+  output_range: Optional[int] = None
+  return_all_encoder_outputs: bool = False
+  # Pre/Post-LN Transformer
+  norm_first: bool = False
+  # Reuse transformer
+  reuse_attention: int = -1
+  use_relative_pe: bool = False
+  pe_max_seq_length: int = 512
+  max_reuse_layer_idx: int = 6
+
+
 @dataclasses.dataclass
 class XLNetEncoderConfig(hyperparams.Config):
  """XLNet encoder configuration."""
@@ -205,6 +231,7 @@ class EncoderConfig(hyperparams.OneOfConfig):
  bigbird: BigBirdEncoderConfig = BigBirdEncoderConfig()
  kernel: KernelEncoderConfig = KernelEncoderConfig()
  mobilebert: MobileBertEncoderConfig = MobileBertEncoderConfig()
+  reuse: ReuseEncoderConfig = ReuseEncoderConfig()
  teams: BertEncoderConfig = BertEncoderConfig()
  xlnet: XLNetEncoderConfig = XLNetEncoderConfig()

@@ -472,6 +499,43 @@ def build_encoder(config: EncoderConfig,
        dict_outputs=True)
    return networks.EncoderScaffold(**kwargs)

+  if encoder_type == "reuse":
+    embedding_cfg = dict(
+        vocab_size=encoder_cfg.vocab_size,
+        type_vocab_size=encoder_cfg.type_vocab_size,
+        hidden_size=encoder_cfg.hidden_size,
+        max_seq_length=encoder_cfg.max_position_embeddings,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        dropout_rate=encoder_cfg.dropout_rate)
+    hidden_cfg = dict(
+        num_attention_heads=encoder_cfg.num_attention_heads,
+        inner_dim=encoder_cfg.intermediate_size,
+        inner_activation=tf_utils.get_activation(
+            encoder_cfg.hidden_activation),
+        output_dropout=encoder_cfg.dropout_rate,
+        attention_dropout=encoder_cfg.attention_dropout_rate,
+        norm_first=encoder_cfg.norm_first,
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        reuse_attention=encoder_cfg.reuse_attention,
+        use_relative_pe=encoder_cfg.use_relative_pe,
+        pe_max_seq_length=encoder_cfg.pe_max_seq_length,
+        max_reuse_layer_idx=encoder_cfg.max_reuse_layer_idx)
+    kwargs = dict(
+        embedding_cfg=embedding_cfg,
+        hidden_cls=layers.ReuseTransformer,
+        hidden_cfg=hidden_cfg,
+        num_hidden_instances=encoder_cfg.num_layers,
+        pooled_output_dim=encoder_cfg.hidden_size,
+        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        return_all_layer_outputs=False,
+        dict_outputs=True,
+        feed_layer_idx=True,
+        recursive=True)
+    return networks.EncoderScaffold(**kwargs)
+
  bert_encoder_cls = networks.BertEncoder
  if encoder_type == "bert_v2":
    bert_encoder_cls = networks.BertEncoderV2

--- a/official/nlp/data/dual_encoder_dataloader.py
+++ b/official/nlp/data/dual_encoder_dataloader.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Loads dataset for the dual encoder (retrieval) task."""
+import functools
+import itertools
+from typing import Iterable, Mapping, Optional, Tuple
+
+import dataclasses
+import tensorflow as tf
+import tensorflow_hub as hub
+
+from official.core import config_definitions as cfg
+from official.core import input_reader
+from official.nlp.data import data_loader
+from official.nlp.data import data_loader_factory
+from official.nlp.modeling import layers
+
+
+@dataclasses.dataclass
+class DualEncoderDataConfig(cfg.DataConfig):
+  """Data config for dual encoder task (tasks/dual_encoder)."""
+  # Either set `input_path`...
+  input_path: str = ''
+  # ...or `tfds_name` and `tfds_split` to specify input.
+  tfds_name: str = ''
+  tfds_split: str = ''
+  global_batch_size: int = 32
+  # Either build preprocessing with Python code by specifying these values...
+  vocab_file: str = ''
+  lower_case: bool = True
+  # ...or load preprocessing from a SavedModel at this location.
+  preprocessing_hub_module_url: str = ''
+
+  left_text_fields: Tuple[str] = ('left_input',)
+  right_text_fields: Tuple[str] = ('right_input',)
+  is_training: bool = True
+  seq_length: int = 128
+
+
+@data_loader_factory.register_data_loader_cls(DualEncoderDataConfig)
+class DualEncoderDataLoader(data_loader.DataLoader):
+  """A class to load dataset for dual encoder task (tasks/dual_encoder)."""
+
+  def __init__(self, params):
+    if bool(params.tfds_name) == bool(params.input_path):
+      raise ValueError('Must specify either `tfds_name` and `tfds_split` '
+                       'or `input_path`.')
+    if bool(params.vocab_file) == bool(params.preprocessing_hub_module_url):
+      raise ValueError('Must specify exactly one of vocab_file (with matching '
+                       'lower_case flag) or preprocessing_hub_module_url.')
+    self._params = params
+    self._seq_length = params.seq_length
+    self._left_text_fields = params.left_text_fields
+    self._right_text_fields = params.right_text_fields
+
+    if params.preprocessing_hub_module_url:
+      preprocessing_hub_module = hub.load(params.preprocessing_hub_module_url)
+      self._tokenizer = preprocessing_hub_module.tokenize
+      self._pack_inputs = functools.partial(
+          preprocessing_hub_module.bert_pack_inputs,
+          seq_length=params.seq_length)
+    else:
+      self._tokenizer = layers.BertTokenizer(
+          vocab_file=params.vocab_file, lower_case=params.lower_case)
+      self._pack_inputs = layers.BertPackInputs(
+          seq_length=params.seq_length,
+          special_tokens_dict=self._tokenizer.get_special_tokens_dict())
+
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    name_to_features = {
+        x: tf.io.FixedLenFeature([], tf.string)
+        for x in itertools.chain(
+            *[self._left_text_fields, self._right_text_fields])
+    }
+    example = tf.io.parse_single_example(record, name_to_features)
+
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in example:
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+
+    return example
+
+  def _bert_tokenize(
+      self, record: Mapping[str, tf.Tensor],
+      text_fields: Iterable[str]) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    """Tokenize the input in text_fields using BERT tokenizer.
+
+    Args:
+      record: A tfexample record contains the features.
+      text_fields: A list of fields to be tokenzied.
+
+    Returns:
+      The tokenized features in a tuple of (input_word_ids, input_mask,
+      input_type_ids).
+    """
+    segments_text = [record[x] for x in text_fields]
+    segments_tokens = [self._tokenizer(s) for s in segments_text]
+    segments = [tf.cast(x.merge_dims(1, 2), tf.int32) for x in segments_tokens]
+    return self._pack_inputs(segments)
+
+  def _bert_preprocess(
+      self, record: Mapping[str, tf.Tensor]) -> Mapping[str, tf.Tensor]:
+    """Perform the bert word piece tokenization for left and right inputs."""
+
+    def _switch_prefix(string, old, new):
+      if string.startswith(old): return new + string[len(old):]
+      raise ValueError('Expected {} to start with {}'.format(string, old))
+
+    def _switch_key_prefix(d, old, new):
+      return {_switch_prefix(key, old, new): value for key, value in d.items()}
+
+    model_inputs = _switch_key_prefix(
+        self._bert_tokenize(record, self._left_text_fields),
+        'input_', 'left_')
+    model_inputs.update(_switch_key_prefix(
+        self._bert_tokenize(record, self._right_text_fields),
+        'input_', 'right_'))
+    return model_inputs
+
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    reader = input_reader.InputReader(
+        params=self._params,
+        # Skip `decoder_fn` for tfds input.
+        decoder_fn=self._decode if self._params.input_path else None,
+        dataset_fn=tf.data.TFRecordDataset,
+        postprocess_fn=self._bert_preprocess)
+    return reader.read(input_context)
--- a/official/nlp/data/dual_encoder_dataloader_test.py
+++ b/official/nlp/data/dual_encoder_dataloader_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for official.nlp.data.dual_encoder_dataloader."""
+import os
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.nlp.data import dual_encoder_dataloader
+
+
+_LEFT_FEATURE_NAME = 'left_input'
+_RIGHT_FEATURE_NAME = 'right_input'
+
+
+def _create_fake_dataset(output_path):
+  """Creates a fake dataset contains examples for training a dual encoder model.
+
+    The created dataset contains examples with two byteslist features keyed by
+    _LEFT_FEATURE_NAME and _RIGHT_FEATURE_NAME.
+
+  Args:
+    output_path: The output path of the fake dataset.
+  """
+  def create_str_feature(values):
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=values))
+
+  with tf.io.TFRecordWriter(output_path) as writer:
+    for _ in range(100):
+      features = {}
+      features[_LEFT_FEATURE_NAME] = create_str_feature([b'hello world.'])
+      features[_RIGHT_FEATURE_NAME] = create_str_feature([b'world hello.'])
+
+      tf_example = tf.train.Example(
+          features=tf.train.Features(feature=features))
+      writer.write(tf_example.SerializeToString())
+
+
+def _make_vocab_file(vocab, output_path):
+  with tf.io.gfile.GFile(output_path, 'w') as f:
+    f.write('\n'.join(vocab + ['']))
+
+
+class DualEncoderDataTest(tf.test.TestCase, parameterized.TestCase):
+
+  def test_load_dataset(self):
+    seq_length = 16
+    batch_size = 10
+    train_data_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
+    vocab_path = os.path.join(self.get_temp_dir(), 'vocab.txt')
+
+    _create_fake_dataset(train_data_path)
+    _make_vocab_file(
+        ['[PAD]', '[UNK]', '[CLS]', '[SEP]', 'he', '#llo', 'world'], vocab_path)
+
+    data_config = dual_encoder_dataloader.DualEncoderDataConfig(
+        input_path=train_data_path,
+        seq_length=seq_length,
+        vocab_file=vocab_path,
+        lower_case=True,
+        left_text_fields=(_LEFT_FEATURE_NAME,),
+        right_text_fields=(_RIGHT_FEATURE_NAME,),
+        global_batch_size=batch_size)
+    dataset = dual_encoder_dataloader.DualEncoderDataLoader(
+        data_config).load()
+    features = next(iter(dataset))
+    self.assertCountEqual(
+        ['left_word_ids', 'left_mask', 'left_type_ids', 'right_word_ids',
+         'right_mask', 'right_type_ids'],
+        features.keys())
+    self.assertEqual(features['left_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['left_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['left_type_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['right_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['right_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['right_type_ids'].shape, (batch_size, seq_length))
+
+  @parameterized.parameters(False, True)
+  def test_load_tfds(self, use_preprocessing_hub):
+    seq_length = 16
+    batch_size = 10
+    if use_preprocessing_hub:
+      vocab_path = ''
+      preprocessing_hub = (
+          'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3')
+    else:
+      vocab_path = os.path.join(self.get_temp_dir(), 'vocab.txt')
+      _make_vocab_file(
+          ['[PAD]', '[UNK]', '[CLS]', '[SEP]', 'he', '#llo', 'world'],
+          vocab_path)
+      preprocessing_hub = ''
+
+    data_config = dual_encoder_dataloader.DualEncoderDataConfig(
+        tfds_name='para_crawl/enmt',
+        tfds_split='train',
+        seq_length=seq_length,
+        vocab_file=vocab_path,
+        lower_case=True,
+        left_text_fields=('en',),
+        right_text_fields=('mt',),
+        preprocessing_hub_module_url=preprocessing_hub,
+        global_batch_size=batch_size)
+    dataset = dual_encoder_dataloader.DualEncoderDataLoader(
+        data_config).load()
+    features = next(iter(dataset))
+    self.assertCountEqual(
+        ['left_word_ids', 'left_mask', 'left_type_ids', 'right_word_ids',
+         'right_mask', 'right_type_ids'],
+        features.keys())
+    self.assertEqual(features['left_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['left_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['left_type_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['right_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['right_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['right_type_ids'].shape, (batch_size, seq_length))
+
+
+if __name__ == '__main__':
+  tf.test.main()