Update code to v2.8.0

9485aa1d · qianyj · 89cfa348 · f5fc733a · 9485aa1d · 9485aa1d
Commit 9485aa1d authored Nov 28, 2023 by qianyj
20 changed files
--- a/official/legacy/image_classification/mnist_test.py
+++ b/official/legacy/image_classification/mnist_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test the Keras MNIST model on GPU."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import functools
+from absl.testing import parameterized
+import tensorflow as tf
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.legacy.image_classification import mnist_main
+from official.utils.testing import integration
+mnist_main.define_mnist_flags()
+def eager_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],)
+class KerasMnistTest(tf.test.TestCase, parameterized.TestCase):
+  """Unit tests for sample Keras MNIST model."""
+  _tempdir = None
+  @classmethod
+  def setUpClass(cls):  # pylint: disable=invalid-name
+    super(KerasMnistTest, cls).setUpClass()
+  def tearDown(self):
+    super(KerasMnistTest, self).tearDown()
+    tf.io.gfile.rmtree(self.get_temp_dir())
+  @combinations.generate(eager_strategy_combinations())
+  def test_end_to_end(self, distribution):
+    """Test Keras MNIST model with `strategy`."""
+    extra_flags = [
+        "-train_epochs",
+        "1",
+        # Let TFDS find the metadata folder automatically
+        "--data_dir="
+    ]
+    dummy_data = (
+        tf.ones(shape=(10, 28, 28, 1), dtype=tf.int32),
+        tf.range(10),
+    )
+    datasets = (
+        tf.data.Dataset.from_tensor_slices(dummy_data),
+        tf.data.Dataset.from_tensor_slices(dummy_data),
+    )
+    run = functools.partial(
+        mnist_main.run,
+        datasets_override=datasets,
+        strategy_override=distribution)
+    integration.run_synthetic(
+        main=run,
+        synth=False,
+        tmp_root=self.create_tempdir().full_path,
+        extra_flags=extra_flags)
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/legacy/image_classification/optimizer_factory.py
+++ b/official/legacy/image_classification/optimizer_factory.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimizer factory for vision tasks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from typing import Any, Dict, Optional, Text
+from absl import logging
+import tensorflow as tf
+import tensorflow_addons as tfa
+from official.legacy.image_classification import learning_rate
+from official.legacy.image_classification.configs import base_configs
+from official.modeling import optimization
+# pylint: disable=protected-access
+def build_optimizer(
+    optimizer_name: Text,
+    base_learning_rate: tf.keras.optimizers.schedules.LearningRateSchedule,
+    params: Dict[Text, Any],
+    model: Optional[tf.keras.Model] = None):
+  """Build the optimizer based on name.
+  Args:
+    optimizer_name: String representation of the optimizer name. Examples: sgd,
+      momentum, rmsprop.
+    base_learning_rate: `tf.keras.optimizers.schedules.LearningRateSchedule`
+      base learning rate.
+    params: String -> Any dictionary representing the optimizer params. This
+      should contain optimizer specific parameters such as `base_learning_rate`,
+      `decay`, etc.
+    model: The `tf.keras.Model`. This is used for the shadow copy if using
+      `ExponentialMovingAverage`.
+  Returns:
+    A tf.keras.Optimizer.
+  Raises:
+    ValueError if the provided optimizer_name is not supported.
+  """
+  optimizer_name = optimizer_name.lower()
+  logging.info('Building %s optimizer with params %s', optimizer_name, params)
+  if optimizer_name == 'sgd':
+    logging.info('Using SGD optimizer')
+    nesterov = params.get('nesterov', False)
+    optimizer = tf.keras.optimizers.SGD(
+        learning_rate=base_learning_rate, nesterov=nesterov)
+  elif optimizer_name == 'momentum':
+    logging.info('Using momentum optimizer')
+    nesterov = params.get('nesterov', False)
+    optimizer = tf.keras.optimizers.SGD(
+        learning_rate=base_learning_rate,
+        momentum=params['momentum'],
+        nesterov=nesterov)
+  elif optimizer_name == 'rmsprop':
+    logging.info('Using RMSProp')
+    rho = params.get('decay', None) or params.get('rho', 0.9)
+    momentum = params.get('momentum', 0.9)
+    epsilon = params.get('epsilon', 1e-07)
+    optimizer = tf.keras.optimizers.RMSprop(
+        learning_rate=base_learning_rate,
+        rho=rho,
+        momentum=momentum,
+        epsilon=epsilon)
+  elif optimizer_name == 'adam':
+    logging.info('Using Adam')
+    beta_1 = params.get('beta_1', 0.9)
+    beta_2 = params.get('beta_2', 0.999)
+    epsilon = params.get('epsilon', 1e-07)
+    optimizer = tf.keras.optimizers.Adam(
+        learning_rate=base_learning_rate,
+        beta_1=beta_1,
+        beta_2=beta_2,
+        epsilon=epsilon)
+  elif optimizer_name == 'adamw':
+    logging.info('Using AdamW')
+    weight_decay = params.get('weight_decay', 0.01)
+    beta_1 = params.get('beta_1', 0.9)
+    beta_2 = params.get('beta_2', 0.999)
+    epsilon = params.get('epsilon', 1e-07)
+    optimizer = tfa.optimizers.AdamW(
+        weight_decay=weight_decay,
+        learning_rate=base_learning_rate,
+        beta_1=beta_1,
+        beta_2=beta_2,
+        epsilon=epsilon)
+  else:
+    raise ValueError('Unknown optimizer %s' % optimizer_name)
+  if params.get('lookahead', None):
+    logging.info('Using lookahead optimizer.')
+    optimizer = tfa.optimizers.Lookahead(optimizer)
+  # Moving average should be applied last, as it's applied at test time
+  moving_average_decay = params.get('moving_average_decay', 0.)
+  if moving_average_decay is not None and moving_average_decay > 0.:
+    if model is None:
+      raise ValueError(
+          '`model` must be provided if using `ExponentialMovingAverage`.')
+    logging.info('Including moving average decay.')
+    optimizer = optimization.ExponentialMovingAverage(
+        optimizer=optimizer, average_decay=moving_average_decay)
+    optimizer.shadow_copy(model)
+  return optimizer
+def build_learning_rate(params: base_configs.LearningRateConfig,
+                        batch_size: Optional[int] = None,
+                        train_epochs: Optional[int] = None,
+                        train_steps: Optional[int] = None):
+  """Build the learning rate given the provided configuration."""
+  decay_type = params.name
+  base_lr = params.initial_lr
+  decay_rate = params.decay_rate
+  if params.decay_epochs is not None:
+    decay_steps = params.decay_epochs * train_steps
+  else:
+    decay_steps = 0
+  if params.warmup_epochs is not None:
+    warmup_steps = params.warmup_epochs * train_steps
+  else:
+    warmup_steps = 0
+  lr_multiplier = params.scale_by_batch_size
+  if lr_multiplier and lr_multiplier > 0:
+    # Scale the learning rate based on the batch size and a multiplier
+    base_lr *= lr_multiplier * batch_size
+    logging.info(
+        'Scaling the learning rate based on the batch size '
+        'multiplier. New base_lr: %f', base_lr)
+  if decay_type == 'exponential':
+    logging.info(
+        'Using exponential learning rate with: '
+        'initial_learning_rate: %f, decay_steps: %d, '
+        'decay_rate: %f', base_lr, decay_steps, decay_rate)
+    lr = tf.keras.optimizers.schedules.ExponentialDecay(
+        initial_learning_rate=base_lr,
+        decay_steps=decay_steps,
+        decay_rate=decay_rate,
+        staircase=params.staircase)
+  elif decay_type == 'stepwise':
+    steps_per_epoch = params.examples_per_epoch // batch_size
+    boundaries = [boundary * steps_per_epoch for boundary in params.boundaries]
+    multipliers = [batch_size * multiplier for multiplier in params.multipliers]
+    logging.info(
+        'Using stepwise learning rate. Parameters: '
+        'boundaries: %s, values: %s', boundaries, multipliers)
+    lr = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
+        boundaries=boundaries, values=multipliers)
+  elif decay_type == 'cosine_with_warmup':
+    lr = learning_rate.CosineDecayWithWarmup(
+        batch_size=batch_size,
+        total_steps=train_epochs * train_steps,
+        warmup_steps=warmup_steps)
+  if warmup_steps > 0:
+    if decay_type not in ['cosine_with_warmup']:
+      logging.info('Applying %d warmup steps to the learning rate',
+                   warmup_steps)
+      lr = learning_rate.WarmupDecaySchedule(
+          lr, warmup_steps, warmup_lr=base_lr)
+  return lr
--- a/official/legacy/image_classification/optimizer_factory_test.py
+++ b/official/legacy/image_classification/optimizer_factory_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for optimizer_factory."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl.testing import parameterized
+import tensorflow as tf
+from official.legacy.image_classification import optimizer_factory
+from official.legacy.image_classification.configs import base_configs
+class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
+  def build_toy_model(self) -> tf.keras.Model:
+    """Creates a toy `tf.Keras.Model`."""
+    model = tf.keras.Sequential()
+    model.add(tf.keras.layers.Dense(1, input_shape=(1,)))
+    return model
+  @parameterized.named_parameters(
+      ('sgd', 'sgd', 0., False), ('momentum', 'momentum', 0., False),
+      ('rmsprop', 'rmsprop', 0., False), ('adam', 'adam', 0., False),
+      ('adamw', 'adamw', 0., False),
+      ('momentum_lookahead', 'momentum', 0., True),
+      ('sgd_ema', 'sgd', 0.999, False),
+      ('momentum_ema', 'momentum', 0.999, False),
+      ('rmsprop_ema', 'rmsprop', 0.999, False))
+  def test_optimizer(self, optimizer_name, moving_average_decay, lookahead):
+    """Smoke test to be sure no syntax errors."""
+    model = self.build_toy_model()
+    params = {
+        'learning_rate': 0.001,
+        'rho': 0.09,
+        'momentum': 0.,
+        'epsilon': 1e-07,
+        'moving_average_decay': moving_average_decay,
+        'lookahead': lookahead,
+    }
+    optimizer = optimizer_factory.build_optimizer(
+        optimizer_name=optimizer_name,
+        base_learning_rate=params['learning_rate'],
+        params=params,
+        model=model)
+    self.assertTrue(issubclass(type(optimizer), tf.keras.optimizers.Optimizer))
+  def test_unknown_optimizer(self):
+    with self.assertRaises(ValueError):
+      optimizer_factory.build_optimizer(
+          optimizer_name='this_optimizer_does_not_exist',
+          base_learning_rate=None,
+          params=None)
+  def test_learning_rate_without_decay_or_warmups(self):
+    params = base_configs.LearningRateConfig(
+        name='exponential',
+        initial_lr=0.01,
+        decay_rate=0.01,
+        decay_epochs=None,
+        warmup_epochs=None,
+        scale_by_batch_size=0.01,
+        examples_per_epoch=1,
+        boundaries=[0],
+        multipliers=[0, 1])
+    batch_size = 1
+    train_steps = 1
+    lr = optimizer_factory.build_learning_rate(
+        params=params, batch_size=batch_size, train_steps=train_steps)
+    self.assertTrue(
+        issubclass(
+            type(lr), tf.keras.optimizers.schedules.LearningRateSchedule))
+  @parameterized.named_parameters(('exponential', 'exponential'),
+                                  ('cosine_with_warmup', 'cosine_with_warmup'))
+  def test_learning_rate_with_decay_and_warmup(self, lr_decay_type):
+    """Basic smoke test for syntax."""
+    params = base_configs.LearningRateConfig(
+        name=lr_decay_type,
+        initial_lr=0.01,
+        decay_rate=0.01,
+        decay_epochs=1,
+        warmup_epochs=1,
+        scale_by_batch_size=0.01,
+        examples_per_epoch=1,
+        boundaries=[0],
+        multipliers=[0, 1])
+    batch_size = 1
+    train_epochs = 1
+    train_steps = 1
+    lr = optimizer_factory.build_learning_rate(
+        params=params,
+        batch_size=batch_size,
+        train_epochs=train_epochs,
+        train_steps=train_steps)
+    self.assertTrue(
+        issubclass(
+            type(lr), tf.keras.optimizers.schedules.LearningRateSchedule))
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/legacy/image_classification/preprocessing.py
+++ b/official/legacy/image_classification/preprocessing.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Preprocessing functions for images."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from typing import List, Optional, Text, Tuple
+import tensorflow as tf
+from official.legacy.image_classification import augment
+# Calculated from the ImageNet training set
+MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255)
+STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255)
+IMAGE_SIZE = 224
+CROP_PADDING = 32
+def mean_image_subtraction(
+    image_bytes: tf.Tensor,
+    means: Tuple[float, ...],
+    num_channels: int = 3,
+    dtype: tf.dtypes.DType = tf.float32,
+) ->  tf.Tensor:
+  """Subtracts the given means from each image channel.
+  For example:
+    means = [123.68, 116.779, 103.939]
+    image_bytes = mean_image_subtraction(image_bytes, means)
+  Note that the rank of `image` must be known.
+  Args:
+    image_bytes: a tensor of size [height, width, C].
+    means: a C-vector of values to subtract from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+  Returns:
+    the centered image.
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `means`.
+  """
+  if image_bytes.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+  if len(means) != num_channels:
+    raise ValueError('len(means) must match the number of channels')
+  # We have a 1-D tensor of means; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  means = tf.broadcast_to(means, tf.shape(image_bytes))
+  if dtype is not None:
+    means = tf.cast(means, dtype=dtype)
+  return image_bytes - means
+def standardize_image(
+    image_bytes: tf.Tensor,
+    stddev: Tuple[float, ...],
+    num_channels: int = 3,
+    dtype: tf.dtypes.DType = tf.float32,
+) ->  tf.Tensor:
+  """Divides the given stddev from each image channel.
+  For example:
+    stddev = [123.68, 116.779, 103.939]
+    image_bytes = standardize_image(image_bytes, stddev)
+  Note that the rank of `image` must be known.
+  Args:
+    image_bytes: a tensor of size [height, width, C].
+    stddev: a C-vector of values to divide from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+  Returns:
+    the centered image.
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `stddev`.
+  """
+  if image_bytes.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+  if len(stddev) != num_channels:
+    raise ValueError('len(stddev) must match the number of channels')
+  # We have a 1-D tensor of stddev; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  stddev = tf.broadcast_to(stddev, tf.shape(image_bytes))
+  if dtype is not None:
+    stddev = tf.cast(stddev, dtype=dtype)
+  return image_bytes / stddev
+def normalize_images(features: tf.Tensor,
+                     mean_rgb: Tuple[float, ...] = MEAN_RGB,
+                     stddev_rgb: Tuple[float, ...] = STDDEV_RGB,
+                     num_channels: int = 3,
+                     dtype: tf.dtypes.DType = tf.float32,
+                     data_format: Text = 'channels_last') -> tf.Tensor:
+  """Normalizes the input image channels with the given mean and stddev.
+  Args:
+    features: `Tensor` representing decoded images in float format.
+    mean_rgb: the mean of the channels to subtract.
+    stddev_rgb: the stddev of the channels to divide.
+    num_channels: the number of channels in the input image tensor.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+    data_format: the format of the input image tensor
+                 ['channels_first', 'channels_last'].
+  Returns:
+    A normalized image `Tensor`.
+  """
+  # TODO(allencwang) - figure out how to use mean_image_subtraction and
+  # standardize_image on batches of images and replace the following.
+  if data_format == 'channels_first':
+    stats_shape = [num_channels, 1, 1]
+  else:
+    stats_shape = [1, 1, num_channels]
+  if dtype is not None:
+    features = tf.image.convert_image_dtype(features, dtype=dtype)
+  if mean_rgb is not None:
+    mean_rgb = tf.constant(mean_rgb,
+                           shape=stats_shape,
+                           dtype=features.dtype)
+    mean_rgb = tf.broadcast_to(mean_rgb, tf.shape(features))
+    features = features - mean_rgb
+  if stddev_rgb is not None:
+    stddev_rgb = tf.constant(stddev_rgb,
+                             shape=stats_shape,
+                             dtype=features.dtype)
+    stddev_rgb = tf.broadcast_to(stddev_rgb, tf.shape(features))
+    features = features / stddev_rgb
+  return features
+def decode_and_center_crop(image_bytes: tf.Tensor,
+                           image_size: int = IMAGE_SIZE,
+                           crop_padding: int = CROP_PADDING) -> tf.Tensor:
+  """Crops to center of image with padding then scales image_size.
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    image_size: image height/width dimension.
+    crop_padding: the padding size to use when centering the crop.
+  Returns:
+    A decoded and cropped image `Tensor`.
+  """
+  decoded = image_bytes.dtype != tf.string
+  shape = (tf.shape(image_bytes) if decoded
+           else tf.image.extract_jpeg_shape(image_bytes))
+  image_height = shape[0]
+  image_width = shape[1]
+  padded_center_crop_size = tf.cast(
+      ((image_size / (image_size + crop_padding)) *
+       tf.cast(tf.minimum(image_height, image_width), tf.float32)),
+      tf.int32)
+  offset_height = ((image_height - padded_center_crop_size) + 1) // 2
+  offset_width = ((image_width - padded_center_crop_size) + 1) // 2
+  crop_window = tf.stack([offset_height, offset_width,
+                          padded_center_crop_size, padded_center_crop_size])
+  if decoded:
+    image = tf.image.crop_to_bounding_box(
+        image_bytes,
+        offset_height=offset_height,
+        offset_width=offset_width,
+        target_height=padded_center_crop_size,
+        target_width=padded_center_crop_size)
+  else:
+    image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
+  image = resize_image(image_bytes=image,
+                       height=image_size,
+                       width=image_size)
+  return image
+def decode_crop_and_flip(image_bytes: tf.Tensor) -> tf.Tensor:
+  """Crops an image to a random part of the image, then randomly flips.
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+  Returns:
+    A decoded and cropped image `Tensor`.
+  """
+  decoded = image_bytes.dtype != tf.string
+  bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+  shape = (tf.shape(image_bytes) if decoded
+           else tf.image.extract_jpeg_shape(image_bytes))
+  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+      shape,
+      bounding_boxes=bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=[0.75, 1.33],
+      area_range=[0.05, 1.0],
+      max_attempts=100,
+      use_image_if_no_bounding_boxes=True)
+  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+  # Reassemble the bounding box in the format the crop op requires.
+  offset_height, offset_width, _ = tf.unstack(bbox_begin)
+  target_height, target_width, _ = tf.unstack(bbox_size)
+  crop_window = tf.stack([offset_height, offset_width,
+                          target_height, target_width])
+  if decoded:
+    cropped = tf.image.crop_to_bounding_box(
+        image_bytes,
+        offset_height=offset_height,
+        offset_width=offset_width,
+        target_height=target_height,
+        target_width=target_width)
+  else:
+    cropped = tf.image.decode_and_crop_jpeg(image_bytes,
+                                            crop_window,
+                                            channels=3)
+  # Flip to add a little more random distortion in.
+  cropped = tf.image.random_flip_left_right(cropped)
+  return cropped
+def resize_image(image_bytes: tf.Tensor,
+                 height: int = IMAGE_SIZE,
+                 width: int = IMAGE_SIZE) -> tf.Tensor:
+  """Resizes an image to a given height and width.
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    height: image height dimension.
+    width: image width dimension.
+  Returns:
+    A tensor containing the resized image.
+  """
+  print(height, width)
+  return tf.compat.v1.image.resize(
+      image_bytes,
+      tf.convert_to_tensor([height, width]),
+      method=tf.image.ResizeMethod.BILINEAR,
+      align_corners=False)
+def preprocess_for_eval(
+    image_bytes: tf.Tensor,
+    image_size: int = IMAGE_SIZE,
+    num_channels: int = 3,
+    mean_subtract: bool = False,
+    standardize: bool = False,
+    dtype: tf.dtypes.DType = tf.float32
+) -> tf.Tensor:
+  """Preprocesses the given image for evaluation.
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    image_size: image height/width dimension.
+    num_channels: number of image input channels.
+    mean_subtract: whether or not to apply mean subtraction.
+    standardize: whether or not to apply standardization.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  images = decode_and_center_crop(image_bytes, image_size)
+  images = tf.reshape(images, [image_size, image_size, num_channels])
+  if mean_subtract:
+    images = mean_image_subtraction(image_bytes=images, means=MEAN_RGB)
+  if standardize:
+    images = standardize_image(image_bytes=images, stddev=STDDEV_RGB)
+  if dtype is not None:
+    images = tf.image.convert_image_dtype(images, dtype=dtype)
+  return images
+def load_eval_image(filename: Text, image_size: int = IMAGE_SIZE) -> tf.Tensor:
+  """Reads an image from the filesystem and applies image preprocessing.
+  Args:
+    filename: a filename path of an image.
+    image_size: image height/width dimension.
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  image_bytes = tf.io.read_file(filename)
+  image = preprocess_for_eval(image_bytes, image_size)
+  return image
+def build_eval_dataset(filenames: List[Text],
+                       labels: Optional[List[int]] = None,
+                       image_size: int = IMAGE_SIZE,
+                       batch_size: int = 1) -> tf.Tensor:
+  """Builds a tf.data.Dataset from a list of filenames and labels.
+  Args:
+    filenames: a list of filename paths of images.
+    labels: a list of labels corresponding to each image.
+    image_size: image height/width dimension.
+    batch_size: the batch size used by the dataset
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  if labels is None:
+    labels = [0] * len(filenames)
+  filenames = tf.constant(filenames)
+  labels = tf.constant(labels)
+  dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
+  dataset = dataset.map(
+      lambda filename, label: (load_eval_image(filename, image_size), label))
+  dataset = dataset.batch(batch_size)
+  return dataset
+def preprocess_for_train(image_bytes: tf.Tensor,
+                         image_size: int = IMAGE_SIZE,
+                         augmenter: Optional[augment.ImageAugment] = None,
+                         mean_subtract: bool = False,
+                         standardize: bool = False,
+                         dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
+  """Preprocesses the given image for training.
+  Args:
+    image_bytes: `Tensor` representing an image binary of
+      arbitrary size of dtype tf.uint8.
+    image_size: image height/width dimension.
+    augmenter: the image augmenter to apply.
+    mean_subtract: whether or not to apply mean subtraction.
+    standardize: whether or not to apply standardization.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  images = decode_crop_and_flip(image_bytes=image_bytes)
+  images = resize_image(images, height=image_size, width=image_size)
+  if augmenter is not None:
+    images = augmenter.distort(images)
+  if mean_subtract:
+    images = mean_image_subtraction(image_bytes=images, means=MEAN_RGB)
+  if standardize:
+    images = standardize_image(image_bytes=images, stddev=STDDEV_RGB)
+  if dtype is not None:
+    images = tf.image.convert_image_dtype(images, dtype)
+  return images
--- a/official/legacy/image_classification/resnet/README.md
+++ b/official/legacy/image_classification/resnet/README.md
+This folder contains a
+[custom training loop (CTL)](#resnet-custom-training-loop) implementation for
+ResNet50.
+## Before you begin
+Please refer to the [README](../README.md) in the parent directory for
+information on setup and preparing the data.
+## ResNet (custom training loop)
+Similar to the [estimator implementation](../../../r1/resnet), the Keras
+implementation has code for the ImageNet dataset. The ImageNet
+version uses a ResNet50 model implemented in
+[`resnet_model.py`](./resnet_model.py).
+### Pretrained Models
+* [ResNet50 Checkpoints](https://storage.googleapis.com/cloud-tpu-checkpoints/resnet/resnet50.tar.gz)
+* ResNet50 TFHub: [feature vector](https://tfhub.dev/tensorflow/resnet_50/feature_vector/1)
+and [classification](https://tfhub.dev/tensorflow/resnet_50/classification/1)
+Again, if you did not download the data to the default directory, specify the
+location with the `--data_dir` flag:
+```bash
+python3 resnet_ctl_imagenet_main.py --data_dir=/path/to/imagenet
+```
+There are more flag options you can specify. Here are some examples:
+- `--use_synthetic_data`: when set to true, synthetic data, rather than real
+data, are used;
+- `--batch_size`: the batch size used for the model;
+- `--model_dir`: the directory to save the model checkpoint;
+- `--train_epochs`: number of epoches to run for training the model;
+- `--train_steps`: number of steps to run for training the model. We now only
+support a number that is smaller than the number of batches in an epoch.
+- `--skip_eval`: when set to true, evaluation as well as validation during
+training is skipped
+For example, this is a typical command line to run with ImageNet data with
+batch size 128 per GPU:
+```bash
+python3 -m resnet_ctl_imagenet_main.py \
+    --model_dir=/tmp/model_dir/something \
+    --num_gpus=2 \
+    --batch_size=128 \
+    --train_epochs=90 \
+    --train_steps=10 \
+    --use_synthetic_data=false
+```
+See [`common.py`](common.py) for full list of options.
+### Using multiple GPUs
+You can train these models on multiple GPUs using `tf.distribute.Strategy` API.
+You can read more about them in this
+[guide](https://www.tensorflow.org/guide/distribute_strategy).
+In this example, we have made it easier to use is with just a command line flag
+`--num_gpus`. By default this flag is 1 if TensorFlow is compiled with CUDA,
+and 0 otherwise.
+- --num_gpus=0: Uses tf.distribute.OneDeviceStrategy with CPU as the device.
+- --num_gpus=1: Uses tf.distribute.OneDeviceStrategy with GPU as the device.
+- --num_gpus=2+: Uses tf.distribute.MirroredStrategy to run synchronous
+distributed training across the GPUs.
+If you wish to run without `tf.distribute.Strategy`, you can do so by setting
+`--distribution_strategy=off`.
+### Running on multiple GPU hosts
+You can also train these models on multiple hosts, each with GPUs, using
+`tf.distribute.Strategy`.
+The easiest way to run multi-host benchmarks is to set the
+[`TF_CONFIG`](https://www.tensorflow.org/guide/distributed_training#TF_CONFIG)
+appropriately at each host.  e.g., to run using `MultiWorkerMirroredStrategy` on
+2 hosts, the `cluster` in `TF_CONFIG` should have 2 `host:port` entries, and
+host `i` should have the `task` in `TF_CONFIG` set to `{"type": "worker",
+"index": i}`.  `MultiWorkerMirroredStrategy` will automatically use all the
+available GPUs at each host.
+### Running on Cloud TPUs
+Note: This model will **not** work with TPUs on Colab.
+You can train the ResNet CTL model on Cloud TPUs using
+`tf.distribute.TPUStrategy`. If you are not familiar with Cloud TPUs, it is
+strongly recommended that you go through the
+[quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
+create a TPU and GCE VM.
+To run ResNet model on a TPU, you must set `--distribution_strategy=tpu` and
+`--tpu=$TPU_NAME`, where `$TPU_NAME` the name of your TPU in the Cloud Console.
+From a GCE VM, you can run the following command to train ResNet for one epoch
+on a v2-8 or v3-8 TPU by setting `TRAIN_EPOCHS` to 1:
+```bash
+python3 resnet_ctl_imagenet_main.py \
+  --tpu=$TPU_NAME \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --batch_size=1024 \
+  --steps_per_loop=500 \
+  --train_epochs=$TRAIN_EPOCHS \
+  --use_synthetic_data=false \
+  --dtype=fp32 \
+  --enable_eager=true \
+  --enable_tensorboard=true \
+  --distribution_strategy=tpu \
+  --log_steps=50 \
+  --single_l2_loss_op=true \
+  --use_tf_function=true
+```
+To train the ResNet to convergence, run it for 90 epochs by setting
+`TRAIN_EPOCHS` to 90.
+Note: `$MODEL_DIR` and `$DATA_DIR` must be GCS paths.
--- a/official/legacy/image_classification/resnet/__init__.py
+++ b/official/legacy/image_classification/resnet/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/official/legacy/image_classification/resnet/common.py
+++ b/official/legacy/image_classification/resnet/common.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Common util functions and classes used by both keras cifar and imagenet."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+from absl import flags
+import tensorflow as tf
+import tensorflow_model_optimization as tfmot
+from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+FLAGS = flags.FLAGS
+BASE_LEARNING_RATE = 0.1  # This matches Jing's version.
+TRAIN_TOP_1 = 'training_accuracy_top_1'
+LR_SCHEDULE = [  # (multiplier, epoch to start) tuples
+    (1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
+]
+class PiecewiseConstantDecayWithWarmup(
+    tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Piecewise constant decay with warmup schedule."""
+  def __init__(self,
+               batch_size,
+               epoch_size,
+               warmup_epochs,
+               boundaries,
+               multipliers,
+               compute_lr_on_cpu=True,
+               name=None):
+    super(PiecewiseConstantDecayWithWarmup, self).__init__()
+    if len(boundaries) != len(multipliers) - 1:
+      raise ValueError('The length of boundaries must be 1 less than the '
+                       'length of multipliers')
+    base_lr_batch_size = 256
+    steps_per_epoch = epoch_size // batch_size
+    self.rescaled_lr = BASE_LEARNING_RATE * batch_size / base_lr_batch_size
+    self.step_boundaries = [float(steps_per_epoch) * x for x in boundaries]
+    self.lr_values = [self.rescaled_lr * m for m in multipliers]
+    self.warmup_steps = warmup_epochs * steps_per_epoch
+    self.compute_lr_on_cpu = compute_lr_on_cpu
+    self.name = name
+    self.learning_rate_ops_cache = {}
+  def __call__(self, step):
+    if tf.executing_eagerly():
+      return self._get_learning_rate(step)
+    # In an eager function or graph, the current implementation of optimizer
+    # repeatedly call and thus create ops for the learning rate schedule. To
+    # avoid this, we cache the ops if not executing eagerly.
+    graph = tf.compat.v1.get_default_graph()
+    if graph not in self.learning_rate_ops_cache:
+      if self.compute_lr_on_cpu:
+        with tf.device('/device:CPU:0'):
+          self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
+      else:
+        self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
+    return self.learning_rate_ops_cache[graph]
+  def _get_learning_rate(self, step):
+    """Compute learning rate at given step."""
+    with tf.name_scope('PiecewiseConstantDecayWithWarmup'):
+      def warmup_lr(step):
+        return self.rescaled_lr * (
+            tf.cast(step, tf.float32) / tf.cast(self.warmup_steps, tf.float32))
+      def piecewise_lr(step):
+        return tf.compat.v1.train.piecewise_constant(step, self.step_boundaries,
+                                                     self.lr_values)
+      return tf.cond(step < self.warmup_steps, lambda: warmup_lr(step),
+                     lambda: piecewise_lr(step))
+  def get_config(self):
+    return {
+        'rescaled_lr': self.rescaled_lr,
+        'step_boundaries': self.step_boundaries,
+        'lr_values': self.lr_values,
+        'warmup_steps': self.warmup_steps,
+        'compute_lr_on_cpu': self.compute_lr_on_cpu,
+        'name': self.name
+    }
+def get_optimizer(learning_rate=0.1):
+  """Returns optimizer to use."""
+  # The learning_rate is overwritten at the beginning of each step by callback.
+  return tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
+def get_callbacks(pruning_method=None,
+                  enable_checkpoint_and_export=False,
+                  model_dir=None):
+  """Returns common callbacks."""
+  time_callback = keras_utils.TimeHistory(
+      FLAGS.batch_size,
+      FLAGS.log_steps,
+      logdir=FLAGS.model_dir if FLAGS.enable_tensorboard else None)
+  callbacks = [time_callback]
+  if FLAGS.enable_tensorboard:
+    tensorboard_callback = tf.keras.callbacks.TensorBoard(
+        log_dir=FLAGS.model_dir, profile_batch=FLAGS.profile_steps)
+    callbacks.append(tensorboard_callback)
+  is_pruning_enabled = pruning_method is not None
+  if is_pruning_enabled:
+    callbacks.append(tfmot.sparsity.keras.UpdatePruningStep())
+    if model_dir is not None:
+      callbacks.append(
+          tfmot.sparsity.keras.PruningSummaries(
+              log_dir=model_dir, profile_batch=0))
+  if enable_checkpoint_and_export:
+    if model_dir is not None:
+      ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
+      callbacks.append(
+          tf.keras.callbacks.ModelCheckpoint(
+              ckpt_full_path, save_weights_only=True))
+  return callbacks
+def build_stats(history, eval_output, callbacks):
+  """Normalizes and returns dictionary of stats.
+  Args:
+    history: Results of the training step. Supports both categorical_accuracy
+      and sparse_categorical_accuracy.
+    eval_output: Output of the eval step. Assumes first value is eval_loss and
+      second value is accuracy_top_1.
+    callbacks: a list of callbacks which might include a time history callback
+      used during keras.fit.
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+  if eval_output:
+    stats['accuracy_top_1'] = float(eval_output[1])
+    stats['eval_loss'] = float(eval_output[0])
+  if history and history.history:
+    train_hist = history.history
+    # Gets final loss from training.
+    stats['loss'] = float(train_hist['loss'][-1])
+    # Gets top_1 training accuracy.
+    if 'categorical_accuracy' in train_hist:
+      stats[TRAIN_TOP_1] = float(train_hist['categorical_accuracy'][-1])
+    elif 'sparse_categorical_accuracy' in train_hist:
+      stats[TRAIN_TOP_1] = float(train_hist['sparse_categorical_accuracy'][-1])
+    elif 'accuracy' in train_hist:
+      stats[TRAIN_TOP_1] = float(train_hist['accuracy'][-1])
+  if not callbacks:
+    return stats
+  # Look for the time history callback which was used during keras.fit
+  for callback in callbacks:
+    if isinstance(callback, keras_utils.TimeHistory):
+      timestamp_log = callback.timestamp_log
+      stats['step_timestamp_log'] = timestamp_log
+      stats['train_finish_time'] = callback.train_finish_time
+      if callback.epoch_runtime_log:
+        stats['avg_exp_per_second'] = callback.average_examples_per_second
+  return stats
+def define_keras_flags(model=False,
+                       optimizer=False,
+                       pretrained_filepath=False):
+  """Define flags for Keras models."""
+  flags_core.define_base(
+      clean=True,
+      num_gpu=True,
+      run_eagerly=True,
+      train_epochs=True,
+      epochs_between_evals=True,
+      distribution_strategy=True)
+  flags_core.define_performance(
+      num_parallel_calls=False,
+      synthetic_data=True,
+      dtype=True,
+      all_reduce_alg=True,
+      num_packs=True,
+      tf_gpu_thread_mode=True,
+      datasets_num_private_threads=True,
+      loss_scale=True,
+      fp16_implementation=True,
+      tf_data_experimental_slack=True,
+      enable_xla=True,
+      training_dataset_cache=True)
+  flags_core.define_image()
+  flags_core.define_benchmark()
+  flags_core.define_distribution()
+  flags.adopt_module_key_flags(flags_core)
+  flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
+  flags.DEFINE_boolean(name='skip_eval', default=False, help='Skip evaluation?')
+  # TODO(b/135607288): Remove this flag once we understand the root cause of
+  # slowdown when setting the learning phase in Keras backend.
+  flags.DEFINE_boolean(
+      name='set_learning_phase_to_train',
+      default=True,
+      help='If skip eval, also set Keras learning phase to 1 (training).')
+  flags.DEFINE_boolean(
+      name='explicit_gpu_placement',
+      default=False,
+      help='If not using distribution strategy, explicitly set device scope '
+      'for the Keras training loop.')
+  flags.DEFINE_boolean(
+      name='use_trivial_model',
+      default=False,
+      help='Whether to use a trivial Keras model.')
+  flags.DEFINE_boolean(
+      name='report_accuracy_metrics',
+      default=True,
+      help='Report metrics during training and evaluation.')
+  flags.DEFINE_boolean(
+      name='use_tensor_lr',
+      default=True,
+      help='Use learning rate tensor instead of a callback.')
+  flags.DEFINE_boolean(
+      name='enable_tensorboard',
+      default=False,
+      help='Whether to enable TensorBoard callback.')
+  flags.DEFINE_string(
+      name='profile_steps',
+      default=None,
+      help='Save profiling data to model dir at given range of global steps. The '
+      'value must be a comma separated pair of positive integers, specifying '
+      'the first and last step to profile. For example, "--profile_steps=2,4" '
+      'triggers the profiler to process 3 steps, starting from the 2nd step. '
+      'Note that profiler has a non-trivial performance overhead, and the '
+      'output file can be gigantic if profiling many steps.')
+  flags.DEFINE_integer(
+      name='train_steps',
+      default=None,
+      help='The number of steps to run for training. If it is larger than '
+      '# batches per epoch, then use # batches per epoch. This flag will be '
+      'ignored if train_epochs is set to be larger than 1. ')
+  flags.DEFINE_boolean(
+      name='batchnorm_spatial_persistent',
+      default=True,
+      help='Enable the spacial persistent mode for CuDNN batch norm kernel.')
+  flags.DEFINE_boolean(
+      name='enable_get_next_as_optional',
+      default=False,
+      help='Enable get_next_as_optional behavior in DistributedIterator.')
+  flags.DEFINE_boolean(
+      name='enable_checkpoint_and_export',
+      default=False,
+      help='Whether to enable a checkpoint callback and export the savedmodel.')
+  flags.DEFINE_string(name='tpu', default='', help='TPU address to connect to.')
+  flags.DEFINE_integer(
+      name='steps_per_loop',
+      default=None,
+      help='Number of steps per training loop. Only training step happens '
+      'inside the loop. Callbacks will not be called inside. Will be capped at '
+      'steps per epoch.')
+  flags.DEFINE_boolean(
+      name='use_tf_while_loop',
+      default=True,
+      help='Whether to build a tf.while_loop inside the training loop on the '
+      'host. Setting it to True is critical to have peak performance on '
+      'TPU.')
+  if model:
+    flags.DEFINE_string('model', 'resnet50_v1.5',
+                        'Name of model preset. (mobilenet, resnet50_v1.5)')
+  if optimizer:
+    flags.DEFINE_string(
+        'optimizer', 'resnet50_default', 'Name of optimizer preset. '
+        '(mobilenet_default, resnet50_default)')
+    # TODO(kimjaehong): Replace as general hyper-params not only for mobilenet.
+    flags.DEFINE_float(
+        'initial_learning_rate_per_sample', 0.00007,
+        'Initial value of learning rate per sample for '
+        'mobilenet_default.')
+    flags.DEFINE_float('lr_decay_factor', 0.94,
+                       'Learning rate decay factor for mobilenet_default.')
+    flags.DEFINE_float('num_epochs_per_decay', 2.5,
+                       'Number of epochs per decay for mobilenet_default.')
+  if pretrained_filepath:
+    flags.DEFINE_string('pretrained_filepath', '', 'Pretrained file path.')
+def get_synth_data(height, width, num_channels, num_classes, dtype):
+  """Creates a set of synthetic random data.
+  Args:
+    height: Integer height that will be used to create a fake image tensor.
+    width: Integer width that will be used to create a fake image tensor.
+    num_channels: Integer depth that will be used to create a fake image tensor.
+    num_classes: Number of classes that should be represented in the fake labels
+      tensor
+    dtype: Data type for features/images.
+  Returns:
+    A tuple of tensors representing the inputs and labels.
+  """
+  # Synthetic input should be within [0, 255].
+  inputs = tf.random.truncated_normal([height, width, num_channels],
+                                      dtype=dtype,
+                                      mean=127,
+                                      stddev=60,
+                                      name='synthetic_inputs')
+  labels = tf.random.uniform([1],
+                             minval=0,
+                             maxval=num_classes - 1,
+                             dtype=tf.int32,
+                             name='synthetic_labels')
+  return inputs, labels
+def define_pruning_flags():
+  """Define flags for pruning methods."""
+  flags.DEFINE_string(
+      'pruning_method', None, 'Pruning method.'
+      'None (no pruning) or polynomial_decay.')
+  flags.DEFINE_float('pruning_initial_sparsity', 0.0,
+                     'Initial sparsity for pruning.')
+  flags.DEFINE_float('pruning_final_sparsity', 0.5,
+                     'Final sparsity for pruning.')
+  flags.DEFINE_integer('pruning_begin_step', 0, 'Begin step for pruning.')
+  flags.DEFINE_integer('pruning_end_step', 100000, 'End step for pruning.')
+  flags.DEFINE_integer('pruning_frequency', 100, 'Frequency for pruning.')
+def define_clustering_flags():
+  """Define flags for clustering methods."""
+  flags.DEFINE_string('clustering_method', None,
+                      'None (no clustering) or selective_clustering '
+                      '(cluster last three Conv2D layers of the model).')
+def get_synth_input_fn(height,
+                       width,
+                       num_channels,
+                       num_classes,
+                       dtype=tf.float32,
+                       drop_remainder=True):
+  """Returns an input function that returns a dataset with random data.
+  This input_fn returns a data set that iterates over a set of random data and
+  bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
+  copy is still included. This used to find the upper throughput bound when
+  tuning the full input pipeline.
+  Args:
+    height: Integer height that will be used to create a fake image tensor.
+    width: Integer width that will be used to create a fake image tensor.
+    num_channels: Integer depth that will be used to create a fake image tensor.
+    num_classes: Number of classes that should be represented in the fake labels
+      tensor
+    dtype: Data type for features/images.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+  Returns:
+    An input_fn that can be used in place of a real one to return a dataset
+    that can be used for iteration.
+  """
+  # pylint: disable=unused-argument
+  def input_fn(is_training, data_dir, batch_size, *args, **kwargs):
+    """Returns dataset filled with random data."""
+    inputs, labels = get_synth_data(
+        height=height,
+        width=width,
+        num_channels=num_channels,
+        num_classes=num_classes,
+        dtype=dtype)
+    # Cast to float32 for Keras model.
+    labels = tf.cast(labels, dtype=tf.float32)
+    data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
+    # `drop_remainder` will make dataset produce outputs with known shapes.
+    data = data.batch(batch_size, drop_remainder=drop_remainder)
+    data = data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+    return data
+  return input_fn
+def set_cudnn_batchnorm_mode():
+  """Set CuDNN batchnorm mode for better performance.
+     Note: Spatial Persistent mode may lead to accuracy losses for certain
+     models.
+  """
+  if FLAGS.batchnorm_spatial_persistent:
+    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
+  else:
+    os.environ.pop('TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT', None)
--- a/official/legacy/image_classification/resnet/imagenet_preprocessing.py
+++ b/official/legacy/image_classification/resnet/imagenet_preprocessing.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Provides utilities to preprocess images.
+Training images are sampled using the provided bounding boxes, and subsequently
+cropped to the sampled bounding box. Images are additionally flipped randomly,
+then resized to the target output size (without aspect-ratio preservation).
+Images used during evaluation are resized (with aspect-ratio preservation) and
+centrally cropped.
+All images undergo mean color subtraction.
+Note that these steps are colloquially referred to as "ResNet preprocessing,"
+and they differ from "VGG preprocessing," which does not use bounding boxes
+and instead does an aspect-preserving resize followed by random crop during
+training. (These both differ from "Inception preprocessing," which introduces
+color distortion steps.)
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+from absl import logging
+import tensorflow as tf
+DEFAULT_IMAGE_SIZE = 224
+NUM_CHANNELS = 3
+NUM_CLASSES = 1001
+NUM_IMAGES = {
+    'train': 1281167,
+    'validation': 50000,
+}
+_NUM_TRAIN_FILES = 1024
+_SHUFFLE_BUFFER = 10000
+_R_MEAN = 123.68
+_G_MEAN = 116.78
+_B_MEAN = 103.94
+CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
+# The lower bound for the smallest side of the image for aspect-preserving
+# resizing. For example, if an image is 500 x 1000, it will be resized to
+# _RESIZE_MIN x (_RESIZE_MIN * 2).
+_RESIZE_MIN = 256
+def process_record_dataset(dataset,
+                           is_training,
+                           batch_size,
+                           shuffle_buffer,
+                           parse_record_fn,
+                           dtype=tf.float32,
+                           datasets_num_private_threads=None,
+                           drop_remainder=False,
+                           tf_data_experimental_slack=False):
+  """Given a Dataset with raw records, return an iterator over the records.
+  Args:
+    dataset: A Dataset representing raw records
+    is_training: A boolean denoting whether the input is for training.
+    batch_size: The number of samples per batch.
+    shuffle_buffer: The buffer size to use when shuffling records. A larger
+      value results in better randomness, but smaller values reduce startup time
+      and use less memory.
+    parse_record_fn: A function that takes a raw record and returns the
+      corresponding (image, label) pair.
+    dtype: Data type to use for images/features.
+    datasets_num_private_threads: Number of threads for a private threadpool
+      created for all datasets computation.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+    tf_data_experimental_slack: Whether to enable tf.data's `experimental_slack`
+      option.
+  Returns:
+    Dataset of (image, label) pairs ready for iteration.
+  """
+  # Defines a specific size thread pool for tf.data operations.
+  if datasets_num_private_threads:
+    options = tf.data.Options()
+    options.experimental_threading.private_threadpool_size = (
+        datasets_num_private_threads)
+    dataset = dataset.with_options(options)
+    logging.info('datasets_num_private_threads: %s',
+                 datasets_num_private_threads)
+  if is_training:
+    # Shuffles records before repeating to respect epoch boundaries.
+    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
+    # Repeats the dataset for the number of epochs to train.
+    dataset = dataset.repeat()
+  # Parses the raw records into images and labels.
+  dataset = dataset.map(
+      lambda value: parse_record_fn(value, is_training, dtype),
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+  # Operations between the final prefetch and the get_next call to the iterator
+  # will happen synchronously during run time. We prefetch here again to
+  # background all of the above processing work and keep it out of the
+  # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
+  # allows DistributionStrategies to adjust how many batches to fetch based
+  # on how many devices are present.
+  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+  options = tf.data.Options()
+  options.experimental_slack = tf_data_experimental_slack
+  dataset = dataset.with_options(options)
+  return dataset
+def get_filenames(is_training, data_dir):
+  """Return filenames for dataset."""
+  if is_training:
+    return [
+        os.path.join(data_dir, 'train-%05d-of-01024' % i)
+        for i in range(_NUM_TRAIN_FILES)
+    ]
+  else:
+    return [
+        os.path.join(data_dir, 'validation-%05d-of-00128' % i)
+        for i in range(128)
+    ]
+def parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+  The output of the build_image_data.py image preprocessing script is a dataset
+  containing serialized Example protocol buffers. Each Example proto contains
+  the following fields (values are included as examples):
+    image/height: 462
+    image/width: 581
+    image/colorspace: 'RGB'
+    image/channels: 3
+    image/class/label: 615
+    image/class/synset: 'n03623198'
+    image/class/text: 'knee pad'
+    image/object/bbox/xmin: 0.1
+    image/object/bbox/xmax: 0.9
+    image/object/bbox/ymin: 0.2
+    image/object/bbox/ymax: 0.6
+    image/object/bbox/label: 615
+    image/format: 'JPEG'
+    image/filename: 'ILSVRC2012_val_00041207.JPEG'
+    image/encoded: <JPEG encoded string>
+  Args:
+    example_serialized: scalar Tensor tf.string containing a serialized Example
+      protocol buffer.
+  Returns:
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
+    label: Tensor tf.int32 containing the label.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+  """
+  # Dense features in Example proto.
+  feature_map = {
+      'image/encoded':
+          tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
+      'image/class/label':
+          tf.io.FixedLenFeature([], dtype=tf.int64, default_value=-1),
+      'image/class/text':
+          tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
+  }
+  sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update({
+      k: sparse_float32 for k in [
+          'image/object/bbox/xmin', 'image/object/bbox/ymin',
+          'image/object/bbox/xmax', 'image/object/bbox/ymax'
+      ]
+  })
+  features = tf.io.parse_single_example(
+      serialized=example_serialized, features=feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
+  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
+  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
+  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
+  # Note that we impose an ordering of (y, x) just to make life difficult.
+  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
+  # Force the variable number of bounding boxes into the shape
+  # [1, num_boxes, coords].
+  bbox = tf.expand_dims(bbox, 0)
+  bbox = tf.transpose(a=bbox, perm=[0, 2, 1])
+  return features['image/encoded'], label, bbox
+def parse_record(raw_record, is_training, dtype):
+  """Parses a record containing a training example of an image.
+  The input record is parsed into a label and image, and the image is passed
+  through preprocessing steps (cropping, flipping, and so on).
+  Args:
+    raw_record: scalar Tensor tf.string containing a serialized Example protocol
+      buffer.
+    is_training: A boolean denoting whether the input is for training.
+    dtype: data type to use for images/features.
+  Returns:
+    Tuple with processed image tensor in a channel-last format and
+    one-hot-encoded label tensor.
+  """
+  image_buffer, label, bbox = parse_example_proto(raw_record)
+  image = preprocess_image(
+      image_buffer=image_buffer,
+      bbox=bbox,
+      output_height=DEFAULT_IMAGE_SIZE,
+      output_width=DEFAULT_IMAGE_SIZE,
+      num_channels=NUM_CHANNELS,
+      is_training=is_training)
+  image = tf.cast(image, dtype)
+  # Subtract one so that labels are in [0, 1000), and cast to float32 for
+  # Keras model.
+  label = tf.cast(
+      tf.cast(tf.reshape(label, shape=[1]), dtype=tf.int32) - 1,
+      dtype=tf.float32)
+  return image, label
+def get_parse_record_fn(use_keras_image_data_format=False):
+  """Get a function for parsing the records, accounting for image format.
+  This is useful by handling different types of Keras models. For instance,
+  the current resnet_model.resnet50 input format is always channel-last,
+  whereas the keras_applications mobilenet input format depends on
+  tf.keras.backend.image_data_format(). We should set
+  use_keras_image_data_format=False for the former and True for the latter.
+  Args:
+    use_keras_image_data_format: A boolean denoting whether data format is keras
+      backend image data format. If False, the image format is channel-last. If
+      True, the image format matches tf.keras.backend.image_data_format().
+  Returns:
+    Function to use for parsing the records.
+  """
+  def parse_record_fn(raw_record, is_training, dtype):
+    image, label = parse_record(raw_record, is_training, dtype)
+    if use_keras_image_data_format:
+      if tf.keras.backend.image_data_format() == 'channels_first':
+        image = tf.transpose(image, perm=[2, 0, 1])
+    return image, label
+  return parse_record_fn
+def input_fn(is_training,
+             data_dir,
+             batch_size,
+             dtype=tf.float32,
+             datasets_num_private_threads=None,
+             parse_record_fn=parse_record,
+             input_context=None,
+             drop_remainder=False,
+             tf_data_experimental_slack=False,
+             training_dataset_cache=False,
+             filenames=None):
+  """Input function which provides batches for train or eval.
+  Args:
+    is_training: A boolean denoting whether the input is for training.
+    data_dir: The directory containing the input data.
+    batch_size: The number of samples per batch.
+    dtype: Data type to use for images/features
+    datasets_num_private_threads: Number of private threads for tf.data.
+    parse_record_fn: Function to use for parsing the records.
+    input_context: A `tf.distribute.InputContext` object passed in by
+      `tf.distribute.Strategy`.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+    tf_data_experimental_slack: Whether to enable tf.data's `experimental_slack`
+      option.
+    training_dataset_cache: Whether to cache the training dataset on workers.
+      Typically used to improve training performance when training data is in
+      remote storage and can fit into worker memory.
+    filenames: Optional field for providing the file names of the TFRecords.
+  Returns:
+    A dataset that can be used for iteration.
+  """
+  if filenames is None:
+    filenames = get_filenames(is_training, data_dir)
+  dataset = tf.data.Dataset.from_tensor_slices(filenames)
+  if input_context:
+    logging.info(
+        'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
+        input_context.input_pipeline_id, input_context.num_input_pipelines)
+    dataset = dataset.shard(input_context.num_input_pipelines,
+                            input_context.input_pipeline_id)
+  if is_training:
+    # Shuffle the input files
+    dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)
+  # Convert to individual records.
+  # cycle_length = 10 means that up to 10 files will be read and deserialized in
+  # parallel. You may want to increase this number if you have a large number of
+  # CPU cores.
+  dataset = dataset.interleave(
+      tf.data.TFRecordDataset,
+      cycle_length=10,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  if is_training and training_dataset_cache:
+    # Improve training performance when training data is in remote storage and
+    # can fit into worker memory.
+    dataset = dataset.cache()
+  return process_record_dataset(
+      dataset=dataset,
+      is_training=is_training,
+      batch_size=batch_size,
+      shuffle_buffer=_SHUFFLE_BUFFER,
+      parse_record_fn=parse_record_fn,
+      dtype=dtype,
+      datasets_num_private_threads=datasets_num_private_threads,
+      drop_remainder=drop_remainder,
+      tf_data_experimental_slack=tf_data_experimental_slack,
+  )
+def _decode_crop_and_flip(image_buffer, bbox, num_channels):
+  """Crops the given image to a random part of the image, and randomly flips.
+  We use the fused decode_and_crop op, which performs better than the two ops
+  used separately in series, but note that this requires that the image be
+  passed in as an un-decoded string Tensor.
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as [ymin,
+      xmin, ymax, xmax].
+    num_channels: Integer depth of the image buffer for decoding.
+  Returns:
+    3-D tensor with cropped image.
+  """
+  # A large fraction of image datasets contain a human-annotated bounding box
+  # delineating the region of the image containing the object of interest.  We
+  # choose to create a new bounding box for the object which is a randomly
+  # distorted version of the human-annotated bounding box that obeys an
+  # allowed range of aspect ratios, sizes and overlap with the human-annotated
+  # bounding box. If no box is supplied, then we assume the bounding box is
+  # the entire image.
+  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+      tf.image.extract_jpeg_shape(image_buffer),
+      bounding_boxes=bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=[0.75, 1.33],
+      area_range=[0.05, 1.0],
+      max_attempts=100,
+      use_image_if_no_bounding_boxes=True)
+  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+  # Reassemble the bounding box in the format the crop op requires.
+  offset_y, offset_x, _ = tf.unstack(bbox_begin)
+  target_height, target_width, _ = tf.unstack(bbox_size)
+  crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+  # Use the fused decode and crop op here, which is faster than each in series.
+  cropped = tf.image.decode_and_crop_jpeg(
+      image_buffer, crop_window, channels=num_channels)
+  # Flip to add a little more random distortion in.
+  cropped = tf.image.random_flip_left_right(cropped)
+  return cropped
+def _central_crop(image, crop_height, crop_width):
+  """Performs central crops of the given image list.
+  Args:
+    image: a 3-D image tensor
+    crop_height: the height of the image following the crop.
+    crop_width: the width of the image following the crop.
+  Returns:
+    3-D tensor with cropped image.
+  """
+  shape = tf.shape(input=image)
+  height, width = shape[0], shape[1]
+  amount_to_be_cropped_h = (height - crop_height)
+  crop_top = amount_to_be_cropped_h // 2
+  amount_to_be_cropped_w = (width - crop_width)
+  crop_left = amount_to_be_cropped_w // 2
+  return tf.slice(image, [crop_top, crop_left, 0],
+                  [crop_height, crop_width, -1])
+def _mean_image_subtraction(image, means, num_channels):
+  """Subtracts the given means from each image channel.
+  For example:
+    means = [123.68, 116.779, 103.939]
+    image = _mean_image_subtraction(image, means)
+  Note that the rank of `image` must be known.
+  Args:
+    image: a tensor of size [height, width, C].
+    means: a C-vector of values to subtract from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+  Returns:
+    the centered image.
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `means`.
+  """
+  if image.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+  if len(means) != num_channels:
+    raise ValueError('len(means) must match the number of channels')
+  # We have a 1-D tensor of means; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  means = tf.broadcast_to(means, tf.shape(image))
+  return image - means
+def _smallest_size_at_least(height, width, resize_min):
+  """Computes new shape with the smallest side equal to `smallest_side`.
+  Computes new shape with the smallest side equal to `smallest_side` while
+  preserving the original aspect ratio.
+  Args:
+    height: an int32 scalar tensor indicating the current height.
+    width: an int32 scalar tensor indicating the current width.
+    resize_min: A python integer or scalar `Tensor` indicating the size of the
+      smallest side after resize.
+  Returns:
+    new_height: an int32 scalar tensor indicating the new height.
+    new_width: an int32 scalar tensor indicating the new width.
+  """
+  resize_min = tf.cast(resize_min, tf.float32)
+  # Convert to floats to make subsequent calculations go smoothly.
+  height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
+  smaller_dim = tf.minimum(height, width)
+  scale_ratio = resize_min / smaller_dim
+  # Convert back to ints to make heights and widths that TF ops will accept.
+  new_height = tf.cast(height * scale_ratio, tf.int32)
+  new_width = tf.cast(width * scale_ratio, tf.int32)
+  return new_height, new_width
+def _aspect_preserving_resize(image, resize_min):
+  """Resize images preserving the original aspect ratio.
+  Args:
+    image: A 3-D image `Tensor`.
+    resize_min: A python integer or scalar `Tensor` indicating the size of the
+      smallest side after resize.
+  Returns:
+    resized_image: A 3-D tensor containing the resized image.
+  """
+  shape = tf.shape(input=image)
+  height, width = shape[0], shape[1]
+  new_height, new_width = _smallest_size_at_least(height, width, resize_min)
+  return _resize_image(image, new_height, new_width)
+def _resize_image(image, height, width):
+  """Simple wrapper around tf.resize_images.
+  This is primarily to make sure we use the same `ResizeMethod` and other
+  details each time.
+  Args:
+    image: A 3-D image `Tensor`.
+    height: The target height for the resized image.
+    width: The target width for the resized image.
+  Returns:
+    resized_image: A 3-D tensor containing the resized image. The first two
+      dimensions have the shape [height, width].
+  """
+  return tf.compat.v1.image.resize(
+      image, [height, width],
+      method=tf.image.ResizeMethod.BILINEAR,
+      align_corners=False)
+def preprocess_image(image_buffer,
+                     bbox,
+                     output_height,
+                     output_width,
+                     num_channels,
+                     is_training=False):
+  """Preprocesses the given image.
+  Preprocessing includes decoding, cropping, and resizing for both training
+  and eval images. Training preprocessing, however, introduces some random
+  distortion of the image to improve accuracy.
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as [ymin,
+      xmin, ymax, xmax].
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    num_channels: Integer depth of the image buffer for decoding.
+    is_training: `True` if we're preprocessing the image for training and
+      `False` otherwise.
+  Returns:
+    A preprocessed image.
+  """
+  if is_training:
+    # For training, we want to randomize some of the distortions.
+    image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
+    image = _resize_image(image, output_height, output_width)
+  else:
+    # For validation, we want to decode, resize, then just crop the middle.
+    image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
+    image = _aspect_preserving_resize(image, _RESIZE_MIN)
+    image = _central_crop(image, output_height, output_width)
+  image.set_shape([output_height, output_width, num_channels])
+  return _mean_image_subtraction(image, CHANNEL_MEANS, num_channels)
--- a/official/legacy/image_classification/resnet/resnet_config.py
+++ b/official/legacy/image_classification/resnet/resnet_config.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""Configuration definitions for ResNet losses, learning rates, and optimizers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import dataclasses
+from official.legacy.image_classification.configs import base_configs
+from official.modeling.hyperparams import base_config
+@dataclasses.dataclass
+class ResNetModelConfig(base_configs.ModelConfig):
+  """Configuration for the ResNet model."""
+  name: str = 'ResNet'
+  num_classes: int = 1000
+  model_params: base_config.Config = dataclasses.field(
+      # pylint: disable=g-long-lambda
+      default_factory=lambda: {
+          'num_classes': 1000,
+          'batch_size': None,
+          'use_l2_regularizer': True,
+          'rescale_inputs': False,
+      })
+  # pylint: enable=g-long-lambda
+  loss: base_configs.LossConfig = base_configs.LossConfig(
+      name='sparse_categorical_crossentropy')
+  optimizer: base_configs.OptimizerConfig = base_configs.OptimizerConfig(
+      name='momentum',
+      decay=0.9,
+      epsilon=0.001,
+      momentum=0.9,
+      moving_average_decay=None)
+  learning_rate: base_configs.LearningRateConfig = (
+      base_configs.LearningRateConfig(
+          name='stepwise',
+          initial_lr=0.1,
+          examples_per_epoch=1281167,
+          boundaries=[30, 60, 80],
+          warmup_epochs=5,
+          scale_by_batch_size=1. / 256.,
+          multipliers=[0.1 / 256, 0.01 / 256, 0.001 / 256, 0.0001 / 256]))
--- a/official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py
+++ b/official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+import math
+import os
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import orbit
+import tensorflow as tf
+from official.common import distribute_utils
+from official.legacy.image_classification.resnet import common
+from official.legacy.image_classification.resnet import imagenet_preprocessing
+from official.legacy.image_classification.resnet import resnet_runnable
+from official.modeling import performance
+from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+from official.utils.misc import model_helpers
+flags.DEFINE_boolean(name='use_tf_function', default=True,
+                     help='Wrap the train and test step inside a '
+                     'tf.function.')
+flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
+                     help='Calculate L2_loss on concatenated weights, '
+                     'instead of using Keras per-layer L2 loss.')
+def build_stats(runnable, time_callback):
+  """Normalizes and returns dictionary of stats.
+  Args:
+    runnable: The module containing all the training and evaluation metrics.
+    time_callback: Time tracking callback instance.
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+  if not runnable.flags_obj.skip_eval:
+    stats['eval_loss'] = runnable.test_loss.result().numpy()
+    stats['eval_acc'] = runnable.test_accuracy.result().numpy()
+    stats['train_loss'] = runnable.train_loss.result().numpy()
+    stats['train_acc'] = runnable.train_accuracy.result().numpy()
+  if time_callback:
+    timestamp_log = time_callback.timestamp_log
+    stats['step_timestamp_log'] = timestamp_log
+    stats['train_finish_time'] = time_callback.train_finish_time
+    if time_callback.epoch_runtime_log:
+      stats['avg_exp_per_second'] = time_callback.average_examples_per_second
+  return stats
+def get_num_train_iterations(flags_obj):
+  """Returns the number of training steps, train and test epochs."""
+  train_steps = (
+      imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
+  train_epochs = flags_obj.train_epochs
+  if flags_obj.train_steps:
+    train_steps = min(flags_obj.train_steps, train_steps)
+    train_epochs = 1
+  eval_steps = math.ceil(1.0 * imagenet_preprocessing.NUM_IMAGES['validation'] /
+                         flags_obj.batch_size)
+  return train_steps, train_epochs, eval_steps
+def run(flags_obj):
+  """Run ResNet ImageNet training and eval loop using custom training loops.
+  Args:
+    flags_obj: An object containing parsed flag values.
+  Raises:
+    ValueError: If fp16 is passed as it is not currently supported.
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  keras_utils.set_session_config()
+  performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
+  if tf.config.list_physical_devices('GPU'):
+    if flags_obj.tf_gpu_thread_mode:
+      keras_utils.set_gpu_thread_mode_and_count(
+          per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+          gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+          num_gpus=flags_obj.num_gpus,
+          datasets_num_private_threads=flags_obj.datasets_num_private_threads)
+    common.set_cudnn_batchnorm_mode()
+  data_format = flags_obj.data_format
+  if data_format is None:
+    data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
+                   else 'channels_last')
+  tf.keras.backend.set_image_data_format(data_format)
+  strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs,
+      tpu_address=flags_obj.tpu)
+  per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations(
+      flags_obj)
+  if flags_obj.steps_per_loop is None:
+    steps_per_loop = per_epoch_steps
+  elif flags_obj.steps_per_loop > per_epoch_steps:
+    steps_per_loop = per_epoch_steps
+    logging.warn('Setting steps_per_loop to %d to respect epoch boundary.',
+                 steps_per_loop)
+  else:
+    steps_per_loop = flags_obj.steps_per_loop
+  logging.info(
+      'Training %d epochs, each epoch has %d steps, '
+      'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
+      train_epochs * per_epoch_steps, eval_steps)
+  time_callback = keras_utils.TimeHistory(
+      flags_obj.batch_size,
+      flags_obj.log_steps,
+      logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
+  with distribute_utils.get_strategy_scope(strategy):
+    runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback,
+                                              per_epoch_steps)
+  eval_interval = flags_obj.epochs_between_evals * per_epoch_steps
+  checkpoint_interval = (
+      steps_per_loop * 5 if flags_obj.enable_checkpoint_and_export else None)
+  summary_interval = steps_per_loop if flags_obj.enable_tensorboard else None
+  checkpoint_manager = tf.train.CheckpointManager(
+      runnable.checkpoint,
+      directory=flags_obj.model_dir,
+      max_to_keep=10,
+      step_counter=runnable.global_step,
+      checkpoint_interval=checkpoint_interval)
+  resnet_controller = orbit.Controller(
+      strategy=strategy,
+      trainer=runnable,
+      evaluator=runnable if not flags_obj.skip_eval else None,
+      global_step=runnable.global_step,
+      steps_per_loop=steps_per_loop,
+      checkpoint_manager=checkpoint_manager,
+      summary_interval=summary_interval,
+      summary_dir=flags_obj.model_dir,
+      eval_summary_dir=os.path.join(flags_obj.model_dir, 'eval'))
+  time_callback.on_train_begin()
+  if not flags_obj.skip_eval:
+    resnet_controller.train_and_evaluate(
+        train_steps=per_epoch_steps * train_epochs,
+        eval_steps=eval_steps,
+        eval_interval=eval_interval)
+  else:
+    resnet_controller.train(steps=per_epoch_steps * train_epochs)
+  time_callback.on_train_end()
+  stats = build_stats(runnable, time_callback)
+  return stats
+def main(_):
+  model_helpers.apply_clean(flags.FLAGS)
+  stats = run(flags.FLAGS)
+  logging.info('Run stats:\n%s', stats)
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  common.define_keras_flags()
+  app.run(main)
--- a/official/legacy/image_classification/resnet/resnet_model.py
+++ b/official/legacy/image_classification/resnet/resnet_model.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ResNet50 model for Keras.
+Adapted from tf.keras.applications.resnet50.ResNet50().
+This is ResNet model version 1.5.
+Related papers/blogs:
+- https://arxiv.org/abs/1512.03385
+- https://arxiv.org/pdf/1603.05027v2.pdf
+- http://torch.ch/blog/2016/02/04/resnets.html
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from official.legacy.image_classification.resnet import imagenet_preprocessing
+layers = tf.keras.layers
+def _gen_l2_regularizer(use_l2_regularizer=True, l2_weight_decay=1e-4):
+  return tf.keras.regularizers.L2(
+      l2_weight_decay) if use_l2_regularizer else None
+def identity_block(input_tensor,
+                   kernel_size,
+                   filters,
+                   stage,
+                   block,
+                   use_l2_regularizer=True,
+                   batch_norm_decay=0.9,
+                   batch_norm_epsilon=1e-5):
+  """The identity block is the block that has no conv layer at shortcut.
+  Args:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: 'a','b'..., current block label, used for generating layer names
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
+    batch_norm_decay: Moment of batch norm layers.
+    batch_norm_epsilon: Epsilon of batch borm layers.
+  Returns:
+    Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if tf.keras.backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+  x = layers.Conv2D(
+      filters1, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2a')(
+          input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2a')(
+          x)
+  x = layers.Activation('relu')(x)
+  x = layers.Conv2D(
+      filters2,
+      kernel_size,
+      padding='same',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2b')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2b')(
+          x)
+  x = layers.Activation('relu')(x)
+  x = layers.Conv2D(
+      filters3, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2c')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2c')(
+          x)
+  x = layers.add([x, input_tensor])
+  x = layers.Activation('relu')(x)
+  return x
+def conv_block(input_tensor,
+               kernel_size,
+               filters,
+               stage,
+               block,
+               strides=(2, 2),
+               use_l2_regularizer=True,
+               batch_norm_decay=0.9,
+               batch_norm_epsilon=1e-5):
+  """A block that has a conv layer at shortcut.
+  Note that from stage 3,
+  the second conv layer at main path is with strides=(2, 2)
+  And the shortcut should have strides=(2, 2) as well
+  Args:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: 'a','b'..., current block label, used for generating layer names
+    strides: Strides for the second conv layer in the block.
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
+    batch_norm_decay: Moment of batch norm layers.
+    batch_norm_epsilon: Epsilon of batch borm layers.
+  Returns:
+    Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if tf.keras.backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+  x = layers.Conv2D(
+      filters1, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2a')(
+          input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2a')(
+          x)
+  x = layers.Activation('relu')(x)
+  x = layers.Conv2D(
+      filters2,
+      kernel_size,
+      strides=strides,
+      padding='same',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2b')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2b')(
+          x)
+  x = layers.Activation('relu')(x)
+  x = layers.Conv2D(
+      filters3, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2c')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2c')(
+          x)
+  shortcut = layers.Conv2D(
+      filters3, (1, 1),
+      strides=strides,
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '1')(
+          input_tensor)
+  shortcut = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '1')(
+          shortcut)
+  x = layers.add([x, shortcut])
+  x = layers.Activation('relu')(x)
+  return x
+def resnet50(num_classes,
+             batch_size=None,
+             use_l2_regularizer=True,
+             rescale_inputs=False,
+             batch_norm_decay=0.9,
+             batch_norm_epsilon=1e-5):
+  """Instantiates the ResNet50 architecture.
+  Args:
+    num_classes: `int` number of classes for image classification.
+    batch_size: Size of the batches for each step.
+    use_l2_regularizer: whether to use L2 regularizer on Conv/Dense layer.
+    rescale_inputs: whether to rescale inputs from 0 to 1.
+    batch_norm_decay: Moment of batch norm layers.
+    batch_norm_epsilon: Epsilon of batch borm layers.
+  Returns:
+      A Keras model instance.
+  """
+  input_shape = (224, 224, 3)
+  img_input = layers.Input(shape=input_shape, batch_size=batch_size)
+  if rescale_inputs:
+    # Hub image modules expect inputs in the range [0, 1]. This rescales these
+    # inputs to the range expected by the trained model.
+    x = layers.Lambda(
+        lambda x: x * 255.0 - tf.keras.backend.constant(    # pylint: disable=g-long-lambda
+            imagenet_preprocessing.CHANNEL_MEANS,
+            shape=[1, 1, 3],
+            dtype=x.dtype),
+        name='rescale')(
+            img_input)
+  else:
+    x = img_input
+  if tf.keras.backend.image_data_format() == 'channels_first':
+    x = layers.Permute((3, 1, 2))(x)
+    bn_axis = 1
+  else:  # channels_last
+    bn_axis = 3
+  block_config = dict(
+      use_l2_regularizer=use_l2_regularizer,
+      batch_norm_decay=batch_norm_decay,
+      batch_norm_epsilon=batch_norm_epsilon)
+  x = layers.ZeroPadding2D(padding=(3, 3), name='conv1_pad')(x)
+  x = layers.Conv2D(
+      64, (7, 7),
+      strides=(2, 2),
+      padding='valid',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name='conv1')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name='bn_conv1')(
+          x)
+  x = layers.Activation('relu')(x)
+  x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
+  x = conv_block(
+      x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), **block_config)
+  x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', **block_config)
+  x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', **block_config)
+  x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', **block_config)
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', **block_config)
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', **block_config)
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', **block_config)
+  x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f', **block_config)
+  x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a', **block_config)
+  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b', **block_config)
+  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c', **block_config)
+  x = layers.GlobalAveragePooling2D()(x)
+  x = layers.Dense(
+      num_classes,
+      kernel_initializer=tf.initializers.random_normal(stddev=0.01),
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      bias_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name='fc1000')(
+          x)
+  # A softmax that is followed by the model loss must be done cannot be done
+  # in float16 due to numeric issues. So we pass dtype=float32.
+  x = layers.Activation('softmax', dtype='float32')(x)
+  # Create model.
+  return tf.keras.Model(img_input, x, name='resnet50')
--- a/official/legacy/image_classification/resnet/resnet_runnable.py
+++ b/official/legacy/image_classification/resnet/resnet_runnable.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+import orbit
+import tensorflow as tf
+from official.legacy.image_classification.resnet import common
+from official.legacy.image_classification.resnet import imagenet_preprocessing
+from official.legacy.image_classification.resnet import resnet_model
+from official.modeling import grad_utils
+from official.modeling import performance
+from official.utils.flags import core as flags_core
+class ResnetRunnable(orbit.StandardTrainer, orbit.StandardEvaluator):
+  """Implements the training and evaluation APIs for Resnet model."""
+  def __init__(self, flags_obj, time_callback, epoch_steps):
+    self.strategy = tf.distribute.get_strategy()
+    self.flags_obj = flags_obj
+    self.dtype = flags_core.get_tf_dtype(flags_obj)
+    self.time_callback = time_callback
+    # Input pipeline related
+    batch_size = flags_obj.batch_size
+    if batch_size % self.strategy.num_replicas_in_sync != 0:
+      raise ValueError(
+          'Batch size must be divisible by number of replicas : {}'.format(
+              self.strategy.num_replicas_in_sync))
+    # As auto rebatching is not supported in
+    # `distribute_datasets_from_function()` API, which is
+    # required when cloning dataset to multiple workers in eager mode,
+    # we use per-replica batch size.
+    self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync)
+    if self.flags_obj.use_synthetic_data:
+      self.input_fn = common.get_synth_input_fn(
+          height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
+          width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
+          num_channels=imagenet_preprocessing.NUM_CHANNELS,
+          num_classes=imagenet_preprocessing.NUM_CLASSES,
+          dtype=self.dtype,
+          drop_remainder=True)
+    else:
+      self.input_fn = imagenet_preprocessing.input_fn
+    self.model = resnet_model.resnet50(
+        num_classes=imagenet_preprocessing.NUM_CLASSES,
+        use_l2_regularizer=not flags_obj.single_l2_loss_op)
+    lr_schedule = common.PiecewiseConstantDecayWithWarmup(
+        batch_size=flags_obj.batch_size,
+        epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
+        warmup_epochs=common.LR_SCHEDULE[0][1],
+        boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
+        multipliers=list(p[0] for p in common.LR_SCHEDULE),
+        compute_lr_on_cpu=True)
+    self.optimizer = common.get_optimizer(lr_schedule)
+    # Make sure iterations variable is created inside scope.
+    self.global_step = self.optimizer.iterations
+    self.optimizer = performance.configure_optimizer(
+        self.optimizer,
+        use_float16=self.dtype == tf.float16,
+        loss_scale=flags_core.get_loss_scale(flags_obj, default_for_fp16=128))
+    self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
+    self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        'train_accuracy', dtype=tf.float32)
+    self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
+    self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        'test_accuracy', dtype=tf.float32)
+    self.checkpoint = tf.train.Checkpoint(
+        model=self.model, optimizer=self.optimizer)
+    # Handling epochs.
+    self.epoch_steps = epoch_steps
+    self.epoch_helper = orbit.utils.EpochHelper(epoch_steps, self.global_step)
+    train_dataset = orbit.utils.make_distributed_dataset(
+        self.strategy,
+        self.input_fn,
+        is_training=True,
+        data_dir=self.flags_obj.data_dir,
+        batch_size=self.batch_size,
+        parse_record_fn=imagenet_preprocessing.parse_record,
+        datasets_num_private_threads=self.flags_obj
+        .datasets_num_private_threads,
+        dtype=self.dtype,
+        drop_remainder=True,
+        training_dataset_cache=self.flags_obj.training_dataset_cache)
+    orbit.StandardTrainer.__init__(
+        self,
+        train_dataset,
+        options=orbit.StandardTrainerOptions(
+            use_tf_while_loop=flags_obj.use_tf_while_loop,
+            use_tf_function=flags_obj.use_tf_function))
+    if not flags_obj.skip_eval:
+      eval_dataset = orbit.utils.make_distributed_dataset(
+          self.strategy,
+          self.input_fn,
+          is_training=False,
+          data_dir=self.flags_obj.data_dir,
+          batch_size=self.batch_size,
+          parse_record_fn=imagenet_preprocessing.parse_record,
+          dtype=self.dtype)
+      orbit.StandardEvaluator.__init__(
+          self,
+          eval_dataset,
+          options=orbit.StandardEvaluatorOptions(
+              use_tf_function=flags_obj.use_tf_function))
+  def train_loop_begin(self):
+    """See base class."""
+    # Reset all metrics
+    self.train_loss.reset_states()
+    self.train_accuracy.reset_states()
+    self._epoch_begin()
+    self.time_callback.on_batch_begin(self.epoch_helper.batch_index)
+  def train_step(self, iterator):
+    """See base class."""
+    def step_fn(inputs):
+      """Function to run on the device."""
+      images, labels = inputs
+      with tf.GradientTape() as tape:
+        logits = self.model(images, training=True)
+        prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
+            labels, logits)
+        loss = tf.reduce_sum(prediction_loss) * (1.0 /
+                                                 self.flags_obj.batch_size)
+        num_replicas = self.strategy.num_replicas_in_sync
+        l2_weight_decay = 1e-4
+        if self.flags_obj.single_l2_loss_op:
+          l2_loss = l2_weight_decay * 2 * tf.add_n([
+              tf.nn.l2_loss(v)
+              for v in self.model.trainable_variables
+              if 'bn' not in v.name
+          ])
+          loss += (l2_loss / num_replicas)
+        else:
+          loss += (tf.reduce_sum(self.model.losses) / num_replicas)
+      grad_utils.minimize_using_explicit_allreduce(
+          tape, self.optimizer, loss, self.model.trainable_variables)
+      self.train_loss.update_state(loss)
+      self.train_accuracy.update_state(labels, logits)
+    if self.flags_obj.enable_xla:
+      step_fn = tf.function(step_fn, jit_compile=True)
+    self.strategy.run(step_fn, args=(next(iterator),))
+  def train_loop_end(self):
+    """See base class."""
+    metrics = {
+        'train_loss': self.train_loss.result(),
+        'train_accuracy': self.train_accuracy.result(),
+    }
+    self.time_callback.on_batch_end(self.epoch_helper.batch_index - 1)
+    self._epoch_end()
+    return metrics
+  def eval_begin(self):
+    """See base class."""
+    self.test_loss.reset_states()
+    self.test_accuracy.reset_states()
+  def eval_step(self, iterator):
+    """See base class."""
+    def step_fn(inputs):
+      """Function to run on the device."""
+      images, labels = inputs
+      logits = self.model(images, training=False)
+      loss = tf.keras.losses.sparse_categorical_crossentropy(labels, logits)
+      loss = tf.reduce_sum(loss) * (1.0 / self.flags_obj.batch_size)
+      self.test_loss.update_state(loss)
+      self.test_accuracy.update_state(labels, logits)
+    self.strategy.run(step_fn, args=(next(iterator),))
+  def eval_end(self):
+    """See base class."""
+    return {
+        'test_loss': self.test_loss.result(),
+        'test_accuracy': self.test_accuracy.result()
+    }
+  def _epoch_begin(self):
+    if self.epoch_helper.epoch_begin():
+      self.time_callback.on_epoch_begin(self.epoch_helper.current_epoch)
+  def _epoch_end(self):
+    if self.epoch_helper.epoch_end():
+      self.time_callback.on_epoch_end(self.epoch_helper.current_epoch)
--- a/official/legacy/image_classification/resnet/tfhub_export.py
+++ b/official/legacy/image_classification/resnet/tfhub_export.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A script to export TF-Hub SavedModel."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+# Import libraries
+from absl import app
+from absl import flags
+import tensorflow as tf
+from official.legacy.image_classification.resnet import imagenet_preprocessing
+from official.legacy.image_classification.resnet import resnet_model
+FLAGS = flags.FLAGS
+flags.DEFINE_string("model_path", None,
+                    "File path to TF model checkpoint or H5 file.")
+flags.DEFINE_string("export_path", None,
+                    "TF-Hub SavedModel destination path to export.")
+def export_tfhub(model_path, hub_destination):
+  """Restores a tf.keras.Model and saves for TF-Hub."""
+  model = resnet_model.resnet50(
+      num_classes=imagenet_preprocessing.NUM_CLASSES, rescale_inputs=True)
+  model.load_weights(model_path)
+  model.save(
+      os.path.join(hub_destination, "classification"), include_optimizer=False)
+  # Extracts a sub-model to use pooling feature vector as model output.
+  image_input = model.get_layer(index=0).get_output_at(0)
+  feature_vector_output = model.get_layer(name="reduce_mean").get_output_at(0)
+  hub_model = tf.keras.Model(image_input, feature_vector_output)
+  # Exports a SavedModel.
+  hub_model.save(
+      os.path.join(hub_destination, "feature-vector"), include_optimizer=False)
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+  export_tfhub(FLAGS.model_path, FLAGS.export_path)
+if __name__ == "__main__":
+  app.run(main)
--- a/official/legacy/image_classification/test_utils.py
+++ b/official/legacy/image_classification/test_utils.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test utilities for image classification tasks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+def trivial_model(num_classes):
+  """Trivial model for ImageNet dataset."""
+  input_shape = (224, 224, 3)
+  img_input = tf.keras.layers.Input(shape=input_shape)
+  x = tf.keras.layers.Lambda(
+      lambda x: tf.keras.backend.reshape(x, [-1, 224 * 224 * 3]),
+      name='reshape')(img_input)
+  x = tf.keras.layers.Dense(1, name='fc1')(x)
+  x = tf.keras.layers.Dense(num_classes, name='fc1000')(x)
+  x = tf.keras.layers.Activation('softmax', dtype='float32')(x)
+  return tf.keras.models.Model(img_input, x, name='trivial')
--- a/official/legacy/nlp/albert/README.md
+++ b/official/legacy/nlp/albert/README.md
+# ALBERT (ALBERT: A Lite BERT for Self-supervised Learning of Language Representations)
+**WARNING**: This directory is deprecated.
+See `nlp/docs/MODEL_GARDEN.md` for the new ALBERT implementation.
--- a/official/legacy/nlp/albert/__init__.py
+++ b/official/legacy/nlp/albert/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/official/legacy/nlp/albert/configs.py
+++ b/official/legacy/nlp/albert/configs.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The ALBERT configurations."""
+import six
+from official.nlp.bert import configs
+class AlbertConfig(configs.BertConfig):
+  """Configuration for `ALBERT`."""
+  def __init__(self, num_hidden_groups=1, inner_group_num=1, **kwargs):
+    """Constructs AlbertConfig.
+    Args:
+      num_hidden_groups: Number of group for the hidden layers, parameters in
+        the same group are shared. Note that this value and also the following
+        'inner_group_num' has to be 1 for now, because all released ALBERT
+        models set them to 1. We may support arbitary valid values in future.
+      inner_group_num: Number of inner repetition of attention and ffn.
+      **kwargs: The remaining arguments are the same as above 'BertConfig'.
+    """
+    super(AlbertConfig, self).__init__(**kwargs)
+    # TODO(chendouble): 'inner_group_num' and 'num_hidden_groups' are always 1
+    # in the released ALBERT. Support other values in AlbertEncoder if needed.
+    if inner_group_num != 1 or num_hidden_groups != 1:
+      raise ValueError("We only support 'inner_group_num' and "
+                       "'num_hidden_groups' as 1.")
+  @classmethod
+  def from_dict(cls, json_object):
+    """Constructs a `AlbertConfig` from a Python dictionary of parameters."""
+    config = AlbertConfig(vocab_size=None)
+    for (key, value) in six.iteritems(json_object):
+      config.__dict__[key] = value
+    return config
--- a/official/legacy/transformer/README.md
+++ b/official/legacy/transformer/README.md
+# Transformer Translation Model
+This is an implementation of the Transformer translation model as described in
+the [Attention is All You Need](https://arxiv.org/abs/1706.03762) paper. The
+implementation leverages tf.keras and makes sure it is compatible with TF 2.x.
+**Warning: the features in the `transformer/` folder have been fully intergrated
+into nlp/modeling.
+Due to its dependencies, we will remove this folder after the model
+garden 2.5 release. The model in `nlp/modeling/models/seq2seq_transformer.py` is
+identical to the model in this folder.**
+## Contents
+  * [Contents](#contents)
+  * [Walkthrough](#walkthrough)
+  * [Detailed instructions](#detailed-instructions)
+    * [Environment preparation](#environment-preparation)
+    * [Download and preprocess datasets](#download-and-preprocess-datasets)
+    * [Model training and evaluation](#model-training-and-evaluation)
+  * [Implementation overview](#implementation-overview)
+    * [Model Definition](#model-definition)
+    * [Model Trainer](#model-trainer)
+    * [Test dataset](#test-dataset)
+## Walkthrough
+Below are the commands for running the Transformer model. See the
+[Detailed instructions](#detailed-instructions) for more details on running the
+model.
+```
+# Ensure that PYTHONPATH is correctly defined as described in
+# https://github.com/tensorflow/models/tree/master/official#requirements
+export PYTHONPATH="$PYTHONPATH:/path/to/models"
+cd /path/to/models/official/legacy/transformer
+# Export variables
+PARAM_SET=big
+DATA_DIR=$HOME/transformer/data
+MODEL_DIR=$HOME/transformer/model_$PARAM_SET
+VOCAB_FILE=$DATA_DIR/vocab.ende.32768
+# Download training/evaluation/test datasets
+python3 data_download.py --data_dir=$DATA_DIR
+# Train the model for 100000 steps and evaluate every 5000 steps on a single GPU.
+# Each train step, takes 4096 tokens as a batch budget with 64 as sequence
+# maximal length.
+python3 transformer_main.py --data_dir=$DATA_DIR --model_dir=$MODEL_DIR \
+    --vocab_file=$VOCAB_FILE --param_set=$PARAM_SET \
+    --train_steps=100000 --steps_between_evals=5000 \
+    --batch_size=4096 --max_length=64 \
+    --bleu_source=$DATA_DIR/newstest2014.en \
+    --bleu_ref=$DATA_DIR/newstest2014.de \
+    --num_gpus=1 \
+    --enable_time_history=false
+# Run during training in a separate process to get continuous updates,
+# or after training is complete.
+tensorboard --logdir=$MODEL_DIR
+```
+## Detailed instructions
+0. ### Environment preparation
+   #### Add models repo to PYTHONPATH
+   Follow the instructions described in the [Requirements](https://github.com/tensorflow/models/tree/master/official#requirements) section to add the models folder to the python path.
+   #### Export variables (optional)
+   Export the following variables, or modify the values in each of the snippets below:
+   ```shell
+   PARAM_SET=big
+   DATA_DIR=$HOME/transformer/data
+   MODEL_DIR=$HOME/transformer/model_$PARAM_SET
+   VOCAB_FILE=$DATA_DIR/vocab.ende.32768
+   ```
+1. ### Download and preprocess datasets
+   [data_download.py](data_download.py) downloads and preprocesses the training and evaluation WMT datasets. After the data is downloaded and extracted, the training data is used to generate a vocabulary of subtokens. The evaluation and training strings are tokenized, and the resulting data is sharded, shuffled, and saved as TFRecords.
+   1.75GB of compressed data will be downloaded. In total, the raw files (compressed, extracted, and combined files) take up 8.4GB of disk space. The resulting TFRecord and vocabulary files are 722MB. The script takes around 40 minutes to run, with the bulk of the time spent downloading and ~15 minutes spent on preprocessing.
+   Command to run:
+   ```
+   python3 data_download.py --data_dir=$DATA_DIR
+   ```
+   Arguments:
+   * `--data_dir`: Path where the preprocessed TFRecord data, and vocab file will be saved.
+   * Use the `--help` or `-h` flag to get a full list of possible arguments.
+2. ### Model training and evaluation
+   [transformer_main.py](transformer_main.py) creates a Transformer keras model,
+   and trains it uses keras model.fit().
+   Users need to adjust `batch_size` and `num_gpus` to get good performance
+   running multiple GPUs.
+   **Note that:**
+   when using multiple GPUs or TPUs, this is the global batch size for all
+   devices. For example, if the batch size is `4096*4` and there are 4 devices,
+   each device will take 4096 tokens as a batch budget.
+   Command to run:
+   ```
+   python3 transformer_main.py --data_dir=$DATA_DIR --model_dir=$MODEL_DIR \
+       --vocab_file=$VOCAB_FILE --param_set=$PARAM_SET
+   ```
+   Arguments:
+   * `--data_dir`: This should be set to the same directory given to the `data_download`'s `data_dir` argument.
+   * `--model_dir`: Directory to save Transformer model training checkpoints.
+   * `--vocab_file`: Path to subtoken vocabulary file. If data_download was used, you may find the file in `data_dir`.
+   * `--param_set`: Parameter set to use when creating and training the model. Options are `base` and `big` (default).
+   * `--enable_time_history`: Whether add TimeHistory call. If so, --log_steps must be specified.
+   * `--batch_size`: The number of tokens to consider in a batch. Combining with
+     `--max_length`, they decide how many sequences are used per batch.
+   * Use the `--help` or `-h` flag to get a full list of possible arguments.
+    #### Using multiple GPUs
+    You can train these models on multiple GPUs using `tf.distribute.Strategy` API.
+    You can read more about them in this
+    [guide](https://www.tensorflow.org/guide/distribute_strategy).
+    In this example, we have made it easier to use is with just a command line flag
+    `--num_gpus`. By default this flag is 1 if TensorFlow is compiled with CUDA,
+    and 0 otherwise.
+    - --num_gpus=0: Uses tf.distribute.OneDeviceStrategy with CPU as the device.
+    - --num_gpus=1: Uses tf.distribute.OneDeviceStrategy with GPU as the device.
+    - --num_gpus=2+: Uses tf.distribute.MirroredStrategy to run synchronous
+    distributed training across the GPUs.
+   #### Using Cloud TPUs
+   You can train the Transformer model on Cloud TPUs using
+   `tf.distribute.TPUStrategy`. If you are not familiar with Cloud TPUs, it is
+   strongly recommended that you go through the
+   [quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
+   create a TPU and GCE VM.
+   To run the Transformer model on a TPU, you must set
+   `--distribution_strategy=tpu`, `--tpu=$TPU_NAME`, and `--use_ctl=True` where
+   `$TPU_NAME` the name of your TPU in the Cloud Console.
+   An example command to run Transformer on a v2-8 or v3-8 TPU would be:
+   ```bash
+   python transformer_main.py \
+     --tpu=$TPU_NAME \
+     --model_dir=$MODEL_DIR \
+     --data_dir=$DATA_DIR \
+     --vocab_file=$DATA_DIR/vocab.ende.32768 \
+     --bleu_source=$DATA_DIR/newstest2014.en \
+     --bleu_ref=$DATA_DIR/newstest2014.end \
+     --batch_size=6144 \
+     --train_steps=2000 \
+     --static_batch=true \
+     --use_ctl=true \
+     --param_set=big \
+     --max_length=64 \
+     --decode_batch_size=32 \
+     --decode_max_length=97 \
+     --padded_decode=true \
+     --distribution_strategy=tpu
+   ```
+   Note: `$MODEL_DIR` and `$DATA_DIR` must be GCS paths.
+   #### Customizing training schedule
+   By default, the model will train for 10 epochs, and evaluate after every epoch. The training schedule may be defined through the flags:
+   * Training with steps:
+     * `--train_steps`: sets the total number of training steps to run.
+     * `--steps_between_evals`: Number of training steps to run between evaluations.
+   #### Compute BLEU score during model evaluation
+   Use these flags to compute the BLEU when the model evaluates:
+   * `--bleu_source`: Path to file containing text to translate.
+   * `--bleu_ref`: Path to file containing the reference translation.
+   When running `transformer_main.py`, use the flags: `--bleu_source=$DATA_DIR/newstest2014.en --bleu_ref=$DATA_DIR/newstest2014.de`
+   #### Tensorboard
+   Training and evaluation metrics (loss, accuracy, approximate BLEU score, etc.) are logged, and can be displayed in the browser using Tensorboard.
+   ```
+   tensorboard --logdir=$MODEL_DIR
+   ```
+   The values are displayed at [localhost:6006](localhost:6006).
+## Implementation overview
+A brief look at each component in the code:
+### Model Definition
+* [transformer.py](transformer.py): Defines a tf.keras.Model: `Transformer`.
+* [embedding_layer.py](embedding_layer.py): Contains the layer that calculates the embeddings. The embedding weights are also used to calculate the pre-softmax probabilities from the decoder output.
+* [attention_layer.py](attention_layer.py): Defines the multi-headed and self attention layers that are used in the encoder/decoder stacks.
+* [ffn_layer.py](ffn_layer.py): Defines the feedforward network that is used in the encoder/decoder stacks. The network is composed of 2 fully connected layers.
+Other files:
+* [beam_search.py](beam_search.py) contains the beam search implementation, which is used during model inference to find high scoring translations.
+### Model Trainer
+[transformer_main.py](transformer_main.py) creates an `TransformerTask` to train and evaluate the model using tf.keras.
+### Test dataset
+The [newstest2014 files](https://storage.googleapis.com/tf-perf-public/official_transformer/test_data/newstest2014.tgz)
+are extracted from the [NMT Seq2Seq tutorial](https://google.github.io/seq2seq/nmt/#download-data).
+The raw text files are converted from the SGM format of the
+[WMT 2016](http://www.statmt.org/wmt16/translation-task.html) test sets. The
+newstest2014 files are put into the `$DATA_DIR` when executing `data_download.py`
--- a/official/legacy/transformer/__init__.py
+++ b/official/legacy/transformer/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/official/legacy/transformer/attention_layer.py
+++ b/official/legacy/transformer/attention_layer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implementation of multiheaded attention and self-attention layers."""
+import math
+import tensorflow as tf
+class Attention(tf.keras.layers.Layer):
+  """Multi-headed attention layer."""
+  def __init__(self, hidden_size, num_heads, attention_dropout):
+    """Initialize Attention.
+    Args:
+      hidden_size: int, output dim of hidden layer.
+      num_heads: int, number of heads to repeat the same attention structure.
+      attention_dropout: float, dropout rate inside attention for training.
+    """
+    if hidden_size % num_heads:
+      raise ValueError(
+          "Hidden size ({}) must be divisible by the number of heads ({})."
+          .format(hidden_size, num_heads))
+    super(Attention, self).__init__()
+    self.hidden_size = hidden_size
+    self.num_heads = num_heads
+    self.attention_dropout = attention_dropout
+  def build(self, input_shape):
+    """Builds the layer."""
+    # Layers for linearly projecting the queries, keys, and values.
+    size_per_head = self.hidden_size // self.num_heads
+    def _glorot_initializer(fan_in, fan_out):
+      limit = math.sqrt(6.0 / (fan_in + fan_out))
+      return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit)
+    attention_initializer = _glorot_initializer(input_shape.as_list()[-1],
+                                                self.hidden_size)
+    self.query_dense_layer = tf.keras.layers.experimental.EinsumDense(
+        "BTE,ENH->BTNH",
+        output_shape=(None, self.num_heads, size_per_head),
+        kernel_initializer=attention_initializer,
+        bias_axes=None,
+        name="query")
+    self.key_dense_layer = tf.keras.layers.experimental.EinsumDense(
+        "BTE,ENH->BTNH",
+        output_shape=(None, self.num_heads, size_per_head),
+        kernel_initializer=attention_initializer,
+        bias_axes=None,
+        name="key")
+    self.value_dense_layer = tf.keras.layers.experimental.EinsumDense(
+        "BTE,ENH->BTNH",
+        output_shape=(None, self.num_heads, size_per_head),
+        kernel_initializer=attention_initializer,
+        bias_axes=None,
+        name="value")
+    output_initializer = _glorot_initializer(self.hidden_size, self.hidden_size)
+    self.output_dense_layer = tf.keras.layers.experimental.EinsumDense(
+        "BTNH,NHE->BTE",
+        output_shape=(None, self.hidden_size),
+        kernel_initializer=output_initializer,
+        bias_axes=None,
+        name="output_transform")
+    super(Attention, self).build(input_shape)
+  def get_config(self):
+    return {
+        "hidden_size": self.hidden_size,
+        "num_heads": self.num_heads,
+        "attention_dropout": self.attention_dropout,
+    }
+  def call(self,
+           query_input,
+           source_input,
+           bias,
+           training,
+           cache=None,
+           decode_loop_step=None):
+    """Apply attention mechanism to query_input and source_input.
+    Args:
+      query_input: A tensor with shape [batch_size, length_query, hidden_size].
+      source_input: A tensor with shape [batch_size, length_source,
+        hidden_size].
+      bias: A tensor with shape [batch_size, 1, length_query, length_source],
+        the attention bias that will be added to the result of the dot product.
+      training: A bool, whether in training mode or not.
+      cache: (Used during prediction) A dictionary with tensors containing
+        results of previous attentions. The dictionary must have the items:
+            {"k": tensor with shape [batch_size, i, heads, dim_per_head],
+             "v": tensor with shape [batch_size, i, heads, dim_per_head]} where
+               i is the current decoded length for non-padded decode, or max
+               sequence length for padded decode.
+      decode_loop_step: An integer, step number of the decoding loop. Used only
+        for autoregressive inference on TPU.
+    Returns:
+      Attention layer output with shape [batch_size, length_query, hidden_size]
+    """
+    # Linearly project the query, key and value using different learned
+    # projections. Splitting heads is automatically done during the linear
+    # projections --> [batch_size, length, num_heads, dim_per_head].
+    query = self.query_dense_layer(query_input)
+    key = self.key_dense_layer(source_input)
+    value = self.value_dense_layer(source_input)
+    if cache is not None:
+      # Combine cached keys and values with new keys and values.
+      if decode_loop_step is not None:
+        cache_k_shape = cache["k"].shape.as_list()
+        indices = tf.reshape(
+            tf.one_hot(decode_loop_step, cache_k_shape[1], dtype=key.dtype),
+            [1, cache_k_shape[1], 1, 1])
+        key = cache["k"] + key * indices
+        cache_v_shape = cache["v"].shape.as_list()
+        indices = tf.reshape(
+            tf.one_hot(decode_loop_step, cache_v_shape[1], dtype=value.dtype),
+            [1, cache_v_shape[1], 1, 1])
+        value = cache["v"] + value * indices
+      else:
+        key = tf.concat([tf.cast(cache["k"], key.dtype), key], axis=1)
+        value = tf.concat([tf.cast(cache["v"], value.dtype), value], axis=1)
+      # Update cache
+      cache["k"] = key
+      cache["v"] = value
+    # Scale query to prevent the dot product between query and key from growing
+    # too large.
+    depth = (self.hidden_size // self.num_heads)
+    query *= depth**-0.5
+    # Calculate dot product attention
+    logits = tf.einsum("BTNH,BFNH->BNFT", key, query)
+    logits += bias
+    # Note that softmax internally performs math operations using float32
+    # for numeric stability. When training with float16, we keep the input
+    # and output in float16 for better performance.
+    weights = tf.nn.softmax(logits, name="attention_weights")
+    if training:
+      weights = tf.nn.dropout(weights, rate=self.attention_dropout)
+    attention_output = tf.einsum("BNFT,BTNH->BFNH", weights, value)
+    # Run the outputs through another linear projection layer. Recombining heads
+    # is automatically done --> [batch_size, length, hidden_size]
+    attention_output = self.output_dense_layer(attention_output)
+    return attention_output
+class SelfAttention(Attention):
+  """Multiheaded self-attention layer."""
+  def call(self,
+           query_input,
+           bias,
+           training,
+           cache=None,
+           decode_loop_step=None):
+    return super(SelfAttention, self).call(query_input, query_input, bias,
+                                           training, cache, decode_loop_step)