Merge pull request #10338 from srihari-humbarwadi:readme

PiperOrigin-RevId: 413033276

Merge pull request #10338 from srihari-humbarwadi:readme
PiperOrigin-RevId: 413033276
c57e975a · saberkun · 7fb4f3cd · acf4156e · c57e975a · c57e975a
Commit c57e975a authored Nov 29, 2021 by saberkun
20 changed files
--- a/official/legacy/image_classification/configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml
+++ b/official/legacy/image_classification/configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml
+# Training configuration for EfficientNet-b0 trained on ImageNet on GPUs.
+# Takes ~32 minutes per epoch for 8 V100s.
+# Reaches ~76.1% within 350 epochs.
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'mirrored'
+  num_gpus: 1
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'train'
+  num_classes: 1000
+  num_examples: 1281167
+  batch_size: 32
+  use_per_replica_batch_size: true
+  dtype: 'float32'
+  augmenter:
+    name: 'autoaugment'
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'validation'
+  num_classes: 1000
+  num_examples: 50000
+  batch_size: 32
+  use_per_replica_batch_size: true
+  dtype: 'float32'
+model:
+  model_params:
+    model_name: 'efficientnet-b0'
+    overrides:
+      num_classes: 1000
+      batch_norm: 'default'
+      dtype: 'float32'
+      activation: 'swish'
+  optimizer:
+    name: 'rmsprop'
+    momentum: 0.9
+    decay: 0.9
+    moving_average_decay: 0.0
+    lookahead: false
+  learning_rate:
+    name: 'exponential'
+  loss:
+    label_smoothing: 0.1
+train:
+  resume_checkpoint: true
+  epochs: 500
+evaluation:
+  epochs_between_evals: 1
--- a/official/legacy/image_classification/configs/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
+++ b/official/legacy/image_classification/configs/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
+# Training configuration for EfficientNet-b0 trained on ImageNet on TPUs.
+# Takes ~2 minutes, 50 seconds per epoch for v3-32.
+# Reaches ~76.1% within 350 epochs.
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'tpu'
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'train'
+  num_classes: 1000
+  num_examples: 1281167
+  batch_size: 128
+  use_per_replica_batch_size: true
+  dtype: 'bfloat16'
+  augmenter:
+    name: 'autoaugment'
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'validation'
+  num_classes: 1000
+  num_examples: 50000
+  batch_size: 128
+  use_per_replica_batch_size: true
+  dtype: 'bfloat16'
+model:
+  model_params:
+    model_name: 'efficientnet-b0'
+    overrides:
+      num_classes: 1000
+      batch_norm: 'tpu'
+      dtype: 'bfloat16'
+      activation: 'swish'
+  optimizer:
+    name: 'rmsprop'
+    momentum: 0.9
+    decay: 0.9
+    moving_average_decay: 0.0
+    lookahead: false
+  learning_rate:
+    name: 'exponential'
+  loss:
+    label_smoothing: 0.1
+train:
+  resume_checkpoint: true
+  epochs: 500
+  set_epoch_loop: true
+evaluation:
+  epochs_between_evals: 1
--- a/official/legacy/image_classification/configs/examples/efficientnet/imagenet/efficientnet-b1-gpu.yaml
+++ b/official/legacy/image_classification/configs/examples/efficientnet/imagenet/efficientnet-b1-gpu.yaml
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'mirrored'
+  num_gpus: 1
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'train'
+  num_classes: 1000
+  num_examples: 1281167
+  batch_size: 32
+  use_per_replica_batch_size: true
+  dtype: 'float32'
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'validation'
+  num_classes: 1000
+  num_examples: 50000
+  batch_size: 32
+  use_per_replica_batch_size: true
+  dtype: 'float32'
+model:
+  model_params:
+    model_name: 'efficientnet-b1'
+    overrides:
+      num_classes: 1000
+      batch_norm: 'default'
+      dtype: 'float32'
+      activation: 'swish'
+  optimizer:
+    name: 'rmsprop'
+    momentum: 0.9
+    decay: 0.9
+    moving_average_decay: 0.0
+    lookahead: false
+  learning_rate:
+    name: 'exponential'
+  loss:
+    label_smoothing: 0.1
+train:
+  resume_checkpoint: true
+  epochs: 500
+evaluation:
+  epochs_between_evals: 1
--- a/official/legacy/image_classification/configs/examples/efficientnet/imagenet/efficientnet-b1-tpu.yaml
+++ b/official/legacy/image_classification/configs/examples/efficientnet/imagenet/efficientnet-b1-tpu.yaml
+# Training configuration for EfficientNet-b1 trained on ImageNet on TPUs.
+# Takes ~3 minutes, 15 seconds per epoch for v3-32.
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'tpu'
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'train'
+  num_classes: 1000
+  num_examples: 1281167
+  batch_size: 128
+  use_per_replica_batch_size: true
+  dtype: 'bfloat16'
+  augmenter:
+    name: 'autoaugment'
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'validation'
+  num_classes: 1000
+  num_examples: 50000
+  batch_size: 128
+  use_per_replica_batch_size: true
+  dtype: 'bfloat16'
+model:
+  model_params:
+    model_name: 'efficientnet-b1'
+    overrides:
+      num_classes: 1000
+      batch_norm: 'tpu'
+      dtype: 'bfloat16'
+      activation: 'swish'
+  optimizer:
+    name: 'rmsprop'
+    momentum: 0.9
+    decay: 0.9
+    moving_average_decay: 0.0
+    lookahead: false
+  learning_rate:
+    name: 'exponential'
+  loss:
+    label_smoothing: 0.1
+train:
+  resume_checkpoint: true
+  epochs: 500
+  set_epoch_loop: true
+evaluation:
+  epochs_between_evals: 1
--- a/official/legacy/image_classification/configs/examples/resnet/imagenet/gpu.yaml
+++ b/official/legacy/image_classification/configs/examples/resnet/imagenet/gpu.yaml
+# Training configuration for ResNet trained on ImageNet on GPUs.
+# Reaches > 76.1% within 90 epochs.
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'mirrored'
+  num_gpus: 1
+  batchnorm_spatial_persistent: true
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'tfds'
+  split: 'train'
+  image_size: 224
+  num_classes: 1000
+  num_examples: 1281167
+  batch_size: 256
+  use_per_replica_batch_size: true
+  dtype: 'float16'
+  mean_subtract: true
+  standardize: true
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'tfds'
+  split: 'validation'
+  image_size: 224
+  num_classes: 1000
+  num_examples: 50000
+  batch_size: 256
+  use_per_replica_batch_size: true
+  dtype: 'float16'
+  mean_subtract: true
+  standardize: true
+model:
+  name: 'resnet'
+  model_params:
+    rescale_inputs: false
+  optimizer:
+    name: 'momentum'
+    momentum: 0.9
+    decay: 0.9
+    epsilon: 0.001
+  loss:
+    label_smoothing: 0.1
+train:
+  resume_checkpoint: true
+  epochs: 90
+evaluation:
+  epochs_between_evals: 1
--- a/official/legacy/image_classification/configs/examples/resnet/imagenet/tpu.yaml
+++ b/official/legacy/image_classification/configs/examples/resnet/imagenet/tpu.yaml
+# Training configuration for ResNet trained on ImageNet on TPUs.
+# Takes ~4 minutes, 30 seconds seconds per epoch for a v3-32.
+# Reaches > 76.1% within 90 epochs.
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'tpu'
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'tfds'
+  split: 'train'
+  one_hot: false
+  image_size: 224
+  num_classes: 1000
+  num_examples: 1281167
+  batch_size: 128
+  use_per_replica_batch_size: true
+  mean_subtract: false
+  standardize: false
+  dtype: 'bfloat16'
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'tfds'
+  split: 'validation'
+  one_hot: false
+  image_size: 224
+  num_classes: 1000
+  num_examples: 50000
+  batch_size: 128
+  use_per_replica_batch_size: true
+  mean_subtract: false
+  standardize: false
+  dtype: 'bfloat16'
+model:
+  name: 'resnet'
+  model_params:
+    rescale_inputs: true
+  optimizer:
+    name: 'momentum'
+    momentum: 0.9
+    decay: 0.9
+    epsilon: 0.001
+    moving_average_decay: 0.
+    lookahead: false
+  loss:
+    label_smoothing: 0.1
+train:
+  callbacks:
+    enable_checkpoint_and_export: true
+  resume_checkpoint: true
+  epochs: 90
+  set_epoch_loop: true
+evaluation:
+  epochs_between_evals: 1
--- a/official/legacy/image_classification/dataset_factory.py
+++ b/official/legacy/image_classification/dataset_factory.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Dataset utilities for vision tasks using TFDS and tf.data.Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+import dataclasses
+import os
+from typing import Any, List, Mapping, Optional, Tuple, Union
+
+from absl import logging
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from official.legacy.image_classification import augment
+from official.legacy.image_classification import preprocessing
+from official.modeling.hyperparams import base_config
+
+AUGMENTERS = {
+    'autoaugment': augment.AutoAugment,
+    'randaugment': augment.RandAugment,
+}
+
+
+@dataclasses.dataclass
+class AugmentConfig(base_config.Config):
+  """Configuration for image augmenters.
+
+  Attributes:
+    name: The name of the image augmentation to use. Possible options are None
+      (default), 'autoaugment', or 'randaugment'.
+    params: Any parameters used to initialize the augmenter.
+  """
+  name: Optional[str] = None
+  params: Optional[Mapping[str, Any]] = None
+
+  def build(self) -> augment.ImageAugment:
+    """Build the augmenter using this config."""
+    params = self.params or {}
+    augmenter = AUGMENTERS.get(self.name, None)
+    return augmenter(**params) if augmenter is not None else None
+
+
+@dataclasses.dataclass
+class DatasetConfig(base_config.Config):
+  """The base configuration for building datasets.
+
+  Attributes:
+    name: The name of the Dataset. Usually should correspond to a TFDS dataset.
+    data_dir: The path where the dataset files are stored, if available.
+    filenames: Optional list of strings representing the TFRecord names.
+    builder: The builder type used to load the dataset. Value should be one of
+      'tfds' (load using TFDS), 'records' (load from TFRecords), or 'synthetic'
+      (generate dummy synthetic data without reading from files).
+    split: The split of the dataset. Usually 'train', 'validation', or 'test'.
+    image_size: The size of the image in the dataset. This assumes that `width`
+      == `height`. Set to 'infer' to infer the image size from TFDS info. This
+      requires `name` to be a registered dataset in TFDS.
+    num_classes: The number of classes given by the dataset. Set to 'infer' to
+      infer the image size from TFDS info. This requires `name` to be a
+      registered dataset in TFDS.
+    num_channels: The number of channels given by the dataset. Set to 'infer' to
+      infer the image size from TFDS info. This requires `name` to be a
+      registered dataset in TFDS.
+    num_examples: The number of examples given by the dataset. Set to 'infer' to
+      infer the image size from TFDS info. This requires `name` to be a
+      registered dataset in TFDS.
+    batch_size: The base batch size for the dataset.
+    use_per_replica_batch_size: Whether to scale the batch size based on
+      available resources. If set to `True`, the dataset builder will return
+      batch_size multiplied by `num_devices`, the number of device replicas
+      (e.g., the number of GPUs or TPU cores). This setting should be `True` if
+      the strategy argument is passed to `build()` and `num_devices > 1`.
+    num_devices: The number of replica devices to use. This should be set by
+      `strategy.num_replicas_in_sync` when using a distribution strategy.
+    dtype: The desired dtype of the dataset. This will be set during
+      preprocessing.
+    one_hot: Whether to apply one hot encoding. Set to `True` to be able to use
+      label smoothing.
+    augmenter: The augmenter config to use. No augmentation is used by default.
+    download: Whether to download data using TFDS.
+    shuffle_buffer_size: The buffer size used for shuffling training data.
+    file_shuffle_buffer_size: The buffer size used for shuffling raw training
+      files.
+    skip_decoding: Whether to skip image decoding when loading from TFDS.
+    cache: whether to cache to dataset examples. Can be used to avoid re-reading
+      from disk on the second epoch. Requires significant memory overhead.
+    tf_data_service: The URI of a tf.data service to offload preprocessing onto
+      during training. The URI should be in the format "protocol://address",
+      e.g. "grpc://tf-data-service:5050".
+    mean_subtract: whether or not to apply mean subtraction to the dataset.
+    standardize: whether or not to apply standardization to the dataset.
+  """
+  name: Optional[str] = None
+  data_dir: Optional[str] = None
+  filenames: Optional[List[str]] = None
+  builder: str = 'tfds'
+  split: str = 'train'
+  image_size: Union[int, str] = 'infer'
+  num_classes: Union[int, str] = 'infer'
+  num_channels: Union[int, str] = 'infer'
+  num_examples: Union[int, str] = 'infer'
+  batch_size: int = 128
+  use_per_replica_batch_size: bool = True
+  num_devices: int = 1
+  dtype: str = 'float32'
+  one_hot: bool = True
+  augmenter: AugmentConfig = AugmentConfig()
+  download: bool = False
+  shuffle_buffer_size: int = 10000
+  file_shuffle_buffer_size: int = 1024
+  skip_decoding: bool = True
+  cache: bool = False
+  tf_data_service: Optional[str] = None
+  mean_subtract: bool = False
+  standardize: bool = False
+
+  @property
+  def has_data(self):
+    """Whether this dataset is has any data associated with it."""
+    return self.name or self.data_dir or self.filenames
+
+
+@dataclasses.dataclass
+class ImageNetConfig(DatasetConfig):
+  """The base ImageNet dataset config."""
+  name: str = 'imagenet2012'
+  # Note: for large datasets like ImageNet, using records is faster than tfds
+  builder: str = 'records'
+  image_size: int = 224
+  num_channels: int = 3
+  num_examples: int = 1281167
+  num_classes: int = 1000
+  batch_size: int = 128
+
+
+@dataclasses.dataclass
+class Cifar10Config(DatasetConfig):
+  """The base CIFAR-10 dataset config."""
+  name: str = 'cifar10'
+  image_size: int = 224
+  batch_size: int = 128
+  download: bool = True
+  cache: bool = True
+
+
+class DatasetBuilder:
+  """An object for building datasets.
+
+  Allows building various pipelines fetching examples, preprocessing, etc.
+  Maintains additional state information calculated from the dataset, i.e.,
+  training set split, batch size, and number of steps (batches).
+  """
+
+  def __init__(self, config: DatasetConfig, **overrides: Any):
+    """Initialize the builder from the config."""
+    self.config = config.replace(**overrides)
+    self.builder_info = None
+
+    if self.config.augmenter is not None:
+      logging.info('Using augmentation: %s', self.config.augmenter.name)
+      self.augmenter = self.config.augmenter.build()
+    else:
+      self.augmenter = None
+
+  @property
+  def is_training(self) -> bool:
+    """Whether this is the training set."""
+    return self.config.split == 'train'
+
+  @property
+  def batch_size(self) -> int:
+    """The batch size, multiplied by the number of replicas (if configured)."""
+    if self.config.use_per_replica_batch_size:
+      return self.config.batch_size * self.config.num_devices
+    else:
+      return self.config.batch_size
+
+  @property
+  def global_batch_size(self):
+    """The global batch size across all replicas."""
+    return self.batch_size
+
+  @property
+  def local_batch_size(self):
+    """The base unscaled batch size."""
+    if self.config.use_per_replica_batch_size:
+      return self.config.batch_size
+    else:
+      return self.config.batch_size // self.config.num_devices
+
+  @property
+  def num_steps(self) -> int:
+    """The number of steps (batches) to exhaust this dataset."""
+    # Always divide by the global batch size to get the correct # of steps
+    return self.num_examples // self.global_batch_size
+
+  @property
+  def dtype(self) -> tf.dtypes.DType:
+    """Converts the config's dtype string to a tf dtype.
+
+    Returns:
+      A mapping from string representation of a dtype to the `tf.dtypes.DType`.
+
+    Raises:
+      ValueError if the config's dtype is not supported.
+
+    """
+    dtype_map = {
+        'float32': tf.float32,
+        'bfloat16': tf.bfloat16,
+        'float16': tf.float16,
+        'fp32': tf.float32,
+        'bf16': tf.bfloat16,
+    }
+    try:
+      return dtype_map[self.config.dtype]
+    except:
+      raise ValueError('Invalid DType provided. Supported types: {}'.format(
+          dtype_map.keys()))
+
+  @property
+  def image_size(self) -> int:
+    """The size of each image (can be inferred from the dataset)."""
+
+    if self.config.image_size == 'infer':
+      return self.info.features['image'].shape[0]
+    else:
+      return int(self.config.image_size)
+
+  @property
+  def num_channels(self) -> int:
+    """The number of image channels (can be inferred from the dataset)."""
+    if self.config.num_channels == 'infer':
+      return self.info.features['image'].shape[-1]
+    else:
+      return int(self.config.num_channels)
+
+  @property
+  def num_examples(self) -> int:
+    """The number of examples (can be inferred from the dataset)."""
+    if self.config.num_examples == 'infer':
+      return self.info.splits[self.config.split].num_examples
+    else:
+      return int(self.config.num_examples)
+
+  @property
+  def num_classes(self) -> int:
+    """The number of classes (can be inferred from the dataset)."""
+    if self.config.num_classes == 'infer':
+      return self.info.features['label'].num_classes
+    else:
+      return int(self.config.num_classes)
+
+  @property
+  def info(self) -> tfds.core.DatasetInfo:
+    """The TFDS dataset info, if available."""
+    try:
+      if self.builder_info is None:
+        self.builder_info = tfds.builder(self.config.name).info
+    except ConnectionError as e:
+      logging.error('Failed to use TFDS to load info. Please set dataset info '
+                    '(image_size, num_channels, num_examples, num_classes) in '
+                    'the dataset config.')
+      raise e
+    return self.builder_info
+
+  def build(
+      self,
+      strategy: Optional[tf.distribute.Strategy] = None) -> tf.data.Dataset:
+    """Construct a dataset end-to-end and return it using an optional strategy.
+
+    Args:
+      strategy: a strategy that, if passed, will distribute the dataset
+        according to that strategy. If passed and `num_devices > 1`,
+        `use_per_replica_batch_size` must be set to `True`.
+
+    Returns:
+      A TensorFlow dataset outputting batched images and labels.
+    """
+    if strategy:
+      if strategy.num_replicas_in_sync != self.config.num_devices:
+        logging.warn(
+            'Passed a strategy with %d devices, but expected'
+            '%d devices.', strategy.num_replicas_in_sync,
+            self.config.num_devices)
+      dataset = strategy.distribute_datasets_from_function(self._build)
+    else:
+      dataset = self._build()
+
+    return dataset
+
+  def _build(
+      self,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Construct a dataset end-to-end and return it.
+
+    Args:
+      input_context: An optional context provided by `tf.distribute` for
+        cross-replica training.
+
+    Returns:
+      A TensorFlow dataset outputting batched images and labels.
+    """
+    builders = {
+        'tfds': self.load_tfds,
+        'records': self.load_records,
+        'synthetic': self.load_synthetic,
+    }
+
+    builder = builders.get(self.config.builder, None)
+
+    if builder is None:
+      raise ValueError('Unknown builder type {}'.format(self.config.builder))
+
+    self.input_context = input_context
+    dataset = builder()
+    dataset = self.pipeline(dataset)
+
+    return dataset
+
+  def load_tfds(self) -> tf.data.Dataset:
+    """Return a dataset loading files from TFDS."""
+
+    logging.info('Using TFDS to load data.')
+    builder = tfds.builder(self.config.name, data_dir=self.config.data_dir)
+
+    if self.config.download:
+      builder.download_and_prepare()
+
+    decoders = {}
+
+    if self.config.skip_decoding:
+      decoders['image'] = tfds.decode.SkipDecoding()
+
+    read_config = tfds.ReadConfig(
+        interleave_cycle_length=10,
+        interleave_block_length=1,
+        input_context=self.input_context)
+
+    dataset = builder.as_dataset(
+        split=self.config.split,
+        as_supervised=True,
+        shuffle_files=True,
+        decoders=decoders,
+        read_config=read_config)
+
+    return dataset
+
+  def load_records(self) -> tf.data.Dataset:
+    """Return a dataset loading files with TFRecords."""
+    logging.info('Using TFRecords to load data.')
+    if self.config.filenames is None:
+      if self.config.data_dir is None:
+        raise ValueError('Dataset must specify a path for the data files.')
+
+      file_pattern = os.path.join(self.config.data_dir,
+                                  '{}*'.format(self.config.split))
+      dataset = tf.data.Dataset.list_files(file_pattern, shuffle=False)
+    else:
+      dataset = tf.data.Dataset.from_tensor_slices(self.config.filenames)
+
+    return dataset
+
+  def load_synthetic(self) -> tf.data.Dataset:
+    """Return a dataset generating dummy synthetic data."""
+    logging.info('Generating a synthetic dataset.')
+
+    def generate_data(_):
+      image = tf.zeros([self.image_size, self.image_size, self.num_channels],
+                       dtype=self.dtype)
+      label = tf.zeros([1], dtype=tf.int32)
+      return image, label
+
+    dataset = tf.data.Dataset.range(1)
+    dataset = dataset.repeat()
+    dataset = dataset.map(
+        generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    return dataset
+
+  def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
+    """Build a pipeline fetching, shuffling, and preprocessing the dataset.
+
+    Args:
+      dataset: A `tf.data.Dataset` that loads raw files.
+
+    Returns:
+      A TensorFlow dataset outputting batched images and labels.
+    """
+    if (self.config.builder != 'tfds' and self.input_context and
+        self.input_context.num_input_pipelines > 1):
+      dataset = dataset.shard(self.input_context.num_input_pipelines,
+                              self.input_context.input_pipeline_id)
+      logging.info(
+          'Sharding the dataset: input_pipeline_id=%d '
+          'num_input_pipelines=%d', self.input_context.num_input_pipelines,
+          self.input_context.input_pipeline_id)
+
+    if self.is_training and self.config.builder == 'records':
+      # Shuffle the input files.
+      dataset.shuffle(buffer_size=self.config.file_shuffle_buffer_size)
+
+    if self.is_training and not self.config.cache:
+      dataset = dataset.repeat()
+
+    if self.config.builder == 'records':
+      # Read the data from disk in parallel
+      dataset = dataset.interleave(
+          tf.data.TFRecordDataset,
+          cycle_length=10,
+          block_length=1,
+          num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    if self.config.cache:
+      dataset = dataset.cache()
+
+    if self.is_training:
+      dataset = dataset.shuffle(self.config.shuffle_buffer_size)
+      dataset = dataset.repeat()
+
+    # Parse, pre-process, and batch the data in parallel
+    if self.config.builder == 'records':
+      preprocess = self.parse_record
+    else:
+      preprocess = self.preprocess
+    dataset = dataset.map(
+        preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    if self.input_context and self.config.num_devices > 1:
+      if not self.config.use_per_replica_batch_size:
+        raise ValueError(
+            'The builder does not support a global batch size with more than '
+            'one replica. Got {} replicas. Please set a '
+            '`per_replica_batch_size` and enable '
+            '`use_per_replica_batch_size=True`.'.format(
+                self.config.num_devices))
+
+      # The batch size of the dataset will be multiplied by the number of
+      # replicas automatically when strategy.distribute_datasets_from_function
+      # is called, so we use local batch size here.
+      dataset = dataset.batch(
+          self.local_batch_size, drop_remainder=self.is_training)
+    else:
+      dataset = dataset.batch(
+          self.global_batch_size, drop_remainder=self.is_training)
+
+    # Prefetch overlaps in-feed with training
+    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+
+    if self.config.tf_data_service:
+      if not hasattr(tf.data.experimental, 'service'):
+        raise ValueError('The tf_data_service flag requires Tensorflow version '
+                         '>= 2.3.0, but the version is {}'.format(
+                             tf.__version__))
+      dataset = dataset.apply(
+          tf.data.experimental.service.distribute(
+              processing_mode='parallel_epochs',
+              service=self.config.tf_data_service,
+              job_name='resnet_train'))
+      dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+
+    return dataset
+
+  def parse_record(self, record: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    """Parse an ImageNet record from a serialized string Tensor."""
+    keys_to_features = {
+        'image/encoded': tf.io.FixedLenFeature((), tf.string, ''),
+        'image/format': tf.io.FixedLenFeature((), tf.string, 'jpeg'),
+        'image/class/label': tf.io.FixedLenFeature([], tf.int64, -1),
+        'image/class/text': tf.io.FixedLenFeature([], tf.string, ''),
+        'image/object/bbox/xmin': tf.io.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymin': tf.io.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/xmax': tf.io.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymax': tf.io.VarLenFeature(dtype=tf.float32),
+        'image/object/class/label': tf.io.VarLenFeature(dtype=tf.int64),
+    }
+
+    parsed = tf.io.parse_single_example(record, keys_to_features)
+
+    label = tf.reshape(parsed['image/class/label'], shape=[1])
+
+    # Subtract one so that labels are in [0, 1000)
+    label -= 1
+
+    image_bytes = tf.reshape(parsed['image/encoded'], shape=[])
+    image, label = self.preprocess(image_bytes, label)
+
+    return image, label
+
+  def preprocess(self, image: tf.Tensor,
+                 label: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    """Apply image preprocessing and augmentation to the image and label."""
+    if self.is_training:
+      image = preprocessing.preprocess_for_train(
+          image,
+          image_size=self.image_size,
+          mean_subtract=self.config.mean_subtract,
+          standardize=self.config.standardize,
+          dtype=self.dtype,
+          augmenter=self.augmenter)
+    else:
+      image = preprocessing.preprocess_for_eval(
+          image,
+          image_size=self.image_size,
+          num_channels=self.num_channels,
+          mean_subtract=self.config.mean_subtract,
+          standardize=self.config.standardize,
+          dtype=self.dtype)
+
+    label = tf.cast(label, tf.int32)
+    if self.config.one_hot:
+      label = tf.one_hot(label, self.num_classes)
+      label = tf.reshape(label, [self.num_classes])
+
+    return image, label
+
+  @classmethod
+  def from_params(cls, *args, **kwargs):
+    """Construct a dataset builder from a default config and any overrides."""
+    config = DatasetConfig.from_args(*args, **kwargs)
+    return cls(config)
--- a/official/vision/detection/utils/object_detection/__init__.py
+++ b/official/vision/detection/utils/object_detection/__init__.py
--- a/official/legacy/image_classification/efficientnet/common_modules.py
+++ b/official/legacy/image_classification/efficientnet/common_modules.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common modeling utilities."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+from typing import Optional, Text
+import numpy as np
+import tensorflow as tf
+import tensorflow.compat.v1 as tf1
+from tensorflow.python.tpu import tpu_function
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class TpuBatchNormalization(tf.keras.layers.BatchNormalization):
+  """Cross replica batch normalization."""
+
+  def __init__(self, fused: Optional[bool] = False, **kwargs):
+    if fused in (True, None):
+      raise ValueError('TpuBatchNormalization does not support fused=True.')
+    super(TpuBatchNormalization, self).__init__(fused=fused, **kwargs)
+
+  def _cross_replica_average(self, t: tf.Tensor, num_shards_per_group: int):
+    """Calculates the average value of input tensor across TPU replicas."""
+    num_shards = tpu_function.get_tpu_context().number_of_shards
+    group_assignment = None
+    if num_shards_per_group > 1:
+      if num_shards % num_shards_per_group != 0:
+        raise ValueError(
+            'num_shards: %d mod shards_per_group: %d, should be 0' %
+            (num_shards, num_shards_per_group))
+      num_groups = num_shards // num_shards_per_group
+      group_assignment = [[
+          x for x in range(num_shards) if x // num_shards_per_group == y
+      ] for y in range(num_groups)]
+    return tf1.tpu.cross_replica_sum(t, group_assignment) / tf.cast(
+        num_shards_per_group, t.dtype)
+
+  def _moments(self, inputs: tf.Tensor, reduction_axes: int, keep_dims: int):
+    """Compute the mean and variance: it overrides the original _moments."""
+    shard_mean, shard_variance = super(TpuBatchNormalization, self)._moments(
+        inputs, reduction_axes, keep_dims=keep_dims)
+
+    num_shards = tpu_function.get_tpu_context().number_of_shards or 1
+    if num_shards <= 8:  # Skip cross_replica for 2x2 or smaller slices.
+      num_shards_per_group = 1
+    else:
+      num_shards_per_group = max(8, num_shards // 8)
+    if num_shards_per_group > 1:
+      # Compute variance using: Var[X]= E[X^2] - E[X]^2.
+      shard_square_of_mean = tf.math.square(shard_mean)
+      shard_mean_of_square = shard_variance + shard_square_of_mean
+      group_mean = self._cross_replica_average(shard_mean, num_shards_per_group)
+      group_mean_of_square = self._cross_replica_average(
+          shard_mean_of_square, num_shards_per_group)
+      group_variance = group_mean_of_square - tf.math.square(group_mean)
+      return (group_mean, group_variance)
+    else:
+      return (shard_mean, shard_variance)
+
+
+def get_batch_norm(batch_norm_type: Text) -> tf.keras.layers.BatchNormalization:
+  """A helper to create a batch normalization getter.
+
+  Args:
+    batch_norm_type: The type of batch normalization layer implementation. `tpu`
+      will use `TpuBatchNormalization`.
+
+  Returns:
+    An instance of `tf.keras.layers.BatchNormalization`.
+  """
+  if batch_norm_type == 'tpu':
+    return TpuBatchNormalization
+
+  return tf.keras.layers.BatchNormalization  # pytype: disable=bad-return-type  # typed-keras
+
+
+def count_params(model, trainable_only=True):
+  """Returns the count of all model parameters, or just trainable ones."""
+  if not trainable_only:
+    return model.count_params()
+  else:
+    return int(
+        np.sum([
+            tf.keras.backend.count_params(p) for p in model.trainable_weights
+        ]))
+
+
+def load_weights(model: tf.keras.Model,
+                 model_weights_path: Text,
+                 weights_format: Text = 'saved_model'):
+  """Load model weights from the given file path.
+
+  Args:
+    model: the model to load weights into
+    model_weights_path: the path of the model weights
+    weights_format: the model weights format. One of 'saved_model', 'h5', or
+      'checkpoint'.
+  """
+  if weights_format == 'saved_model':
+    loaded_model = tf.keras.models.load_model(model_weights_path)
+    model.set_weights(loaded_model.get_weights())
+  else:
+    model.load_weights(model_weights_path)
--- a/official/legacy/image_classification/efficientnet/efficientnet_config.py
+++ b/official/legacy/image_classification/efficientnet/efficientnet_config.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Configuration definitions for EfficientNet losses, learning rates, and optimizers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import dataclasses
+from official.legacy.image_classification.configs import base_configs
+from official.modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class EfficientNetModelConfig(base_configs.ModelConfig):
+  """Configuration for the EfficientNet model.
+
+  This configuration will default to settings used for training efficientnet-b0
+  on a v3-8 TPU on ImageNet.
+
+  Attributes:
+    name: The name of the model. Defaults to 'EfficientNet'.
+    num_classes: The number of classes in the model.
+    model_params: A dictionary that represents the parameters of the
+      EfficientNet model. These will be passed in to the "from_name" function.
+    loss: The configuration for loss. Defaults to a categorical cross entropy
+      implementation.
+    optimizer: The configuration for optimizations. Defaults to an RMSProp
+      configuration.
+    learning_rate: The configuration for learning rate. Defaults to an
+      exponential configuration.
+  """
+  name: str = 'EfficientNet'
+  num_classes: int = 1000
+  model_params: base_config.Config = dataclasses.field(
+      default_factory=lambda: {
+          'model_name': 'efficientnet-b0',
+          'model_weights_path': '',
+          'weights_format': 'saved_model',
+          'overrides': {
+              'batch_norm': 'default',
+              'rescale_input': True,
+              'num_classes': 1000,
+              'activation': 'swish',
+              'dtype': 'float32',
+          }
+      })
+  loss: base_configs.LossConfig = base_configs.LossConfig(
+      name='categorical_crossentropy', label_smoothing=0.1)
+  optimizer: base_configs.OptimizerConfig = base_configs.OptimizerConfig(
+      name='rmsprop',
+      decay=0.9,
+      epsilon=0.001,
+      momentum=0.9,
+      moving_average_decay=None)
+  learning_rate: base_configs.LearningRateConfig = base_configs.LearningRateConfig(  # pylint: disable=line-too-long
+      name='exponential',
+      initial_lr=0.008,
+      decay_epochs=2.4,
+      decay_rate=0.97,
+      warmup_epochs=5,
+      scale_by_batch_size=1. / 128.,
+      staircase=True)
--- a/official/legacy/image_classification/efficientnet/efficientnet_model.py
+++ b/official/legacy/image_classification/efficientnet/efficientnet_model.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Contains definitions for EfficientNet model.
+
+[1] Mingxing Tan, Quoc V. Le
+  EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks.
+  ICML'19, https://arxiv.org/abs/1905.11946
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import dataclasses
+import math
+from typing import Any, Dict, Optional, Text, Tuple
+
+from absl import logging
+import tensorflow as tf
+from official.legacy.image_classification import preprocessing
+from official.legacy.image_classification.efficientnet import common_modules
+from official.modeling import tf_utils
+from official.modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class BlockConfig(base_config.Config):
+  """Config for a single MB Conv Block."""
+  input_filters: int = 0
+  output_filters: int = 0
+  kernel_size: int = 3
+  num_repeat: int = 1
+  expand_ratio: int = 1
+  strides: Tuple[int, int] = (1, 1)
+  se_ratio: Optional[float] = None
+  id_skip: bool = True
+  fused_conv: bool = False
+  conv_type: str = 'depthwise'
+
+
+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """Default Config for Efficientnet-B0."""
+  width_coefficient: float = 1.0
+  depth_coefficient: float = 1.0
+  resolution: int = 224
+  dropout_rate: float = 0.2
+  blocks: Tuple[BlockConfig, ...] = (
+      # (input_filters, output_filters, kernel_size, num_repeat,
+      #  expand_ratio, strides, se_ratio)
+      # pylint: disable=bad-whitespace
+      BlockConfig.from_args(32, 16, 3, 1, 1, (1, 1), 0.25),
+      BlockConfig.from_args(16, 24, 3, 2, 6, (2, 2), 0.25),
+      BlockConfig.from_args(24, 40, 5, 2, 6, (2, 2), 0.25),
+      BlockConfig.from_args(40, 80, 3, 3, 6, (2, 2), 0.25),
+      BlockConfig.from_args(80, 112, 5, 3, 6, (1, 1), 0.25),
+      BlockConfig.from_args(112, 192, 5, 4, 6, (2, 2), 0.25),
+      BlockConfig.from_args(192, 320, 3, 1, 6, (1, 1), 0.25),
+      # pylint: enable=bad-whitespace
+  )
+  stem_base_filters: int = 32
+  top_base_filters: int = 1280
+  activation: str = 'simple_swish'
+  batch_norm: str = 'default'
+  bn_momentum: float = 0.99
+  bn_epsilon: float = 1e-3
+  # While the original implementation used a weight decay of 1e-5,
+  # tf.nn.l2_loss divides it by 2, so we halve this to compensate in Keras
+  weight_decay: float = 5e-6
+  drop_connect_rate: float = 0.2
+  depth_divisor: int = 8
+  min_depth: Optional[int] = None
+  use_se: bool = True
+  input_channels: int = 3
+  num_classes: int = 1000
+  model_name: str = 'efficientnet'
+  rescale_input: bool = True
+  data_format: str = 'channels_last'
+  dtype: str = 'float32'
+
+
+MODEL_CONFIGS = {
+    # (width, depth, resolution, dropout)
+    'efficientnet-b0': ModelConfig.from_args(1.0, 1.0, 224, 0.2),
+    'efficientnet-b1': ModelConfig.from_args(1.0, 1.1, 240, 0.2),
+    'efficientnet-b2': ModelConfig.from_args(1.1, 1.2, 260, 0.3),
+    'efficientnet-b3': ModelConfig.from_args(1.2, 1.4, 300, 0.3),
+    'efficientnet-b4': ModelConfig.from_args(1.4, 1.8, 380, 0.4),
+    'efficientnet-b5': ModelConfig.from_args(1.6, 2.2, 456, 0.4),
+    'efficientnet-b6': ModelConfig.from_args(1.8, 2.6, 528, 0.5),
+    'efficientnet-b7': ModelConfig.from_args(2.0, 3.1, 600, 0.5),
+    'efficientnet-b8': ModelConfig.from_args(2.2, 3.6, 672, 0.5),
+    'efficientnet-l2': ModelConfig.from_args(4.3, 5.3, 800, 0.5),
+}
+
+CONV_KERNEL_INITIALIZER = {
+    'class_name': 'VarianceScaling',
+    'config': {
+        'scale': 2.0,
+        'mode': 'fan_out',
+        # Note: this is a truncated normal distribution
+        'distribution': 'normal'
+    }
+}
+
+DENSE_KERNEL_INITIALIZER = {
+    'class_name': 'VarianceScaling',
+    'config': {
+        'scale': 1 / 3.0,
+        'mode': 'fan_out',
+        'distribution': 'uniform'
+    }
+}
+
+
+def round_filters(filters: int, config: ModelConfig) -> int:
+  """Round number of filters based on width coefficient."""
+  width_coefficient = config.width_coefficient
+  min_depth = config.min_depth
+  divisor = config.depth_divisor
+  orig_filters = filters
+
+  if not width_coefficient:
+    return filters
+
+  filters *= width_coefficient
+  min_depth = min_depth or divisor
+  new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_filters < 0.9 * filters:
+    new_filters += divisor
+  logging.info('round_filter input=%s output=%s', orig_filters, new_filters)
+  return int(new_filters)
+
+
+def round_repeats(repeats: int, depth_coefficient: float) -> int:
+  """Round number of repeats based on depth coefficient."""
+  return int(math.ceil(depth_coefficient * repeats))
+
+
+def conv2d_block(inputs: tf.Tensor,
+                 conv_filters: Optional[int],
+                 config: ModelConfig,
+                 kernel_size: Any = (1, 1),
+                 strides: Any = (1, 1),
+                 use_batch_norm: bool = True,
+                 use_bias: bool = False,
+                 activation: Optional[Any] = None,
+                 depthwise: bool = False,
+                 name: Optional[Text] = None):
+  """A conv2d followed by batch norm and an activation."""
+  batch_norm = common_modules.get_batch_norm(config.batch_norm)
+  bn_momentum = config.bn_momentum
+  bn_epsilon = config.bn_epsilon
+  data_format = tf.keras.backend.image_data_format()
+  weight_decay = config.weight_decay
+
+  name = name or ''
+
+  # Collect args based on what kind of conv2d block is desired
+  init_kwargs = {
+      'kernel_size': kernel_size,
+      'strides': strides,
+      'use_bias': use_bias,
+      'padding': 'same',
+      'name': name + '_conv2d',
+      'kernel_regularizer': tf.keras.regularizers.l2(weight_decay),
+      'bias_regularizer': tf.keras.regularizers.l2(weight_decay),
+  }
+
+  if depthwise:
+    conv2d = tf.keras.layers.DepthwiseConv2D
+    init_kwargs.update({'depthwise_initializer': CONV_KERNEL_INITIALIZER})
+  else:
+    conv2d = tf.keras.layers.Conv2D
+    init_kwargs.update({
+        'filters': conv_filters,
+        'kernel_initializer': CONV_KERNEL_INITIALIZER
+    })
+
+  x = conv2d(**init_kwargs)(inputs)
+
+  if use_batch_norm:
+    bn_axis = 1 if data_format == 'channels_first' else -1
+    x = batch_norm(
+        axis=bn_axis,
+        momentum=bn_momentum,
+        epsilon=bn_epsilon,
+        name=name + '_bn')(
+            x)
+
+  if activation is not None:
+    x = tf.keras.layers.Activation(activation, name=name + '_activation')(x)
+  return x
+
+
+def mb_conv_block(inputs: tf.Tensor,
+                  block: BlockConfig,
+                  config: ModelConfig,
+                  prefix: Optional[Text] = None):
+  """Mobile Inverted Residual Bottleneck.
+
+  Args:
+    inputs: the Keras input to the block
+    block: BlockConfig, arguments to create a Block
+    config: ModelConfig, a set of model parameters
+    prefix: prefix for naming all layers
+
+  Returns:
+    the output of the block
+  """
+  use_se = config.use_se
+  activation = tf_utils.get_activation(config.activation)
+  drop_connect_rate = config.drop_connect_rate
+  data_format = tf.keras.backend.image_data_format()
+  use_depthwise = block.conv_type != 'no_depthwise'
+  prefix = prefix or ''
+
+  filters = block.input_filters * block.expand_ratio
+
+  x = inputs
+
+  if block.fused_conv:
+    # If we use fused mbconv, skip expansion and use regular conv.
+    x = conv2d_block(
+        x,
+        filters,
+        config,
+        kernel_size=block.kernel_size,
+        strides=block.strides,
+        activation=activation,
+        name=prefix + 'fused')
+  else:
+    if block.expand_ratio != 1:
+      # Expansion phase
+      kernel_size = (1, 1) if use_depthwise else (3, 3)
+      x = conv2d_block(
+          x,
+          filters,
+          config,
+          kernel_size=kernel_size,
+          activation=activation,
+          name=prefix + 'expand')
+
+    # Depthwise Convolution
+    if use_depthwise:
+      x = conv2d_block(
+          x,
+          conv_filters=None,
+          config=config,
+          kernel_size=block.kernel_size,
+          strides=block.strides,
+          activation=activation,
+          depthwise=True,
+          name=prefix + 'depthwise')
+
+  # Squeeze and Excitation phase
+  if use_se:
+    assert block.se_ratio is not None
+    assert 0 < block.se_ratio <= 1
+    num_reduced_filters = max(1, int(block.input_filters * block.se_ratio))
+
+    if data_format == 'channels_first':
+      se_shape = (filters, 1, 1)
+    else:
+      se_shape = (1, 1, filters)
+
+    se = tf.keras.layers.GlobalAveragePooling2D(name=prefix + 'se_squeeze')(x)
+    se = tf.keras.layers.Reshape(se_shape, name=prefix + 'se_reshape')(se)
+
+    se = conv2d_block(
+        se,
+        num_reduced_filters,
+        config,
+        use_bias=True,
+        use_batch_norm=False,
+        activation=activation,
+        name=prefix + 'se_reduce')
+    se = conv2d_block(
+        se,
+        filters,
+        config,
+        use_bias=True,
+        use_batch_norm=False,
+        activation='sigmoid',
+        name=prefix + 'se_expand')
+    x = tf.keras.layers.multiply([x, se], name=prefix + 'se_excite')
+
+  # Output phase
+  x = conv2d_block(
+      x, block.output_filters, config, activation=None, name=prefix + 'project')
+
+  # Add identity so that quantization-aware training can insert quantization
+  # ops correctly.
+  x = tf.keras.layers.Activation(
+      tf_utils.get_activation('identity'), name=prefix + 'id')(
+          x)
+
+  if (block.id_skip and all(s == 1 for s in block.strides) and
+      block.input_filters == block.output_filters):
+    if drop_connect_rate and drop_connect_rate > 0:
+      # Apply dropconnect
+      # The only difference between dropout and dropconnect in TF is scaling by
+      # drop_connect_rate during training. See:
+      # https://github.com/keras-team/keras/pull/9898#issuecomment-380577612
+      x = tf.keras.layers.Dropout(
+          drop_connect_rate, noise_shape=(None, 1, 1, 1), name=prefix + 'drop')(
+              x)
+
+    x = tf.keras.layers.add([x, inputs], name=prefix + 'add')
+
+  return x
+
+
+def efficientnet(image_input: tf.keras.layers.Input, config: ModelConfig):  # pytype: disable=invalid-annotation  # typed-keras
+  """Creates an EfficientNet graph given the model parameters.
+
+  This function is wrapped by the `EfficientNet` class to make a tf.keras.Model.
+
+  Args:
+    image_input: the input batch of images
+    config: the model config
+
+  Returns:
+    the output of efficientnet
+  """
+  depth_coefficient = config.depth_coefficient
+  blocks = config.blocks
+  stem_base_filters = config.stem_base_filters
+  top_base_filters = config.top_base_filters
+  activation = tf_utils.get_activation(config.activation)
+  dropout_rate = config.dropout_rate
+  drop_connect_rate = config.drop_connect_rate
+  num_classes = config.num_classes
+  input_channels = config.input_channels
+  rescale_input = config.rescale_input
+  data_format = tf.keras.backend.image_data_format()
+  dtype = config.dtype
+  weight_decay = config.weight_decay
+
+  x = image_input
+  if data_format == 'channels_first':
+    # Happens on GPU/TPU if available.
+    x = tf.keras.layers.Permute((3, 1, 2))(x)
+  if rescale_input:
+    x = preprocessing.normalize_images(
+        x, num_channels=input_channels, dtype=dtype, data_format=data_format)
+
+  # Build stem
+  x = conv2d_block(
+      x,
+      round_filters(stem_base_filters, config),
+      config,
+      kernel_size=[3, 3],
+      strides=[2, 2],
+      activation=activation,
+      name='stem')
+
+  # Build blocks
+  num_blocks_total = sum(
+      round_repeats(block.num_repeat, depth_coefficient) for block in blocks)
+  block_num = 0
+
+  for stack_idx, block in enumerate(blocks):
+    assert block.num_repeat > 0
+    # Update block input and output filters based on depth multiplier
+    block = block.replace(
+        input_filters=round_filters(block.input_filters, config),
+        output_filters=round_filters(block.output_filters, config),
+        num_repeat=round_repeats(block.num_repeat, depth_coefficient))
+
+    # The first block needs to take care of stride and filter size increase
+    drop_rate = drop_connect_rate * float(block_num) / num_blocks_total
+    config = config.replace(drop_connect_rate=drop_rate)
+    block_prefix = 'stack_{}/block_0/'.format(stack_idx)
+    x = mb_conv_block(x, block, config, block_prefix)
+    block_num += 1
+    if block.num_repeat > 1:
+      block = block.replace(input_filters=block.output_filters, strides=[1, 1])
+
+      for block_idx in range(block.num_repeat - 1):
+        drop_rate = drop_connect_rate * float(block_num) / num_blocks_total
+        config = config.replace(drop_connect_rate=drop_rate)
+        block_prefix = 'stack_{}/block_{}/'.format(stack_idx, block_idx + 1)
+        x = mb_conv_block(x, block, config, prefix=block_prefix)
+        block_num += 1
+
+  # Build top
+  x = conv2d_block(
+      x,
+      round_filters(top_base_filters, config),
+      config,
+      activation=activation,
+      name='top')
+
+  # Build classifier
+  x = tf.keras.layers.GlobalAveragePooling2D(name='top_pool')(x)
+  if dropout_rate and dropout_rate > 0:
+    x = tf.keras.layers.Dropout(dropout_rate, name='top_dropout')(x)
+  x = tf.keras.layers.Dense(
+      num_classes,
+      kernel_initializer=DENSE_KERNEL_INITIALIZER,
+      kernel_regularizer=tf.keras.regularizers.l2(weight_decay),
+      bias_regularizer=tf.keras.regularizers.l2(weight_decay),
+      name='logits')(
+          x)
+  x = tf.keras.layers.Activation('softmax', name='probs')(x)
+
+  return x
+
+
+class EfficientNet(tf.keras.Model):
+  """Wrapper class for an EfficientNet Keras model.
+
+  Contains helper methods to build, manage, and save metadata about the model.
+  """
+
+  def __init__(self,
+               config: Optional[ModelConfig] = None,
+               overrides: Optional[Dict[Text, Any]] = None):
+    """Create an EfficientNet model.
+
+    Args:
+      config: (optional) the main model parameters to create the model
+      overrides: (optional) a dict containing keys that can override config
+    """
+    overrides = overrides or {}
+    config = config or ModelConfig()
+
+    self.config = config.replace(**overrides)
+
+    input_channels = self.config.input_channels
+    model_name = self.config.model_name
+    input_shape = (None, None, input_channels)  # Should handle any size image
+    image_input = tf.keras.layers.Input(shape=input_shape)
+
+    output = efficientnet(image_input, self.config)
+
+    # Cast to float32 in case we have a different model dtype
+    output = tf.cast(output, tf.float32)
+
+    logging.info('Building model %s with params %s', model_name, self.config)
+
+    super(EfficientNet, self).__init__(
+        inputs=image_input, outputs=output, name=model_name)
+
+  @classmethod
+  def from_name(cls,
+                model_name: Text,
+                model_weights_path: Optional[Text] = None,
+                weights_format: Text = 'saved_model',
+                overrides: Optional[Dict[Text, Any]] = None):
+    """Construct an EfficientNet model from a predefined model name.
+
+    E.g., `EfficientNet.from_name('efficientnet-b0')`.
+
+    Args:
+      model_name: the predefined model name
+      model_weights_path: the path to the weights (h5 file or saved model dir)
+      weights_format: the model weights format. One of 'saved_model', 'h5', or
+        'checkpoint'.
+      overrides: (optional) a dict containing keys that can override config
+
+    Returns:
+      A constructed EfficientNet instance.
+    """
+    model_configs = dict(MODEL_CONFIGS)
+    overrides = dict(overrides) if overrides else {}
+
+    # One can define their own custom models if necessary
+    model_configs.update(overrides.pop('model_config', {}))
+
+    if model_name not in model_configs:
+      raise ValueError('Unknown model name {}'.format(model_name))
+
+    config = model_configs[model_name]
+
+    model = cls(config=config, overrides=overrides)
+
+    if model_weights_path:
+      common_modules.load_weights(
+          model, model_weights_path, weights_format=weights_format)
+
+    return model
--- a/official/legacy/image_classification/efficientnet/tfhub_export.py
+++ b/official/legacy/image_classification/efficientnet/tfhub_export.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A script to export TF-Hub SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import os
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+from official.legacy.image_classification.efficientnet import efficientnet_model
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_name", None, "EfficientNet model name.")
+flags.DEFINE_string("model_path", None, "File path to TF model checkpoint.")
+flags.DEFINE_string("export_path", None,
+                    "TF-Hub SavedModel destination path to export.")
+
+
+def export_tfhub(model_path, hub_destination, model_name):
+  """Restores a tf.keras.Model and saves for TF-Hub."""
+  model_configs = dict(efficientnet_model.MODEL_CONFIGS)
+  config = model_configs[model_name]
+
+  image_input = tf.keras.layers.Input(
+      shape=(None, None, 3), name="image_input", dtype=tf.float32)
+  x = image_input * 255.0
+  ouputs = efficientnet_model.efficientnet(x, config)
+  hub_model = tf.keras.Model(image_input, ouputs)
+  ckpt = tf.train.Checkpoint(model=hub_model)
+  ckpt.restore(model_path).assert_existing_objects_matched()
+  hub_model.save(
+      os.path.join(hub_destination, "classification"), include_optimizer=False)
+
+  feature_vector_output = hub_model.get_layer(name="top_pool").get_output_at(0)
+  hub_model2 = tf.keras.Model(image_input, feature_vector_output)
+  hub_model2.save(
+      os.path.join(hub_destination, "feature-vector"), include_optimizer=False)
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+
+  export_tfhub(FLAGS.model_path, FLAGS.export_path, FLAGS.model_name)
+
+
+if __name__ == "__main__":
+  app.run(main)
--- a/official/legacy/image_classification/learning_rate.py
+++ b/official/legacy/image_classification/learning_rate.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Learning rate utilities for vision tasks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from typing import Any, Mapping, Optional
+
+import numpy as np
+import tensorflow as tf
+
+BASE_LEARNING_RATE = 0.1
+
+
+class WarmupDecaySchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """A wrapper for LearningRateSchedule that includes warmup steps."""
+
+  def __init__(self,
+               lr_schedule: tf.keras.optimizers.schedules.LearningRateSchedule,
+               warmup_steps: int,
+               warmup_lr: Optional[float] = None):
+    """Add warmup decay to a learning rate schedule.
+
+    Args:
+      lr_schedule: base learning rate scheduler
+      warmup_steps: number of warmup steps
+      warmup_lr: an optional field for the final warmup learning rate. This
+        should be provided if the base `lr_schedule` does not contain this
+        field.
+    """
+    super(WarmupDecaySchedule, self).__init__()
+    self._lr_schedule = lr_schedule
+    self._warmup_steps = warmup_steps
+    self._warmup_lr = warmup_lr
+
+  def __call__(self, step: int):
+    lr = self._lr_schedule(step)
+    if self._warmup_steps:
+      if self._warmup_lr is not None:
+        initial_learning_rate = tf.convert_to_tensor(
+            self._warmup_lr, name="initial_learning_rate")
+      else:
+        initial_learning_rate = tf.convert_to_tensor(
+            self._lr_schedule.initial_learning_rate,
+            name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      global_step_recomp = tf.cast(step, dtype)
+      warmup_steps = tf.cast(self._warmup_steps, dtype)
+      warmup_lr = initial_learning_rate * global_step_recomp / warmup_steps
+      lr = tf.cond(global_step_recomp < warmup_steps, lambda: warmup_lr,
+                   lambda: lr)
+    return lr
+
+  def get_config(self) -> Mapping[str, Any]:
+    config = self._lr_schedule.get_config()
+    config.update({
+        "warmup_steps": self._warmup_steps,
+        "warmup_lr": self._warmup_lr,
+    })
+    return config
+
+
+class CosineDecayWithWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Class to generate learning rate tensor."""
+
+  def __init__(self, batch_size: int, total_steps: int, warmup_steps: int):
+    """Creates the consine learning rate tensor with linear warmup.
+
+    Args:
+      batch_size: The training batch size used in the experiment.
+      total_steps: Total training steps.
+      warmup_steps: Steps for the warm up period.
+    """
+    super(CosineDecayWithWarmup, self).__init__()
+    base_lr_batch_size = 256
+    self._total_steps = total_steps
+    self._init_learning_rate = BASE_LEARNING_RATE * batch_size / base_lr_batch_size
+    self._warmup_steps = warmup_steps
+
+  def __call__(self, global_step: int):
+    global_step = tf.cast(global_step, dtype=tf.float32)
+    warmup_steps = self._warmup_steps
+    init_lr = self._init_learning_rate
+    total_steps = self._total_steps
+
+    linear_warmup = global_step / warmup_steps * init_lr
+
+    cosine_learning_rate = init_lr * (tf.cos(np.pi *
+                                             (global_step - warmup_steps) /
+                                             (total_steps - warmup_steps)) +
+                                      1.0) / 2.0
+
+    learning_rate = tf.where(global_step < warmup_steps, linear_warmup,
+                             cosine_learning_rate)
+    return learning_rate
+
+  def get_config(self):
+    return {
+        "total_steps": self._total_steps,
+        "warmup_learning_rate": self._warmup_learning_rate,
+        "warmup_steps": self._warmup_steps,
+        "init_learning_rate": self._init_learning_rate,
+    }
--- a/official/legacy/image_classification/learning_rate_test.py
+++ b/official/legacy/image_classification/learning_rate_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for learning_rate."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from official.legacy.image_classification import learning_rate
+
+
+class LearningRateTests(tf.test.TestCase):
+
+  def test_warmup_decay(self):
+    """Basic computational test for warmup decay."""
+    initial_lr = 0.01
+    decay_steps = 100
+    decay_rate = 0.01
+    warmup_steps = 10
+
+    base_lr = tf.keras.optimizers.schedules.ExponentialDecay(
+        initial_learning_rate=initial_lr,
+        decay_steps=decay_steps,
+        decay_rate=decay_rate)
+    lr = learning_rate.WarmupDecaySchedule(
+        lr_schedule=base_lr, warmup_steps=warmup_steps)
+
+    for step in range(warmup_steps - 1):
+      config = lr.get_config()
+      self.assertEqual(config['warmup_steps'], warmup_steps)
+      self.assertAllClose(
+          self.evaluate(lr(step)), step / warmup_steps * initial_lr)
+
+  def test_cosine_decay_with_warmup(self):
+    """Basic computational test for cosine decay with warmup."""
+    expected_lrs = [0.0, 0.1, 0.05, 0.0]
+
+    lr = learning_rate.CosineDecayWithWarmup(
+        batch_size=256, total_steps=3, warmup_steps=1)
+
+    for step in [0, 1, 2, 3]:
+      self.assertAllClose(lr(step), expected_lrs[step])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/legacy/image_classification/mnist_main.py
+++ b/official/legacy/image_classification/mnist_main.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Runs a simple model on the MNIST dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from official.common import distribute_utils
+from official.legacy.image_classification.resnet import common
+from official.utils.flags import core as flags_core
+from official.utils.misc import model_helpers
+
+FLAGS = flags.FLAGS
+
+
+def build_model():
+  """Constructs the ML model used to predict handwritten digits."""
+
+  image = tf.keras.layers.Input(shape=(28, 28, 1))
+
+  y = tf.keras.layers.Conv2D(filters=32,
+                             kernel_size=5,
+                             padding='same',
+                             activation='relu')(image)
+  y = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
+                                   strides=(2, 2),
+                                   padding='same')(y)
+  y = tf.keras.layers.Conv2D(filters=32,
+                             kernel_size=5,
+                             padding='same',
+                             activation='relu')(y)
+  y = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
+                                   strides=(2, 2),
+                                   padding='same')(y)
+  y = tf.keras.layers.Flatten()(y)
+  y = tf.keras.layers.Dense(1024, activation='relu')(y)
+  y = tf.keras.layers.Dropout(0.4)(y)
+
+  probs = tf.keras.layers.Dense(10, activation='softmax')(y)
+
+  model = tf.keras.models.Model(image, probs, name='mnist')
+
+  return model
+
+
+@tfds.decode.make_decoder(output_dtype=tf.float32)
+def decode_image(example, feature):
+  """Convert image to float32 and normalize from [0, 255] to [0.0, 1.0]."""
+  return tf.cast(feature.decode_example(example), dtype=tf.float32) / 255
+
+
+def run(flags_obj, datasets_override=None, strategy_override=None):
+  """Run MNIST model training and eval loop using native Keras APIs.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+    datasets_override: A pair of `tf.data.Dataset` objects to train the model,
+                       representing the train and test sets.
+    strategy_override: A `tf.distribute.Strategy` object to use for model.
+
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  # Start TF profiler server.
+  tf.profiler.experimental.server.start(flags_obj.profiler_port)
+
+  strategy = strategy_override or distribute_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      tpu_address=flags_obj.tpu)
+
+  strategy_scope = distribute_utils.get_strategy_scope(strategy)
+
+  mnist = tfds.builder('mnist', data_dir=flags_obj.data_dir)
+  if flags_obj.download:
+    mnist.download_and_prepare()
+
+  mnist_train, mnist_test = datasets_override or mnist.as_dataset(
+      split=['train', 'test'],
+      decoders={'image': decode_image()},  # pylint: disable=no-value-for-parameter
+      as_supervised=True)
+  train_input_dataset = mnist_train.cache().repeat().shuffle(
+      buffer_size=50000).batch(flags_obj.batch_size)
+  eval_input_dataset = mnist_test.cache().repeat().batch(flags_obj.batch_size)
+
+  with strategy_scope:
+    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
+        0.05, decay_steps=100000, decay_rate=0.96)
+    optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule)
+
+    model = build_model()
+    model.compile(
+        optimizer=optimizer,
+        loss='sparse_categorical_crossentropy',
+        metrics=['sparse_categorical_accuracy'])
+
+  num_train_examples = mnist.info.splits['train'].num_examples
+  train_steps = num_train_examples // flags_obj.batch_size
+  train_epochs = flags_obj.train_epochs
+
+  ckpt_full_path = os.path.join(flags_obj.model_dir, 'model.ckpt-{epoch:04d}')
+  callbacks = [
+      tf.keras.callbacks.ModelCheckpoint(
+          ckpt_full_path, save_weights_only=True),
+      tf.keras.callbacks.TensorBoard(log_dir=flags_obj.model_dir),
+  ]
+
+  num_eval_examples = mnist.info.splits['test'].num_examples
+  num_eval_steps = num_eval_examples // flags_obj.batch_size
+
+  history = model.fit(
+      train_input_dataset,
+      epochs=train_epochs,
+      steps_per_epoch=train_steps,
+      callbacks=callbacks,
+      validation_steps=num_eval_steps,
+      validation_data=eval_input_dataset,
+      validation_freq=flags_obj.epochs_between_evals)
+
+  export_path = os.path.join(flags_obj.model_dir, 'saved_model')
+  model.save(export_path, include_optimizer=False)
+
+  eval_output = model.evaluate(
+      eval_input_dataset, steps=num_eval_steps, verbose=2)
+
+  stats = common.build_stats(history, eval_output, callbacks)
+  return stats
+
+
+def define_mnist_flags():
+  """Define command line flags for MNIST model."""
+  flags_core.define_base(
+      clean=True,
+      num_gpu=True,
+      train_epochs=True,
+      epochs_between_evals=True,
+      distribution_strategy=True)
+  flags_core.define_device()
+  flags_core.define_distribution()
+  flags.DEFINE_bool('download', True,
+                    'Whether to download data to `--data_dir`.')
+  flags.DEFINE_integer('profiler_port', 9012,
+                       'Port to start profiler server on.')
+  FLAGS.set_default('batch_size', 1024)
+
+
+def main(_):
+  model_helpers.apply_clean(FLAGS)
+  stats = run(flags.FLAGS)
+  logging.info('Run stats:\n%s', stats)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  define_mnist_flags()
+  app.run(main)
--- a/official/legacy/image_classification/mnist_test.py
+++ b/official/legacy/image_classification/mnist_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test the Keras MNIST model on GPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.legacy.image_classification import mnist_main
+from official.utils.testing import integration
+
+
+mnist_main.define_mnist_flags()
+
+
+def eager_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],)
+
+
+class KerasMnistTest(tf.test.TestCase, parameterized.TestCase):
+  """Unit tests for sample Keras MNIST model."""
+  _tempdir = None
+
+  @classmethod
+  def setUpClass(cls):  # pylint: disable=invalid-name
+    super(KerasMnistTest, cls).setUpClass()
+
+  def tearDown(self):
+    super(KerasMnistTest, self).tearDown()
+    tf.io.gfile.rmtree(self.get_temp_dir())
+
+  @combinations.generate(eager_strategy_combinations())
+  def test_end_to_end(self, distribution):
+    """Test Keras MNIST model with `strategy`."""
+
+    extra_flags = [
+        "-train_epochs",
+        "1",
+        # Let TFDS find the metadata folder automatically
+        "--data_dir="
+    ]
+
+    dummy_data = (
+        tf.ones(shape=(10, 28, 28, 1), dtype=tf.int32),
+        tf.range(10),
+    )
+    datasets = (
+        tf.data.Dataset.from_tensor_slices(dummy_data),
+        tf.data.Dataset.from_tensor_slices(dummy_data),
+    )
+
+    run = functools.partial(
+        mnist_main.run,
+        datasets_override=datasets,
+        strategy_override=distribution)
+
+    integration.run_synthetic(
+        main=run,
+        synth=False,
+        tmp_root=self.create_tempdir().full_path,
+        extra_flags=extra_flags)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/legacy/image_classification/optimizer_factory.py
+++ b/official/legacy/image_classification/optimizer_factory.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Optimizer factory for vision tasks."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+from typing import Any, Dict, Optional, Text
+
+from absl import logging
+import tensorflow as tf
+import tensorflow_addons as tfa
+from official.legacy.image_classification import learning_rate
+from official.legacy.image_classification.configs import base_configs
+from official.modeling import optimization
+
+# pylint: disable=protected-access
+
+
+def build_optimizer(
+    optimizer_name: Text,
+    base_learning_rate: tf.keras.optimizers.schedules.LearningRateSchedule,
+    params: Dict[Text, Any],
+    model: Optional[tf.keras.Model] = None):
+  """Build the optimizer based on name.
+
+  Args:
+    optimizer_name: String representation of the optimizer name. Examples: sgd,
+      momentum, rmsprop.
+    base_learning_rate: `tf.keras.optimizers.schedules.LearningRateSchedule`
+      base learning rate.
+    params: String -> Any dictionary representing the optimizer params. This
+      should contain optimizer specific parameters such as `base_learning_rate`,
+      `decay`, etc.
+    model: The `tf.keras.Model`. This is used for the shadow copy if using
+      `ExponentialMovingAverage`.
+
+  Returns:
+    A tf.keras.Optimizer.
+
+  Raises:
+    ValueError if the provided optimizer_name is not supported.
+
+  """
+  optimizer_name = optimizer_name.lower()
+  logging.info('Building %s optimizer with params %s', optimizer_name, params)
+
+  if optimizer_name == 'sgd':
+    logging.info('Using SGD optimizer')
+    nesterov = params.get('nesterov', False)
+    optimizer = tf.keras.optimizers.SGD(
+        learning_rate=base_learning_rate, nesterov=nesterov)
+  elif optimizer_name == 'momentum':
+    logging.info('Using momentum optimizer')
+    nesterov = params.get('nesterov', False)
+    optimizer = tf.keras.optimizers.SGD(
+        learning_rate=base_learning_rate,
+        momentum=params['momentum'],
+        nesterov=nesterov)
+  elif optimizer_name == 'rmsprop':
+    logging.info('Using RMSProp')
+    rho = params.get('decay', None) or params.get('rho', 0.9)
+    momentum = params.get('momentum', 0.9)
+    epsilon = params.get('epsilon', 1e-07)
+    optimizer = tf.keras.optimizers.RMSprop(
+        learning_rate=base_learning_rate,
+        rho=rho,
+        momentum=momentum,
+        epsilon=epsilon)
+  elif optimizer_name == 'adam':
+    logging.info('Using Adam')
+    beta_1 = params.get('beta_1', 0.9)
+    beta_2 = params.get('beta_2', 0.999)
+    epsilon = params.get('epsilon', 1e-07)
+    optimizer = tf.keras.optimizers.Adam(
+        learning_rate=base_learning_rate,
+        beta_1=beta_1,
+        beta_2=beta_2,
+        epsilon=epsilon)
+  elif optimizer_name == 'adamw':
+    logging.info('Using AdamW')
+    weight_decay = params.get('weight_decay', 0.01)
+    beta_1 = params.get('beta_1', 0.9)
+    beta_2 = params.get('beta_2', 0.999)
+    epsilon = params.get('epsilon', 1e-07)
+    optimizer = tfa.optimizers.AdamW(
+        weight_decay=weight_decay,
+        learning_rate=base_learning_rate,
+        beta_1=beta_1,
+        beta_2=beta_2,
+        epsilon=epsilon)
+  else:
+    raise ValueError('Unknown optimizer %s' % optimizer_name)
+
+  if params.get('lookahead', None):
+    logging.info('Using lookahead optimizer.')
+    optimizer = tfa.optimizers.Lookahead(optimizer)
+
+  # Moving average should be applied last, as it's applied at test time
+  moving_average_decay = params.get('moving_average_decay', 0.)
+  if moving_average_decay is not None and moving_average_decay > 0.:
+    if model is None:
+      raise ValueError(
+          '`model` must be provided if using `ExponentialMovingAverage`.')
+    logging.info('Including moving average decay.')
+    optimizer = optimization.ExponentialMovingAverage(
+        optimizer=optimizer, average_decay=moving_average_decay)
+    optimizer.shadow_copy(model)
+  return optimizer
+
+
+def build_learning_rate(params: base_configs.LearningRateConfig,
+                        batch_size: Optional[int] = None,
+                        train_epochs: Optional[int] = None,
+                        train_steps: Optional[int] = None):
+  """Build the learning rate given the provided configuration."""
+  decay_type = params.name
+  base_lr = params.initial_lr
+  decay_rate = params.decay_rate
+  if params.decay_epochs is not None:
+    decay_steps = params.decay_epochs * train_steps
+  else:
+    decay_steps = 0
+  if params.warmup_epochs is not None:
+    warmup_steps = params.warmup_epochs * train_steps
+  else:
+    warmup_steps = 0
+
+  lr_multiplier = params.scale_by_batch_size
+
+  if lr_multiplier and lr_multiplier > 0:
+    # Scale the learning rate based on the batch size and a multiplier
+    base_lr *= lr_multiplier * batch_size
+    logging.info(
+        'Scaling the learning rate based on the batch size '
+        'multiplier. New base_lr: %f', base_lr)
+
+  if decay_type == 'exponential':
+    logging.info(
+        'Using exponential learning rate with: '
+        'initial_learning_rate: %f, decay_steps: %d, '
+        'decay_rate: %f', base_lr, decay_steps, decay_rate)
+    lr = tf.keras.optimizers.schedules.ExponentialDecay(
+        initial_learning_rate=base_lr,
+        decay_steps=decay_steps,
+        decay_rate=decay_rate,
+        staircase=params.staircase)
+  elif decay_type == 'stepwise':
+    steps_per_epoch = params.examples_per_epoch // batch_size
+    boundaries = [boundary * steps_per_epoch for boundary in params.boundaries]
+    multipliers = [batch_size * multiplier for multiplier in params.multipliers]
+    logging.info(
+        'Using stepwise learning rate. Parameters: '
+        'boundaries: %s, values: %s', boundaries, multipliers)
+    lr = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
+        boundaries=boundaries, values=multipliers)
+  elif decay_type == 'cosine_with_warmup':
+    lr = learning_rate.CosineDecayWithWarmup(
+        batch_size=batch_size,
+        total_steps=train_epochs * train_steps,
+        warmup_steps=warmup_steps)
+  if warmup_steps > 0:
+    if decay_type not in ['cosine_with_warmup']:
+      logging.info('Applying %d warmup steps to the learning rate',
+                   warmup_steps)
+      lr = learning_rate.WarmupDecaySchedule(
+          lr, warmup_steps, warmup_lr=base_lr)
+  return lr
--- a/official/legacy/image_classification/optimizer_factory_test.py
+++ b/official/legacy/image_classification/optimizer_factory_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for optimizer_factory."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+import tensorflow as tf
+from official.legacy.image_classification import optimizer_factory
+from official.legacy.image_classification.configs import base_configs
+
+
+class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
+
+  def build_toy_model(self) -> tf.keras.Model:
+    """Creates a toy `tf.Keras.Model`."""
+    model = tf.keras.Sequential()
+    model.add(tf.keras.layers.Dense(1, input_shape=(1,)))
+    return model
+
+  @parameterized.named_parameters(
+      ('sgd', 'sgd', 0., False), ('momentum', 'momentum', 0., False),
+      ('rmsprop', 'rmsprop', 0., False), ('adam', 'adam', 0., False),
+      ('adamw', 'adamw', 0., False),
+      ('momentum_lookahead', 'momentum', 0., True),
+      ('sgd_ema', 'sgd', 0.999, False),
+      ('momentum_ema', 'momentum', 0.999, False),
+      ('rmsprop_ema', 'rmsprop', 0.999, False))
+  def test_optimizer(self, optimizer_name, moving_average_decay, lookahead):
+    """Smoke test to be sure no syntax errors."""
+    model = self.build_toy_model()
+    params = {
+        'learning_rate': 0.001,
+        'rho': 0.09,
+        'momentum': 0.,
+        'epsilon': 1e-07,
+        'moving_average_decay': moving_average_decay,
+        'lookahead': lookahead,
+    }
+    optimizer = optimizer_factory.build_optimizer(
+        optimizer_name=optimizer_name,
+        base_learning_rate=params['learning_rate'],
+        params=params,
+        model=model)
+    self.assertTrue(issubclass(type(optimizer), tf.keras.optimizers.Optimizer))
+
+  def test_unknown_optimizer(self):
+    with self.assertRaises(ValueError):
+      optimizer_factory.build_optimizer(
+          optimizer_name='this_optimizer_does_not_exist',
+          base_learning_rate=None,
+          params=None)
+
+  def test_learning_rate_without_decay_or_warmups(self):
+    params = base_configs.LearningRateConfig(
+        name='exponential',
+        initial_lr=0.01,
+        decay_rate=0.01,
+        decay_epochs=None,
+        warmup_epochs=None,
+        scale_by_batch_size=0.01,
+        examples_per_epoch=1,
+        boundaries=[0],
+        multipliers=[0, 1])
+    batch_size = 1
+    train_steps = 1
+
+    lr = optimizer_factory.build_learning_rate(
+        params=params, batch_size=batch_size, train_steps=train_steps)
+    self.assertTrue(
+        issubclass(
+            type(lr), tf.keras.optimizers.schedules.LearningRateSchedule))
+
+  @parameterized.named_parameters(('exponential', 'exponential'),
+                                  ('cosine_with_warmup', 'cosine_with_warmup'))
+  def test_learning_rate_with_decay_and_warmup(self, lr_decay_type):
+    """Basic smoke test for syntax."""
+    params = base_configs.LearningRateConfig(
+        name=lr_decay_type,
+        initial_lr=0.01,
+        decay_rate=0.01,
+        decay_epochs=1,
+        warmup_epochs=1,
+        scale_by_batch_size=0.01,
+        examples_per_epoch=1,
+        boundaries=[0],
+        multipliers=[0, 1])
+    batch_size = 1
+    train_epochs = 1
+    train_steps = 1
+
+    lr = optimizer_factory.build_learning_rate(
+        params=params,
+        batch_size=batch_size,
+        train_epochs=train_epochs,
+        train_steps=train_steps)
+    self.assertTrue(
+        issubclass(
+            type(lr), tf.keras.optimizers.schedules.LearningRateSchedule))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/legacy/image_classification/preprocessing.py
+++ b/official/legacy/image_classification/preprocessing.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Preprocessing functions for images."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+from typing import List, Optional, Text, Tuple
+import tensorflow as tf
+from official.legacy.image_classification import augment
+
+
+# Calculated from the ImageNet training set
+MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255)
+STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255)
+
+IMAGE_SIZE = 224
+CROP_PADDING = 32
+
+
+def mean_image_subtraction(
+    image_bytes: tf.Tensor,
+    means: Tuple[float, ...],
+    num_channels: int = 3,
+    dtype: tf.dtypes.DType = tf.float32,
+) ->  tf.Tensor:
+  """Subtracts the given means from each image channel.
+
+  For example:
+    means = [123.68, 116.779, 103.939]
+    image_bytes = mean_image_subtraction(image_bytes, means)
+
+  Note that the rank of `image` must be known.
+
+  Args:
+    image_bytes: a tensor of size [height, width, C].
+    means: a C-vector of values to subtract from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+
+  Returns:
+    the centered image.
+
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `means`.
+  """
+  if image_bytes.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+
+  if len(means) != num_channels:
+    raise ValueError('len(means) must match the number of channels')
+
+  # We have a 1-D tensor of means; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  means = tf.broadcast_to(means, tf.shape(image_bytes))
+  if dtype is not None:
+    means = tf.cast(means, dtype=dtype)
+
+  return image_bytes - means
+
+
+def standardize_image(
+    image_bytes: tf.Tensor,
+    stddev: Tuple[float, ...],
+    num_channels: int = 3,
+    dtype: tf.dtypes.DType = tf.float32,
+) ->  tf.Tensor:
+  """Divides the given stddev from each image channel.
+
+  For example:
+    stddev = [123.68, 116.779, 103.939]
+    image_bytes = standardize_image(image_bytes, stddev)
+
+  Note that the rank of `image` must be known.
+
+  Args:
+    image_bytes: a tensor of size [height, width, C].
+    stddev: a C-vector of values to divide from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+
+  Returns:
+    the centered image.
+
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `stddev`.
+  """
+  if image_bytes.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+
+  if len(stddev) != num_channels:
+    raise ValueError('len(stddev) must match the number of channels')
+
+  # We have a 1-D tensor of stddev; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  stddev = tf.broadcast_to(stddev, tf.shape(image_bytes))
+  if dtype is not None:
+    stddev = tf.cast(stddev, dtype=dtype)
+
+  return image_bytes / stddev
+
+
+def normalize_images(features: tf.Tensor,
+                     mean_rgb: Tuple[float, ...] = MEAN_RGB,
+                     stddev_rgb: Tuple[float, ...] = STDDEV_RGB,
+                     num_channels: int = 3,
+                     dtype: tf.dtypes.DType = tf.float32,
+                     data_format: Text = 'channels_last') -> tf.Tensor:
+  """Normalizes the input image channels with the given mean and stddev.
+
+  Args:
+    features: `Tensor` representing decoded images in float format.
+    mean_rgb: the mean of the channels to subtract.
+    stddev_rgb: the stddev of the channels to divide.
+    num_channels: the number of channels in the input image tensor.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+    data_format: the format of the input image tensor
+                 ['channels_first', 'channels_last'].
+
+  Returns:
+    A normalized image `Tensor`.
+  """
+  # TODO(allencwang) - figure out how to use mean_image_subtraction and
+  # standardize_image on batches of images and replace the following.
+  if data_format == 'channels_first':
+    stats_shape = [num_channels, 1, 1]
+  else:
+    stats_shape = [1, 1, num_channels]
+
+  if dtype is not None:
+    features = tf.image.convert_image_dtype(features, dtype=dtype)
+
+  if mean_rgb is not None:
+    mean_rgb = tf.constant(mean_rgb,
+                           shape=stats_shape,
+                           dtype=features.dtype)
+    mean_rgb = tf.broadcast_to(mean_rgb, tf.shape(features))
+    features = features - mean_rgb
+
+  if stddev_rgb is not None:
+    stddev_rgb = tf.constant(stddev_rgb,
+                             shape=stats_shape,
+                             dtype=features.dtype)
+    stddev_rgb = tf.broadcast_to(stddev_rgb, tf.shape(features))
+    features = features / stddev_rgb
+
+  return features
+
+
+def decode_and_center_crop(image_bytes: tf.Tensor,
+                           image_size: int = IMAGE_SIZE,
+                           crop_padding: int = CROP_PADDING) -> tf.Tensor:
+  """Crops to center of image with padding then scales image_size.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    image_size: image height/width dimension.
+    crop_padding: the padding size to use when centering the crop.
+
+  Returns:
+    A decoded and cropped image `Tensor`.
+  """
+  decoded = image_bytes.dtype != tf.string
+  shape = (tf.shape(image_bytes) if decoded
+           else tf.image.extract_jpeg_shape(image_bytes))
+  image_height = shape[0]
+  image_width = shape[1]
+
+  padded_center_crop_size = tf.cast(
+      ((image_size / (image_size + crop_padding)) *
+       tf.cast(tf.minimum(image_height, image_width), tf.float32)),
+      tf.int32)
+
+  offset_height = ((image_height - padded_center_crop_size) + 1) // 2
+  offset_width = ((image_width - padded_center_crop_size) + 1) // 2
+  crop_window = tf.stack([offset_height, offset_width,
+                          padded_center_crop_size, padded_center_crop_size])
+  if decoded:
+    image = tf.image.crop_to_bounding_box(
+        image_bytes,
+        offset_height=offset_height,
+        offset_width=offset_width,
+        target_height=padded_center_crop_size,
+        target_width=padded_center_crop_size)
+  else:
+    image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
+
+  image = resize_image(image_bytes=image,
+                       height=image_size,
+                       width=image_size)
+
+  return image
+
+
+def decode_crop_and_flip(image_bytes: tf.Tensor) -> tf.Tensor:
+  """Crops an image to a random part of the image, then randomly flips.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+
+  Returns:
+    A decoded and cropped image `Tensor`.
+
+  """
+  decoded = image_bytes.dtype != tf.string
+  bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+  shape = (tf.shape(image_bytes) if decoded
+           else tf.image.extract_jpeg_shape(image_bytes))
+  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+      shape,
+      bounding_boxes=bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=[0.75, 1.33],
+      area_range=[0.05, 1.0],
+      max_attempts=100,
+      use_image_if_no_bounding_boxes=True)
+  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+
+  # Reassemble the bounding box in the format the crop op requires.
+  offset_height, offset_width, _ = tf.unstack(bbox_begin)
+  target_height, target_width, _ = tf.unstack(bbox_size)
+  crop_window = tf.stack([offset_height, offset_width,
+                          target_height, target_width])
+  if decoded:
+    cropped = tf.image.crop_to_bounding_box(
+        image_bytes,
+        offset_height=offset_height,
+        offset_width=offset_width,
+        target_height=target_height,
+        target_width=target_width)
+  else:
+    cropped = tf.image.decode_and_crop_jpeg(image_bytes,
+                                            crop_window,
+                                            channels=3)
+
+  # Flip to add a little more random distortion in.
+  cropped = tf.image.random_flip_left_right(cropped)
+  return cropped
+
+
+def resize_image(image_bytes: tf.Tensor,
+                 height: int = IMAGE_SIZE,
+                 width: int = IMAGE_SIZE) -> tf.Tensor:
+  """Resizes an image to a given height and width.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    height: image height dimension.
+    width: image width dimension.
+
+  Returns:
+    A tensor containing the resized image.
+
+  """
+  print(height, width)
+  return tf.compat.v1.image.resize(
+      image_bytes,
+      tf.convert_to_tensor([height, width]),
+      method=tf.image.ResizeMethod.BILINEAR,
+      align_corners=False)
+
+
+def preprocess_for_eval(
+    image_bytes: tf.Tensor,
+    image_size: int = IMAGE_SIZE,
+    num_channels: int = 3,
+    mean_subtract: bool = False,
+    standardize: bool = False,
+    dtype: tf.dtypes.DType = tf.float32
+) -> tf.Tensor:
+  """Preprocesses the given image for evaluation.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    image_size: image height/width dimension.
+    num_channels: number of image input channels.
+    mean_subtract: whether or not to apply mean subtraction.
+    standardize: whether or not to apply standardization.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  images = decode_and_center_crop(image_bytes, image_size)
+  images = tf.reshape(images, [image_size, image_size, num_channels])
+
+  if mean_subtract:
+    images = mean_image_subtraction(image_bytes=images, means=MEAN_RGB)
+  if standardize:
+    images = standardize_image(image_bytes=images, stddev=STDDEV_RGB)
+  if dtype is not None:
+    images = tf.image.convert_image_dtype(images, dtype=dtype)
+
+  return images
+
+
+def load_eval_image(filename: Text, image_size: int = IMAGE_SIZE) -> tf.Tensor:
+  """Reads an image from the filesystem and applies image preprocessing.
+
+  Args:
+    filename: a filename path of an image.
+    image_size: image height/width dimension.
+
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  image_bytes = tf.io.read_file(filename)
+  image = preprocess_for_eval(image_bytes, image_size)
+
+  return image
+
+
+def build_eval_dataset(filenames: List[Text],
+                       labels: Optional[List[int]] = None,
+                       image_size: int = IMAGE_SIZE,
+                       batch_size: int = 1) -> tf.Tensor:
+  """Builds a tf.data.Dataset from a list of filenames and labels.
+
+  Args:
+    filenames: a list of filename paths of images.
+    labels: a list of labels corresponding to each image.
+    image_size: image height/width dimension.
+    batch_size: the batch size used by the dataset
+
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  if labels is None:
+    labels = [0] * len(filenames)
+
+  filenames = tf.constant(filenames)
+  labels = tf.constant(labels)
+  dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
+
+  dataset = dataset.map(
+      lambda filename, label: (load_eval_image(filename, image_size), label))
+  dataset = dataset.batch(batch_size)
+
+  return dataset
+
+
+def preprocess_for_train(image_bytes: tf.Tensor,
+                         image_size: int = IMAGE_SIZE,
+                         augmenter: Optional[augment.ImageAugment] = None,
+                         mean_subtract: bool = False,
+                         standardize: bool = False,
+                         dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
+  """Preprocesses the given image for training.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of
+      arbitrary size of dtype tf.uint8.
+    image_size: image height/width dimension.
+    augmenter: the image augmenter to apply.
+    mean_subtract: whether or not to apply mean subtraction.
+    standardize: whether or not to apply standardization.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  images = decode_crop_and_flip(image_bytes=image_bytes)
+  images = resize_image(images, height=image_size, width=image_size)
+  if augmenter is not None:
+    images = augmenter.distort(images)
+  if mean_subtract:
+    images = mean_image_subtraction(image_bytes=images, means=MEAN_RGB)
+  if standardize:
+    images = standardize_image(image_bytes=images, stddev=STDDEV_RGB)
+  if dtype is not None:
+    images = tf.image.convert_image_dtype(images, dtype)
+
+  return images
--- a/official/legacy/image_classification/resnet/README.md
+++ b/official/legacy/image_classification/resnet/README.md
+This folder contains a
+[custom training loop (CTL)](#resnet-custom-training-loop) implementation for
+ResNet50.
+
+## Before you begin
+Please refer to the [README](../README.md) in the parent directory for
+information on setup and preparing the data.
+
+## ResNet (custom training loop)
+
+Similar to the [estimator implementation](../../../r1/resnet), the Keras
+implementation has code for the ImageNet dataset. The ImageNet
+version uses a ResNet50 model implemented in
+[`resnet_model.py`](./resnet_model.py).
+
+
+### Pretrained Models
+
+* [ResNet50 Checkpoints](https://storage.googleapis.com/cloud-tpu-checkpoints/resnet/resnet50.tar.gz)
+
+* ResNet50 TFHub: [feature vector](https://tfhub.dev/tensorflow/resnet_50/feature_vector/1)
+and [classification](https://tfhub.dev/tensorflow/resnet_50/classification/1)
+
+Again, if you did not download the data to the default directory, specify the
+location with the `--data_dir` flag:
+
+```bash
+python3 resnet_ctl_imagenet_main.py --data_dir=/path/to/imagenet
+```
+
+There are more flag options you can specify. Here are some examples:
+
+- `--use_synthetic_data`: when set to true, synthetic data, rather than real
+data, are used;
+- `--batch_size`: the batch size used for the model;
+- `--model_dir`: the directory to save the model checkpoint;
+- `--train_epochs`: number of epoches to run for training the model;
+- `--train_steps`: number of steps to run for training the model. We now only
+support a number that is smaller than the number of batches in an epoch.
+- `--skip_eval`: when set to true, evaluation as well as validation during
+training is skipped
+
+For example, this is a typical command line to run with ImageNet data with
+batch size 128 per GPU:
+
+```bash
+python3 -m resnet_ctl_imagenet_main.py \
+    --model_dir=/tmp/model_dir/something \
+    --num_gpus=2 \
+    --batch_size=128 \
+    --train_epochs=90 \
+    --train_steps=10 \
+    --use_synthetic_data=false
+```
+
+See [`common.py`](common.py) for full list of options.
+
+### Using multiple GPUs
+
+You can train these models on multiple GPUs using `tf.distribute.Strategy` API.
+You can read more about them in this
+[guide](https://www.tensorflow.org/guide/distribute_strategy).
+
+In this example, we have made it easier to use is with just a command line flag
+`--num_gpus`. By default this flag is 1 if TensorFlow is compiled with CUDA,
+and 0 otherwise.
+
+- --num_gpus=0: Uses tf.distribute.OneDeviceStrategy with CPU as the device.
+- --num_gpus=1: Uses tf.distribute.OneDeviceStrategy with GPU as the device.
+- --num_gpus=2+: Uses tf.distribute.MirroredStrategy to run synchronous
+distributed training across the GPUs.
+
+If you wish to run without `tf.distribute.Strategy`, you can do so by setting
+`--distribution_strategy=off`.
+
+### Running on multiple GPU hosts
+
+You can also train these models on multiple hosts, each with GPUs, using
+`tf.distribute.Strategy`.
+
+The easiest way to run multi-host benchmarks is to set the
+[`TF_CONFIG`](https://www.tensorflow.org/guide/distributed_training#TF_CONFIG)
+appropriately at each host.  e.g., to run using `MultiWorkerMirroredStrategy` on
+2 hosts, the `cluster` in `TF_CONFIG` should have 2 `host:port` entries, and
+host `i` should have the `task` in `TF_CONFIG` set to `{"type": "worker",
+"index": i}`.  `MultiWorkerMirroredStrategy` will automatically use all the
+available GPUs at each host.
+
+### Running on Cloud TPUs
+
+Note: This model will **not** work with TPUs on Colab.
+
+You can train the ResNet CTL model on Cloud TPUs using
+`tf.distribute.TPUStrategy`. If you are not familiar with Cloud TPUs, it is
+strongly recommended that you go through the
+[quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
+create a TPU and GCE VM.
+
+To run ResNet model on a TPU, you must set `--distribution_strategy=tpu` and
+`--tpu=$TPU_NAME`, where `$TPU_NAME` the name of your TPU in the Cloud Console.
+From a GCE VM, you can run the following command to train ResNet for one epoch
+on a v2-8 or v3-8 TPU by setting `TRAIN_EPOCHS` to 1:
+
+```bash
+python3 resnet_ctl_imagenet_main.py \
+  --tpu=$TPU_NAME \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --batch_size=1024 \
+  --steps_per_loop=500 \
+  --train_epochs=$TRAIN_EPOCHS \
+  --use_synthetic_data=false \
+  --dtype=fp32 \
+  --enable_eager=true \
+  --enable_tensorboard=true \
+  --distribution_strategy=tpu \
+  --log_steps=50 \
+  --single_l2_loss_op=true \
+  --use_tf_function=true
+```
+
+To train the ResNet to convergence, run it for 90 epochs by setting
+`TRAIN_EPOCHS` to 90.
+
+Note: `$MODEL_DIR` and `$DATA_DIR` must be GCS paths.