Internal change.

PiperOrigin-RevId: 421362994

Internal change.
PiperOrigin-RevId: 421362994
1c32ebf2 · Fan Yang · A. Unique TensorFlower · ada0e36b · ada0e36b · ada0e36b
Commit 1c32ebf2 authored Jan 12, 2022 by Fan Yang Committed by A. Unique TensorFlower Jan 12, 2022
20 changed files
--- a/official/vision/image_classification/efficientnet/efficientnet_config.py
+++ b/official/vision/image_classification/efficientnet/efficientnet_config.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Configuration definitions for EfficientNet losses, learning rates, and optimizers."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from typing import Any, Mapping
-
-import dataclasses
-
-from official.modeling.hyperparams import base_config
-from official.vision.image_classification.configs import base_configs
-
-
-@dataclasses.dataclass
-class EfficientNetModelConfig(base_configs.ModelConfig):
-  """Configuration for the EfficientNet model.
-
-  This configuration will default to settings used for training efficientnet-b0
-  on a v3-8 TPU on ImageNet.
-
-  Attributes:
-    name: The name of the model. Defaults to 'EfficientNet'.
-    num_classes: The number of classes in the model.
-    model_params: A dictionary that represents the parameters of the
-      EfficientNet model. These will be passed in to the "from_name" function.
-    loss: The configuration for loss. Defaults to a categorical cross entropy
-      implementation.
-    optimizer: The configuration for optimizations. Defaults to an RMSProp
-      configuration.
-    learning_rate: The configuration for learning rate. Defaults to an
-      exponential configuration.
-  """
-  name: str = 'EfficientNet'
-  num_classes: int = 1000
-  model_params: base_config.Config = dataclasses.field(
-      default_factory=lambda: {
-          'model_name': 'efficientnet-b0',
-          'model_weights_path': '',
-          'weights_format': 'saved_model',
-          'overrides': {
-              'batch_norm': 'default',
-              'rescale_input': True,
-              'num_classes': 1000,
-              'activation': 'swish',
-              'dtype': 'float32',
-          }
-      })
-  loss: base_configs.LossConfig = base_configs.LossConfig(
-      name='categorical_crossentropy', label_smoothing=0.1)
-  optimizer: base_configs.OptimizerConfig = base_configs.OptimizerConfig(
-      name='rmsprop',
-      decay=0.9,
-      epsilon=0.001,
-      momentum=0.9,
-      moving_average_decay=None)
-  learning_rate: base_configs.LearningRateConfig = base_configs.LearningRateConfig(  # pylint: disable=line-too-long
-      name='exponential',
-      initial_lr=0.008,
-      decay_epochs=2.4,
-      decay_rate=0.97,
-      warmup_epochs=5,
-      scale_by_batch_size=1. / 128.,
-      staircase=True)
--- a/official/vision/image_classification/efficientnet/efficientnet_model.py
+++ b/official/vision/image_classification/efficientnet/efficientnet_model.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Contains definitions for EfficientNet model.
-
-[1] Mingxing Tan, Quoc V. Le
-  EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks.
-  ICML'19, https://arxiv.org/abs/1905.11946
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import os
-from typing import Any, Dict, Optional, Text, Tuple
-
-from absl import logging
-from dataclasses import dataclass
-import tensorflow as tf
-
-from official.modeling import tf_utils
-from official.modeling.hyperparams import base_config
-from official.vision.image_classification import preprocessing
-from official.vision.image_classification.efficientnet import common_modules
-
-
-@dataclass
-class BlockConfig(base_config.Config):
-  """Config for a single MB Conv Block."""
-  input_filters: int = 0
-  output_filters: int = 0
-  kernel_size: int = 3
-  num_repeat: int = 1
-  expand_ratio: int = 1
-  strides: Tuple[int, int] = (1, 1)
-  se_ratio: Optional[float] = None
-  id_skip: bool = True
-  fused_conv: bool = False
-  conv_type: str = 'depthwise'
-
-
-@dataclass
-class ModelConfig(base_config.Config):
-  """Default Config for Efficientnet-B0."""
-  width_coefficient: float = 1.0
-  depth_coefficient: float = 1.0
-  resolution: int = 224
-  dropout_rate: float = 0.2
-  blocks: Tuple[BlockConfig, ...] = (
-      # (input_filters, output_filters, kernel_size, num_repeat,
-      #  expand_ratio, strides, se_ratio)
-      # pylint: disable=bad-whitespace
-      BlockConfig.from_args(32, 16, 3, 1, 1, (1, 1), 0.25),
-      BlockConfig.from_args(16, 24, 3, 2, 6, (2, 2), 0.25),
-      BlockConfig.from_args(24, 40, 5, 2, 6, (2, 2), 0.25),
-      BlockConfig.from_args(40, 80, 3, 3, 6, (2, 2), 0.25),
-      BlockConfig.from_args(80, 112, 5, 3, 6, (1, 1), 0.25),
-      BlockConfig.from_args(112, 192, 5, 4, 6, (2, 2), 0.25),
-      BlockConfig.from_args(192, 320, 3, 1, 6, (1, 1), 0.25),
-      # pylint: enable=bad-whitespace
-  )
-  stem_base_filters: int = 32
-  top_base_filters: int = 1280
-  activation: str = 'simple_swish'
-  batch_norm: str = 'default'
-  bn_momentum: float = 0.99
-  bn_epsilon: float = 1e-3
-  # While the original implementation used a weight decay of 1e-5,
-  # tf.nn.l2_loss divides it by 2, so we halve this to compensate in Keras
-  weight_decay: float = 5e-6
-  drop_connect_rate: float = 0.2
-  depth_divisor: int = 8
-  min_depth: Optional[int] = None
-  use_se: bool = True
-  input_channels: int = 3
-  num_classes: int = 1000
-  model_name: str = 'efficientnet'
-  rescale_input: bool = True
-  data_format: str = 'channels_last'
-  dtype: str = 'float32'
-
-
-MODEL_CONFIGS = {
-    # (width, depth, resolution, dropout)
-    'efficientnet-b0': ModelConfig.from_args(1.0, 1.0, 224, 0.2),
-    'efficientnet-b1': ModelConfig.from_args(1.0, 1.1, 240, 0.2),
-    'efficientnet-b2': ModelConfig.from_args(1.1, 1.2, 260, 0.3),
-    'efficientnet-b3': ModelConfig.from_args(1.2, 1.4, 300, 0.3),
-    'efficientnet-b4': ModelConfig.from_args(1.4, 1.8, 380, 0.4),
-    'efficientnet-b5': ModelConfig.from_args(1.6, 2.2, 456, 0.4),
-    'efficientnet-b6': ModelConfig.from_args(1.8, 2.6, 528, 0.5),
-    'efficientnet-b7': ModelConfig.from_args(2.0, 3.1, 600, 0.5),
-    'efficientnet-b8': ModelConfig.from_args(2.2, 3.6, 672, 0.5),
-    'efficientnet-l2': ModelConfig.from_args(4.3, 5.3, 800, 0.5),
-}
-
-CONV_KERNEL_INITIALIZER = {
-    'class_name': 'VarianceScaling',
-    'config': {
-        'scale': 2.0,
-        'mode': 'fan_out',
-        # Note: this is a truncated normal distribution
-        'distribution': 'normal'
-    }
-}
-
-DENSE_KERNEL_INITIALIZER = {
-    'class_name': 'VarianceScaling',
-    'config': {
-        'scale': 1 / 3.0,
-        'mode': 'fan_out',
-        'distribution': 'uniform'
-    }
-}
-
-
-def round_filters(filters: int, config: ModelConfig) -> int:
-  """Round number of filters based on width coefficient."""
-  width_coefficient = config.width_coefficient
-  min_depth = config.min_depth
-  divisor = config.depth_divisor
-  orig_filters = filters
-
-  if not width_coefficient:
-    return filters
-
-  filters *= width_coefficient
-  min_depth = min_depth or divisor
-  new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
-  # Make sure that round down does not go down by more than 10%.
-  if new_filters < 0.9 * filters:
-    new_filters += divisor
-  logging.info('round_filter input=%s output=%s', orig_filters, new_filters)
-  return int(new_filters)
-
-
-def round_repeats(repeats: int, depth_coefficient: float) -> int:
-  """Round number of repeats based on depth coefficient."""
-  return int(math.ceil(depth_coefficient * repeats))
-
-
-def conv2d_block(inputs: tf.Tensor,
-                 conv_filters: Optional[int],
-                 config: ModelConfig,
-                 kernel_size: Any = (1, 1),
-                 strides: Any = (1, 1),
-                 use_batch_norm: bool = True,
-                 use_bias: bool = False,
-                 activation: Optional[Any] = None,
-                 depthwise: bool = False,
-                 name: Optional[Text] = None):
-  """A conv2d followed by batch norm and an activation."""
-  batch_norm = common_modules.get_batch_norm(config.batch_norm)
-  bn_momentum = config.bn_momentum
-  bn_epsilon = config.bn_epsilon
-  data_format = tf.keras.backend.image_data_format()
-  weight_decay = config.weight_decay
-
-  name = name or ''
-
-  # Collect args based on what kind of conv2d block is desired
-  init_kwargs = {
-      'kernel_size': kernel_size,
-      'strides': strides,
-      'use_bias': use_bias,
-      'padding': 'same',
-      'name': name + '_conv2d',
-      'kernel_regularizer': tf.keras.regularizers.l2(weight_decay),
-      'bias_regularizer': tf.keras.regularizers.l2(weight_decay),
-  }
-
-  if depthwise:
-    conv2d = tf.keras.layers.DepthwiseConv2D
-    init_kwargs.update({'depthwise_initializer': CONV_KERNEL_INITIALIZER})
-  else:
-    conv2d = tf.keras.layers.Conv2D
-    init_kwargs.update({
-        'filters': conv_filters,
-        'kernel_initializer': CONV_KERNEL_INITIALIZER
-    })
-
-  x = conv2d(**init_kwargs)(inputs)
-
-  if use_batch_norm:
-    bn_axis = 1 if data_format == 'channels_first' else -1
-    x = batch_norm(
-        axis=bn_axis,
-        momentum=bn_momentum,
-        epsilon=bn_epsilon,
-        name=name + '_bn')(
-            x)
-
-  if activation is not None:
-    x = tf.keras.layers.Activation(activation, name=name + '_activation')(x)
-  return x
-
-
-def mb_conv_block(inputs: tf.Tensor,
-                  block: BlockConfig,
-                  config: ModelConfig,
-                  prefix: Optional[Text] = None):
-  """Mobile Inverted Residual Bottleneck.
-
-  Args:
-    inputs: the Keras input to the block
-    block: BlockConfig, arguments to create a Block
-    config: ModelConfig, a set of model parameters
-    prefix: prefix for naming all layers
-
-  Returns:
-    the output of the block
-  """
-  use_se = config.use_se
-  activation = tf_utils.get_activation(config.activation)
-  drop_connect_rate = config.drop_connect_rate
-  data_format = tf.keras.backend.image_data_format()
-  use_depthwise = block.conv_type != 'no_depthwise'
-  prefix = prefix or ''
-
-  filters = block.input_filters * block.expand_ratio
-
-  x = inputs
-
-  if block.fused_conv:
-    # If we use fused mbconv, skip expansion and use regular conv.
-    x = conv2d_block(
-        x,
-        filters,
-        config,
-        kernel_size=block.kernel_size,
-        strides=block.strides,
-        activation=activation,
-        name=prefix + 'fused')
-  else:
-    if block.expand_ratio != 1:
-      # Expansion phase
-      kernel_size = (1, 1) if use_depthwise else (3, 3)
-      x = conv2d_block(
-          x,
-          filters,
-          config,
-          kernel_size=kernel_size,
-          activation=activation,
-          name=prefix + 'expand')
-
-    # Depthwise Convolution
-    if use_depthwise:
-      x = conv2d_block(
-          x,
-          conv_filters=None,
-          config=config,
-          kernel_size=block.kernel_size,
-          strides=block.strides,
-          activation=activation,
-          depthwise=True,
-          name=prefix + 'depthwise')
-
-  # Squeeze and Excitation phase
-  if use_se:
-    assert block.se_ratio is not None
-    assert 0 < block.se_ratio <= 1
-    num_reduced_filters = max(1, int(block.input_filters * block.se_ratio))
-
-    if data_format == 'channels_first':
-      se_shape = (filters, 1, 1)
-    else:
-      se_shape = (1, 1, filters)
-
-    se = tf.keras.layers.GlobalAveragePooling2D(name=prefix + 'se_squeeze')(x)
-    se = tf.keras.layers.Reshape(se_shape, name=prefix + 'se_reshape')(se)
-
-    se = conv2d_block(
-        se,
-        num_reduced_filters,
-        config,
-        use_bias=True,
-        use_batch_norm=False,
-        activation=activation,
-        name=prefix + 'se_reduce')
-    se = conv2d_block(
-        se,
-        filters,
-        config,
-        use_bias=True,
-        use_batch_norm=False,
-        activation='sigmoid',
-        name=prefix + 'se_expand')
-    x = tf.keras.layers.multiply([x, se], name=prefix + 'se_excite')
-
-  # Output phase
-  x = conv2d_block(
-      x, block.output_filters, config, activation=None, name=prefix + 'project')
-
-  # Add identity so that quantization-aware training can insert quantization
-  # ops correctly.
-  x = tf.keras.layers.Activation(
-      tf_utils.get_activation('identity'), name=prefix + 'id')(
-          x)
-
-  if (block.id_skip and all(s == 1 for s in block.strides) and
-      block.input_filters == block.output_filters):
-    if drop_connect_rate and drop_connect_rate > 0:
-      # Apply dropconnect
-      # The only difference between dropout and dropconnect in TF is scaling by
-      # drop_connect_rate during training. See:
-      # https://github.com/keras-team/keras/pull/9898#issuecomment-380577612
-      x = tf.keras.layers.Dropout(
-          drop_connect_rate, noise_shape=(None, 1, 1, 1), name=prefix + 'drop')(
-              x)
-
-    x = tf.keras.layers.add([x, inputs], name=prefix + 'add')
-
-  return x
-
-
-def efficientnet(image_input: tf.keras.layers.Input, config: ModelConfig):  # pytype: disable=invalid-annotation  # typed-keras
-  """Creates an EfficientNet graph given the model parameters.
-
-  This function is wrapped by the `EfficientNet` class to make a tf.keras.Model.
-
-  Args:
-    image_input: the input batch of images
-    config: the model config
-
-  Returns:
-    the output of efficientnet
-  """
-  depth_coefficient = config.depth_coefficient
-  blocks = config.blocks
-  stem_base_filters = config.stem_base_filters
-  top_base_filters = config.top_base_filters
-  activation = tf_utils.get_activation(config.activation)
-  dropout_rate = config.dropout_rate
-  drop_connect_rate = config.drop_connect_rate
-  num_classes = config.num_classes
-  input_channels = config.input_channels
-  rescale_input = config.rescale_input
-  data_format = tf.keras.backend.image_data_format()
-  dtype = config.dtype
-  weight_decay = config.weight_decay
-
-  x = image_input
-  if data_format == 'channels_first':
-    # Happens on GPU/TPU if available.
-    x = tf.keras.layers.Permute((3, 1, 2))(x)
-  if rescale_input:
-    x = preprocessing.normalize_images(
-        x, num_channels=input_channels, dtype=dtype, data_format=data_format)
-
-  # Build stem
-  x = conv2d_block(
-      x,
-      round_filters(stem_base_filters, config),
-      config,
-      kernel_size=[3, 3],
-      strides=[2, 2],
-      activation=activation,
-      name='stem')
-
-  # Build blocks
-  num_blocks_total = sum(
-      round_repeats(block.num_repeat, depth_coefficient) for block in blocks)
-  block_num = 0
-
-  for stack_idx, block in enumerate(blocks):
-    assert block.num_repeat > 0
-    # Update block input and output filters based on depth multiplier
-    block = block.replace(
-        input_filters=round_filters(block.input_filters, config),
-        output_filters=round_filters(block.output_filters, config),
-        num_repeat=round_repeats(block.num_repeat, depth_coefficient))
-
-    # The first block needs to take care of stride and filter size increase
-    drop_rate = drop_connect_rate * float(block_num) / num_blocks_total
-    config = config.replace(drop_connect_rate=drop_rate)
-    block_prefix = 'stack_{}/block_0/'.format(stack_idx)
-    x = mb_conv_block(x, block, config, block_prefix)
-    block_num += 1
-    if block.num_repeat > 1:
-      block = block.replace(input_filters=block.output_filters, strides=[1, 1])
-
-      for block_idx in range(block.num_repeat - 1):
-        drop_rate = drop_connect_rate * float(block_num) / num_blocks_total
-        config = config.replace(drop_connect_rate=drop_rate)
-        block_prefix = 'stack_{}/block_{}/'.format(stack_idx, block_idx + 1)
-        x = mb_conv_block(x, block, config, prefix=block_prefix)
-        block_num += 1
-
-  # Build top
-  x = conv2d_block(
-      x,
-      round_filters(top_base_filters, config),
-      config,
-      activation=activation,
-      name='top')
-
-  # Build classifier
-  x = tf.keras.layers.GlobalAveragePooling2D(name='top_pool')(x)
-  if dropout_rate and dropout_rate > 0:
-    x = tf.keras.layers.Dropout(dropout_rate, name='top_dropout')(x)
-  x = tf.keras.layers.Dense(
-      num_classes,
-      kernel_initializer=DENSE_KERNEL_INITIALIZER,
-      kernel_regularizer=tf.keras.regularizers.l2(weight_decay),
-      bias_regularizer=tf.keras.regularizers.l2(weight_decay),
-      name='logits')(
-          x)
-  x = tf.keras.layers.Activation('softmax', name='probs')(x)
-
-  return x
-
-
-class EfficientNet(tf.keras.Model):
-  """Wrapper class for an EfficientNet Keras model.
-
-  Contains helper methods to build, manage, and save metadata about the model.
-  """
-
-  def __init__(self,
-               config: Optional[ModelConfig] = None,
-               overrides: Optional[Dict[Text, Any]] = None):
-    """Create an EfficientNet model.
-
-    Args:
-      config: (optional) the main model parameters to create the model
-      overrides: (optional) a dict containing keys that can override config
-    """
-    overrides = overrides or {}
-    config = config or ModelConfig()
-
-    self.config = config.replace(**overrides)
-
-    input_channels = self.config.input_channels
-    model_name = self.config.model_name
-    input_shape = (None, None, input_channels)  # Should handle any size image
-    image_input = tf.keras.layers.Input(shape=input_shape)
-
-    output = efficientnet(image_input, self.config)
-
-    # Cast to float32 in case we have a different model dtype
-    output = tf.cast(output, tf.float32)
-
-    logging.info('Building model %s with params %s', model_name, self.config)
-
-    super(EfficientNet, self).__init__(
-        inputs=image_input, outputs=output, name=model_name)
-
-  @classmethod
-  def from_name(cls,
-                model_name: Text,
-                model_weights_path: Optional[Text] = None,
-                weights_format: Text = 'saved_model',
-                overrides: Optional[Dict[Text, Any]] = None):
-    """Construct an EfficientNet model from a predefined model name.
-
-    E.g., `EfficientNet.from_name('efficientnet-b0')`.
-
-    Args:
-      model_name: the predefined model name
-      model_weights_path: the path to the weights (h5 file or saved model dir)
-      weights_format: the model weights format. One of 'saved_model', 'h5', or
-        'checkpoint'.
-      overrides: (optional) a dict containing keys that can override config
-
-    Returns:
-      A constructed EfficientNet instance.
-    """
-    model_configs = dict(MODEL_CONFIGS)
-    overrides = dict(overrides) if overrides else {}
-
-    # One can define their own custom models if necessary
-    model_configs.update(overrides.pop('model_config', {}))
-
-    if model_name not in model_configs:
-      raise ValueError('Unknown model name {}'.format(model_name))
-
-    config = model_configs[model_name]
-
-    model = cls(config=config, overrides=overrides)
-
-    if model_weights_path:
-      common_modules.load_weights(
-          model, model_weights_path, weights_format=weights_format)
-
-    return model
--- a/official/vision/image_classification/efficientnet/tfhub_export.py
+++ b/official/vision/image_classification/efficientnet/tfhub_export.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""A script to export TF-Hub SavedModel."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl import app
-from absl import flags
-
-import tensorflow as tf
-
-from official.vision.image_classification.efficientnet import efficientnet_model
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("model_name", None, "EfficientNet model name.")
-flags.DEFINE_string("model_path", None, "File path to TF model checkpoint.")
-flags.DEFINE_string("export_path", None,
-                    "TF-Hub SavedModel destination path to export.")
-
-
-def export_tfhub(model_path, hub_destination, model_name):
-  """Restores a tf.keras.Model and saves for TF-Hub."""
-  model_configs = dict(efficientnet_model.MODEL_CONFIGS)
-  config = model_configs[model_name]
-
-  image_input = tf.keras.layers.Input(
-      shape=(None, None, 3), name="image_input", dtype=tf.float32)
-  x = image_input * 255.0
-  ouputs = efficientnet_model.efficientnet(x, config)
-  hub_model = tf.keras.Model(image_input, ouputs)
-  ckpt = tf.train.Checkpoint(model=hub_model)
-  ckpt.restore(model_path).assert_existing_objects_matched()
-  hub_model.save(
-      os.path.join(hub_destination, "classification"), include_optimizer=False)
-
-  feature_vector_output = hub_model.get_layer(name="top_pool").get_output_at(0)
-  hub_model2 = tf.keras.Model(image_input, feature_vector_output)
-  hub_model2.save(
-      os.path.join(hub_destination, "feature-vector"), include_optimizer=False)
-
-
-def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError("Too many command-line arguments.")
-
-  export_tfhub(FLAGS.model_path, FLAGS.export_path, FLAGS.model_name)
-
-
-if __name__ == "__main__":
-  app.run(main)
--- a/official/vision/image_classification/learning_rate.py
+++ b/official/vision/image_classification/learning_rate.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Learning rate utilities for vision tasks."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from typing import Any, Mapping, Optional
-
-import numpy as np
-import tensorflow as tf
-
-BASE_LEARNING_RATE = 0.1
-
-
-class WarmupDecaySchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
-  """A wrapper for LearningRateSchedule that includes warmup steps."""
-
-  def __init__(self,
-               lr_schedule: tf.keras.optimizers.schedules.LearningRateSchedule,
-               warmup_steps: int,
-               warmup_lr: Optional[float] = None):
-    """Add warmup decay to a learning rate schedule.
-
-    Args:
-      lr_schedule: base learning rate scheduler
-      warmup_steps: number of warmup steps
-      warmup_lr: an optional field for the final warmup learning rate. This
-        should be provided if the base `lr_schedule` does not contain this
-        field.
-    """
-    super(WarmupDecaySchedule, self).__init__()
-    self._lr_schedule = lr_schedule
-    self._warmup_steps = warmup_steps
-    self._warmup_lr = warmup_lr
-
-  def __call__(self, step: int):
-    lr = self._lr_schedule(step)
-    if self._warmup_steps:
-      if self._warmup_lr is not None:
-        initial_learning_rate = tf.convert_to_tensor(
-            self._warmup_lr, name="initial_learning_rate")
-      else:
-        initial_learning_rate = tf.convert_to_tensor(
-            self._lr_schedule.initial_learning_rate,
-            name="initial_learning_rate")
-      dtype = initial_learning_rate.dtype
-      global_step_recomp = tf.cast(step, dtype)
-      warmup_steps = tf.cast(self._warmup_steps, dtype)
-      warmup_lr = initial_learning_rate * global_step_recomp / warmup_steps
-      lr = tf.cond(global_step_recomp < warmup_steps, lambda: warmup_lr,
-                   lambda: lr)
-    return lr
-
-  def get_config(self) -> Mapping[str, Any]:
-    config = self._lr_schedule.get_config()
-    config.update({
-        "warmup_steps": self._warmup_steps,
-        "warmup_lr": self._warmup_lr,
-    })
-    return config
-
-
-class CosineDecayWithWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Class to generate learning rate tensor."""
-
-  def __init__(self, batch_size: int, total_steps: int, warmup_steps: int):
-    """Creates the consine learning rate tensor with linear warmup.
-
-    Args:
-      batch_size: The training batch size used in the experiment.
-      total_steps: Total training steps.
-      warmup_steps: Steps for the warm up period.
-    """
-    super(CosineDecayWithWarmup, self).__init__()
-    base_lr_batch_size = 256
-    self._total_steps = total_steps
-    self._init_learning_rate = BASE_LEARNING_RATE * batch_size / base_lr_batch_size
-    self._warmup_steps = warmup_steps
-
-  def __call__(self, global_step: int):
-    global_step = tf.cast(global_step, dtype=tf.float32)
-    warmup_steps = self._warmup_steps
-    init_lr = self._init_learning_rate
-    total_steps = self._total_steps
-
-    linear_warmup = global_step / warmup_steps * init_lr
-
-    cosine_learning_rate = init_lr * (tf.cos(np.pi *
-                                             (global_step - warmup_steps) /
-                                             (total_steps - warmup_steps)) +
-                                      1.0) / 2.0
-
-    learning_rate = tf.where(global_step < warmup_steps, linear_warmup,
-                             cosine_learning_rate)
-    return learning_rate
-
-  def get_config(self):
-    return {
-        "total_steps": self._total_steps,
-        "warmup_learning_rate": self._warmup_learning_rate,
-        "warmup_steps": self._warmup_steps,
-        "init_learning_rate": self._init_learning_rate,
-    }
--- a/official/vision/image_classification/learning_rate_test.py
+++ b/official/vision/image_classification/learning_rate_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for learning_rate."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from official.vision.image_classification import learning_rate
-
-
-class LearningRateTests(tf.test.TestCase):
-
-  def test_warmup_decay(self):
-    """Basic computational test for warmup decay."""
-    initial_lr = 0.01
-    decay_steps = 100
-    decay_rate = 0.01
-    warmup_steps = 10
-
-    base_lr = tf.keras.optimizers.schedules.ExponentialDecay(
-        initial_learning_rate=initial_lr,
-        decay_steps=decay_steps,
-        decay_rate=decay_rate)
-    lr = learning_rate.WarmupDecaySchedule(
-        lr_schedule=base_lr, warmup_steps=warmup_steps)
-
-    for step in range(warmup_steps - 1):
-      config = lr.get_config()
-      self.assertEqual(config['warmup_steps'], warmup_steps)
-      self.assertAllClose(
-          self.evaluate(lr(step)), step / warmup_steps * initial_lr)
-
-  def test_cosine_decay_with_warmup(self):
-    """Basic computational test for cosine decay with warmup."""
-    expected_lrs = [0.0, 0.1, 0.05, 0.0]
-
-    lr = learning_rate.CosineDecayWithWarmup(
-        batch_size=256, total_steps=3, warmup_steps=1)
-
-    for step in [0, 1, 2, 3]:
-      self.assertAllClose(lr(step), expected_lrs[step])
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/image_classification/mnist_main.py
+++ b/official/vision/image_classification/mnist_main.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Runs a simple model on the MNIST dataset."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-# Import libraries
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-import tensorflow_datasets as tfds
-from official.common import distribute_utils
-from official.utils.flags import core as flags_core
-from official.utils.misc import model_helpers
-from official.vision.image_classification.resnet import common
-
-FLAGS = flags.FLAGS
-
-
-def build_model():
-  """Constructs the ML model used to predict handwritten digits."""
-
-  image = tf.keras.layers.Input(shape=(28, 28, 1))
-
-  y = tf.keras.layers.Conv2D(filters=32,
-                             kernel_size=5,
-                             padding='same',
-                             activation='relu')(image)
-  y = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
-                                   strides=(2, 2),
-                                   padding='same')(y)
-  y = tf.keras.layers.Conv2D(filters=32,
-                             kernel_size=5,
-                             padding='same',
-                             activation='relu')(y)
-  y = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
-                                   strides=(2, 2),
-                                   padding='same')(y)
-  y = tf.keras.layers.Flatten()(y)
-  y = tf.keras.layers.Dense(1024, activation='relu')(y)
-  y = tf.keras.layers.Dropout(0.4)(y)
-
-  probs = tf.keras.layers.Dense(10, activation='softmax')(y)
-
-  model = tf.keras.models.Model(image, probs, name='mnist')
-
-  return model
-
-
-@tfds.decode.make_decoder(output_dtype=tf.float32)
-def decode_image(example, feature):
-  """Convert image to float32 and normalize from [0, 255] to [0.0, 1.0]."""
-  return tf.cast(feature.decode_example(example), dtype=tf.float32) / 255
-
-
-def run(flags_obj, datasets_override=None, strategy_override=None):
-  """Run MNIST model training and eval loop using native Keras APIs.
-
-  Args:
-    flags_obj: An object containing parsed flag values.
-    datasets_override: A pair of `tf.data.Dataset` objects to train the model,
-                       representing the train and test sets.
-    strategy_override: A `tf.distribute.Strategy` object to use for model.
-
-  Returns:
-    Dictionary of training and eval stats.
-  """
-  # Start TF profiler server.
-  tf.profiler.experimental.server.start(flags_obj.profiler_port)
-
-  strategy = strategy_override or distribute_utils.get_distribution_strategy(
-      distribution_strategy=flags_obj.distribution_strategy,
-      num_gpus=flags_obj.num_gpus,
-      tpu_address=flags_obj.tpu)
-
-  strategy_scope = distribute_utils.get_strategy_scope(strategy)
-
-  mnist = tfds.builder('mnist', data_dir=flags_obj.data_dir)
-  if flags_obj.download:
-    mnist.download_and_prepare()
-
-  mnist_train, mnist_test = datasets_override or mnist.as_dataset(
-      split=['train', 'test'],
-      decoders={'image': decode_image()},  # pylint: disable=no-value-for-parameter
-      as_supervised=True)
-  train_input_dataset = mnist_train.cache().repeat().shuffle(
-      buffer_size=50000).batch(flags_obj.batch_size)
-  eval_input_dataset = mnist_test.cache().repeat().batch(flags_obj.batch_size)
-
-  with strategy_scope:
-    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
-        0.05, decay_steps=100000, decay_rate=0.96)
-    optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule)
-
-    model = build_model()
-    model.compile(
-        optimizer=optimizer,
-        loss='sparse_categorical_crossentropy',
-        metrics=['sparse_categorical_accuracy'])
-
-  num_train_examples = mnist.info.splits['train'].num_examples
-  train_steps = num_train_examples // flags_obj.batch_size
-  train_epochs = flags_obj.train_epochs
-
-  ckpt_full_path = os.path.join(flags_obj.model_dir, 'model.ckpt-{epoch:04d}')
-  callbacks = [
-      tf.keras.callbacks.ModelCheckpoint(
-          ckpt_full_path, save_weights_only=True),
-      tf.keras.callbacks.TensorBoard(log_dir=flags_obj.model_dir),
-  ]
-
-  num_eval_examples = mnist.info.splits['test'].num_examples
-  num_eval_steps = num_eval_examples // flags_obj.batch_size
-
-  history = model.fit(
-      train_input_dataset,
-      epochs=train_epochs,
-      steps_per_epoch=train_steps,
-      callbacks=callbacks,
-      validation_steps=num_eval_steps,
-      validation_data=eval_input_dataset,
-      validation_freq=flags_obj.epochs_between_evals)
-
-  export_path = os.path.join(flags_obj.model_dir, 'saved_model')
-  model.save(export_path, include_optimizer=False)
-
-  eval_output = model.evaluate(
-      eval_input_dataset, steps=num_eval_steps, verbose=2)
-
-  stats = common.build_stats(history, eval_output, callbacks)
-  return stats
-
-
-def define_mnist_flags():
-  """Define command line flags for MNIST model."""
-  flags_core.define_base(
-      clean=True,
-      num_gpu=True,
-      train_epochs=True,
-      epochs_between_evals=True,
-      distribution_strategy=True)
-  flags_core.define_device()
-  flags_core.define_distribution()
-  flags.DEFINE_bool('download', True,
-                    'Whether to download data to `--data_dir`.')
-  flags.DEFINE_integer('profiler_port', 9012,
-                       'Port to start profiler server on.')
-  FLAGS.set_default('batch_size', 1024)
-
-
-def main(_):
-  model_helpers.apply_clean(FLAGS)
-  stats = run(flags.FLAGS)
-  logging.info('Run stats:\n%s', stats)
-
-
-if __name__ == '__main__':
-  logging.set_verbosity(logging.INFO)
-  define_mnist_flags()
-  app.run(main)
--- a/official/vision/image_classification/mnist_test.py
+++ b/official/vision/image_classification/mnist_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Test the Keras MNIST model on GPU."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.utils.testing import integration
-from official.vision.image_classification import mnist_main
-
-
-mnist_main.define_mnist_flags()
-
-
-def eager_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.default_strategy,
-          strategy_combinations.cloud_tpu_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-      ],)
-
-
-class KerasMnistTest(tf.test.TestCase, parameterized.TestCase):
-  """Unit tests for sample Keras MNIST model."""
-  _tempdir = None
-
-  @classmethod
-  def setUpClass(cls):  # pylint: disable=invalid-name
-    super(KerasMnistTest, cls).setUpClass()
-
-  def tearDown(self):
-    super(KerasMnistTest, self).tearDown()
-    tf.io.gfile.rmtree(self.get_temp_dir())
-
-  @combinations.generate(eager_strategy_combinations())
-  def test_end_to_end(self, distribution):
-    """Test Keras MNIST model with `strategy`."""
-
-    extra_flags = [
-        "-train_epochs",
-        "1",
-        # Let TFDS find the metadata folder automatically
-        "--data_dir="
-    ]
-
-    dummy_data = (
-        tf.ones(shape=(10, 28, 28, 1), dtype=tf.int32),
-        tf.range(10),
-    )
-    datasets = (
-        tf.data.Dataset.from_tensor_slices(dummy_data),
-        tf.data.Dataset.from_tensor_slices(dummy_data),
-    )
-
-    run = functools.partial(
-        mnist_main.run,
-        datasets_override=datasets,
-        strategy_override=distribution)
-
-    integration.run_synthetic(
-        main=run,
-        synth=False,
-        tmp_root=self.create_tempdir().full_path,
-        extra_flags=extra_flags)
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/vision/image_classification/optimizer_factory.py
+++ b/official/vision/image_classification/optimizer_factory.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Optimizer factory for vision tasks."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from typing import Any, Dict, Optional, Text
-
-from absl import logging
-import tensorflow as tf
-import tensorflow_addons as tfa
-
-from official.modeling import optimization
-from official.vision.image_classification import learning_rate
-from official.vision.image_classification.configs import base_configs
-
-# pylint: disable=protected-access
-
-
-def build_optimizer(
-    optimizer_name: Text,
-    base_learning_rate: tf.keras.optimizers.schedules.LearningRateSchedule,
-    params: Dict[Text, Any],
-    model: Optional[tf.keras.Model] = None):
-  """Build the optimizer based on name.
-
-  Args:
-    optimizer_name: String representation of the optimizer name. Examples: sgd,
-      momentum, rmsprop.
-    base_learning_rate: `tf.keras.optimizers.schedules.LearningRateSchedule`
-      base learning rate.
-    params: String -> Any dictionary representing the optimizer params. This
-      should contain optimizer specific parameters such as `base_learning_rate`,
-      `decay`, etc.
-    model: The `tf.keras.Model`. This is used for the shadow copy if using
-      `ExponentialMovingAverage`.
-
-  Returns:
-    A tf.keras.Optimizer.
-
-  Raises:
-    ValueError if the provided optimizer_name is not supported.
-
-  """
-  optimizer_name = optimizer_name.lower()
-  logging.info('Building %s optimizer with params %s', optimizer_name, params)
-
-  if optimizer_name == 'sgd':
-    logging.info('Using SGD optimizer')
-    nesterov = params.get('nesterov', False)
-    optimizer = tf.keras.optimizers.SGD(
-        learning_rate=base_learning_rate, nesterov=nesterov)
-  elif optimizer_name == 'momentum':
-    logging.info('Using momentum optimizer')
-    nesterov = params.get('nesterov', False)
-    optimizer = tf.keras.optimizers.SGD(
-        learning_rate=base_learning_rate,
-        momentum=params['momentum'],
-        nesterov=nesterov)
-  elif optimizer_name == 'rmsprop':
-    logging.info('Using RMSProp')
-    rho = params.get('decay', None) or params.get('rho', 0.9)
-    momentum = params.get('momentum', 0.9)
-    epsilon = params.get('epsilon', 1e-07)
-    optimizer = tf.keras.optimizers.RMSprop(
-        learning_rate=base_learning_rate,
-        rho=rho,
-        momentum=momentum,
-        epsilon=epsilon)
-  elif optimizer_name == 'adam':
-    logging.info('Using Adam')
-    beta_1 = params.get('beta_1', 0.9)
-    beta_2 = params.get('beta_2', 0.999)
-    epsilon = params.get('epsilon', 1e-07)
-    optimizer = tf.keras.optimizers.Adam(
-        learning_rate=base_learning_rate,
-        beta_1=beta_1,
-        beta_2=beta_2,
-        epsilon=epsilon)
-  elif optimizer_name == 'adamw':
-    logging.info('Using AdamW')
-    weight_decay = params.get('weight_decay', 0.01)
-    beta_1 = params.get('beta_1', 0.9)
-    beta_2 = params.get('beta_2', 0.999)
-    epsilon = params.get('epsilon', 1e-07)
-    optimizer = tfa.optimizers.AdamW(
-        weight_decay=weight_decay,
-        learning_rate=base_learning_rate,
-        beta_1=beta_1,
-        beta_2=beta_2,
-        epsilon=epsilon)
-  else:
-    raise ValueError('Unknown optimizer %s' % optimizer_name)
-
-  if params.get('lookahead', None):
-    logging.info('Using lookahead optimizer.')
-    optimizer = tfa.optimizers.Lookahead(optimizer)
-
-  # Moving average should be applied last, as it's applied at test time
-  moving_average_decay = params.get('moving_average_decay', 0.)
-  if moving_average_decay is not None and moving_average_decay > 0.:
-    if model is None:
-      raise ValueError(
-          '`model` must be provided if using `ExponentialMovingAverage`.')
-    logging.info('Including moving average decay.')
-    optimizer = optimization.ExponentialMovingAverage(
-        optimizer=optimizer, average_decay=moving_average_decay)
-    optimizer.shadow_copy(model)
-  return optimizer
-
-
-def build_learning_rate(params: base_configs.LearningRateConfig,
-                        batch_size: Optional[int] = None,
-                        train_epochs: Optional[int] = None,
-                        train_steps: Optional[int] = None):
-  """Build the learning rate given the provided configuration."""
-  decay_type = params.name
-  base_lr = params.initial_lr
-  decay_rate = params.decay_rate
-  if params.decay_epochs is not None:
-    decay_steps = params.decay_epochs * train_steps
-  else:
-    decay_steps = 0
-  if params.warmup_epochs is not None:
-    warmup_steps = params.warmup_epochs * train_steps
-  else:
-    warmup_steps = 0
-
-  lr_multiplier = params.scale_by_batch_size
-
-  if lr_multiplier and lr_multiplier > 0:
-    # Scale the learning rate based on the batch size and a multiplier
-    base_lr *= lr_multiplier * batch_size
-    logging.info(
-        'Scaling the learning rate based on the batch size '
-        'multiplier. New base_lr: %f', base_lr)
-
-  if decay_type == 'exponential':
-    logging.info(
-        'Using exponential learning rate with: '
-        'initial_learning_rate: %f, decay_steps: %d, '
-        'decay_rate: %f', base_lr, decay_steps, decay_rate)
-    lr = tf.keras.optimizers.schedules.ExponentialDecay(
-        initial_learning_rate=base_lr,
-        decay_steps=decay_steps,
-        decay_rate=decay_rate,
-        staircase=params.staircase)
-  elif decay_type == 'stepwise':
-    steps_per_epoch = params.examples_per_epoch // batch_size
-    boundaries = [boundary * steps_per_epoch for boundary in params.boundaries]
-    multipliers = [batch_size * multiplier for multiplier in params.multipliers]
-    logging.info(
-        'Using stepwise learning rate. Parameters: '
-        'boundaries: %s, values: %s', boundaries, multipliers)
-    lr = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
-        boundaries=boundaries, values=multipliers)
-  elif decay_type == 'cosine_with_warmup':
-    lr = learning_rate.CosineDecayWithWarmup(
-        batch_size=batch_size,
-        total_steps=train_epochs * train_steps,
-        warmup_steps=warmup_steps)
-  if warmup_steps > 0:
-    if decay_type not in ['cosine_with_warmup']:
-      logging.info('Applying %d warmup steps to the learning rate',
-                   warmup_steps)
-      lr = learning_rate.WarmupDecaySchedule(
-          lr, warmup_steps, warmup_lr=base_lr)
-  return lr
--- a/official/vision/image_classification/optimizer_factory_test.py
+++ b/official/vision/image_classification/optimizer_factory_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for optimizer_factory."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-
-import tensorflow as tf
-from official.vision.image_classification import optimizer_factory
-from official.vision.image_classification.configs import base_configs
-
-
-class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
-
-  def build_toy_model(self) -> tf.keras.Model:
-    """Creates a toy `tf.Keras.Model`."""
-    model = tf.keras.Sequential()
-    model.add(tf.keras.layers.Dense(1, input_shape=(1,)))
-    return model
-
-  @parameterized.named_parameters(
-      ('sgd', 'sgd', 0., False), ('momentum', 'momentum', 0., False),
-      ('rmsprop', 'rmsprop', 0., False), ('adam', 'adam', 0., False),
-      ('adamw', 'adamw', 0., False),
-      ('momentum_lookahead', 'momentum', 0., True),
-      ('sgd_ema', 'sgd', 0.999, False),
-      ('momentum_ema', 'momentum', 0.999, False),
-      ('rmsprop_ema', 'rmsprop', 0.999, False))
-  def test_optimizer(self, optimizer_name, moving_average_decay, lookahead):
-    """Smoke test to be sure no syntax errors."""
-    model = self.build_toy_model()
-    params = {
-        'learning_rate': 0.001,
-        'rho': 0.09,
-        'momentum': 0.,
-        'epsilon': 1e-07,
-        'moving_average_decay': moving_average_decay,
-        'lookahead': lookahead,
-    }
-    optimizer = optimizer_factory.build_optimizer(
-        optimizer_name=optimizer_name,
-        base_learning_rate=params['learning_rate'],
-        params=params,
-        model=model)
-    self.assertTrue(issubclass(type(optimizer), tf.keras.optimizers.Optimizer))
-
-  def test_unknown_optimizer(self):
-    with self.assertRaises(ValueError):
-      optimizer_factory.build_optimizer(
-          optimizer_name='this_optimizer_does_not_exist',
-          base_learning_rate=None,
-          params=None)
-
-  def test_learning_rate_without_decay_or_warmups(self):
-    params = base_configs.LearningRateConfig(
-        name='exponential',
-        initial_lr=0.01,
-        decay_rate=0.01,
-        decay_epochs=None,
-        warmup_epochs=None,
-        scale_by_batch_size=0.01,
-        examples_per_epoch=1,
-        boundaries=[0],
-        multipliers=[0, 1])
-    batch_size = 1
-    train_steps = 1
-
-    lr = optimizer_factory.build_learning_rate(
-        params=params, batch_size=batch_size, train_steps=train_steps)
-    self.assertTrue(
-        issubclass(
-            type(lr), tf.keras.optimizers.schedules.LearningRateSchedule))
-
-  @parameterized.named_parameters(('exponential', 'exponential'),
-                                  ('cosine_with_warmup', 'cosine_with_warmup'))
-  def test_learning_rate_with_decay_and_warmup(self, lr_decay_type):
-    """Basic smoke test for syntax."""
-    params = base_configs.LearningRateConfig(
-        name=lr_decay_type,
-        initial_lr=0.01,
-        decay_rate=0.01,
-        decay_epochs=1,
-        warmup_epochs=1,
-        scale_by_batch_size=0.01,
-        examples_per_epoch=1,
-        boundaries=[0],
-        multipliers=[0, 1])
-    batch_size = 1
-    train_epochs = 1
-    train_steps = 1
-
-    lr = optimizer_factory.build_learning_rate(
-        params=params,
-        batch_size=batch_size,
-        train_epochs=train_epochs,
-        train_steps=train_steps)
-    self.assertTrue(
-        issubclass(
-            type(lr), tf.keras.optimizers.schedules.LearningRateSchedule))
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/image_classification/preprocessing.py
+++ b/official/vision/image_classification/preprocessing.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Preprocessing functions for images."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-from typing import List, Optional, Text, Tuple
-
-from official.vision.image_classification import augment
-
-
-# Calculated from the ImageNet training set
-MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255)
-STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255)
-
-IMAGE_SIZE = 224
-CROP_PADDING = 32
-
-
-def mean_image_subtraction(
-    image_bytes: tf.Tensor,
-    means: Tuple[float, ...],
-    num_channels: int = 3,
-    dtype: tf.dtypes.DType = tf.float32,
-) ->  tf.Tensor:
-  """Subtracts the given means from each image channel.
-
-  For example:
-    means = [123.68, 116.779, 103.939]
-    image_bytes = mean_image_subtraction(image_bytes, means)
-
-  Note that the rank of `image` must be known.
-
-  Args:
-    image_bytes: a tensor of size [height, width, C].
-    means: a C-vector of values to subtract from each channel.
-    num_channels: number of color channels in the image that will be distorted.
-    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
-
-  Returns:
-    the centered image.
-
-  Raises:
-    ValueError: If the rank of `image` is unknown, if `image` has a rank other
-      than three or if the number of channels in `image` doesn't match the
-      number of values in `means`.
-  """
-  if image_bytes.get_shape().ndims != 3:
-    raise ValueError('Input must be of size [height, width, C>0]')
-
-  if len(means) != num_channels:
-    raise ValueError('len(means) must match the number of channels')
-
-  # We have a 1-D tensor of means; convert to 3-D.
-  # Note(b/130245863): we explicitly call `broadcast` instead of simply
-  # expanding dimensions for better performance.
-  means = tf.broadcast_to(means, tf.shape(image_bytes))
-  if dtype is not None:
-    means = tf.cast(means, dtype=dtype)
-
-  return image_bytes - means
-
-
-def standardize_image(
-    image_bytes: tf.Tensor,
-    stddev: Tuple[float, ...],
-    num_channels: int = 3,
-    dtype: tf.dtypes.DType = tf.float32,
-) ->  tf.Tensor:
-  """Divides the given stddev from each image channel.
-
-  For example:
-    stddev = [123.68, 116.779, 103.939]
-    image_bytes = standardize_image(image_bytes, stddev)
-
-  Note that the rank of `image` must be known.
-
-  Args:
-    image_bytes: a tensor of size [height, width, C].
-    stddev: a C-vector of values to divide from each channel.
-    num_channels: number of color channels in the image that will be distorted.
-    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
-
-  Returns:
-    the centered image.
-
-  Raises:
-    ValueError: If the rank of `image` is unknown, if `image` has a rank other
-      than three or if the number of channels in `image` doesn't match the
-      number of values in `stddev`.
-  """
-  if image_bytes.get_shape().ndims != 3:
-    raise ValueError('Input must be of size [height, width, C>0]')
-
-  if len(stddev) != num_channels:
-    raise ValueError('len(stddev) must match the number of channels')
-
-  # We have a 1-D tensor of stddev; convert to 3-D.
-  # Note(b/130245863): we explicitly call `broadcast` instead of simply
-  # expanding dimensions for better performance.
-  stddev = tf.broadcast_to(stddev, tf.shape(image_bytes))
-  if dtype is not None:
-    stddev = tf.cast(stddev, dtype=dtype)
-
-  return image_bytes / stddev
-
-
-def normalize_images(features: tf.Tensor,
-                     mean_rgb: Tuple[float, ...] = MEAN_RGB,
-                     stddev_rgb: Tuple[float, ...] = STDDEV_RGB,
-                     num_channels: int = 3,
-                     dtype: tf.dtypes.DType = tf.float32,
-                     data_format: Text = 'channels_last') -> tf.Tensor:
-  """Normalizes the input image channels with the given mean and stddev.
-
-  Args:
-    features: `Tensor` representing decoded images in float format.
-    mean_rgb: the mean of the channels to subtract.
-    stddev_rgb: the stddev of the channels to divide.
-    num_channels: the number of channels in the input image tensor.
-    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
-    data_format: the format of the input image tensor
-                 ['channels_first', 'channels_last'].
-
-  Returns:
-    A normalized image `Tensor`.
-  """
-  # TODO(allencwang) - figure out how to use mean_image_subtraction and
-  # standardize_image on batches of images and replace the following.
-  if data_format == 'channels_first':
-    stats_shape = [num_channels, 1, 1]
-  else:
-    stats_shape = [1, 1, num_channels]
-
-  if dtype is not None:
-    features = tf.image.convert_image_dtype(features, dtype=dtype)
-
-  if mean_rgb is not None:
-    mean_rgb = tf.constant(mean_rgb,
-                           shape=stats_shape,
-                           dtype=features.dtype)
-    mean_rgb = tf.broadcast_to(mean_rgb, tf.shape(features))
-    features = features - mean_rgb
-
-  if stddev_rgb is not None:
-    stddev_rgb = tf.constant(stddev_rgb,
-                             shape=stats_shape,
-                             dtype=features.dtype)
-    stddev_rgb = tf.broadcast_to(stddev_rgb, tf.shape(features))
-    features = features / stddev_rgb
-
-  return features
-
-
-def decode_and_center_crop(image_bytes: tf.Tensor,
-                           image_size: int = IMAGE_SIZE,
-                           crop_padding: int = CROP_PADDING) -> tf.Tensor:
-  """Crops to center of image with padding then scales image_size.
-
-  Args:
-    image_bytes: `Tensor` representing an image binary of arbitrary size.
-    image_size: image height/width dimension.
-    crop_padding: the padding size to use when centering the crop.
-
-  Returns:
-    A decoded and cropped image `Tensor`.
-  """
-  decoded = image_bytes.dtype != tf.string
-  shape = (tf.shape(image_bytes) if decoded
-           else tf.image.extract_jpeg_shape(image_bytes))
-  image_height = shape[0]
-  image_width = shape[1]
-
-  padded_center_crop_size = tf.cast(
-      ((image_size / (image_size + crop_padding)) *
-       tf.cast(tf.minimum(image_height, image_width), tf.float32)),
-      tf.int32)
-
-  offset_height = ((image_height - padded_center_crop_size) + 1) // 2
-  offset_width = ((image_width - padded_center_crop_size) + 1) // 2
-  crop_window = tf.stack([offset_height, offset_width,
-                          padded_center_crop_size, padded_center_crop_size])
-  if decoded:
-    image = tf.image.crop_to_bounding_box(
-        image_bytes,
-        offset_height=offset_height,
-        offset_width=offset_width,
-        target_height=padded_center_crop_size,
-        target_width=padded_center_crop_size)
-  else:
-    image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
-
-  image = resize_image(image_bytes=image,
-                       height=image_size,
-                       width=image_size)
-
-  return image
-
-
-def decode_crop_and_flip(image_bytes: tf.Tensor) -> tf.Tensor:
-  """Crops an image to a random part of the image, then randomly flips.
-
-  Args:
-    image_bytes: `Tensor` representing an image binary of arbitrary size.
-
-  Returns:
-    A decoded and cropped image `Tensor`.
-
-  """
-  decoded = image_bytes.dtype != tf.string
-  bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
-  shape = (tf.shape(image_bytes) if decoded
-           else tf.image.extract_jpeg_shape(image_bytes))
-  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
-      shape,
-      bounding_boxes=bbox,
-      min_object_covered=0.1,
-      aspect_ratio_range=[0.75, 1.33],
-      area_range=[0.05, 1.0],
-      max_attempts=100,
-      use_image_if_no_bounding_boxes=True)
-  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
-
-  # Reassemble the bounding box in the format the crop op requires.
-  offset_height, offset_width, _ = tf.unstack(bbox_begin)
-  target_height, target_width, _ = tf.unstack(bbox_size)
-  crop_window = tf.stack([offset_height, offset_width,
-                          target_height, target_width])
-  if decoded:
-    cropped = tf.image.crop_to_bounding_box(
-        image_bytes,
-        offset_height=offset_height,
-        offset_width=offset_width,
-        target_height=target_height,
-        target_width=target_width)
-  else:
-    cropped = tf.image.decode_and_crop_jpeg(image_bytes,
-                                            crop_window,
-                                            channels=3)
-
-  # Flip to add a little more random distortion in.
-  cropped = tf.image.random_flip_left_right(cropped)
-  return cropped
-
-
-def resize_image(image_bytes: tf.Tensor,
-                 height: int = IMAGE_SIZE,
-                 width: int = IMAGE_SIZE) -> tf.Tensor:
-  """Resizes an image to a given height and width.
-
-  Args:
-    image_bytes: `Tensor` representing an image binary of arbitrary size.
-    height: image height dimension.
-    width: image width dimension.
-
-  Returns:
-    A tensor containing the resized image.
-
-  """
-  return tf.compat.v1.image.resize(
-      image_bytes, [height, width], method=tf.image.ResizeMethod.BILINEAR,
-      align_corners=False)
-
-
-def preprocess_for_eval(
-    image_bytes: tf.Tensor,
-    image_size: int = IMAGE_SIZE,
-    num_channels: int = 3,
-    mean_subtract: bool = False,
-    standardize: bool = False,
-    dtype: tf.dtypes.DType = tf.float32
-) -> tf.Tensor:
-  """Preprocesses the given image for evaluation.
-
-  Args:
-    image_bytes: `Tensor` representing an image binary of arbitrary size.
-    image_size: image height/width dimension.
-    num_channels: number of image input channels.
-    mean_subtract: whether or not to apply mean subtraction.
-    standardize: whether or not to apply standardization.
-    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
-
-  Returns:
-    A preprocessed and normalized image `Tensor`.
-  """
-  images = decode_and_center_crop(image_bytes, image_size)
-  images = tf.reshape(images, [image_size, image_size, num_channels])
-
-  if mean_subtract:
-    images = mean_image_subtraction(image_bytes=images, means=MEAN_RGB)
-  if standardize:
-    images = standardize_image(image_bytes=images, stddev=STDDEV_RGB)
-  if dtype is not None:
-    images = tf.image.convert_image_dtype(images, dtype=dtype)
-
-  return images
-
-
-def load_eval_image(filename: Text, image_size: int = IMAGE_SIZE) -> tf.Tensor:
-  """Reads an image from the filesystem and applies image preprocessing.
-
-  Args:
-    filename: a filename path of an image.
-    image_size: image height/width dimension.
-
-  Returns:
-    A preprocessed and normalized image `Tensor`.
-  """
-  image_bytes = tf.io.read_file(filename)
-  image = preprocess_for_eval(image_bytes, image_size)
-
-  return image
-
-
-def build_eval_dataset(filenames: List[Text],
-                       labels: Optional[List[int]] = None,
-                       image_size: int = IMAGE_SIZE,
-                       batch_size: int = 1) -> tf.Tensor:
-  """Builds a tf.data.Dataset from a list of filenames and labels.
-
-  Args:
-    filenames: a list of filename paths of images.
-    labels: a list of labels corresponding to each image.
-    image_size: image height/width dimension.
-    batch_size: the batch size used by the dataset
-
-  Returns:
-    A preprocessed and normalized image `Tensor`.
-  """
-  if labels is None:
-    labels = [0] * len(filenames)
-
-  filenames = tf.constant(filenames)
-  labels = tf.constant(labels)
-  dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
-
-  dataset = dataset.map(
-      lambda filename, label: (load_eval_image(filename, image_size), label))
-  dataset = dataset.batch(batch_size)
-
-  return dataset
-
-
-def preprocess_for_train(image_bytes: tf.Tensor,
-                         image_size: int = IMAGE_SIZE,
-                         augmenter: Optional[augment.ImageAugment] = None,
-                         mean_subtract: bool = False,
-                         standardize: bool = False,
-                         dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
-  """Preprocesses the given image for training.
-
-  Args:
-    image_bytes: `Tensor` representing an image binary of
-      arbitrary size of dtype tf.uint8.
-    image_size: image height/width dimension.
-    augmenter: the image augmenter to apply.
-    mean_subtract: whether or not to apply mean subtraction.
-    standardize: whether or not to apply standardization.
-    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
-
-  Returns:
-    A preprocessed and normalized image `Tensor`.
-  """
-  images = decode_crop_and_flip(image_bytes=image_bytes)
-  images = resize_image(images, height=image_size, width=image_size)
-  if augmenter is not None:
-    images = augmenter.distort(images)
-  if mean_subtract:
-    images = mean_image_subtraction(image_bytes=images, means=MEAN_RGB)
-  if standardize:
-    images = standardize_image(image_bytes=images, stddev=STDDEV_RGB)
-  if dtype is not None:
-    images = tf.image.convert_image_dtype(images, dtype)
-
-  return images
--- a/official/vision/image_classification/resnet/README.md
+++ b/official/vision/image_classification/resnet/README.md
-This folder contains a
-[custom training loop (CTL)](#resnet-custom-training-loop) implementation for
-ResNet50.
-
-## Before you begin
-Please refer to the [README](../README.md) in the parent directory for
-information on setup and preparing the data.
-
-## ResNet (custom training loop)
-
-Similar to the [estimator implementation](../../../r1/resnet), the Keras
-implementation has code for the ImageNet dataset. The ImageNet
-version uses a ResNet50 model implemented in
-[`resnet_model.py`](./resnet_model.py).
-
-
-### Pretrained Models
-
-* [ResNet50 Checkpoints](https://storage.googleapis.com/cloud-tpu-checkpoints/resnet/resnet50.tar.gz)
-
-* ResNet50 TFHub: [feature vector](https://tfhub.dev/tensorflow/resnet_50/feature_vector/1)
-and [classification](https://tfhub.dev/tensorflow/resnet_50/classification/1)
-
-Again, if you did not download the data to the default directory, specify the
-location with the `--data_dir` flag:
-
-```bash
-python3 resnet_ctl_imagenet_main.py --data_dir=/path/to/imagenet
-```
-
-There are more flag options you can specify. Here are some examples:
-
- `--use_synthetic_data`: when set to true, synthetic data, rather than real
-data, are used;
- `--batch_size`: the batch size used for the model;
- `--model_dir`: the directory to save the model checkpoint;
- `--train_epochs`: number of epoches to run for training the model;
- `--train_steps`: number of steps to run for training the model. We now only
-support a number that is smaller than the number of batches in an epoch.
- `--skip_eval`: when set to true, evaluation as well as validation during
-training is skipped
-
-For example, this is a typical command line to run with ImageNet data with
-batch size 128 per GPU:
-
-```bash
-python3 -m resnet_ctl_imagenet_main.py \
-    --model_dir=/tmp/model_dir/something \
-    --num_gpus=2 \
-    --batch_size=128 \
-    --train_epochs=90 \
-    --train_steps=10 \
-    --use_synthetic_data=false
-```
-
-See [`common.py`](common.py) for full list of options.
-
-### Using multiple GPUs
-
-You can train these models on multiple GPUs using `tf.distribute.Strategy` API.
-You can read more about them in this
-[guide](https://www.tensorflow.org/guide/distribute_strategy).
-
-In this example, we have made it easier to use is with just a command line flag
-`--num_gpus`. By default this flag is 1 if TensorFlow is compiled with CUDA,
-and 0 otherwise.
-
- --num_gpus=0: Uses tf.distribute.OneDeviceStrategy with CPU as the device.
- --num_gpus=1: Uses tf.distribute.OneDeviceStrategy with GPU as the device.
- --num_gpus=2+: Uses tf.distribute.MirroredStrategy to run synchronous
-distributed training across the GPUs.
-
-If you wish to run without `tf.distribute.Strategy`, you can do so by setting
-`--distribution_strategy=off`.
-
-### Running on multiple GPU hosts
-
-You can also train these models on multiple hosts, each with GPUs, using
-`tf.distribute.Strategy`.
-
-The easiest way to run multi-host benchmarks is to set the
-[`TF_CONFIG`](https://www.tensorflow.org/guide/distributed_training#TF_CONFIG)
-appropriately at each host.  e.g., to run using `MultiWorkerMirroredStrategy` on
-2 hosts, the `cluster` in `TF_CONFIG` should have 2 `host:port` entries, and
-host `i` should have the `task` in `TF_CONFIG` set to `{"type": "worker",
-"index": i}`.  `MultiWorkerMirroredStrategy` will automatically use all the
-available GPUs at each host.
-
-### Running on Cloud TPUs
-
-Note: This model will **not** work with TPUs on Colab.
-
-You can train the ResNet CTL model on Cloud TPUs using
-`tf.distribute.TPUStrategy`. If you are not familiar with Cloud TPUs, it is
-strongly recommended that you go through the
-[quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
-create a TPU and GCE VM.
-
-To run ResNet model on a TPU, you must set `--distribution_strategy=tpu` and
-`--tpu=$TPU_NAME`, where `$TPU_NAME` the name of your TPU in the Cloud Console.
-From a GCE VM, you can run the following command to train ResNet for one epoch
-on a v2-8 or v3-8 TPU by setting `TRAIN_EPOCHS` to 1:
-
-```bash
-python3 resnet_ctl_imagenet_main.py \
-  --tpu=$TPU_NAME \
-  --model_dir=$MODEL_DIR \
-  --data_dir=$DATA_DIR \
-  --batch_size=1024 \
-  --steps_per_loop=500 \
-  --train_epochs=$TRAIN_EPOCHS \
-  --use_synthetic_data=false \
-  --dtype=fp32 \
-  --enable_eager=true \
-  --enable_tensorboard=true \
-  --distribution_strategy=tpu \
-  --log_steps=50 \
-  --single_l2_loss_op=true \
-  --use_tf_function=true
-```
-
-To train the ResNet to convergence, run it for 90 epochs by setting
-`TRAIN_EPOCHS` to 90.
-
-Note: `$MODEL_DIR` and `$DATA_DIR` must be GCS paths.
--- a/official/vision/image_classification/resnet/__init__.py
+++ b/official/vision/image_classification/resnet/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
--- a/official/vision/image_classification/resnet/common.py
+++ b/official/vision/image_classification/resnet/common.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Common util functions and classes used by both keras cifar and imagenet."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl import flags
-import tensorflow as tf
-
-import tensorflow_model_optimization as tfmot
-from official.utils.flags import core as flags_core
-from official.utils.misc import keras_utils
-
-FLAGS = flags.FLAGS
-BASE_LEARNING_RATE = 0.1  # This matches Jing's version.
-TRAIN_TOP_1 = 'training_accuracy_top_1'
-LR_SCHEDULE = [  # (multiplier, epoch to start) tuples
-    (1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
-]
-
-
-class PiecewiseConstantDecayWithWarmup(
-    tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Piecewise constant decay with warmup schedule."""
-
-  def __init__(self,
-               batch_size,
-               epoch_size,
-               warmup_epochs,
-               boundaries,
-               multipliers,
-               compute_lr_on_cpu=True,
-               name=None):
-    super(PiecewiseConstantDecayWithWarmup, self).__init__()
-    if len(boundaries) != len(multipliers) - 1:
-      raise ValueError('The length of boundaries must be 1 less than the '
-                       'length of multipliers')
-
-    base_lr_batch_size = 256
-    steps_per_epoch = epoch_size // batch_size
-
-    self.rescaled_lr = BASE_LEARNING_RATE * batch_size / base_lr_batch_size
-    self.step_boundaries = [float(steps_per_epoch) * x for x in boundaries]
-    self.lr_values = [self.rescaled_lr * m for m in multipliers]
-    self.warmup_steps = warmup_epochs * steps_per_epoch
-    self.compute_lr_on_cpu = compute_lr_on_cpu
-    self.name = name
-
-    self.learning_rate_ops_cache = {}
-
-  def __call__(self, step):
-    if tf.executing_eagerly():
-      return self._get_learning_rate(step)
-
-    # In an eager function or graph, the current implementation of optimizer
-    # repeatedly call and thus create ops for the learning rate schedule. To
-    # avoid this, we cache the ops if not executing eagerly.
-    graph = tf.compat.v1.get_default_graph()
-    if graph not in self.learning_rate_ops_cache:
-      if self.compute_lr_on_cpu:
-        with tf.device('/device:CPU:0'):
-          self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
-      else:
-        self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
-    return self.learning_rate_ops_cache[graph]
-
-  def _get_learning_rate(self, step):
-    """Compute learning rate at given step."""
-    with tf.name_scope('PiecewiseConstantDecayWithWarmup'):
-
-      def warmup_lr(step):
-        return self.rescaled_lr * (
-            tf.cast(step, tf.float32) / tf.cast(self.warmup_steps, tf.float32))
-
-      def piecewise_lr(step):
-        return tf.compat.v1.train.piecewise_constant(step, self.step_boundaries,
-                                                     self.lr_values)
-
-      return tf.cond(step < self.warmup_steps, lambda: warmup_lr(step),
-                     lambda: piecewise_lr(step))
-
-  def get_config(self):
-    return {
-        'rescaled_lr': self.rescaled_lr,
-        'step_boundaries': self.step_boundaries,
-        'lr_values': self.lr_values,
-        'warmup_steps': self.warmup_steps,
-        'compute_lr_on_cpu': self.compute_lr_on_cpu,
-        'name': self.name
-    }
-
-
-def get_optimizer(learning_rate=0.1):
-  """Returns optimizer to use."""
-  # The learning_rate is overwritten at the beginning of each step by callback.
-  return tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
-
-
-def get_callbacks(pruning_method=None,
-                  enable_checkpoint_and_export=False,
-                  model_dir=None):
-  """Returns common callbacks."""
-  time_callback = keras_utils.TimeHistory(
-      FLAGS.batch_size,
-      FLAGS.log_steps,
-      logdir=FLAGS.model_dir if FLAGS.enable_tensorboard else None)
-  callbacks = [time_callback]
-
-  if FLAGS.enable_tensorboard:
-    tensorboard_callback = tf.keras.callbacks.TensorBoard(
-        log_dir=FLAGS.model_dir, profile_batch=FLAGS.profile_steps)
-    callbacks.append(tensorboard_callback)
-
-  is_pruning_enabled = pruning_method is not None
-  if is_pruning_enabled:
-    callbacks.append(tfmot.sparsity.keras.UpdatePruningStep())
-    if model_dir is not None:
-      callbacks.append(
-          tfmot.sparsity.keras.PruningSummaries(
-              log_dir=model_dir, profile_batch=0))
-
-  if enable_checkpoint_and_export:
-    if model_dir is not None:
-      ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
-      callbacks.append(
-          tf.keras.callbacks.ModelCheckpoint(
-              ckpt_full_path, save_weights_only=True))
-  return callbacks
-
-
-def build_stats(history, eval_output, callbacks):
-  """Normalizes and returns dictionary of stats.
-
-  Args:
-    history: Results of the training step. Supports both categorical_accuracy
-      and sparse_categorical_accuracy.
-    eval_output: Output of the eval step. Assumes first value is eval_loss and
-      second value is accuracy_top_1.
-    callbacks: a list of callbacks which might include a time history callback
-      used during keras.fit.
-
-  Returns:
-    Dictionary of normalized results.
-  """
-  stats = {}
-  if eval_output:
-    stats['accuracy_top_1'] = float(eval_output[1])
-    stats['eval_loss'] = float(eval_output[0])
-  if history and history.history:
-    train_hist = history.history
-    # Gets final loss from training.
-    stats['loss'] = float(train_hist['loss'][-1])
-    # Gets top_1 training accuracy.
-    if 'categorical_accuracy' in train_hist:
-      stats[TRAIN_TOP_1] = float(train_hist['categorical_accuracy'][-1])
-    elif 'sparse_categorical_accuracy' in train_hist:
-      stats[TRAIN_TOP_1] = float(train_hist['sparse_categorical_accuracy'][-1])
-    elif 'accuracy' in train_hist:
-      stats[TRAIN_TOP_1] = float(train_hist['accuracy'][-1])
-
-  if not callbacks:
-    return stats
-
-  # Look for the time history callback which was used during keras.fit
-  for callback in callbacks:
-    if isinstance(callback, keras_utils.TimeHistory):
-      timestamp_log = callback.timestamp_log
-      stats['step_timestamp_log'] = timestamp_log
-      stats['train_finish_time'] = callback.train_finish_time
-      if callback.epoch_runtime_log:
-        stats['avg_exp_per_second'] = callback.average_examples_per_second
-
-  return stats
-
-
-def define_keras_flags(model=False,
-                       optimizer=False,
-                       pretrained_filepath=False):
-  """Define flags for Keras models."""
-  flags_core.define_base(
-      clean=True,
-      num_gpu=True,
-      run_eagerly=True,
-      train_epochs=True,
-      epochs_between_evals=True,
-      distribution_strategy=True)
-  flags_core.define_performance(
-      num_parallel_calls=False,
-      synthetic_data=True,
-      dtype=True,
-      all_reduce_alg=True,
-      num_packs=True,
-      tf_gpu_thread_mode=True,
-      datasets_num_private_threads=True,
-      loss_scale=True,
-      fp16_implementation=True,
-      tf_data_experimental_slack=True,
-      enable_xla=True,
-      training_dataset_cache=True)
-  flags_core.define_image()
-  flags_core.define_benchmark()
-  flags_core.define_distribution()
-  flags.adopt_module_key_flags(flags_core)
-
-  flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
-  flags.DEFINE_boolean(name='skip_eval', default=False, help='Skip evaluation?')
-  # TODO(b/135607288): Remove this flag once we understand the root cause of
-  # slowdown when setting the learning phase in Keras backend.
-  flags.DEFINE_boolean(
-      name='set_learning_phase_to_train',
-      default=True,
-      help='If skip eval, also set Keras learning phase to 1 (training).')
-  flags.DEFINE_boolean(
-      name='explicit_gpu_placement',
-      default=False,
-      help='If not using distribution strategy, explicitly set device scope '
-      'for the Keras training loop.')
-  flags.DEFINE_boolean(
-      name='use_trivial_model',
-      default=False,
-      help='Whether to use a trivial Keras model.')
-  flags.DEFINE_boolean(
-      name='report_accuracy_metrics',
-      default=True,
-      help='Report metrics during training and evaluation.')
-  flags.DEFINE_boolean(
-      name='use_tensor_lr',
-      default=True,
-      help='Use learning rate tensor instead of a callback.')
-  flags.DEFINE_boolean(
-      name='enable_tensorboard',
-      default=False,
-      help='Whether to enable Tensorboard callback.')
-  flags.DEFINE_string(
-      name='profile_steps',
-      default=None,
-      help='Save profiling data to model dir at given range of global steps. The '
-      'value must be a comma separated pair of positive integers, specifying '
-      'the first and last step to profile. For example, "--profile_steps=2,4" '
-      'triggers the profiler to process 3 steps, starting from the 2nd step. '
-      'Note that profiler has a non-trivial performance overhead, and the '
-      'output file can be gigantic if profiling many steps.')
-  flags.DEFINE_integer(
-      name='train_steps',
-      default=None,
-      help='The number of steps to run for training. If it is larger than '
-      '# batches per epoch, then use # batches per epoch. This flag will be '
-      'ignored if train_epochs is set to be larger than 1. ')
-  flags.DEFINE_boolean(
-      name='batchnorm_spatial_persistent',
-      default=True,
-      help='Enable the spacial persistent mode for CuDNN batch norm kernel.')
-  flags.DEFINE_boolean(
-      name='enable_get_next_as_optional',
-      default=False,
-      help='Enable get_next_as_optional behavior in DistributedIterator.')
-  flags.DEFINE_boolean(
-      name='enable_checkpoint_and_export',
-      default=False,
-      help='Whether to enable a checkpoint callback and export the savedmodel.')
-  flags.DEFINE_string(name='tpu', default='', help='TPU address to connect to.')
-  flags.DEFINE_integer(
-      name='steps_per_loop',
-      default=None,
-      help='Number of steps per training loop. Only training step happens '
-      'inside the loop. Callbacks will not be called inside. Will be capped at '
-      'steps per epoch.')
-  flags.DEFINE_boolean(
-      name='use_tf_while_loop',
-      default=True,
-      help='Whether to build a tf.while_loop inside the training loop on the '
-      'host. Setting it to True is critical to have peak performance on '
-      'TPU.')
-
-  if model:
-    flags.DEFINE_string('model', 'resnet50_v1.5',
-                        'Name of model preset. (mobilenet, resnet50_v1.5)')
-  if optimizer:
-    flags.DEFINE_string(
-        'optimizer', 'resnet50_default', 'Name of optimizer preset. '
-        '(mobilenet_default, resnet50_default)')
-    # TODO(kimjaehong): Replace as general hyper-params not only for mobilenet.
-    flags.DEFINE_float(
-        'initial_learning_rate_per_sample', 0.00007,
-        'Initial value of learning rate per sample for '
-        'mobilenet_default.')
-    flags.DEFINE_float('lr_decay_factor', 0.94,
-                       'Learning rate decay factor for mobilenet_default.')
-    flags.DEFINE_float('num_epochs_per_decay', 2.5,
-                       'Number of epochs per decay for mobilenet_default.')
-  if pretrained_filepath:
-    flags.DEFINE_string('pretrained_filepath', '', 'Pretrained file path.')
-
-
-def get_synth_data(height, width, num_channels, num_classes, dtype):
-  """Creates a set of synthetic random data.
-
-  Args:
-    height: Integer height that will be used to create a fake image tensor.
-    width: Integer width that will be used to create a fake image tensor.
-    num_channels: Integer depth that will be used to create a fake image tensor.
-    num_classes: Number of classes that should be represented in the fake labels
-      tensor
-    dtype: Data type for features/images.
-
-  Returns:
-    A tuple of tensors representing the inputs and labels.
-
-  """
-  # Synthetic input should be within [0, 255].
-  inputs = tf.random.truncated_normal([height, width, num_channels],
-                                      dtype=dtype,
-                                      mean=127,
-                                      stddev=60,
-                                      name='synthetic_inputs')
-  labels = tf.random.uniform([1],
-                             minval=0,
-                             maxval=num_classes - 1,
-                             dtype=tf.int32,
-                             name='synthetic_labels')
-  return inputs, labels
-
-
-def define_pruning_flags():
-  """Define flags for pruning methods."""
-  flags.DEFINE_string(
-      'pruning_method', None, 'Pruning method.'
-      'None (no pruning) or polynomial_decay.')
-  flags.DEFINE_float('pruning_initial_sparsity', 0.0,
-                     'Initial sparsity for pruning.')
-  flags.DEFINE_float('pruning_final_sparsity', 0.5,
-                     'Final sparsity for pruning.')
-  flags.DEFINE_integer('pruning_begin_step', 0, 'Begin step for pruning.')
-  flags.DEFINE_integer('pruning_end_step', 100000, 'End step for pruning.')
-  flags.DEFINE_integer('pruning_frequency', 100, 'Frequency for pruning.')
-
-
-def define_clustering_flags():
-  """Define flags for clustering methods."""
-  flags.DEFINE_string('clustering_method', None,
-                      'None (no clustering) or selective_clustering '
-                      '(cluster last three Conv2D layers of the model).')
-
-
-def get_synth_input_fn(height,
-                       width,
-                       num_channels,
-                       num_classes,
-                       dtype=tf.float32,
-                       drop_remainder=True):
-  """Returns an input function that returns a dataset with random data.
-
-  This input_fn returns a data set that iterates over a set of random data and
-  bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
-  copy is still included. This used to find the upper throughput bound when
-  tuning the full input pipeline.
-
-  Args:
-    height: Integer height that will be used to create a fake image tensor.
-    width: Integer width that will be used to create a fake image tensor.
-    num_channels: Integer depth that will be used to create a fake image tensor.
-    num_classes: Number of classes that should be represented in the fake labels
-      tensor
-    dtype: Data type for features/images.
-    drop_remainder: A boolean indicates whether to drop the remainder of the
-      batches. If True, the batch dimension will be static.
-
-  Returns:
-    An input_fn that can be used in place of a real one to return a dataset
-    that can be used for iteration.
-  """
-
-  # pylint: disable=unused-argument
-  def input_fn(is_training, data_dir, batch_size, *args, **kwargs):
-    """Returns dataset filled with random data."""
-    inputs, labels = get_synth_data(
-        height=height,
-        width=width,
-        num_channels=num_channels,
-        num_classes=num_classes,
-        dtype=dtype)
-    # Cast to float32 for Keras model.
-    labels = tf.cast(labels, dtype=tf.float32)
-    data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
-
-    # `drop_remainder` will make dataset produce outputs with known shapes.
-    data = data.batch(batch_size, drop_remainder=drop_remainder)
-    data = data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
-    return data
-
-  return input_fn
-
-
-def set_cudnn_batchnorm_mode():
-  """Set CuDNN batchnorm mode for better performance.
-
-     Note: Spatial Persistent mode may lead to accuracy losses for certain
-     models.
-  """
-  if FLAGS.batchnorm_spatial_persistent:
-    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
-  else:
-    os.environ.pop('TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT', None)
--- a/official/vision/image_classification/resnet/imagenet_preprocessing.py
+++ b/official/vision/image_classification/resnet/imagenet_preprocessing.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Provides utilities to preprocess images.
-
-Training images are sampled using the provided bounding boxes, and subsequently
-cropped to the sampled bounding box. Images are additionally flipped randomly,
-then resized to the target output size (without aspect-ratio preservation).
-
-Images used during evaluation are resized (with aspect-ratio preservation) and
-centrally cropped.
-
-All images undergo mean color subtraction.
-
-Note that these steps are colloquially referred to as "ResNet preprocessing,"
-and they differ from "VGG preprocessing," which does not use bounding boxes
-and instead does an aspect-preserving resize followed by random crop during
-training. (These both differ from "Inception preprocessing," which introduces
-color distortion steps.)
-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl import logging
-import tensorflow as tf
-
-DEFAULT_IMAGE_SIZE = 224
-NUM_CHANNELS = 3
-NUM_CLASSES = 1001
-
-NUM_IMAGES = {
-    'train': 1281167,
-    'validation': 50000,
-}
-
-_NUM_TRAIN_FILES = 1024
-_SHUFFLE_BUFFER = 10000
-
-_R_MEAN = 123.68
-_G_MEAN = 116.78
-_B_MEAN = 103.94
-CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
-
-# The lower bound for the smallest side of the image for aspect-preserving
-# resizing. For example, if an image is 500 x 1000, it will be resized to
-# _RESIZE_MIN x (_RESIZE_MIN * 2).
-_RESIZE_MIN = 256
-
-
-def process_record_dataset(dataset,
-                           is_training,
-                           batch_size,
-                           shuffle_buffer,
-                           parse_record_fn,
-                           dtype=tf.float32,
-                           datasets_num_private_threads=None,
-                           drop_remainder=False,
-                           tf_data_experimental_slack=False):
-  """Given a Dataset with raw records, return an iterator over the records.
-
-  Args:
-    dataset: A Dataset representing raw records
-    is_training: A boolean denoting whether the input is for training.
-    batch_size: The number of samples per batch.
-    shuffle_buffer: The buffer size to use when shuffling records. A larger
-      value results in better randomness, but smaller values reduce startup time
-      and use less memory.
-    parse_record_fn: A function that takes a raw record and returns the
-      corresponding (image, label) pair.
-    dtype: Data type to use for images/features.
-    datasets_num_private_threads: Number of threads for a private threadpool
-      created for all datasets computation.
-    drop_remainder: A boolean indicates whether to drop the remainder of the
-      batches. If True, the batch dimension will be static.
-    tf_data_experimental_slack: Whether to enable tf.data's `experimental_slack`
-      option.
-
-  Returns:
-    Dataset of (image, label) pairs ready for iteration.
-  """
-  # Defines a specific size thread pool for tf.data operations.
-  if datasets_num_private_threads:
-    options = tf.data.Options()
-    options.experimental_threading.private_threadpool_size = (
-        datasets_num_private_threads)
-    dataset = dataset.with_options(options)
-    logging.info('datasets_num_private_threads: %s',
-                 datasets_num_private_threads)
-
-  if is_training:
-    # Shuffles records before repeating to respect epoch boundaries.
-    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
-    # Repeats the dataset for the number of epochs to train.
-    dataset = dataset.repeat()
-
-  # Parses the raw records into images and labels.
-  dataset = dataset.map(
-      lambda value: parse_record_fn(value, is_training, dtype),
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
-  dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
-
-  # Operations between the final prefetch and the get_next call to the iterator
-  # will happen synchronously during run time. We prefetch here again to
-  # background all of the above processing work and keep it out of the
-  # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
-  # allows DistributionStrategies to adjust how many batches to fetch based
-  # on how many devices are present.
-  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
-
-  options = tf.data.Options()
-  options.experimental_slack = tf_data_experimental_slack
-  dataset = dataset.with_options(options)
-
-  return dataset
-
-
-def get_filenames(is_training, data_dir):
-  """Return filenames for dataset."""
-  if is_training:
-    return [
-        os.path.join(data_dir, 'train-%05d-of-01024' % i)
-        for i in range(_NUM_TRAIN_FILES)
-    ]
-  else:
-    return [
-        os.path.join(data_dir, 'validation-%05d-of-00128' % i)
-        for i in range(128)
-    ]
-
-
-def parse_example_proto(example_serialized):
-  """Parses an Example proto containing a training example of an image.
-
-  The output of the build_image_data.py image preprocessing script is a dataset
-  containing serialized Example protocol buffers. Each Example proto contains
-  the following fields (values are included as examples):
-
-    image/height: 462
-    image/width: 581
-    image/colorspace: 'RGB'
-    image/channels: 3
-    image/class/label: 615
-    image/class/synset: 'n03623198'
-    image/class/text: 'knee pad'
-    image/object/bbox/xmin: 0.1
-    image/object/bbox/xmax: 0.9
-    image/object/bbox/ymin: 0.2
-    image/object/bbox/ymax: 0.6
-    image/object/bbox/label: 615
-    image/format: 'JPEG'
-    image/filename: 'ILSVRC2012_val_00041207.JPEG'
-    image/encoded: <JPEG encoded string>
-
-  Args:
-    example_serialized: scalar Tensor tf.string containing a serialized Example
-      protocol buffer.
-
-  Returns:
-    image_buffer: Tensor tf.string containing the contents of a JPEG file.
-    label: Tensor tf.int32 containing the label.
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged as
-      [ymin, xmin, ymax, xmax].
-  """
-  # Dense features in Example proto.
-  feature_map = {
-      'image/encoded':
-          tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
-      'image/class/label':
-          tf.io.FixedLenFeature([], dtype=tf.int64, default_value=-1),
-      'image/class/text':
-          tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
-  }
-  sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
-  # Sparse features in Example proto.
-  feature_map.update({
-      k: sparse_float32 for k in [
-          'image/object/bbox/xmin', 'image/object/bbox/ymin',
-          'image/object/bbox/xmax', 'image/object/bbox/ymax'
-      ]
-  })
-
-  features = tf.io.parse_single_example(
-      serialized=example_serialized, features=feature_map)
-  label = tf.cast(features['image/class/label'], dtype=tf.int32)
-
-  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
-  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
-  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
-  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
-
-  # Note that we impose an ordering of (y, x) just to make life difficult.
-  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
-
-  # Force the variable number of bounding boxes into the shape
-  # [1, num_boxes, coords].
-  bbox = tf.expand_dims(bbox, 0)
-  bbox = tf.transpose(a=bbox, perm=[0, 2, 1])
-
-  return features['image/encoded'], label, bbox
-
-
-def parse_record(raw_record, is_training, dtype):
-  """Parses a record containing a training example of an image.
-
-  The input record is parsed into a label and image, and the image is passed
-  through preprocessing steps (cropping, flipping, and so on).
-
-  Args:
-    raw_record: scalar Tensor tf.string containing a serialized Example protocol
-      buffer.
-    is_training: A boolean denoting whether the input is for training.
-    dtype: data type to use for images/features.
-
-  Returns:
-    Tuple with processed image tensor in a channel-last format and
-    one-hot-encoded label tensor.
-  """
-  image_buffer, label, bbox = parse_example_proto(raw_record)
-
-  image = preprocess_image(
-      image_buffer=image_buffer,
-      bbox=bbox,
-      output_height=DEFAULT_IMAGE_SIZE,
-      output_width=DEFAULT_IMAGE_SIZE,
-      num_channels=NUM_CHANNELS,
-      is_training=is_training)
-  image = tf.cast(image, dtype)
-
-  # Subtract one so that labels are in [0, 1000), and cast to float32 for
-  # Keras model.
-  label = tf.cast(
-      tf.cast(tf.reshape(label, shape=[1]), dtype=tf.int32) - 1,
-      dtype=tf.float32)
-  return image, label
-
-
-def get_parse_record_fn(use_keras_image_data_format=False):
-  """Get a function for parsing the records, accounting for image format.
-
-  This is useful by handling different types of Keras models. For instance,
-  the current resnet_model.resnet50 input format is always channel-last,
-  whereas the keras_applications mobilenet input format depends on
-  tf.keras.backend.image_data_format(). We should set
-  use_keras_image_data_format=False for the former and True for the latter.
-
-  Args:
-    use_keras_image_data_format: A boolean denoting whether data format is keras
-      backend image data format. If False, the image format is channel-last. If
-      True, the image format matches tf.keras.backend.image_data_format().
-
-  Returns:
-    Function to use for parsing the records.
-  """
-
-  def parse_record_fn(raw_record, is_training, dtype):
-    image, label = parse_record(raw_record, is_training, dtype)
-    if use_keras_image_data_format:
-      if tf.keras.backend.image_data_format() == 'channels_first':
-        image = tf.transpose(image, perm=[2, 0, 1])
-    return image, label
-
-  return parse_record_fn
-
-
-def input_fn(is_training,
-             data_dir,
-             batch_size,
-             dtype=tf.float32,
-             datasets_num_private_threads=None,
-             parse_record_fn=parse_record,
-             input_context=None,
-             drop_remainder=False,
-             tf_data_experimental_slack=False,
-             training_dataset_cache=False,
-             filenames=None):
-  """Input function which provides batches for train or eval.
-
-  Args:
-    is_training: A boolean denoting whether the input is for training.
-    data_dir: The directory containing the input data.
-    batch_size: The number of samples per batch.
-    dtype: Data type to use for images/features
-    datasets_num_private_threads: Number of private threads for tf.data.
-    parse_record_fn: Function to use for parsing the records.
-    input_context: A `tf.distribute.InputContext` object passed in by
-      `tf.distribute.Strategy`.
-    drop_remainder: A boolean indicates whether to drop the remainder of the
-      batches. If True, the batch dimension will be static.
-    tf_data_experimental_slack: Whether to enable tf.data's `experimental_slack`
-      option.
-    training_dataset_cache: Whether to cache the training dataset on workers.
-      Typically used to improve training performance when training data is in
-      remote storage and can fit into worker memory.
-    filenames: Optional field for providing the file names of the TFRecords.
-
-  Returns:
-    A dataset that can be used for iteration.
-  """
-  if filenames is None:
-    filenames = get_filenames(is_training, data_dir)
-  dataset = tf.data.Dataset.from_tensor_slices(filenames)
-
-  if input_context:
-    logging.info(
-        'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
-        input_context.input_pipeline_id, input_context.num_input_pipelines)
-    dataset = dataset.shard(input_context.num_input_pipelines,
-                            input_context.input_pipeline_id)
-
-  if is_training:
-    # Shuffle the input files
-    dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)
-
-  # Convert to individual records.
-  # cycle_length = 10 means that up to 10 files will be read and deserialized in
-  # parallel. You may want to increase this number if you have a large number of
-  # CPU cores.
-  dataset = dataset.interleave(
-      tf.data.TFRecordDataset,
-      cycle_length=10,
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-  if is_training and training_dataset_cache:
-    # Improve training performance when training data is in remote storage and
-    # can fit into worker memory.
-    dataset = dataset.cache()
-
-  return process_record_dataset(
-      dataset=dataset,
-      is_training=is_training,
-      batch_size=batch_size,
-      shuffle_buffer=_SHUFFLE_BUFFER,
-      parse_record_fn=parse_record_fn,
-      dtype=dtype,
-      datasets_num_private_threads=datasets_num_private_threads,
-      drop_remainder=drop_remainder,
-      tf_data_experimental_slack=tf_data_experimental_slack,
-  )
-
-
-def _decode_crop_and_flip(image_buffer, bbox, num_channels):
-  """Crops the given image to a random part of the image, and randomly flips.
-
-  We use the fused decode_and_crop op, which performs better than the two ops
-  used separately in series, but note that this requires that the image be
-  passed in as an un-decoded string Tensor.
-
-  Args:
-    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged as [ymin,
-      xmin, ymax, xmax].
-    num_channels: Integer depth of the image buffer for decoding.
-
-  Returns:
-    3-D tensor with cropped image.
-
-  """
-  # A large fraction of image datasets contain a human-annotated bounding box
-  # delineating the region of the image containing the object of interest.  We
-  # choose to create a new bounding box for the object which is a randomly
-  # distorted version of the human-annotated bounding box that obeys an
-  # allowed range of aspect ratios, sizes and overlap with the human-annotated
-  # bounding box. If no box is supplied, then we assume the bounding box is
-  # the entire image.
-  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
-      tf.image.extract_jpeg_shape(image_buffer),
-      bounding_boxes=bbox,
-      min_object_covered=0.1,
-      aspect_ratio_range=[0.75, 1.33],
-      area_range=[0.05, 1.0],
-      max_attempts=100,
-      use_image_if_no_bounding_boxes=True)
-  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
-
-  # Reassemble the bounding box in the format the crop op requires.
-  offset_y, offset_x, _ = tf.unstack(bbox_begin)
-  target_height, target_width, _ = tf.unstack(bbox_size)
-  crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
-
-  # Use the fused decode and crop op here, which is faster than each in series.
-  cropped = tf.image.decode_and_crop_jpeg(
-      image_buffer, crop_window, channels=num_channels)
-
-  # Flip to add a little more random distortion in.
-  cropped = tf.image.random_flip_left_right(cropped)
-  return cropped
-
-
-def _central_crop(image, crop_height, crop_width):
-  """Performs central crops of the given image list.
-
-  Args:
-    image: a 3-D image tensor
-    crop_height: the height of the image following the crop.
-    crop_width: the width of the image following the crop.
-
-  Returns:
-    3-D tensor with cropped image.
-  """
-  shape = tf.shape(input=image)
-  height, width = shape[0], shape[1]
-
-  amount_to_be_cropped_h = (height - crop_height)
-  crop_top = amount_to_be_cropped_h // 2
-  amount_to_be_cropped_w = (width - crop_width)
-  crop_left = amount_to_be_cropped_w // 2
-  return tf.slice(image, [crop_top, crop_left, 0],
-                  [crop_height, crop_width, -1])
-
-
-def _mean_image_subtraction(image, means, num_channels):
-  """Subtracts the given means from each image channel.
-
-  For example:
-    means = [123.68, 116.779, 103.939]
-    image = _mean_image_subtraction(image, means)
-
-  Note that the rank of `image` must be known.
-
-  Args:
-    image: a tensor of size [height, width, C].
-    means: a C-vector of values to subtract from each channel.
-    num_channels: number of color channels in the image that will be distorted.
-
-  Returns:
-    the centered image.
-
-  Raises:
-    ValueError: If the rank of `image` is unknown, if `image` has a rank other
-      than three or if the number of channels in `image` doesn't match the
-      number of values in `means`.
-  """
-  if image.get_shape().ndims != 3:
-    raise ValueError('Input must be of size [height, width, C>0]')
-
-  if len(means) != num_channels:
-    raise ValueError('len(means) must match the number of channels')
-
-  # We have a 1-D tensor of means; convert to 3-D.
-  # Note(b/130245863): we explicitly call `broadcast` instead of simply
-  # expanding dimensions for better performance.
-  means = tf.broadcast_to(means, tf.shape(image))
-
-  return image - means
-
-
-def _smallest_size_at_least(height, width, resize_min):
-  """Computes new shape with the smallest side equal to `smallest_side`.
-
-  Computes new shape with the smallest side equal to `smallest_side` while
-  preserving the original aspect ratio.
-
-  Args:
-    height: an int32 scalar tensor indicating the current height.
-    width: an int32 scalar tensor indicating the current width.
-    resize_min: A python integer or scalar `Tensor` indicating the size of the
-      smallest side after resize.
-
-  Returns:
-    new_height: an int32 scalar tensor indicating the new height.
-    new_width: an int32 scalar tensor indicating the new width.
-  """
-  resize_min = tf.cast(resize_min, tf.float32)
-
-  # Convert to floats to make subsequent calculations go smoothly.
-  height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
-
-  smaller_dim = tf.minimum(height, width)
-  scale_ratio = resize_min / smaller_dim
-
-  # Convert back to ints to make heights and widths that TF ops will accept.
-  new_height = tf.cast(height * scale_ratio, tf.int32)
-  new_width = tf.cast(width * scale_ratio, tf.int32)
-
-  return new_height, new_width
-
-
-def _aspect_preserving_resize(image, resize_min):
-  """Resize images preserving the original aspect ratio.
-
-  Args:
-    image: A 3-D image `Tensor`.
-    resize_min: A python integer or scalar `Tensor` indicating the size of the
-      smallest side after resize.
-
-  Returns:
-    resized_image: A 3-D tensor containing the resized image.
-  """
-  shape = tf.shape(input=image)
-  height, width = shape[0], shape[1]
-
-  new_height, new_width = _smallest_size_at_least(height, width, resize_min)
-
-  return _resize_image(image, new_height, new_width)
-
-
-def _resize_image(image, height, width):
-  """Simple wrapper around tf.resize_images.
-
-  This is primarily to make sure we use the same `ResizeMethod` and other
-  details each time.
-
-  Args:
-    image: A 3-D image `Tensor`.
-    height: The target height for the resized image.
-    width: The target width for the resized image.
-
-  Returns:
-    resized_image: A 3-D tensor containing the resized image. The first two
-      dimensions have the shape [height, width].
-  """
-  return tf.compat.v1.image.resize(
-      image, [height, width],
-      method=tf.image.ResizeMethod.BILINEAR,
-      align_corners=False)
-
-
-def preprocess_image(image_buffer,
-                     bbox,
-                     output_height,
-                     output_width,
-                     num_channels,
-                     is_training=False):
-  """Preprocesses the given image.
-
-  Preprocessing includes decoding, cropping, and resizing for both training
-  and eval images. Training preprocessing, however, introduces some random
-  distortion of the image to improve accuracy.
-
-  Args:
-    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged as [ymin,
-      xmin, ymax, xmax].
-    output_height: The height of the image after preprocessing.
-    output_width: The width of the image after preprocessing.
-    num_channels: Integer depth of the image buffer for decoding.
-    is_training: `True` if we're preprocessing the image for training and
-      `False` otherwise.
-
-  Returns:
-    A preprocessed image.
-  """
-  if is_training:
-    # For training, we want to randomize some of the distortions.
-    image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
-    image = _resize_image(image, output_height, output_width)
-  else:
-    # For validation, we want to decode, resize, then just crop the middle.
-    image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
-    image = _aspect_preserving_resize(image, _RESIZE_MIN)
-    image = _central_crop(image, output_height, output_width)
-
-  image.set_shape([output_height, output_width, num_channels])
-
-  return _mean_image_subtraction(image, CHANNEL_MEANS, num_channels)
--- a/official/vision/image_classification/resnet/resnet_config.py
+++ b/official/vision/image_classification/resnet/resnet_config.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Configuration definitions for ResNet losses, learning rates, and optimizers."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import dataclasses
-
-from official.modeling.hyperparams import base_config
-from official.vision.image_classification.configs import base_configs
-
-
-@dataclasses.dataclass
-class ResNetModelConfig(base_configs.ModelConfig):
-  """Configuration for the ResNet model."""
-  name: str = 'ResNet'
-  num_classes: int = 1000
-  model_params: base_config.Config = dataclasses.field(
-      default_factory=lambda: {
-          'num_classes': 1000,
-          'batch_size': None,
-          'use_l2_regularizer': True,
-          'rescale_inputs': False,
-      })
-  loss: base_configs.LossConfig = base_configs.LossConfig(
-      name='sparse_categorical_crossentropy')
-  optimizer: base_configs.OptimizerConfig = base_configs.OptimizerConfig(
-      name='momentum',
-      decay=0.9,
-      epsilon=0.001,
-      momentum=0.9,
-      moving_average_decay=None)
-  learning_rate: base_configs.LearningRateConfig = (
-      base_configs.LearningRateConfig(
-          name='stepwise',
-          initial_lr=0.1,
-          examples_per_epoch=1281167,
-          boundaries=[30, 60, 80],
-          warmup_epochs=5,
-          scale_by_batch_size=1. / 256.,
-          multipliers=[0.1 / 256, 0.01 / 256, 0.001 / 256, 0.0001 / 256]))
--- a/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py
+++ b/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
-
-import math
-import os
-
-# Import libraries
-from absl import app
-from absl import flags
-from absl import logging
-import orbit
-import tensorflow as tf
-from official.common import distribute_utils
-from official.modeling import performance
-from official.utils.flags import core as flags_core
-from official.utils.misc import keras_utils
-from official.utils.misc import model_helpers
-from official.vision.image_classification.resnet import common
-from official.vision.image_classification.resnet import imagenet_preprocessing
-from official.vision.image_classification.resnet import resnet_runnable
-
-flags.DEFINE_boolean(name='use_tf_function', default=True,
-                     help='Wrap the train and test step inside a '
-                     'tf.function.')
-flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
-                     help='Calculate L2_loss on concatenated weights, '
-                     'instead of using Keras per-layer L2 loss.')
-
-
-def build_stats(runnable, time_callback):
-  """Normalizes and returns dictionary of stats.
-
-  Args:
-    runnable: The module containing all the training and evaluation metrics.
-    time_callback: Time tracking callback instance.
-
-  Returns:
-    Dictionary of normalized results.
-  """
-  stats = {}
-
-  if not runnable.flags_obj.skip_eval:
-    stats['eval_loss'] = runnable.test_loss.result().numpy()
-    stats['eval_acc'] = runnable.test_accuracy.result().numpy()
-
-    stats['train_loss'] = runnable.train_loss.result().numpy()
-    stats['train_acc'] = runnable.train_accuracy.result().numpy()
-
-  if time_callback:
-    timestamp_log = time_callback.timestamp_log
-    stats['step_timestamp_log'] = timestamp_log
-    stats['train_finish_time'] = time_callback.train_finish_time
-    if time_callback.epoch_runtime_log:
-      stats['avg_exp_per_second'] = time_callback.average_examples_per_second
-
-  return stats
-
-
-def get_num_train_iterations(flags_obj):
-  """Returns the number of training steps, train and test epochs."""
-  train_steps = (
-      imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
-  train_epochs = flags_obj.train_epochs
-
-  if flags_obj.train_steps:
-    train_steps = min(flags_obj.train_steps, train_steps)
-    train_epochs = 1
-
-  eval_steps = math.ceil(1.0 * imagenet_preprocessing.NUM_IMAGES['validation'] /
-                         flags_obj.batch_size)
-
-  return train_steps, train_epochs, eval_steps
-
-
-def run(flags_obj):
-  """Run ResNet ImageNet training and eval loop using custom training loops.
-
-  Args:
-    flags_obj: An object containing parsed flag values.
-
-  Raises:
-    ValueError: If fp16 is passed as it is not currently supported.
-
-  Returns:
-    Dictionary of training and eval stats.
-  """
-  keras_utils.set_session_config()
-  performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
-
-  if tf.config.list_physical_devices('GPU'):
-    if flags_obj.tf_gpu_thread_mode:
-      keras_utils.set_gpu_thread_mode_and_count(
-          per_gpu_thread_count=flags_obj.per_gpu_thread_count,
-          gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
-          num_gpus=flags_obj.num_gpus,
-          datasets_num_private_threads=flags_obj.datasets_num_private_threads)
-    common.set_cudnn_batchnorm_mode()
-
-  data_format = flags_obj.data_format
-  if data_format is None:
-    data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
-                   else 'channels_last')
-  tf.keras.backend.set_image_data_format(data_format)
-
-  strategy = distribute_utils.get_distribution_strategy(
-      distribution_strategy=flags_obj.distribution_strategy,
-      num_gpus=flags_obj.num_gpus,
-      all_reduce_alg=flags_obj.all_reduce_alg,
-      num_packs=flags_obj.num_packs,
-      tpu_address=flags_obj.tpu)
-
-  per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations(
-      flags_obj)
-  if flags_obj.steps_per_loop is None:
-    steps_per_loop = per_epoch_steps
-  elif flags_obj.steps_per_loop > per_epoch_steps:
-    steps_per_loop = per_epoch_steps
-    logging.warn('Setting steps_per_loop to %d to respect epoch boundary.',
-                 steps_per_loop)
-  else:
-    steps_per_loop = flags_obj.steps_per_loop
-
-  logging.info(
-      'Training %d epochs, each epoch has %d steps, '
-      'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
-      train_epochs * per_epoch_steps, eval_steps)
-
-  time_callback = keras_utils.TimeHistory(
-      flags_obj.batch_size,
-      flags_obj.log_steps,
-      logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
-  with distribute_utils.get_strategy_scope(strategy):
-    runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback,
-                                              per_epoch_steps)
-
-  eval_interval = flags_obj.epochs_between_evals * per_epoch_steps
-  checkpoint_interval = (
-      steps_per_loop * 5 if flags_obj.enable_checkpoint_and_export else None)
-  summary_interval = steps_per_loop if flags_obj.enable_tensorboard else None
-
-  checkpoint_manager = tf.train.CheckpointManager(
-      runnable.checkpoint,
-      directory=flags_obj.model_dir,
-      max_to_keep=10,
-      step_counter=runnable.global_step,
-      checkpoint_interval=checkpoint_interval)
-
-  resnet_controller = orbit.Controller(
-      strategy=strategy,
-      trainer=runnable,
-      evaluator=runnable if not flags_obj.skip_eval else None,
-      global_step=runnable.global_step,
-      steps_per_loop=steps_per_loop,
-      checkpoint_manager=checkpoint_manager,
-      summary_interval=summary_interval,
-      summary_dir=flags_obj.model_dir,
-      eval_summary_dir=os.path.join(flags_obj.model_dir, 'eval'))
-
-  time_callback.on_train_begin()
-  if not flags_obj.skip_eval:
-    resnet_controller.train_and_evaluate(
-        train_steps=per_epoch_steps * train_epochs,
-        eval_steps=eval_steps,
-        eval_interval=eval_interval)
-  else:
-    resnet_controller.train(steps=per_epoch_steps * train_epochs)
-  time_callback.on_train_end()
-
-  stats = build_stats(runnable, time_callback)
-  return stats
-
-
-def main(_):
-  model_helpers.apply_clean(flags.FLAGS)
-  stats = run(flags.FLAGS)
-  logging.info('Run stats:\n%s', stats)
-
-
-if __name__ == '__main__':
-  logging.set_verbosity(logging.INFO)
-  common.define_keras_flags()
-  app.run(main)
--- a/official/vision/image_classification/resnet/resnet_model.py
+++ b/official/vision/image_classification/resnet/resnet_model.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""ResNet50 model for Keras.
-
-Adapted from tf.keras.applications.resnet50.ResNet50().
-This is ResNet model version 1.5.
-
-Related papers/blogs:
- https://arxiv.org/abs/1512.03385
- https://arxiv.org/pdf/1603.05027v2.pdf
- http://torch.ch/blog/2016/02/04/resnets.html
-
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-from official.vision.image_classification.resnet import imagenet_preprocessing
-
-layers = tf.keras.layers
-
-
-def _gen_l2_regularizer(use_l2_regularizer=True, l2_weight_decay=1e-4):
-  return tf.keras.regularizers.L2(
-      l2_weight_decay) if use_l2_regularizer else None
-
-
-def identity_block(input_tensor,
-                   kernel_size,
-                   filters,
-                   stage,
-                   block,
-                   use_l2_regularizer=True,
-                   batch_norm_decay=0.9,
-                   batch_norm_epsilon=1e-5):
-  """The identity block is the block that has no conv layer at shortcut.
-
-  Args:
-    input_tensor: input tensor
-    kernel_size: default 3, the kernel size of middle conv layer at main path
-    filters: list of integers, the filters of 3 conv layer at main path
-    stage: integer, current stage label, used for generating layer names
-    block: 'a','b'..., current block label, used for generating layer names
-    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
-    batch_norm_decay: Moment of batch norm layers.
-    batch_norm_epsilon: Epsilon of batch borm layers.
-
-  Returns:
-    Output tensor for the block.
-  """
-  filters1, filters2, filters3 = filters
-  if tf.keras.backend.image_data_format() == 'channels_last':
-    bn_axis = 3
-  else:
-    bn_axis = 1
-  conv_name_base = 'res' + str(stage) + block + '_branch'
-  bn_name_base = 'bn' + str(stage) + block + '_branch'
-
-  x = layers.Conv2D(
-      filters1, (1, 1),
-      use_bias=False,
-      kernel_initializer='he_normal',
-      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-      name=conv_name_base + '2a')(
-          input_tensor)
-  x = layers.BatchNormalization(
-      axis=bn_axis,
-      momentum=batch_norm_decay,
-      epsilon=batch_norm_epsilon,
-      name=bn_name_base + '2a')(
-          x)
-  x = layers.Activation('relu')(x)
-
-  x = layers.Conv2D(
-      filters2,
-      kernel_size,
-      padding='same',
-      use_bias=False,
-      kernel_initializer='he_normal',
-      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-      name=conv_name_base + '2b')(
-          x)
-  x = layers.BatchNormalization(
-      axis=bn_axis,
-      momentum=batch_norm_decay,
-      epsilon=batch_norm_epsilon,
-      name=bn_name_base + '2b')(
-          x)
-  x = layers.Activation('relu')(x)
-
-  x = layers.Conv2D(
-      filters3, (1, 1),
-      use_bias=False,
-      kernel_initializer='he_normal',
-      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-      name=conv_name_base + '2c')(
-          x)
-  x = layers.BatchNormalization(
-      axis=bn_axis,
-      momentum=batch_norm_decay,
-      epsilon=batch_norm_epsilon,
-      name=bn_name_base + '2c')(
-          x)
-
-  x = layers.add([x, input_tensor])
-  x = layers.Activation('relu')(x)
-  return x
-
-
-def conv_block(input_tensor,
-               kernel_size,
-               filters,
-               stage,
-               block,
-               strides=(2, 2),
-               use_l2_regularizer=True,
-               batch_norm_decay=0.9,
-               batch_norm_epsilon=1e-5):
-  """A block that has a conv layer at shortcut.
-
-  Note that from stage 3,
-  the second conv layer at main path is with strides=(2, 2)
-  And the shortcut should have strides=(2, 2) as well
-
-  Args:
-    input_tensor: input tensor
-    kernel_size: default 3, the kernel size of middle conv layer at main path
-    filters: list of integers, the filters of 3 conv layer at main path
-    stage: integer, current stage label, used for generating layer names
-    block: 'a','b'..., current block label, used for generating layer names
-    strides: Strides for the second conv layer in the block.
-    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
-    batch_norm_decay: Moment of batch norm layers.
-    batch_norm_epsilon: Epsilon of batch borm layers.
-
-  Returns:
-    Output tensor for the block.
-  """
-  filters1, filters2, filters3 = filters
-  if tf.keras.backend.image_data_format() == 'channels_last':
-    bn_axis = 3
-  else:
-    bn_axis = 1
-  conv_name_base = 'res' + str(stage) + block + '_branch'
-  bn_name_base = 'bn' + str(stage) + block + '_branch'
-
-  x = layers.Conv2D(
-      filters1, (1, 1),
-      use_bias=False,
-      kernel_initializer='he_normal',
-      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-      name=conv_name_base + '2a')(
-          input_tensor)
-  x = layers.BatchNormalization(
-      axis=bn_axis,
-      momentum=batch_norm_decay,
-      epsilon=batch_norm_epsilon,
-      name=bn_name_base + '2a')(
-          x)
-  x = layers.Activation('relu')(x)
-
-  x = layers.Conv2D(
-      filters2,
-      kernel_size,
-      strides=strides,
-      padding='same',
-      use_bias=False,
-      kernel_initializer='he_normal',
-      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-      name=conv_name_base + '2b')(
-          x)
-  x = layers.BatchNormalization(
-      axis=bn_axis,
-      momentum=batch_norm_decay,
-      epsilon=batch_norm_epsilon,
-      name=bn_name_base + '2b')(
-          x)
-  x = layers.Activation('relu')(x)
-
-  x = layers.Conv2D(
-      filters3, (1, 1),
-      use_bias=False,
-      kernel_initializer='he_normal',
-      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-      name=conv_name_base + '2c')(
-          x)
-  x = layers.BatchNormalization(
-      axis=bn_axis,
-      momentum=batch_norm_decay,
-      epsilon=batch_norm_epsilon,
-      name=bn_name_base + '2c')(
-          x)
-
-  shortcut = layers.Conv2D(
-      filters3, (1, 1),
-      strides=strides,
-      use_bias=False,
-      kernel_initializer='he_normal',
-      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-      name=conv_name_base + '1')(
-          input_tensor)
-  shortcut = layers.BatchNormalization(
-      axis=bn_axis,
-      momentum=batch_norm_decay,
-      epsilon=batch_norm_epsilon,
-      name=bn_name_base + '1')(
-          shortcut)
-
-  x = layers.add([x, shortcut])
-  x = layers.Activation('relu')(x)
-  return x
-
-
-def resnet50(num_classes,
-             batch_size=None,
-             use_l2_regularizer=True,
-             rescale_inputs=False,
-             batch_norm_decay=0.9,
-             batch_norm_epsilon=1e-5):
-  """Instantiates the ResNet50 architecture.
-
-  Args:
-    num_classes: `int` number of classes for image classification.
-    batch_size: Size of the batches for each step.
-    use_l2_regularizer: whether to use L2 regularizer on Conv/Dense layer.
-    rescale_inputs: whether to rescale inputs from 0 to 1.
-    batch_norm_decay: Moment of batch norm layers.
-    batch_norm_epsilon: Epsilon of batch borm layers.
-
-  Returns:
-      A Keras model instance.
-  """
-  input_shape = (224, 224, 3)
-  img_input = layers.Input(shape=input_shape, batch_size=batch_size)
-  if rescale_inputs:
-    # Hub image modules expect inputs in the range [0, 1]. This rescales these
-    # inputs to the range expected by the trained model.
-    x = layers.Lambda(
-        lambda x: x * 255.0 - tf.keras.backend.constant(    # pylint: disable=g-long-lambda
-            imagenet_preprocessing.CHANNEL_MEANS,
-            shape=[1, 1, 3],
-            dtype=x.dtype),
-        name='rescale')(
-            img_input)
-  else:
-    x = img_input
-
-  if tf.keras.backend.image_data_format() == 'channels_first':
-    x = layers.Permute((3, 1, 2))(x)
-    bn_axis = 1
-  else:  # channels_last
-    bn_axis = 3
-
-  block_config = dict(
-      use_l2_regularizer=use_l2_regularizer,
-      batch_norm_decay=batch_norm_decay,
-      batch_norm_epsilon=batch_norm_epsilon)
-  x = layers.ZeroPadding2D(padding=(3, 3), name='conv1_pad')(x)
-  x = layers.Conv2D(
-      64, (7, 7),
-      strides=(2, 2),
-      padding='valid',
-      use_bias=False,
-      kernel_initializer='he_normal',
-      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-      name='conv1')(
-          x)
-  x = layers.BatchNormalization(
-      axis=bn_axis,
-      momentum=batch_norm_decay,
-      epsilon=batch_norm_epsilon,
-      name='bn_conv1')(
-          x)
-  x = layers.Activation('relu')(x)
-  x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
-
-  x = conv_block(
-      x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), **block_config)
-  x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', **block_config)
-  x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', **block_config)
-
-  x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', **block_config)
-  x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', **block_config)
-  x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', **block_config)
-  x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', **block_config)
-
-  x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', **block_config)
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b', **block_config)
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c', **block_config)
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d', **block_config)
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e', **block_config)
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f', **block_config)
-
-  x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a', **block_config)
-  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b', **block_config)
-  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c', **block_config)
-
-  x = layers.GlobalAveragePooling2D()(x)
-  x = layers.Dense(
-      num_classes,
-      kernel_initializer=tf.initializers.random_normal(stddev=0.01),
-      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-      bias_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-      name='fc1000')(
-          x)
-
-  # A softmax that is followed by the model loss must be done cannot be done
-  # in float16 due to numeric issues. So we pass dtype=float32.
-  x = layers.Activation('softmax', dtype='float32')(x)
-
-  # Create model.
-  return tf.keras.Model(img_input, x, name='resnet50')
--- a/official/vision/image_classification/resnet/resnet_runnable.py
+++ b/official/vision/image_classification/resnet/resnet_runnable.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
-
-import orbit
-import tensorflow as tf
-from official.modeling import grad_utils
-from official.modeling import performance
-from official.utils.flags import core as flags_core
-from official.vision.image_classification.resnet import common
-from official.vision.image_classification.resnet import imagenet_preprocessing
-from official.vision.image_classification.resnet import resnet_model
-
-
-class ResnetRunnable(orbit.StandardTrainer, orbit.StandardEvaluator):
-  """Implements the training and evaluation APIs for Resnet model."""
-
-  def __init__(self, flags_obj, time_callback, epoch_steps):
-    self.strategy = tf.distribute.get_strategy()
-    self.flags_obj = flags_obj
-    self.dtype = flags_core.get_tf_dtype(flags_obj)
-    self.time_callback = time_callback
-
-    # Input pipeline related
-    batch_size = flags_obj.batch_size
-    if batch_size % self.strategy.num_replicas_in_sync != 0:
-      raise ValueError(
-          'Batch size must be divisible by number of replicas : {}'.format(
-              self.strategy.num_replicas_in_sync))
-
-    # As auto rebatching is not supported in
-    # `distribute_datasets_from_function()` API, which is
-    # required when cloning dataset to multiple workers in eager mode,
-    # we use per-replica batch size.
-    self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync)
-
-    if self.flags_obj.use_synthetic_data:
-      self.input_fn = common.get_synth_input_fn(
-          height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
-          width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
-          num_channels=imagenet_preprocessing.NUM_CHANNELS,
-          num_classes=imagenet_preprocessing.NUM_CLASSES,
-          dtype=self.dtype,
-          drop_remainder=True)
-    else:
-      self.input_fn = imagenet_preprocessing.input_fn
-
-    self.model = resnet_model.resnet50(
-        num_classes=imagenet_preprocessing.NUM_CLASSES,
-        use_l2_regularizer=not flags_obj.single_l2_loss_op)
-
-    lr_schedule = common.PiecewiseConstantDecayWithWarmup(
-        batch_size=flags_obj.batch_size,
-        epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
-        warmup_epochs=common.LR_SCHEDULE[0][1],
-        boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
-        multipliers=list(p[0] for p in common.LR_SCHEDULE),
-        compute_lr_on_cpu=True)
-    self.optimizer = common.get_optimizer(lr_schedule)
-    # Make sure iterations variable is created inside scope.
-    self.global_step = self.optimizer.iterations
-
-    self.optimizer = performance.configure_optimizer(
-        self.optimizer,
-        use_float16=self.dtype == tf.float16,
-        loss_scale=flags_core.get_loss_scale(flags_obj, default_for_fp16=128))
-
-    self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
-    self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
-        'train_accuracy', dtype=tf.float32)
-    self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
-    self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
-        'test_accuracy', dtype=tf.float32)
-
-    self.checkpoint = tf.train.Checkpoint(
-        model=self.model, optimizer=self.optimizer)
-
-    # Handling epochs.
-    self.epoch_steps = epoch_steps
-    self.epoch_helper = orbit.utils.EpochHelper(epoch_steps, self.global_step)
-    train_dataset = orbit.utils.make_distributed_dataset(
-        self.strategy,
-        self.input_fn,
-        is_training=True,
-        data_dir=self.flags_obj.data_dir,
-        batch_size=self.batch_size,
-        parse_record_fn=imagenet_preprocessing.parse_record,
-        datasets_num_private_threads=self.flags_obj
-        .datasets_num_private_threads,
-        dtype=self.dtype,
-        drop_remainder=True)
-    orbit.StandardTrainer.__init__(
-        self,
-        train_dataset,
-        options=orbit.StandardTrainerOptions(
-            use_tf_while_loop=flags_obj.use_tf_while_loop,
-            use_tf_function=flags_obj.use_tf_function))
-    if not flags_obj.skip_eval:
-      eval_dataset = orbit.utils.make_distributed_dataset(
-          self.strategy,
-          self.input_fn,
-          is_training=False,
-          data_dir=self.flags_obj.data_dir,
-          batch_size=self.batch_size,
-          parse_record_fn=imagenet_preprocessing.parse_record,
-          dtype=self.dtype)
-      orbit.StandardEvaluator.__init__(
-          self,
-          eval_dataset,
-          options=orbit.StandardEvaluatorOptions(
-              use_tf_function=flags_obj.use_tf_function))
-
-  def train_loop_begin(self):
-    """See base class."""
-    # Reset all metrics
-    self.train_loss.reset_states()
-    self.train_accuracy.reset_states()
-
-    self._epoch_begin()
-    self.time_callback.on_batch_begin(self.epoch_helper.batch_index)
-
-  def train_step(self, iterator):
-    """See base class."""
-
-    def step_fn(inputs):
-      """Function to run on the device."""
-      images, labels = inputs
-      with tf.GradientTape() as tape:
-        logits = self.model(images, training=True)
-
-        prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
-            labels, logits)
-        loss = tf.reduce_sum(prediction_loss) * (1.0 /
-                                                 self.flags_obj.batch_size)
-        num_replicas = self.strategy.num_replicas_in_sync
-        l2_weight_decay = 1e-4
-        if self.flags_obj.single_l2_loss_op:
-          l2_loss = l2_weight_decay * 2 * tf.add_n([
-              tf.nn.l2_loss(v)
-              for v in self.model.trainable_variables
-              if 'bn' not in v.name
-          ])
-
-          loss += (l2_loss / num_replicas)
-        else:
-          loss += (tf.reduce_sum(self.model.losses) / num_replicas)
-
-      grad_utils.minimize_using_explicit_allreduce(
-          tape, self.optimizer, loss, self.model.trainable_variables)
-      self.train_loss.update_state(loss)
-      self.train_accuracy.update_state(labels, logits)
-    if self.flags_obj.enable_xla:
-      step_fn = tf.function(step_fn, jit_compile=True)
-    self.strategy.run(step_fn, args=(next(iterator),))
-
-  def train_loop_end(self):
-    """See base class."""
-    metrics = {
-        'train_loss': self.train_loss.result(),
-        'train_accuracy': self.train_accuracy.result(),
-    }
-    self.time_callback.on_batch_end(self.epoch_helper.batch_index - 1)
-    self._epoch_end()
-    return metrics
-
-  def eval_begin(self):
-    """See base class."""
-    self.test_loss.reset_states()
-    self.test_accuracy.reset_states()
-
-  def eval_step(self, iterator):
-    """See base class."""
-
-    def step_fn(inputs):
-      """Function to run on the device."""
-      images, labels = inputs
-      logits = self.model(images, training=False)
-      loss = tf.keras.losses.sparse_categorical_crossentropy(labels, logits)
-      loss = tf.reduce_sum(loss) * (1.0 / self.flags_obj.batch_size)
-      self.test_loss.update_state(loss)
-      self.test_accuracy.update_state(labels, logits)
-
-    self.strategy.run(step_fn, args=(next(iterator),))
-
-  def eval_end(self):
-    """See base class."""
-    return {
-        'test_loss': self.test_loss.result(),
-        'test_accuracy': self.test_accuracy.result()
-    }
-
-  def _epoch_begin(self):
-    if self.epoch_helper.epoch_begin():
-      self.time_callback.on_epoch_begin(self.epoch_helper.current_epoch)
-
-  def _epoch_end(self):
-    if self.epoch_helper.epoch_end():
-      self.time_callback.on_epoch_end(self.epoch_helper.current_epoch)
--- a/official/vision/image_classification/resnet/tfhub_export.py
+++ b/official/vision/image_classification/resnet/tfhub_export.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""A script to export TF-Hub SavedModel."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-# Import libraries
-from absl import app
-from absl import flags
-
-import tensorflow as tf
-
-from official.vision.image_classification.resnet import imagenet_preprocessing
-from official.vision.image_classification.resnet import resnet_model
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("model_path", None,
-                    "File path to TF model checkpoint or H5 file.")
-flags.DEFINE_string("export_path", None,
-                    "TF-Hub SavedModel destination path to export.")
-
-
-def export_tfhub(model_path, hub_destination):
-  """Restores a tf.keras.Model and saves for TF-Hub."""
-  model = resnet_model.resnet50(
-      num_classes=imagenet_preprocessing.NUM_CLASSES, rescale_inputs=True)
-  model.load_weights(model_path)
-  model.save(
-      os.path.join(hub_destination, "classification"), include_optimizer=False)
-
-  # Extracts a sub-model to use pooling feature vector as model output.
-  image_input = model.get_layer(index=0).get_output_at(0)
-  feature_vector_output = model.get_layer(name="reduce_mean").get_output_at(0)
-  hub_model = tf.keras.Model(image_input, feature_vector_output)
-
-  # Exports a SavedModel.
-  hub_model.save(
-      os.path.join(hub_destination, "feature-vector"), include_optimizer=False)
-
-
-def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError("Too many command-line arguments.")
-
-  export_tfhub(FLAGS.model_path, FLAGS.export_path)
-
-
-if __name__ == "__main__":
-  app.run(main)
--- a/official/vision/image_classification/test_utils.py
+++ b/official/vision/image_classification/test_utils.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Test utilities for image classification tasks."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-
-def trivial_model(num_classes):
-  """Trivial model for ImageNet dataset."""
-
-  input_shape = (224, 224, 3)
-  img_input = tf.keras.layers.Input(shape=input_shape)
-
-  x = tf.keras.layers.Lambda(
-      lambda x: tf.keras.backend.reshape(x, [-1, 224 * 224 * 3]),
-      name='reshape')(img_input)
-  x = tf.keras.layers.Dense(1, name='fc1')(x)
-  x = tf.keras.layers.Dense(num_classes, name='fc1000')(x)
-  x = tf.keras.layers.Activation('softmax', dtype='float32')(x)
-
-  return tf.keras.models.Model(img_input, x, name='trivial')