Merge pull request #7 from tensorflow/master

updated

Merge pull request #7 from tensorflow/master
updated
965cc3ee · Ayushman Kumar · GitHub · 1f3247f4 · 1f685c54 · 965cc3ee
Unverified Commit 965cc3ee authored Apr 21, 2020 by Ayushman Kumar Committed by GitHub Apr 21, 2020
20 changed files
--- a/official/vision/detection/configs/retinanet_config.py
+++ b/official/vision/detection/configs/retinanet_config.py
@@ -14,84 +14,18 @@
 # ==============================================================================
 """Config template to train Retinanet."""

-# pylint: disable=line-too-long
+from official.modeling.hyperparams import params_dict
+from official.vision.detection.configs import base_config

-# For ResNet-50, this freezes the variables of the first conv1 and conv2_x
-# layers [1], which leads to higher training speed and slightly better testing
-# accuracy. The intuition is that the low-level architecture (e.g., ResNet-50)
-# is able to capture low-level features such as edges; therefore, it does not
-# need to be fine-tuned for the detection task.
-# Note that we need to trailing `/` to avoid the incorrect match.
-# [1]: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L198
-RESNET_FROZEN_VAR_PREFIX = r'(resnet\d+)\/(conv2d(|_([1-9]|10))|batch_normalization(|_([1-9]|10)))\/'
-REGULARIZATION_VAR_REGEX = r'.*(kernel|weight):0$'

 # pylint: disable=line-too-long
-RETINANET_CFG = {
+RETINANET_CFG = params_dict.ParamsDict(base_config.BASE_CFG)
+RETINANET_CFG.override({
    'type': 'retinanet',
-    'model_dir': '',
-    'use_tpu': True,
-    'strategy_type': 'tpu',
-    'train': {
-        'batch_size': 64,
-        'iterations_per_loop': 500,
-        'total_steps': 22500,
-        'optimizer': {
-            'type': 'momentum',
-            'momentum': 0.9,
-            'nesterov': True,  # `False` is better for TPU v3-128.
-        },
-        'learning_rate': {
-            'type': 'step',
-            'warmup_learning_rate': 0.0067,
-            'warmup_steps': 500,
-            'init_learning_rate': 0.08,
-            'learning_rate_levels': [0.008, 0.0008],
-            'learning_rate_steps': [15000, 20000],
-        },
-        'checkpoint': {
-            'path': '',
-            'prefix': '',
-        },
-        'frozen_variable_prefix': RESNET_FROZEN_VAR_PREFIX,
-        'train_file_pattern': '',
-        # TODO(b/142174042): Support transpose_input option.
-        'transpose_input': False,
-        'regularization_variable_regex': REGULARIZATION_VAR_REGEX,
-        'l2_weight_decay': 0.0001,
-        'input_sharding': False,
-    },
-    'eval': {
-        'batch_size': 8,
-        'min_eval_interval': 180,
-        'eval_timeout': None,
-        'eval_samples': 5000,
-        'type': 'box',
-        'val_json_file': '',
-        'eval_file_pattern': '',
-        'input_sharding': True,
-        # When visualizing images, set evaluation batch size to 40 to avoid
-        # potential OOM.
-        'num_images_to_visualize': 0,
-    },
-    'predict': {
-        'predict_batch_size': 8,
-    },
    'architecture': {
        'parser': 'retinanet_parser',
-        'backbone': 'resnet',
-        'multilevel_features': 'fpn',
-        'use_bfloat16': False,
-    },
-    'anchor': {
-        'min_level': 3,
-        'max_level': 7,
-        'num_scales': 3,
-        'aspect_ratios': [1.0, 2.0, 0.5],
-        'anchor_size': 4.0,
    },
    'retinanet_parser': {
-        'use_bfloat16': False,
        'output_size': [640, 640],
        'num_channels': 3,
        'match_threshold': 0.5,
@@ -104,68 +38,22 @@ RETINANET_CFG = {
        'skip_crowd_during_training': True,
        'max_num_instances': 100,
    },
-    'resnet': {
-        'resnet_depth': 50,
-        'batch_norm': {
-            'batch_norm_momentum': 0.997,
-            'batch_norm_epsilon': 1e-4,
-            'batch_norm_trainable': True,
-        },
-    },
-    'fpn': {
-        'min_level': 3,
-        'max_level': 7,
-        'fpn_feat_dims': 256,
-        'use_separable_conv': False,
-        'use_batch_norm': True,
-        'batch_norm': {
-            'batch_norm_momentum': 0.997,
-            'batch_norm_epsilon': 1e-4,
-            'batch_norm_trainable': True,
-        },
-    },
    'retinanet_head': {
-        'min_level': 3,
-        'max_level': 7,
-        # Note that `num_classes` is the total number of classes including
-        # one background classes whose index is 0.
-        'num_classes': 91,
        'anchors_per_location': 9,
-        'retinanet_head_num_convs': 4,
-        'retinanet_head_num_filters': 256,
+        'num_convs': 4,
+        'num_filters': 256,
        'use_separable_conv': False,
-        'batch_norm': {
-            'batch_norm_momentum': 0.997,
-            'batch_norm_epsilon': 1e-4,
-            'batch_norm_trainable': True,
-        },
    },
    'retinanet_loss': {
-        'num_classes': 91,
        'focal_loss_alpha': 0.25,
        'focal_loss_gamma': 1.5,
        'huber_loss_delta': 0.1,
        'box_loss_weight': 50,
    },
-    'postprocess': {
-        'use_batched_nms': False,
-        'min_level': 3,
-        'max_level': 7,
-        'max_total_size': 100,
-        'nms_iou_threshold': 0.5,
-        'score_threshold': 0.05,
-        'pre_nms_num_boxes': 5000,
-    },
-    'enable_summary': False,
-}
+    'enable_summary': True,
+}, is_strict=False)

 RETINANET_RESTRICTIONS = [
-    'architecture.use_bfloat16 == retinanet_parser.use_bfloat16',
-    'anchor.min_level == retinanet_head.min_level',
-    'anchor.max_level == retinanet_head.max_level',
-    'anchor.min_level == postprocess.min_level',
-    'anchor.max_level == postprocess.max_level',
-    'retinanet_head.num_classes == retinanet_loss.num_classes',
 ]

 # pylint: enable=line-too-long
--- a/official/vision/detection/dataloader/factory.py
+++ b/official/vision/detection/dataloader/factory.py
@@ -29,8 +29,8 @@ def parser_generator(params, mode):
    parser_params = params.retinanet_parser
    parser_fn = retinanet_parser.Parser(
        output_size=parser_params.output_size,
-        min_level=anchor_params.min_level,
-        max_level=anchor_params.max_level,
+        min_level=params.architecture.min_level,
+        max_level=params.architecture.max_level,
        num_scales=anchor_params.num_scales,
        aspect_ratios=anchor_params.aspect_ratios,
        anchor_size=anchor_params.anchor_size,
@@ -43,15 +43,15 @@ def parser_generator(params, mode):
        autoaugment_policy_name=parser_params.autoaugment_policy_name,
        skip_crowd_during_training=parser_params.skip_crowd_during_training,
        max_num_instances=parser_params.max_num_instances,
-        use_bfloat16=parser_params.use_bfloat16,
+        use_bfloat16=params.architecture.use_bfloat16,
        mode=mode)
  elif params.architecture.parser == 'maskrcnn_parser':
    anchor_params = params.anchor
    parser_params = params.maskrcnn_parser
    parser_fn = maskrcnn_parser.Parser(
        output_size=parser_params.output_size,
-        min_level=anchor_params.min_level,
-        max_level=anchor_params.max_level,
+        min_level=params.architecture.min_level,
+        max_level=params.architecture.max_level,
        num_scales=anchor_params.num_scales,
        aspect_ratios=anchor_params.aspect_ratios,
        anchor_size=anchor_params.anchor_size,
@@ -64,17 +64,17 @@ def parser_generator(params, mode):
        aug_scale_max=parser_params.aug_scale_max,
        skip_crowd_during_training=parser_params.skip_crowd_during_training,
        max_num_instances=parser_params.max_num_instances,
-        include_mask=parser_params.include_mask,
+        include_mask=params.architecture.include_mask,
        mask_crop_size=parser_params.mask_crop_size,
-        use_bfloat16=parser_params.use_bfloat16,
+        use_bfloat16=params.architecture.use_bfloat16,
        mode=mode)
  elif params.architecture.parser == 'shapemask_parser':
    anchor_params = params.anchor
    parser_params = params.shapemask_parser
    parser_fn = shapemask_parser.Parser(
        output_size=parser_params.output_size,
-        min_level=anchor_params.min_level,
-        max_level=anchor_params.max_level,
+        min_level=params.architecture.min_level,
+        max_level=params.architecture.max_level,
        num_scales=anchor_params.num_scales,
        aspect_ratios=anchor_params.aspect_ratios,
        anchor_size=anchor_params.anchor_size,
@@ -93,7 +93,7 @@ def parser_generator(params, mode):
        aug_scale_max=parser_params.aug_scale_max,
        skip_crowd_during_training=parser_params.skip_crowd_during_training,
        max_num_instances=parser_params.max_num_instances,
-        use_bfloat16=parser_params.use_bfloat16,
+        use_bfloat16=params.architecture.use_bfloat16,
        mask_train_class=parser_params.mask_train_class,
        mode=mode)
  else:

--- a/official/vision/detection/dataloader/input_reader.py
+++ b/official/vision/detection/dataloader/input_reader.py
@@ -85,13 +85,14 @@ class InputFn(object):

    if self._input_sharding and ctx and ctx.num_input_pipelines > 1:
      dataset = dataset.shard(ctx.num_input_pipelines, ctx.input_pipeline_id)
+    dataset = dataset.cache()
+
    if self._is_training:
      dataset = dataset.repeat()

    dataset = dataset.interleave(
        map_func=lambda file_name: self._dataset_fn(file_name), cycle_length=32,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    dataset = dataset.cache()

    if self._is_training:
      # Large shuffle size is critical for 2vm input pipeline. Can use small

--- a/official/vision/detection/main.py
+++ b/official/vision/detection/main.py
@@ -35,10 +35,12 @@ from official.vision.detection.dataloader import input_reader
 from official.vision.detection.dataloader import mode_keys as ModeKeys
 from official.vision.detection.executor.detection_executor import DetectionDistributedExecutor
 from official.vision.detection.modeling import factory as model_factory
+from official.utils.flags import core as flags_core
 from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils

 hyperparams_flags.initialize_common_flags()
+flags_core.define_log_steps()

 flags.DEFINE_bool(
    'enable_xla',
@@ -67,10 +69,12 @@ FLAGS = flags.FLAGS


 def run_executor(params,
+                 mode,
+                 checkpoint_path=None,
                 train_input_fn=None,
                 eval_input_fn=None,
                 callbacks=None,
-                 strategy=None):
+                 prebuilt_strategy=None):
  """Runs Retinanet model on distribution strategy defined by the user."""

  if params.architecture.use_bfloat16:
@@ -80,7 +84,9 @@ def run_executor(params,

  model_builder = model_factory.model_generator(params)

-  if strategy is None:
+  if prebuilt_strategy is not None:
+    strategy = prebuilt_strategy
+  else:
    strategy_config = params.strategy_config
    distribution_utils.configure_cluster(strategy_config.worker_hosts,
                                         strategy_config.task_index)
@@ -94,7 +100,7 @@ def run_executor(params,
  num_workers = int(strategy.num_replicas_in_sync + 7) // 8
  is_multi_host = (int(num_workers) >= 2)

-  if FLAGS.mode == 'train':
+  if mode == 'train':

    def _model_fn(params):
      return model_builder.build_model(params, mode=ModeKeys.TRAIN)
@@ -126,8 +132,7 @@ def run_executor(params,
        init_checkpoint=model_builder.make_restore_checkpoint_fn(),
        custom_callbacks=callbacks,
        save_config=True)
-
-  elif FLAGS.mode == 'eval' or FLAGS.mode == 'eval_once':
+  elif mode == 'eval' or mode == 'eval_once':

    def _model_fn(params):
      return model_builder.build_model(params, mode=ModeKeys.PREDICT_WITH_GT)
@@ -150,7 +155,7 @@ def run_executor(params,
        trainable_variables_filter=model_builder
        .make_filter_trainable_variables_fn())

-    if FLAGS.mode == 'eval':
+    if mode == 'eval':
      results = dist_executor.evaluate_from_model_dir(
          model_dir=params.model_dir,
          eval_input_fn=eval_input_fn,
@@ -160,9 +165,8 @@ def run_executor(params,
          total_steps=params.train.total_steps)
    else:
      # Run evaluation once for a single checkpoint.
-      if not FLAGS.checkpoint_path:
-        raise ValueError('FLAGS.checkpoint_path cannot be empty.')
-      checkpoint_path = FLAGS.checkpoint_path
+      if not checkpoint_path:
+        raise ValueError('checkpoint_path cannot be empty.')
      if tf.io.gfile.isdir(checkpoint_path):
        checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
      summary_writer = executor.SummaryWriter(params.model_dir, 'eval')
@@ -175,7 +179,7 @@ def run_executor(params,
      logging.info('Final eval metric %s: %f', k, v)
    return results
  else:
-    raise ValueError('Mode not found: %s.' % FLAGS.mode)
+    raise ValueError('Mode not found: %s.' % mode)


 def run(callbacks=None):
@@ -224,8 +228,21 @@ def run(callbacks=None):
        mode=input_reader.ModeKeys.PREDICT_WITH_GT,
        batch_size=params.eval.batch_size,
        num_examples=params.eval.eval_samples)
+
+  if callbacks is None:
+    callbacks = []
+
+  if FLAGS.log_steps:
+    callbacks.append(
+        keras_utils.TimeHistory(
+            batch_size=params.train.batch_size,
+            log_steps=FLAGS.log_steps,
+        ))
+
  return run_executor(
      params,
+      FLAGS.mode,
+      checkpoint_path=FLAGS.checkpoint_path,
      train_input_fn=train_input_fn,
      eval_input_fn=eval_input_fn,
      callbacks=callbacks)
@@ -238,6 +255,5 @@ def main(argv):


 if __name__ == '__main__':
-  assert tf.version.VERSION.startswith('2.')
  tf.config.set_soft_device_placement(True)
  app.run(main)
--- a/official/vision/detection/modeling/architecture/factory.py
+++ b/official/vision/detection/modeling/architecture/factory.py
@@ -25,16 +25,12 @@ from official.vision.detection.modeling.architecture import nn_ops
 from official.vision.detection.modeling.architecture import resnet


-def batch_norm_relu_generator(params):
-
-  def _batch_norm_op(**kwargs):
-    return nn_ops.BatchNormRelu(
-        momentum=params.batch_norm_momentum,
-        epsilon=params.batch_norm_epsilon,
-        trainable=params.batch_norm_trainable,
-        **kwargs)
-
-  return _batch_norm_op
+def norm_activation_generator(params):
+  return nn_ops.norm_activation_builder(
+      momentum=params.batch_norm_momentum,
+      epsilon=params.batch_norm_epsilon,
+      trainable=params.batch_norm_trainable,
+      activation=params.activation)


 def backbone_generator(params):
@@ -43,10 +39,12 @@ def backbone_generator(params):
    resnet_params = params.resnet
    backbone_fn = resnet.Resnet(
        resnet_depth=resnet_params.resnet_depth,
-        batch_norm_relu=batch_norm_relu_generator(resnet_params.batch_norm))
+        activation=params.norm_activation.activation,
+        norm_activation=norm_activation_generator(
+            params.norm_activation))
  else:
-    raise ValueError('Backbone model %s is not supported.' %
-                     params.architecture.backbone)
+    raise ValueError('Backbone model `{}` is not supported.'
+                     .format(params.architecture.backbone))

  return backbone_fn

@@ -56,81 +54,75 @@ def multilevel_features_generator(params):
  if params.architecture.multilevel_features == 'fpn':
    fpn_params = params.fpn
    fpn_fn = fpn.Fpn(
-        min_level=fpn_params.min_level,
-        max_level=fpn_params.max_level,
+        min_level=params.architecture.min_level,
+        max_level=params.architecture.max_level,
        fpn_feat_dims=fpn_params.fpn_feat_dims,
        use_separable_conv=fpn_params.use_separable_conv,
+        activation=params.norm_activation.activation,
        use_batch_norm=fpn_params.use_batch_norm,
-        batch_norm_relu=batch_norm_relu_generator(fpn_params.batch_norm))
+        norm_activation=norm_activation_generator(
+            params.norm_activation))
  elif params.architecture.multilevel_features == 'identity':
    fpn_fn = identity.Identity()
  else:
-    raise ValueError('The multi-level feature model %s is not supported.'
-                     % params.architecture.multilevel_features)
+    raise ValueError('The multi-level feature model `{}` is not supported.'
+                     .format(params.architecture.multilevel_features))
  return fpn_fn


 def retinanet_head_generator(params):
  """Generator function for RetinaNet head architecture."""
+  head_params = params.retinanet_head
  return heads.RetinanetHead(
-      params.min_level,
-      params.max_level,
-      params.num_classes,
-      params.anchors_per_location,
-      params.retinanet_head_num_convs,
-      params.retinanet_head_num_filters,
-      params.use_separable_conv,
-      batch_norm_relu=batch_norm_relu_generator(params.batch_norm))
+      params.architecture.min_level,
+      params.architecture.max_level,
+      params.architecture.num_classes,
+      head_params.anchors_per_location,
+      head_params.num_convs,
+      head_params.num_filters,
+      head_params.use_separable_conv,
+      norm_activation=norm_activation_generator(params.norm_activation))


 def rpn_head_generator(params):
+  head_params = params.rpn_head
  """Generator function for RPN head architecture."""
-  return heads.RpnHead(params.min_level,
-                       params.max_level,
-                       params.anchors_per_location,
-                       params.num_convs,
-                       params.num_filters,
-                       params.use_separable_conv,
-                       params.use_batch_norm,
-                       batch_norm_relu=batch_norm_relu_generator(
-                           params.batch_norm))
+  return heads.RpnHead(
+      params.architecture.min_level,
+      params.architecture.max_level,
+      head_params.anchors_per_location,
+      head_params.num_convs,
+      head_params.num_filters,
+      head_params.use_separable_conv,
+      params.norm_activation.activation,
+      head_params.use_batch_norm,
+      norm_activation=norm_activation_generator(params.norm_activation))


 def fast_rcnn_head_generator(params):
  """Generator function for Fast R-CNN head architecture."""
-  return heads.FastrcnnHead(params.num_classes,
-                            params.num_convs,
-                            params.num_filters,
-                            params.use_separable_conv,
-                            params.num_fcs,
-                            params.fc_dims,
-                            params.use_batch_norm,
-                            batch_norm_relu=batch_norm_relu_generator(
-                                params.batch_norm))
+  head_params = params.frcnn_head
+  return heads.FastrcnnHead(
+      params.architecture.num_classes,
+      head_params.num_convs,
+      head_params.num_filters,
+      head_params.use_separable_conv,
+      head_params.num_fcs,
+      head_params.fc_dims,
+      params.norm_activation.activation,
+      head_params.use_batch_norm,
+      norm_activation=norm_activation_generator(params.norm_activation))


 def mask_rcnn_head_generator(params):
  """Generator function for Mask R-CNN head architecture."""
-  return heads.MaskrcnnHead(params.num_classes,
-                            params.mask_target_size,
-                            params.num_convs,
-                            params.num_filters,
-                            params.use_separable_conv,
-                            params.use_batch_norm,
-                            batch_norm_relu=batch_norm_relu_generator(
-                                params.batch_norm))
-
-
-def shapeprior_head_generator(params):
-  """Generator function for Shapemask head architecture."""
-  raise NotImplementedError('Unimplemented')
-
-
-def coarsemask_head_generator(params):
-  """Generator function for Shapemask head architecture."""
-  raise NotImplementedError('Unimplemented')
-
-
-def finemask_head_generator(params):
-  """Generator function for Shapemask head architecture."""
-  raise NotImplementedError('Unimplemented')
+  head_params = params.mrcnn_head
+  return heads.MaskrcnnHead(
+      params.architecture.num_classes,
+      params.architecture.mask_target_size,
+      head_params.num_convs,
+      head_params.num_filters,
+      head_params.use_separable_conv,
+      params.norm_activation.activation,
+      head_params.use_batch_norm,
+      norm_activation=norm_activation_generator(params.norm_activation))
--- a/official/vision/detection/modeling/architecture/fpn.py
+++ b/official/vision/detection/modeling/architecture/fpn.py
@@ -41,8 +41,10 @@ class Fpn(object):
               max_level=7,
               fpn_feat_dims=256,
               use_separable_conv=False,
+               activation='relu',
               use_batch_norm=True,
-               batch_norm_relu=nn_ops.BatchNormRelu):
+               norm_activation=nn_ops.norm_activation_builder(
+                   activation='relu')):
    """FPN initialization function.

    Args:
@@ -52,8 +54,8 @@ class Fpn(object):
      use_separable_conv: `bool`, if True use separable convolution for
        convolution in FPN layers.
      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
-      batch_norm_relu: an operation that includes a batch normalization layer
-        followed by a relu layer(optional).
+      norm_activation: an operation that includes a normalization layer
+        followed by an optional activation layer.
    """
    self._min_level = min_level
    self._max_level = max_level
@@ -63,17 +65,23 @@ class Fpn(object):
          tf.keras.layers.SeparableConv2D, depth_multiplier=1)
    else:
      self._conv2d_op = tf.keras.layers.Conv2D
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))
    self._use_batch_norm = use_batch_norm
-    self._batch_norm_relu = batch_norm_relu
+    self._norm_activation = norm_activation

-    self._batch_norm_relus = {}
+    self._norm_activations = {}
    self._lateral_conv2d_op = {}
    self._post_hoc_conv2d_op = {}
    self._coarse_conv2d_op = {}
    for level in range(self._min_level, self._max_level + 1):
      if self._use_batch_norm:
-        self._batch_norm_relus[level] = batch_norm_relu(
-            relu=False, name='p%d-bn' % level)
+        self._norm_activations[level] = norm_activation(
+            use_activation=False, name='p%d-bn' % level)
      self._lateral_conv2d_op[level] = self._conv2d_op(
          filters=self._fpn_feat_dims,
          kernel_size=(1, 1),
@@ -133,11 +141,11 @@ class Fpn(object):
      for level in range(backbone_max_level + 1, self._max_level + 1):
        feats_in = feats[level - 1]
        if level > backbone_max_level + 1:
-          feats_in = tf.nn.relu(feats_in)
+          feats_in = self._activation_op(feats_in)
        feats[level] = self._coarse_conv2d_op[level](feats_in)
      if self._use_batch_norm:
        # Adds batch_norm layer.
        for level in range(self._min_level, self._max_level + 1):
-          feats[level] = self._batch_norm_relus[level](
+          feats[level] = self._norm_activations[level](
              feats[level], is_training=is_training)
    return feats
--- a/official/vision/detection/modeling/architecture/heads.py
+++ b/official/vision/detection/modeling/architecture/heads.py
@@ -39,8 +39,10 @@ class RpnHead(tf.keras.layers.Layer):
               num_convs=2,
               num_filters=256,
               use_separable_conv=False,
+               activation='relu',
               use_batch_norm=True,
-               batch_norm_relu=nn_ops.BatchNormRelu):
+               norm_activation=nn_ops.norm_activation_builder(
+                   activation='relu')):
    """Initialize params to build Region Proposal Network head.

    Args:
@@ -55,12 +57,18 @@ class RpnHead(tf.keras.layers.Layer):
      use_separable_conv: `bool`, indicating whether the separable conv layers
        is used.
      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
-      batch_norm_relu: an operation that includes a batch normalization layer
-        followed by a relu layer(optional).
+      norm_activation: an operation that includes a normalization layer
+        followed by an optional activation layer.
    """
    self._min_level = min_level
    self._max_level = max_level
    self._anchors_per_location = anchors_per_location
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))
    self._use_batch_norm = use_batch_norm

    if use_separable_conv:
@@ -78,7 +86,7 @@ class RpnHead(tf.keras.layers.Layer):
        num_filters,
        kernel_size=(3, 3),
        strides=(1, 1),
-        activation=(None if self._use_batch_norm else tf.nn.relu),
+        activation=(None if self._use_batch_norm else self._activation_op),
        padding='same',
        name='rpn')
    self._rpn_class_conv = self._conv2d_op(
@@ -94,10 +102,10 @@ class RpnHead(tf.keras.layers.Layer):
        padding='valid',
        name='rpn-box')

-    self._batch_norm_relus = {}
+    self._norm_activations = {}
    if self._use_batch_norm:
      for level in range(self._min_level, self._max_level + 1):
-        self._batch_norm_relus[level] = batch_norm_relu(name='rpn-l%d-bn' %
+        self._norm_activations[level] = norm_activation(name='rpn-l%d-bn' %
                                                        level)

  def _shared_rpn_heads(self, features, anchors_per_location, level,
@@ -106,7 +114,7 @@ class RpnHead(tf.keras.layers.Layer):
    features = self._rpn_conv(features)
    if self._use_batch_norm:
      # The batch normalization layers are not shared between levels.
-      features = self._batch_norm_relus[level](
+      features = self._norm_activations[level](
          features, is_training=is_training)
    # Proposal classification scores
    scores = self._rpn_class_conv(features)
@@ -139,8 +147,10 @@ class FastrcnnHead(tf.keras.layers.Layer):
               use_separable_conv=False,
               num_fcs=2,
               fc_dims=1024,
+               activation='relu',
               use_batch_norm=True,
-               batch_norm_relu=nn_ops.BatchNormRelu):
+               norm_activation=nn_ops.norm_activation_builder(
+                   activation='relu')):
    """Initialize params to build Fast R-CNN box head.

    Args:
@@ -156,8 +166,8 @@ class FastrcnnHead(tf.keras.layers.Layer):
      fc_dims: `int` number that represents the number of dimension of the FC
        layers.
      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
-      batch_norm_relu: an operation that includes a batch normalization layer
-        followed by a relu layer(optional).
+      norm_activation: an operation that includes a normalization layer
+        followed by an optional activation layer.
    """
    self._num_classes = num_classes

@@ -177,9 +187,14 @@ class FastrcnnHead(tf.keras.layers.Layer):

    self._num_fcs = num_fcs
    self._fc_dims = fc_dims
-
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))
    self._use_batch_norm = use_batch_norm
-    self._batch_norm_relu = batch_norm_relu
+    self._norm_activation = norm_activation

    self._conv_ops = []
    self._conv_bn_ops = []
@@ -191,10 +206,10 @@ class FastrcnnHead(tf.keras.layers.Layer):
              strides=(1, 1),
              padding='same',
              dilation_rate=(1, 1),
-              activation=(None if self._use_batch_norm else tf.nn.relu),
+              activation=(None if self._use_batch_norm else self._activation_op),
              name='conv_{}'.format(i)))
      if self._use_batch_norm:
-        self._conv_bn_ops.append(self._batch_norm_relu())
+        self._conv_bn_ops.append(self._norm_activation())

    self._fc_ops = []
    self._fc_bn_ops = []
@@ -202,10 +217,10 @@ class FastrcnnHead(tf.keras.layers.Layer):
      self._fc_ops.append(
          tf.keras.layers.Dense(
              units=self._fc_dims,
-              activation=(None if self._use_batch_norm else tf.nn.relu),
+              activation=(None if self._use_batch_norm else self._activation_op),
              name='fc{}'.format(i)))
      if self._use_batch_norm:
-        self._fc_bn_ops.append(self._batch_norm_relu(fused=False))
+        self._fc_bn_ops.append(self._norm_activation(fused=False))

    self._class_predict = tf.keras.layers.Dense(
        self._num_classes,
@@ -266,8 +281,10 @@ class MaskrcnnHead(tf.keras.layers.Layer):
               num_convs=4,
               num_filters=256,
               use_separable_conv=False,
+               activation='relu',
               use_batch_norm=True,
-               batch_norm_relu=nn_ops.BatchNormRelu):
+               norm_activation=nn_ops.norm_activation_builder(
+                   activation='relu')):
    """Initialize params to build Fast R-CNN head.

    Args:
@@ -280,8 +297,8 @@ class MaskrcnnHead(tf.keras.layers.Layer):
      use_separable_conv: `bool`, indicating whether the separable conv layers
        is used.
      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
-      batch_norm_relu: an operation that includes a batch normalization layer
-        followed by a relu layer(optional).
+      norm_activation: an operation that includes a normalization layer
+        followed by an optional activation layer.
    """
    self._num_classes = num_classes
    self._mask_target_size = mask_target_size
@@ -299,9 +316,14 @@ class MaskrcnnHead(tf.keras.layers.Layer):
          kernel_initializer=tf.keras.initializers.VarianceScaling(
              scale=2, mode='fan_out', distribution='untruncated_normal'),
          bias_initializer=tf.zeros_initializer())
-
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))
    self._use_batch_norm = use_batch_norm
-    self._batch_norm_relu = batch_norm_relu
+    self._norm_activation = norm_activation
    self._conv2d_ops = []
    for i in range(self._num_convs):
      self._conv2d_ops.append(
@@ -311,14 +333,14 @@ class MaskrcnnHead(tf.keras.layers.Layer):
              strides=(1, 1),
              padding='same',
              dilation_rate=(1, 1),
-              activation=(None if self._use_batch_norm else tf.nn.relu),
+              activation=(None if self._use_batch_norm else self._activation_op),
              name='mask-conv-l%d' % i))
    self._mask_conv_transpose = tf.keras.layers.Conv2DTranspose(
        self._num_filters,
        kernel_size=(2, 2),
        strides=(2, 2),
        padding='valid',
-        activation=(None if self._use_batch_norm else tf.nn.relu),
+        activation=(None if self._use_batch_norm else self._activation_op),
        kernel_initializer=tf.keras.initializers.VarianceScaling(
            scale=2, mode='fan_out', distribution='untruncated_normal'),
        bias_initializer=tf.zeros_initializer(),
@@ -353,11 +375,11 @@ class MaskrcnnHead(tf.keras.layers.Layer):
        for i in range(self._num_convs):
          net = self._conv2d_ops[i](net)
          if self._use_batch_norm:
-            net = self._batch_norm_relu()(net, is_training=is_training)
+            net = self._norm_activation()(net, is_training=is_training)

        net = self._mask_conv_transpose(net)
        if self._use_batch_norm:
-          net = self._batch_norm_relu()(net, is_training=is_training)
+          net = self._norm_activation()(net, is_training=is_training)

        mask_outputs = self._conv2d_op(
            self._num_classes,
@@ -398,7 +420,8 @@ class RetinanetHead(object):
               num_convs=4,
               num_filters=256,
               use_separable_conv=False,
-               batch_norm_relu=nn_ops.BatchNormRelu):
+               norm_activation=nn_ops.norm_activation_builder(
+                   activation='relu')):
    """Initialize params to build RetinaNet head.

    Args:
@@ -411,8 +434,8 @@ class RetinanetHead(object):
      num_filters: `int` number of filters used in the head architecture.
      use_separable_conv: `bool` to indicate whether to use separable
        convoluation.
-      batch_norm_relu: an operation that includes a batch normalization layer
-        followed by a relu layer(optional).
+      norm_activation: an operation that includes a normalization layer
+        followed by an optional activation layer.
    """
    self._min_level = min_level
    self._max_level = max_level
@@ -423,13 +446,12 @@ class RetinanetHead(object):
    self._num_convs = num_convs
    self._num_filters = num_filters
    self._use_separable_conv = use_separable_conv
-
    with tf.name_scope('class_net') as scope_name:
      self._class_name_scope = tf.name_scope(scope_name)
    with tf.name_scope('box_net') as scope_name:
      self._box_name_scope = tf.name_scope(scope_name)
-    self._build_class_net_layers(batch_norm_relu)
-    self._build_box_net_layers(batch_norm_relu)
+    self._build_class_net_layers(norm_activation)
+    self._build_box_net_layers(norm_activation)

  def _class_net_batch_norm_name(self, i, level):
    return 'class-%d-%d' % (i, level)
@@ -437,7 +459,7 @@ class RetinanetHead(object):
  def _box_net_batch_norm_name(self, i, level):
    return 'box-%d-%d' % (i, level)

-  def _build_class_net_layers(self, batch_norm_relu):
+  def _build_class_net_layers(self, norm_activation):
    """Build re-usable layers for class prediction network."""
    if self._use_separable_conv:
      self._class_predict = tf.keras.layers.SeparableConv2D(
@@ -455,7 +477,7 @@ class RetinanetHead(object):
          padding='same',
          name='class-predict')
    self._class_conv = []
-    self._class_batch_norm_relu = {}
+    self._class_norm_activation = {}
    for i in range(self._num_convs):
      if self._use_separable_conv:
        self._class_conv.append(
@@ -479,9 +501,9 @@ class RetinanetHead(object):
                name='class-' + str(i)))
      for level in range(self._min_level, self._max_level + 1):
        name = self._class_net_batch_norm_name(i, level)
-        self._class_batch_norm_relu[name] = batch_norm_relu(name=name)
+        self._class_norm_activation[name] = norm_activation(name=name)

-  def _build_box_net_layers(self, batch_norm_relu):
+  def _build_box_net_layers(self, norm_activation):
    """Build re-usable layers for box prediction network."""
    if self._use_separable_conv:
      self._box_predict = tf.keras.layers.SeparableConv2D(
@@ -499,7 +521,7 @@ class RetinanetHead(object):
          padding='same',
          name='box-predict')
    self._box_conv = []
-    self._box_batch_norm_relu = {}
+    self._box_norm_activation = {}
    for i in range(self._num_convs):
      if self._use_separable_conv:
        self._box_conv.append(
@@ -523,13 +545,13 @@ class RetinanetHead(object):
                name='box-' + str(i)))
      for level in range(self._min_level, self._max_level + 1):
        name = self._box_net_batch_norm_name(i, level)
-        self._box_batch_norm_relu[name] = batch_norm_relu(name=name)
+        self._box_norm_activation[name] = norm_activation(name=name)

  def __call__(self, fpn_features, is_training=None):
    """Returns outputs of RetinaNet head."""
    class_outputs = {}
    box_outputs = {}
-    with backend.get_graph().as_default(), tf.name_scope('retinanet'):
+    with backend.get_graph().as_default(), tf.name_scope('retinanet_head'):
      for level in range(self._min_level, self._max_level + 1):
        features = fpn_features[level]

@@ -548,7 +570,7 @@ class RetinanetHead(object):
        # each level has its batch normlization to capture the statistical
        # difference among different levels.
        name = self._class_net_batch_norm_name(i, level)
-        features = self._class_batch_norm_relu[name](
+        features = self._class_norm_activation[name](
            features, is_training=is_training)

      classes = self._class_predict(features)
@@ -563,7 +585,7 @@ class RetinanetHead(object):
        # each level has its batch normlization to capture the statistical
        # difference among different levels.
        name = self._box_net_batch_norm_name(i, level)
-        features = self._box_batch_norm_relu[name](
+        features = self._box_norm_activation[name](
            features, is_training=is_training)

      boxes = self._box_predict(features)
@@ -953,13 +975,13 @@ class ShapemaskCoarsemaskHead(object):
  def coarsemask_decoder_net(self,
                             images,
                             is_training=None,
-                             batch_norm_relu=nn_ops.BatchNormRelu):
+                             norm_activation=nn_ops.norm_activation_builder()):
    """Coarse mask decoder network architecture.

    Args:
      images: A tensor of size [batch, height_in, width_in, channels_in].
      is_training: Whether batch_norm layers are in training mode.
-      batch_norm_relu: an operation that includes a batch normalization layer
+      norm_activation: an operation that includes a batch normalization layer
        followed by a relu layer(optional).
    Returns:
      images: A feature tensor of size [batch, output_size, output_size,
@@ -975,7 +997,7 @@ class ShapemaskCoarsemaskHead(object):
          padding='same',
          name='coarse-class-%d' % i)(
              images)
-      images = batch_norm_relu(name='coarse-class-%d-bn' % i)(
+      images = norm_activation(name='coarse-class-%d-bn' % i)(
          images, is_training=is_training)

    return images
@@ -991,7 +1013,7 @@ class ShapemaskFinemaskHead(object):
               num_convs,
               coarse_mask_thr,
               gt_upsample_scale,
-               batch_norm_relu=nn_ops.BatchNormRelu):
+               norm_activation=nn_ops.norm_activation_builder()):
    """Initialize params to build ShapeMask coarse and fine prediction head.

    Args:
@@ -1002,7 +1024,7 @@ class ShapemaskFinemaskHead(object):
        layer.
      coarse_mask_thr: the threshold for suppressing noisy coarse prediction.
      gt_upsample_scale: scale for upsampling groundtruths.
-      batch_norm_relu: an operation that includes a batch normalization layer
+      norm_activation: an operation that includes a batch normalization layer
        followed by a relu layer(optional).
    """
    self._mask_num_classes = num_classes
@@ -1038,7 +1060,7 @@ class ShapemaskFinemaskHead(object):
              activation=None,
              padding='same',
              name='fine-class-%d' % i))
-      self._fine_class_bn.append(batch_norm_relu(name='fine-class-%d-bn' % i))
+      self._fine_class_bn.append(norm_activation(name='fine-class-%d-bn' % i))

  def __call__(self, prior_conditioned_features, class_probs, is_training=None):
    """Generate instance masks from FPN features and detection priors.

--- a/official/vision/detection/modeling/architecture/nn_ops.py
+++ b/official/vision/detection/modeling/architecture/nn_ops.py
@@ -18,20 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import functools
 from absl import logging
 import tensorflow.compat.v2 as tf
-from tensorflow.python.keras import backend


-class BatchNormRelu(tf.keras.layers.Layer):
-  """Combined Batch Normalization and ReLU layers."""
+class NormActivation(tf.keras.layers.Layer):
+  """Combined Normalization and Activation layers."""

  def __init__(self,
               momentum=0.997,
               epsilon=1e-4,
               trainable=True,
-               relu=True,
               init_zero=False,
+               use_activation=True,
+               activation='relu',
               fused=True,
               name=None):
    """A class to construct layers for a batch normalization followed by a ReLU.
@@ -39,22 +40,24 @@ class BatchNormRelu(tf.keras.layers.Layer):
    Args:
      momentum: momentum for the moving average.
      epsilon: small float added to variance to avoid dividing by zero.
-      trainable: `boolean`, if True also add variables to the graph collection
+      trainable: `bool`, if True also add variables to the graph collection
        GraphKeys.TRAINABLE_VARIABLES. If False, freeze batch normalization
        layer.
-      relu: `bool` if False, omits the ReLU operation.
      init_zero: `bool` if True, initializes scale parameter of batch
          normalization with 0. If False, initialize it with 1.
      fused: `bool` fused option in batch normalziation.
+      use_actiation: `bool`, whether to add the optional activation layer after
+        the batch normalization layer.
+      activation: 'string', the type of the activation layer. Currently support
+        `relu` and `swish`.
      name: `str` name for the operation.
    """
-    super(BatchNormRelu, self).__init__(trainable=trainable)
-    self._use_relu = relu
+    super(NormActivation, self).__init__(trainable=trainable)
    if init_zero:
      gamma_initializer = tf.keras.initializers.Zeros()
    else:
      gamma_initializer = tf.keras.initializers.Ones()
-    self._batch_norm_op = tf.keras.layers.BatchNormalization(
+    self._normalization_op = tf.keras.layers.BatchNormalization(
        momentum=momentum,
        epsilon=epsilon,
        center=True,
@@ -63,9 +66,16 @@ class BatchNormRelu(tf.keras.layers.Layer):
        fused=fused,
        gamma_initializer=gamma_initializer,
        name=name)
+    self._use_activation = use_activation
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))

  def __call__(self, inputs, is_training=None):
-    """Builds layers for a batch normalization followed by a ReLU.
+    """Builds the normalization layer followed by an optional activation layer.

    Args:
      inputs: `Tensor` of shape `[batch, channels, ...]`.
@@ -78,9 +88,22 @@ class BatchNormRelu(tf.keras.layers.Layer):
    # from keras.Model.training
    if is_training and self.trainable:
      is_training = True
-    inputs = self._batch_norm_op(inputs, training=is_training)
+    inputs = self._normalization_op(inputs, training=is_training)

-    if self._use_relu:
-      inputs = tf.nn.relu(inputs)
+    if self._use_activation:
+      inputs = self._activation_op(inputs)
    return inputs

+
+def norm_activation_builder(momentum=0.997,
+                            epsilon=1e-4,
+                            trainable=True,
+                            activation='relu',
+                            **kwargs):
+  return functools.partial(
+      NormActivation,
+      momentum=momentum,
+      epsilon=epsilon,
+      trainable=trainable,
+      activation='relu',
+      **kwargs)
--- a/official/vision/detection/modeling/architecture/resnet.py
+++ b/official/vision/detection/modeling/architecture/resnet.py
@@ -34,21 +34,27 @@ class Resnet(object):

  def __init__(self,
               resnet_depth,
-               batch_norm_relu=nn_ops.BatchNormRelu,
+               activation='relu',
+               norm_activation=nn_ops.norm_activation_builder(
+                   activation='relu'),
               data_format='channels_last'):
    """ResNet initialization function.

    Args:
      resnet_depth: `int` depth of ResNet backbone model.
-      batch_norm_relu: an operation that includes a batch normalization layer
-        followed by a relu layer(optional).
+      norm_activation: an operation that includes a normalization layer
+        followed by an optional activation layer.
      data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.
    """
    self._resnet_depth = resnet_depth
-
-    self._batch_norm_relu = batch_norm_relu
-
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))
+    self._norm_activation = norm_activation
    self._data_format = data_format

    model_params = {
@@ -170,19 +176,19 @@ class Resnet(object):
      # Projection shortcut in first layer to match filters and strides
      shortcut = self.conv2d_fixed_padding(
          inputs=inputs, filters=filters, kernel_size=1, strides=strides)
-      shortcut = self._batch_norm_relu(relu=False)(
+      shortcut = self._norm_activation(use_activation=False)(
          shortcut, is_training=is_training)

    inputs = self.conv2d_fixed_padding(
        inputs=inputs, filters=filters, kernel_size=3, strides=strides)
-    inputs = self._batch_norm_relu()(inputs, is_training=is_training)
+    inputs = self._norm_activation()(inputs, is_training=is_training)

    inputs = self.conv2d_fixed_padding(
        inputs=inputs, filters=filters, kernel_size=3, strides=1)
-    inputs = self._batch_norm_relu()(
-        inputs, relu=False, init_zero=True, is_training=is_training)
+    inputs = self._norm_activation(use_activation=False, init_zero=True)(
+        inputs, is_training=is_training)

-    return tf.nn.relu(inputs + shortcut)
+    return self._activation_op(inputs + shortcut)

  def bottleneck_block(self,
                       inputs,
@@ -214,24 +220,23 @@ class Resnet(object):
      filters_out = 4 * filters
      shortcut = self.conv2d_fixed_padding(
          inputs=inputs, filters=filters_out, kernel_size=1, strides=strides)
-      shortcut = self._batch_norm_relu(relu=False)(
+      shortcut = self._norm_activation(use_activation=False)(
          shortcut, is_training=is_training)

    inputs = self.conv2d_fixed_padding(
        inputs=inputs, filters=filters, kernel_size=1, strides=1)
-    inputs = self._batch_norm_relu()(inputs, is_training=is_training)
+    inputs = self._norm_activation()(inputs, is_training=is_training)

    inputs = self.conv2d_fixed_padding(
        inputs=inputs, filters=filters, kernel_size=3, strides=strides)
-    inputs = self._batch_norm_relu()(inputs, is_training=is_training)
+    inputs = self._norm_activation()(inputs, is_training=is_training)

    inputs = self.conv2d_fixed_padding(
        inputs=inputs, filters=4 * filters, kernel_size=1, strides=1)
-    inputs = self._batch_norm_relu(
-        relu=False, init_zero=True)(
-            inputs, is_training=is_training)
+    inputs = self._norm_activation(use_activation=False, init_zero=True)(
+        inputs, is_training=is_training)

-    return tf.nn.relu(inputs + shortcut)
+    return self._activation_op(inputs + shortcut)

  def block_group(self, inputs, filters, block_fn, blocks, strides, name,
                  is_training):
@@ -279,7 +284,7 @@ class Resnet(object):
      inputs = self.conv2d_fixed_padding(
          inputs=inputs, filters=64, kernel_size=7, strides=2)
      inputs = tf.identity(inputs, 'initial_conv')
-      inputs = self._batch_norm_relu()(inputs, is_training=is_training)
+      inputs = self._norm_activation()(inputs, is_training=is_training)

      inputs = tf.keras.layers.MaxPool2D(
          pool_size=3, strides=2, padding='SAME',

--- a/official/vision/detection/modeling/base_model.py
+++ b/official/vision/detection/modeling/base_model.py
@@ -24,37 +24,7 @@ import re
 import tensorflow.compat.v2 as tf
 from official.vision.detection.modeling import checkpoint_utils
 from official.vision.detection.modeling import learning_rates
-
-
-class OptimizerFactory(object):
-  """Class to generate optimizer function."""
-
-  def __init__(self, params):
-    """Creates optimized based on the specified flags."""
-    if params.type == 'momentum':
-      nesterov = False
-      try:
-        nesterov = params.nesterov
-      except AttributeError:
-        pass
-      self._optimizer = functools.partial(
-          tf.keras.optimizers.SGD,
-          momentum=params.momentum,
-          nesterov=nesterov)
-    elif params.type == 'adam':
-      self._optimizer = tf.keras.optimizers.Adam
-    elif params.type == 'adadelta':
-      self._optimizer = tf.keras.optimizers.Adadelta
-    elif params.type == 'adagrad':
-      self._optimizer = tf.keras.optimizers.Adagrad
-    elif params.type == 'rmsprop':
-      self._optimizer = functools.partial(
-          tf.keras.optimizers.RMSprop, momentum=params.momentum)
-    else:
-      raise ValueError('Unsupported optimizer type %s.' % self._optimizer)
-
-  def __call__(self, learning_rate):
-    return self._optimizer(learning_rate=learning_rate)
+from official.vision.detection.modeling import optimizers


 def _make_filter_trainable_variables_fn(frozen_variable_prefix):
@@ -73,7 +43,8 @@ def _make_filter_trainable_variables_fn(frozen_variable_prefix):
    # the frozen variables' names.
    filtered_variables = [
        v for v in variables
-        if not re.match(frozen_variable_prefix, v.name)
+        if not frozen_variable_prefix or
+        not re.match(frozen_variable_prefix, v.name)
    ]
    return filtered_variables

@@ -94,9 +65,9 @@ class Model(object):
      tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)

    # Optimization.
-    self._optimizer_fn = OptimizerFactory(params.train.optimizer)
+    self._optimizer_fn = optimizers.OptimizerFactory(params.train.optimizer)
    self._learning_rate = learning_rates.learning_rate_generator(
-        params.train.learning_rate)
+        params.train.total_steps, params.train.learning_rate)

    self._frozen_variable_prefix = params.train.frozen_variable_prefix
    self._regularization_var_regex = params.train.regularization_variable_regex

--- a/official/vision/detection/modeling/learning_rates.py
+++ b/official/vision/detection/modeling/learning_rates.py
@@ -28,9 +28,10 @@ from official.modeling.hyperparams import params_dict
 class StepLearningRateWithLinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
  """Class to generate learning rate tensor."""

-  def __init__(self, params):
+  def __init__(self, total_steps, params):
    """Creates the step learning rate tensor with linear warmup."""
    super(StepLearningRateWithLinearWarmup, self).__init__()
+    self._total_steps = total_steps
    assert isinstance(params, (dict, params_dict.ParamsDict))
    if isinstance(params, dict):
      params = params_dict.ParamsDict(params)
@@ -59,9 +60,10 @@ class StepLearningRateWithLinearWarmup(tf.keras.optimizers.schedules.LearningRat
 class CosineLearningRateWithLinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
  """Class to generate learning rate tensor."""

-  def __init__(self, params):
+  def __init__(self, total_steps, params):
    """Creates the consine learning rate tensor with linear warmup."""
    super(CosineLearningRateWithLinearWarmup, self).__init__()
+    self._total_steps = total_steps
    assert isinstance(params, (dict, params_dict.ParamsDict))
    if isinstance(params, dict):
      params = params_dict.ParamsDict(params)
@@ -72,7 +74,7 @@ class CosineLearningRateWithLinearWarmup(tf.keras.optimizers.schedules.LearningR
    warmup_lr = self._params.warmup_learning_rate
    warmup_steps = self._params.warmup_steps
    init_lr = self._params.init_learning_rate
-    total_steps = self._params.total_steps
+    total_steps = self._total_steps
    linear_warmup = (
        warmup_lr + global_step / warmup_steps * (init_lr - warmup_lr))
    cosine_learning_rate = (
@@ -86,11 +88,11 @@ class CosineLearningRateWithLinearWarmup(tf.keras.optimizers.schedules.LearningR
    return {'_params': self._params.as_dict()}


-def learning_rate_generator(params):
+def learning_rate_generator(total_steps, params):
  """The learning rate function generator."""
  if params.type == 'step':
-    return StepLearningRateWithLinearWarmup(params)
+    return StepLearningRateWithLinearWarmup(total_steps, params)
  elif params.type == 'cosine':
-    return CosineLearningRateWithLinearWarmup(params)
+    return CosineLearningRateWithLinearWarmup(total_steps, params)
  else:
    raise ValueError('Unsupported learning rate type: {}.'.format(params.type))
--- a/official/vision/detection/modeling/losses.py
+++ b/official/vision/detection/modeling/losses.py
@@ -371,8 +371,8 @@ class MaskrcnnLoss(object):
 class RetinanetClassLoss(object):
  """RetinaNet class loss."""

-  def __init__(self, params):
-    self._num_classes = params.num_classes
+  def __init__(self, params, num_classes):
+    self._num_classes = num_classes
    self._focal_loss_alpha = params.focal_loss_alpha
    self._focal_loss_gamma = params.focal_loss_gamma


--- a/official/vision/detection/modeling/maskrcnn_model.py
+++ b/official/vision/detection/modeling/maskrcnn_model.py
@@ -49,14 +49,16 @@ class MaskrcnnModel(base_model.Model):
    # Architecture generators.
    self._backbone_fn = factory.backbone_generator(params)
    self._fpn_fn = factory.multilevel_features_generator(params)
-    self._rpn_head_fn = factory.rpn_head_generator(params.rpn_head)
+    self._rpn_head_fn = factory.rpn_head_generator(params)
    self._generate_rois_fn = roi_ops.ROIGenerator(params.roi_proposal)
    self._sample_rois_fn = sampling_ops.ROISampler(params.roi_sampling)
-    self._sample_masks_fn = sampling_ops.MaskSampler(params.mask_sampling)
+    self._sample_masks_fn = sampling_ops.MaskSampler(
+        params.architecture.mask_target_size,
+        params.mask_sampling.num_mask_samples_per_image)

-    self._frcnn_head_fn = factory.fast_rcnn_head_generator(params.frcnn_head)
+    self._frcnn_head_fn = factory.fast_rcnn_head_generator(params)
    if self._include_mask:
-      self._mrcnn_head_fn = factory.mask_rcnn_head_generator(params.mrcnn_head)
+      self._mrcnn_head_fn = factory.mask_rcnn_head_generator(params)

    # Loss function.
    self._rpn_score_loss_fn = losses.RpnScoreLoss(params.rpn_score_loss)
@@ -91,8 +93,8 @@ class MaskrcnnModel(base_model.Model):
            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
                                  rpn_box_outputs),
    })
-    input_anchor = anchor.Anchor(self._params.anchor.min_level,
-                                 self._params.anchor.max_level,
+    input_anchor = anchor.Anchor(self._params.architecture.min_level,
+                                 self._params.architecture.max_level,
                                 self._params.anchor.num_scales,
                                 self._params.anchor.aspect_ratios,
                                 self._params.anchor.anchor_size,

--- a/official/vision/detection/modeling/optimizers.py
+++ b/official/vision/detection/modeling/optimizers.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Optimizers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+
+class OptimizerFactory(object):
+  """Class to generate optimizer function."""
+
+  def __init__(self, params):
+    """Creates optimized based on the specified flags."""
+    if params.type == 'momentum':
+      self._optimizer = functools.partial(
+          tf.keras.optimizers.SGD,
+          momentum=params.momentum,
+          nesterov=params.nesterov)
+    elif params.type == 'adam':
+      self._optimizer = tf.keras.optimizers.Adam
+    elif params.type == 'adadelta':
+      self._optimizer = tf.keras.optimizers.Adadelta
+    elif params.type == 'adagrad':
+      self._optimizer = tf.keras.optimizers.Adagrad
+    elif params.type == 'rmsprop':
+      self._optimizer = functools.partial(
+          tf.keras.optimizers.RMSprop, momentum=params.momentum)
+    else:
+      raise ValueError('Unsupported optimizer type `{}`.'.format(params.type))
+
+  def __call__(self, learning_rate):
+    return self._optimizer(learning_rate=learning_rate)
--- a/official/vision/detection/modeling/retinanet_model.py
+++ b/official/vision/detection/modeling/retinanet_model.py
@@ -44,16 +44,19 @@ class RetinanetModel(base_model.Model):
    # Architecture generators.
    self._backbone_fn = factory.backbone_generator(params)
    self._fpn_fn = factory.multilevel_features_generator(params)
-    self._head_fn = factory.retinanet_head_generator(params.retinanet_head)
+    self._head_fn = factory.retinanet_head_generator(params)

    # Loss function.
-    self._cls_loss_fn = losses.RetinanetClassLoss(params.retinanet_loss)
+    self._cls_loss_fn = losses.RetinanetClassLoss(
+        params.retinanet_loss, params.architecture.num_classes)
    self._box_loss_fn = losses.RetinanetBoxLoss(params.retinanet_loss)
    self._box_loss_weight = params.retinanet_loss.box_loss_weight
    self._keras_model = None

    # Predict function.
    self._generate_detections_fn = postprocess_ops.MultilevelDetectionGenerator(
+        params.architecture.min_level,
+        params.architecture.max_level,
        params.postprocess)

    self._transpose_input = params.train.transpose_input

--- a/official/vision/detection/ops/__init__.py
+++ b/official/vision/detection/ops/__init__.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
--- a/official/vision/detection/ops/postprocess_ops.py
+++ b/official/vision/detection/ops/postprocess_ops.py
@@ -294,10 +294,10 @@ def _generate_detections_batched(boxes,
 class MultilevelDetectionGenerator(object):
  """Generates detected boxes with scores and classes for one-stage detector."""

-  def __init__(self, params):
+  def __init__(self, min_level, max_level, params):
+    self._min_level = min_level
+    self._max_level = max_level
    self._generate_detections = generate_detections_factory(params)
-    self._min_level = params.min_level
-    self._max_level = params.max_level

  def __call__(self, box_outputs, class_outputs, anchor_boxes, image_shape):
    # Collects outputs from all levels into a list.

--- a/official/vision/detection/ops/sampling_ops.py
+++ b/official/vision/detection/ops/sampling_ops.py
@@ -346,9 +346,9 @@ class ROISampler(object):
 class MaskSampler(object):
  """Samples and creates mask training targets."""

-  def __init__(self, params):
-    self._num_mask_samples_per_image = params.num_mask_samples_per_image
-    self._mask_target_size = params.mask_target_size
+  def __init__(self, mask_target_size, num_mask_samples_per_image):
+    self._mask_target_size = mask_target_size
+    self._num_mask_samples_per_image = num_mask_samples_per_image

  def __call__(self,
               candidate_rois,

--- a/official/vision/detection/ops/spatial_transform_ops.py
+++ b/official/vision/detection/ops/spatial_transform_ops.py
@@ -48,12 +48,143 @@ def nearest_upsampling(data, scale):
    return tf.reshape(data, [bs, h * scale, w * scale, c])


+def feature_bilinear_interpolation(features, kernel_y, kernel_x):
+  """Feature bilinear interpolation.
+
+  The RoIAlign feature f can be computed by bilinear interpolation
+  of four neighboring feature points f0, f1, f2, and f3.
+
+  f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
+                        [f10, f11]]
+  f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
+  f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
+  kernel_y = [hy, ly]
+  kernel_x = [hx, lx]
+
+  Args:
+    features: The features are in shape of [batch_size, num_boxes, output_size *
+      2, output_size * 2, num_filters].
+    kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
+    kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
+
+  Returns:
+    A 5-D tensor representing feature crop of shape
+    [batch_size, num_boxes, output_size, output_size, num_filters].
+
+  """
+  (batch_size, num_boxes, output_size, _,
+   num_filters) = features.get_shape().as_list()
+  output_size = output_size // 2
+  kernel_y = tf.reshape(kernel_y, [batch_size, num_boxes, output_size * 2, 1])
+  kernel_x = tf.reshape(kernel_x, [batch_size, num_boxes, 1, output_size * 2])
+  # Use implicit broadcast to generate the interpolation kernel. The
+  # multiplier `4` is for avg pooling.
+  interpolation_kernel = kernel_y * kernel_x * 4
+
+  # Interpolate the gathered features with computed interpolation kernels.
+  features *= tf.cast(
+      tf.expand_dims(interpolation_kernel, axis=-1), dtype=features.dtype)
+  features = tf.reshape(
+      features,
+      [batch_size * num_boxes, output_size * 2, output_size * 2, num_filters])
+  features = tf.nn.avg_pool(features, [1, 2, 2, 1], [1, 2, 2, 1], 'VALID')
+  features = tf.reshape(
+      features, [batch_size, num_boxes, output_size, output_size, num_filters])
+  return features
+
+
+def compute_grid_positions(boxes, boundaries, output_size, sample_offset):
+  """Compute the grid position w.r.t.
+
+  the corresponding feature map.
+
+  Args:
+    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
+      information of each box w.r.t. the corresponding feature map.
+      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
+      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
+        in terms of the number of pixels of the corresponding feature map size.
+    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
+      the boundary (in (y, x)) of the corresponding feature map for each box.
+      Any resampled grid points that go beyond the bounary will be clipped.
+    output_size: a scalar indicating the output crop size.
+    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
+      from grid point.
+
+  Returns:
+    kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
+    kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
+    box_grid_y0y1: Tensor of size [batch_size, boxes, output_size, 2]
+    box_grid_x0x1: Tensor of size [batch_size, boxes, output_size, 2]
+  """
+  batch_size, num_boxes, _ = boxes.get_shape().as_list()
+  box_grid_x = []
+  box_grid_y = []
+  for i in range(output_size):
+    box_grid_x.append(boxes[:, :, 1] +
+                      (i + sample_offset) * boxes[:, :, 3] / output_size)
+    box_grid_y.append(boxes[:, :, 0] +
+                      (i + sample_offset) * boxes[:, :, 2] / output_size)
+  box_grid_x = tf.stack(box_grid_x, axis=2)
+  box_grid_y = tf.stack(box_grid_y, axis=2)
+
+  box_grid_y0 = tf.floor(box_grid_y)
+  box_grid_x0 = tf.floor(box_grid_x)
+  box_grid_x0 = tf.maximum(0., box_grid_x0)
+  box_grid_y0 = tf.maximum(0., box_grid_y0)
+
+  box_grid_x0 = tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1))
+  box_grid_x1 = tf.minimum(box_grid_x0 + 1,
+                           tf.expand_dims(boundaries[:, :, 1], -1))
+  box_grid_y0 = tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1))
+  box_grid_y1 = tf.minimum(box_grid_y0 + 1,
+                           tf.expand_dims(boundaries[:, :, 0], -1))
+
+  box_gridx0x1 = tf.stack([box_grid_x0, box_grid_x1], axis=-1)
+  box_gridy0y1 = tf.stack([box_grid_y0, box_grid_y1], axis=-1)
+
+  # The RoIAlign feature f can be computed by bilinear interpolation of four
+  # neighboring feature points f0, f1, f2, and f3.
+  # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
+  #                       [f10, f11]]
+  # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
+  # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
+  ly = box_grid_y - box_grid_y0
+  lx = box_grid_x - box_grid_x0
+  hy = 1.0 - ly
+  hx = 1.0 - lx
+  kernel_y = tf.reshape(
+      tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size, 2, 1])
+  kernel_x = tf.reshape(
+      tf.stack([hx, lx], axis=3), [batch_size, num_boxes, output_size, 2, 1])
+  return kernel_y, kernel_x, box_gridy0y1, box_gridx0x1
+
+
+def get_grid_one_hot(box_gridy0y1, box_gridx0x1, feature_height, feature_width):
+  """Get grid_one_hot from indices and feature_size."""
+  (batch_size, num_boxes, output_size, _) = box_gridx0x1.get_shape().as_list()
+  y_indices = tf.cast(
+      tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size, 2]),
+      dtype=tf.int32)
+  x_indices = tf.cast(
+      tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size, 2]),
+      dtype=tf.int32)
+
+  # shape is [batch_size, num_boxes, output_size, 2, height]
+  grid_y_one_hot = tf.one_hot(tf.cast(y_indices, tf.int32), feature_height)
+  # shape is [batch_size, num_boxes, output_size, 2, width]
+  grid_x_one_hot = tf.one_hot(tf.cast(x_indices, tf.int32), feature_width)
+
+  return grid_y_one_hot, grid_x_one_hot
+
+
 def selective_crop_and_resize(features,
                              boxes,
                              box_levels,
                              boundaries,
                              output_size=7,
-                              sample_offset=0.5):
+                              sample_offset=0.5,
+                              use_einsum_gather=False):
  """Crop and resize boxes on a set of feature maps.

  Given multiple features maps indexed by different levels, and a set of boxes
@@ -67,7 +198,7 @@ def selective_crop_and_resize(features,
  pixel.

  For performance, we perform the gather and interpolation on all layers as a
-  single operation. This is op the multi-level features are first stacked and
+  single operation. In this op the multi-level features are first stacked and
  gathered into [2*output_size, 2*output_size] feature points. Then bilinear
  interpolation is performed on the gathered feature points to generate
  [output_size, output_size] RoIAlign feature map.
@@ -86,14 +217,13 @@ def selective_crop_and_resize(features,
       output_size.

  Args:
-    features: a 5-D tensor of shape
-      [batch_size, num_levels, max_height, max_width, num_filters] where
-      cropping and resizing are based.
+    features: a 5-D tensor of shape [batch_size, num_levels, max_height,
+      max_width, num_filters] where cropping and resizing are based.
    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
      information of each box w.r.t. the corresponding feature map.
      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
-      in terms of the number of pixels of the corresponding feature map size.
+        in terms of the number of pixels of the corresponding feature map size.
    box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing
      the 0-based corresponding feature level index of each box.
    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
@@ -102,6 +232,10 @@ def selective_crop_and_resize(features,
    output_size: a scalar indicating the output crop size.
    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
      from grid point.
+    use_einsum_gather: use einsum to replace gather or not. Replacing einsum
+      with gather can improve performance when feature size is not large, einsum
+      is friendly with model partition as well. Gather's performance is better
+      when feature size is very large and there are multiple box levels.

  Returns:
    features_per_box: a 5-D tensor of shape
@@ -112,93 +246,77 @@ def selective_crop_and_resize(features,
   num_filters) = features.get_shape().as_list()
  _, num_boxes, _ = boxes.get_shape().as_list()

-  # Compute the grid position w.r.t. the corresponding feature map.
-  box_grid_x = []
-  box_grid_y = []
-  for i in range(output_size):
-    box_grid_x.append(boxes[:, :, 1] +
-                      (i + sample_offset) * boxes[:, :, 3] / output_size)
-    box_grid_y.append(boxes[:, :, 0] +
-                      (i + sample_offset) * boxes[:, :, 2] / output_size)
-  box_grid_x = tf.stack(box_grid_x, axis=2)
-  box_grid_y = tf.stack(box_grid_y, axis=2)
-
-  # Compute indices for gather operation.
-  box_grid_y0 = tf.floor(box_grid_y)
-  box_grid_x0 = tf.floor(box_grid_x)
-  box_grid_x0 = tf.maximum(0., box_grid_x0)
-  box_grid_y0 = tf.maximum(0., box_grid_y0)
-  box_gridx0x1 = tf.stack(
-      [tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1)),
-       tf.minimum(box_grid_x0 + 1, tf.expand_dims(boundaries[:, :, 1], -1))],
-      axis=3)
-  box_gridy0y1 = tf.stack(
-      [tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1)),
-       tf.minimum(box_grid_y0 + 1, tf.expand_dims(boundaries[:, :, 0], -1))],
-      axis=3)
-
+  kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
+      boxes, boundaries, output_size, sample_offset)
  x_indices = tf.cast(
-      tf.reshape(box_gridx0x1,
-                 [batch_size, num_boxes, output_size * 2]), dtype=tf.int32)
+      tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
+      dtype=tf.int32)
  y_indices = tf.cast(
-      tf.reshape(box_gridy0y1,
-                 [batch_size, num_boxes, output_size * 2]), dtype=tf.int32)
-
-  height_dim_offset = max_feature_width
-  level_dim_offset = max_feature_height * height_dim_offset
-  batch_dim_offset = num_levels * level_dim_offset
-  indices = tf.reshape(
-      tf.tile(tf.reshape(tf.range(batch_size) * batch_dim_offset,
-                         [batch_size, 1, 1, 1]),
-              [1, num_boxes, output_size * 2, output_size * 2]) +
-      tf.tile(tf.reshape(box_levels * level_dim_offset,
-                         [batch_size, num_boxes, 1, 1]),
-              [1, 1, output_size * 2, output_size * 2]) +
-      tf.tile(tf.reshape(y_indices * height_dim_offset,
-                         [batch_size, num_boxes, output_size * 2, 1]),
-              [1, 1, 1, output_size * 2]) +
-      tf.tile(tf.reshape(x_indices,
-                         [batch_size, num_boxes, 1, output_size * 2]),
-              [1, 1, output_size * 2, 1]), [-1])
-
-  features = tf.reshape(features, [-1, num_filters])
-  features_per_box = tf.reshape(
-      tf.gather(features, indices),
-      [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
-
-  # The RoIAlign feature f can be computed by bilinear interpolation of four
-  # neighboring feature points f0, f1, f2, and f3.
-  # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
-  #                       [f10, f11]]
-  # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
-  # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
-  ly = box_grid_y - box_grid_y0
-  lx = box_grid_x - box_grid_x0
-  hy = 1.0 - ly
-  hx = 1.0 - lx
-  kernel_x = tf.reshape(tf.stack([hx, lx], axis=3),
-                        [batch_size, num_boxes, 1, output_size*2])
-  kernel_y = tf.reshape(tf.stack([hy, ly], axis=3),
-                        [batch_size, num_boxes, output_size*2, 1])
-  # Uses implicit broadcast to generate the interpolation kernel. The
-  # multiplier `4` is for avg pooling.
-  interpolation_kernel = kernel_y * kernel_x * 4
-
-  # Interpolates the gathered features with computed interpolation kernels.
-  features_per_box *= tf.cast(
-      tf.expand_dims(interpolation_kernel, axis=4),
-      dtype=features_per_box.dtype)
-  features_per_box = tf.reshape(
-      features_per_box,
-      [batch_size * num_boxes, output_size*2, output_size*2, num_filters])
-  features_per_box = tf.nn.avg_pool2d(
-      input=features_per_box,
-      ksize=[1, 2, 2, 1],
-      strides=[1, 2, 2, 1],
-      padding='VALID')
-  features_per_box = tf.reshape(
-      features_per_box,
-      [batch_size, num_boxes, output_size, output_size, num_filters])
+      tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
+      dtype=tf.int32)
+
+  if use_einsum_gather:
+    # Blinear interpolation is done during the last two gathers:
+    #        f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
+    #                              [f10, f11]]
+    #        [[f00, f01],
+    #         [f10, f11]] = tf.einsum(tf.einsum(features, y_one_hot), x_one_hot)
+    #       where [hy, ly] and [hx, lx] are the bilinear interpolation kernel.
+
+    # shape is [batch_size, boxes, output_size, 2, 1]
+    grid_y_one_hot, grid_x_one_hot = get_grid_one_hot(box_gridy0y1,
+                                                      box_gridx0x1,
+                                                      max_feature_height,
+                                                      max_feature_width)
+
+    # shape is [batch_size, num_boxes, output_size, height]
+    grid_y_weight = tf.reduce_sum(
+        tf.multiply(grid_y_one_hot, kernel_y), axis=-2)
+    # shape is [batch_size, num_boxes, output_size, width]
+    grid_x_weight = tf.reduce_sum(
+        tf.multiply(grid_x_one_hot, kernel_x), axis=-2)
+
+    # Gather for y_axis.
+    # shape is [batch_size, num_boxes, output_size, width, features]
+    features_per_box = tf.einsum('bmhwf,bmoh->bmowf', features,
+                                 tf.cast(grid_y_weight, features.dtype))
+    # Gather for x_axis.
+    # shape is [batch_size, num_boxes, output_size, output_size, features]
+    features_per_box = tf.einsum('bmhwf,bmow->bmhof', features_per_box,
+                                 tf.cast(grid_x_weight, features.dtype))
+  else:
+    height_dim_offset = max_feature_width
+    level_dim_offset = max_feature_height * height_dim_offset
+    batch_dim_offset = num_levels * level_dim_offset
+
+    batch_size_offset = tf.tile(
+        tf.reshape(
+            tf.range(batch_size) * batch_dim_offset, [batch_size, 1, 1, 1]),
+        [1, num_boxes, output_size * 2, output_size * 2])
+    box_levels_offset = tf.tile(
+        tf.reshape(box_levels * level_dim_offset,
+                   [batch_size, num_boxes, 1, 1]),
+        [1, 1, output_size * 2, output_size * 2])
+    y_indices_offset = tf.tile(
+        tf.reshape(y_indices * height_dim_offset,
+                   [batch_size, num_boxes, output_size * 2, 1]),
+        [1, 1, 1, output_size * 2])
+    x_indices_offset = tf.tile(
+        tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
+        [1, 1, output_size * 2, 1])
+
+    indices = tf.reshape(
+        batch_size_offset + box_levels_offset + y_indices_offset +
+        x_indices_offset, [-1])
+
+    features = tf.reshape(features, [-1, num_filters])
+    # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
+    # performance.
+    features_per_box = tf.reshape(
+        tf.gather(features, indices),
+        [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
+    features_per_box = feature_bilinear_interpolation(features_per_box,
+                                                      kernel_y, kernel_x)

  return features_per_box

@@ -211,29 +329,52 @@ def multilevel_crop_and_resize(features, boxes, output_size=7):
  and resizing it using the correspoding feature map of that level.

  Args:
-    features: A dictionary with key as pyramid level and value as features.
-      The features are in shape of [batch_size, height_l, width_l, num_filters].
-    boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row
-      represents a box with [y1, x1, y2, x2] in un-normalized coordinates.
+    features: A dictionary with key as pyramid level and value as features. The
+      features are in shape of [batch_size, height_l, width_l, num_filters].
+    boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents
+      a box with [y1, x1, y2, x2] in un-normalized coordinates.
    output_size: A scalar to indicate the output crop size.

  Returns:
    A 5-D tensor representing feature crop of shape
    [batch_size, num_boxes, output_size, output_size, num_filters].
  """
+
  with tf.name_scope('multilevel_crop_and_resize'):
-    levels = features.keys()
+    levels = list(features.keys())
    min_level = min(levels)
    max_level = max(levels)
-    _, max_feature_height, max_feature_width, _ = (
+    batch_size, max_feature_height, max_feature_width, num_filters = (
        features[min_level].get_shape().as_list())
-    # Stacks feature pyramid into a features_all of shape
+    _, num_boxes, _ = boxes.get_shape().as_list()
+
+    # Stack feature pyramid into a features_all of shape
    # [batch_size, levels, height, width, num_filters].
    features_all = []
+    feature_heights = []
+    feature_widths = []
    for level in range(min_level, max_level + 1):
-      features_all.append(tf.image.pad_to_bounding_box(
-          features[level], 0, 0, max_feature_height, max_feature_width))
-    features_all = tf.stack(features_all, axis=1)
+      shape = features[level].get_shape().as_list()
+      feature_heights.append(shape[1])
+      feature_widths.append(shape[2])
+      # Concat tensor of [batch_size, height_l * width_l, num_filters] for each
+      # levels.
+      features_all.append(
+          tf.reshape(features[level], [batch_size, -1, num_filters]))
+      features_r2 = tf.reshape(tf.concat(features_all, 1), [-1, num_filters])
+
+    # Calculate height_l * width_l for each level.
+    level_dim_sizes = [
+        feature_widths[i] * feature_heights[i]
+        for i in range(len(feature_widths))
+    ]
+    # level_dim_offsets is accumulated sum of level_dim_size.
+    level_dim_offsets = [0]
+    for i in range(len(feature_widths) - 1):
+      level_dim_offsets.append(level_dim_offsets[i] + level_dim_sizes[i])
+    batch_dim_size = level_dim_offsets[-1] + level_dim_sizes[-1]
+    level_dim_offsets = tf.constant(level_dim_offsets, tf.int32)
+    height_dim_sizes = tf.constant(feature_widths, tf.int32)

    # Assigns boxes to the right level.
    box_width = boxes[:, :, 3] - boxes[:, :, 1]
@@ -241,8 +382,8 @@ def multilevel_crop_and_resize(features, boxes, output_size=7):
    areas_sqrt = tf.sqrt(box_height * box_width)
    levels = tf.cast(
        tf.math.floordiv(
-            tf.math.log(tf.divide(areas_sqrt, 224.0)), tf.math.log(2.0))
-        + 4.0,
+            tf.math.log(tf.divide(areas_sqrt, 224.0)), tf.math.log(2.0)) +
+        4.0,
        dtype=tf.int32)
    # Maps levels between [min_level, max_level].
    levels = tf.minimum(max_level, tf.maximum(levels, min_level))
@@ -263,17 +404,58 @@ def multilevel_crop_and_resize(features, boxes, output_size=7):
    level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32))
    boundary = tf.cast(
        tf.concat([
-            tf.expand_dims([[tf.cast(max_feature_height, tf.float32)]] /
-                           level_strides - 1,
-                           axis=-1),
-            tf.expand_dims([[tf.cast(max_feature_width, tf.float32)]] /
-                           level_strides - 1,
-                           axis=-1),
-        ], axis=-1),
-        boxes.dtype)
+            tf.expand_dims(
+                [[tf.cast(max_feature_height, tf.float32)]] / level_strides - 1,
+                axis=-1),
+            tf.expand_dims(
+                [[tf.cast(max_feature_width, tf.float32)]] / level_strides - 1,
+                axis=-1),
+        ],
+                  axis=-1), boxes.dtype)
+
+    # Compute grid positions.
+    kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
+        boxes, boundary, output_size, sample_offset=0.5)
+
+    x_indices = tf.cast(
+        tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
+        dtype=tf.int32)
+    y_indices = tf.cast(
+        tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
+        dtype=tf.int32)

-    return selective_crop_and_resize(
-        features_all, boxes, levels, boundary, output_size)
+    batch_size_offset = tf.tile(
+        tf.reshape(
+            tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
+        [1, num_boxes, output_size * 2, output_size * 2])
+    # Get level offset for each box. Each box belongs to one level.
+    levels_offset = tf.tile(
+        tf.reshape(
+            tf.gather(level_dim_offsets, levels),
+            [batch_size, num_boxes, 1, 1]),
+        [1, 1, output_size * 2, output_size * 2])
+    y_indices_offset = tf.tile(
+        tf.reshape(
+            y_indices * tf.expand_dims(tf.gather(height_dim_sizes, levels), -1),
+            [batch_size, num_boxes, output_size * 2, 1]),
+        [1, 1, 1, output_size * 2])
+    x_indices_offset = tf.tile(
+        tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
+        [1, 1, output_size * 2, 1])
+    indices = tf.reshape(
+        batch_size_offset + levels_offset + y_indices_offset + x_indices_offset,
+        [-1])
+
+    # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
+    # performance.
+    features_per_box = tf.reshape(
+        tf.gather(features_r2, indices),
+        [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
+
+    # Bilinear interpolation.
+    features_per_box = feature_bilinear_interpolation(features_per_box,
+                                                      kernel_y, kernel_x)
+    return features_per_box


 def single_level_feature_crop(features, level_boxes, detection_prior_levels,
@@ -355,7 +537,8 @@ def crop_mask_in_target_box(masks,
                            boxes,
                            target_boxes,
                            output_size,
-                            sample_offset=0):
+                            sample_offset=0,
+                            use_einsum=True):
  """Crop masks in target boxes.

  Args:
@@ -370,6 +553,7 @@ def crop_mask_in_target_box(masks,
      supports to output a square shape outputs.
    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
      from grid point.
+    use_einsum: Use einsum to replace gather in selective_crop_and_resize.

  Returns:
    A 4-D tensor representing feature crop of shape
@@ -417,7 +601,8 @@ def crop_mask_in_target_box(masks,
        levels,
        boundaries,
        output_size,
-        sample_offset=sample_offset)
+        sample_offset=sample_offset,
+        use_einsum_gather=use_einsum)
    cropped_masks = tf.squeeze(cropped_masks, axis=-1)

  return cropped_masks
--- a/official/vision/image_classification/README.md
+++ b/official/vision/image_classification/README.md
@@ -19,21 +19,49 @@ installed and

 ### ImageNet preparation

+#### Using TFDS
+`classifier_trainer.py` supports ImageNet with
+[TensorFlow Datasets (TFDS)](https://www.tensorflow.org/datasets/overview).
+
+Please see the following [example snippet](https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/scripts/download_and_prepare.py)
+for more information on how to use TFDS to download and prepare datasets, and
+specifically the [TFDS ImageNet readme](https://github.com/tensorflow/datasets/blob/master/docs/catalog/imagenet2012.md)
+for manual download instructions.
+
+#### Legacy TFRecords
 Download the ImageNet dataset and convert it to TFRecord format.
 The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
 and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy)
 provide a few options.

+Note that the legacy ResNet runners, e.g. [resnet/resnet_ctl_imagenet_main.py](resnet/resnet_ctl_imagenet_main.py)
+require TFRecords whereas `classifier_trainer.py` can use both by setting the
+builder to 'records' or 'tfds' in the configurations.
+
 ### Running on Cloud TPUs

 Note: These models will **not** work with TPUs on Colab.

 You can train image classification models on Cloud TPUs using
-`tf.distribute.TPUStrategy`. If you are not familiar with Cloud TPUs, it is
-strongly recommended that you go through the
+[tf.distribute.experimental.TPUStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/TPUStrategy?version=nightly).
+If you are not familiar with Cloud TPUs, it is strongly recommended that you go
+through the
 [quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
 create a TPU and GCE VM.

+### Running on multiple GPU hosts
+
+You can also train these models on multiple hosts, each with GPUs, using
+[tf.distribute.experimental.MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy).
+
+The easiest way to run multi-host benchmarks is to set the
+[`TF_CONFIG`](https://www.tensorflow.org/guide/distributed_training#TF_CONFIG)
+appropriately at each host.  e.g., to run using `MultiWorkerMirroredStrategy` on
+2 hosts, the `cluster` in `TF_CONFIG` should have 2 `host:port` entries, and
+host `i` should have the `task` in `TF_CONFIG` set to `{"type": "worker",
+"index": i}`.  `MultiWorkerMirroredStrategy` will automatically use all the
+available GPUs at each host.
+
 ## MNIST

 To download the data and run the MNIST sample model locally for the first time,
@@ -100,7 +128,7 @@ python3 classifier_trainer.py \
  --tpu=$TPU_NAME \
  --model_dir=$MODEL_DIR \
  --data_dir=$DATA_DIR \
-  --config_file=config/examples/resnet/imagenet/tpu.yaml
+  --config_file=configs/examples/resnet/imagenet/tpu.yaml
 ```

 ### EfficientNet
@@ -127,7 +155,7 @@ python3 classifier_trainer.py \
  --tpu=$TPU_NAME \
  --model_dir=$MODEL_DIR \
  --data_dir=$DATA_DIR \
-  --config_file=config/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
+  --config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
 ```

 Note that the number of GPU devices can be overridden in the command line using