Internal change

PiperOrigin-RevId: 413981781

Internal change
PiperOrigin-RevId: 413981781
e293e338 · Yeqing Li · A. Unique TensorFlower · aa6aed37 · e293e338 · e293e338
Commit e293e338 authored Dec 03, 2021 by Yeqing Li Committed by A. Unique TensorFlower Dec 03, 2021
20 changed files
--- a/official/legacy/detection/evaluation/coco_evaluator.py
+++ b/official/legacy/detection/evaluation/coco_evaluator.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The COCO-style evaluator.
+
+The following snippet demonstrates the use of interfaces:
+
+  evaluator = COCOEvaluator(...)
+  for _ in range(num_evals):
+    for _ in range(num_batches_per_eval):
+      predictions, groundtruth = predictor.predict(...)  # pop a batch.
+      evaluator.update(predictions, groundtruths)  # aggregate internal stats.
+    evaluator.evaluate()  # finish one full eval.
+
+See also: https://github.com/cocodataset/cocoapi/
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import atexit
+import copy
+import tempfile
+
+from absl import logging
+import numpy as np
+from pycocotools import cocoeval
+import six
+import tensorflow as tf
+
+from official.legacy.detection.evaluation import coco_utils
+from official.legacy.detection.utils import class_utils
+
+
+class MetricWrapper(object):
+  """Metric Wrapper of the COCO evaluator."""
+  # This is only a wrapper for COCO metric and works on for numpy array. So it
+  # doesn't inherit from tf.keras.layers.Layer or tf.keras.metrics.Metric.
+
+  def __init__(self, evaluator):
+    self._evaluator = evaluator
+
+  def update_state(self, y_true, y_pred):
+    """Update internal states."""
+    labels = tf.nest.map_structure(lambda x: x.numpy(), y_true)
+    outputs = tf.nest.map_structure(lambda x: x.numpy(), y_pred)
+    groundtruths = {}
+    predictions = {}
+    for key, val in outputs.items():
+      if isinstance(val, tuple):
+        val = np.concatenate(val)
+      predictions[key] = val
+    for key, val in labels.items():
+      if isinstance(val, tuple):
+        val = np.concatenate(val)
+      groundtruths[key] = val
+    self._evaluator.update(predictions, groundtruths)
+
+  def result(self):
+    return self._evaluator.evaluate()
+
+  def reset_states(self):
+    return self._evaluator.reset()
+
+
+class COCOEvaluator(object):
+  """COCO evaluation metric class."""
+
+  def __init__(self, annotation_file, include_mask, need_rescale_bboxes=True):
+    """Constructs COCO evaluation class.
+
+    The class provides the interface to metrics_fn in TPUEstimator. The
+    _update_op() takes detections from each image and push them to
+    self.detections. The _evaluate() loads a JSON file in COCO annotation format
+    as the groundtruths and runs COCO evaluation.
+
+    Args:
+      annotation_file: a JSON file that stores annotations of the eval dataset.
+        If `annotation_file` is None, groundtruth annotations will be loaded
+        from the dataloader.
+      include_mask: a boolean to indicate whether or not to include the mask
+        eval.
+      need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back
+        to absolute values (`image_info` is needed in this case).
+    """
+    if annotation_file:
+      if annotation_file.startswith('gs://'):
+        _, local_val_json = tempfile.mkstemp(suffix='.json')
+        tf.io.gfile.remove(local_val_json)
+
+        tf.io.gfile.copy(annotation_file, local_val_json)
+        atexit.register(tf.io.gfile.remove, local_val_json)
+      else:
+        local_val_json = annotation_file
+      self._coco_gt = coco_utils.COCOWrapper(
+          eval_type=('mask' if include_mask else 'box'),
+          annotation_file=local_val_json)
+    self._annotation_file = annotation_file
+    self._include_mask = include_mask
+    self._metric_names = [
+        'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1', 'ARmax10',
+        'ARmax100', 'ARs', 'ARm', 'ARl'
+    ]
+    self._required_prediction_fields = [
+        'source_id', 'num_detections', 'detection_classes', 'detection_scores',
+        'detection_boxes'
+    ]
+    self._need_rescale_bboxes = need_rescale_bboxes
+    if self._need_rescale_bboxes:
+      self._required_prediction_fields.append('image_info')
+    self._required_groundtruth_fields = [
+        'source_id', 'height', 'width', 'classes', 'boxes'
+    ]
+    if self._include_mask:
+      mask_metric_names = ['mask_' + x for x in self._metric_names]
+      self._metric_names.extend(mask_metric_names)
+      self._required_prediction_fields.extend(['detection_masks'])
+      self._required_groundtruth_fields.extend(['masks'])
+
+    self.reset()
+
+  def reset(self):
+    """Resets internal states for a fresh run."""
+    self._predictions = {}
+    if not self._annotation_file:
+      self._groundtruths = {}
+
+  def evaluate(self):
+    """Evaluates with detections from all images with COCO API.
+
+    Returns:
+      coco_metric: float numpy array with shape [24] representing the
+        coco-style evaluation metrics (box and mask).
+    """
+    if not self._annotation_file:
+      logging.info('Thre is no annotation_file in COCOEvaluator.')
+      gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset(
+          self._groundtruths)
+      coco_gt = coco_utils.COCOWrapper(
+          eval_type=('mask' if self._include_mask else 'box'),
+          gt_dataset=gt_dataset)
+    else:
+      logging.info('Using annotation file: %s', self._annotation_file)
+      coco_gt = self._coco_gt
+    coco_predictions = coco_utils.convert_predictions_to_coco_annotations(
+        self._predictions)
+    coco_dt = coco_gt.loadRes(predictions=coco_predictions)
+    image_ids = [ann['image_id'] for ann in coco_predictions]
+
+    coco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='bbox')
+    coco_eval.params.imgIds = image_ids
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    coco_metrics = coco_eval.stats
+
+    if self._include_mask:
+      mcoco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='segm')
+      mcoco_eval.params.imgIds = image_ids
+      mcoco_eval.evaluate()
+      mcoco_eval.accumulate()
+      mcoco_eval.summarize()
+      mask_coco_metrics = mcoco_eval.stats
+
+    if self._include_mask:
+      metrics = np.hstack((coco_metrics, mask_coco_metrics))
+    else:
+      metrics = coco_metrics
+
+    # Cleans up the internal variables in order for a fresh eval next time.
+    self.reset()
+
+    metrics_dict = {}
+    for i, name in enumerate(self._metric_names):
+      metrics_dict[name] = metrics[i].astype(np.float32)
+    return metrics_dict
+
+  def _process_predictions(self, predictions):
+    image_scale = np.tile(predictions['image_info'][:, 2:3, :], (1, 1, 2))
+    predictions['detection_boxes'] = (
+        predictions['detection_boxes'].astype(np.float32))
+    predictions['detection_boxes'] /= image_scale
+    if 'detection_outer_boxes' in predictions:
+      predictions['detection_outer_boxes'] = (
+          predictions['detection_outer_boxes'].astype(np.float32))
+      predictions['detection_outer_boxes'] /= image_scale
+
+  def update(self, predictions, groundtruths=None):
+    """Update and aggregate detection results and groundtruth data.
+
+    Args:
+      predictions: a dictionary of numpy arrays including the fields below. See
+        different parsers under `../dataloader` for more details.
+        Required fields:
+          - source_id: a numpy array of int or string of shape [batch_size].
+          - image_info [if `need_rescale_bboxes` is True]: a numpy array of
+            float of shape [batch_size, 4, 2].
+          - num_detections: a numpy array of int of shape [batch_size].
+          - detection_boxes: a numpy array of float of shape [batch_size, K, 4].
+          - detection_classes: a numpy array of int of shape [batch_size, K].
+          - detection_scores: a numpy array of float of shape [batch_size, K].
+        Optional fields:
+          - detection_masks: a numpy array of float of shape [batch_size, K,
+            mask_height, mask_width].
+      groundtruths: a dictionary of numpy arrays including the fields below. See
+        also different parsers under `../dataloader` for more details.
+        Required fields:
+          - source_id: a numpy array of int or string of shape [batch_size].
+          - height: a numpy array of int of shape [batch_size].
+          - width: a numpy array of int of shape [batch_size].
+          - num_detections: a numpy array of int of shape [batch_size].
+          - boxes: a numpy array of float of shape [batch_size, K, 4].
+          - classes: a numpy array of int of shape [batch_size, K].
+        Optional fields:
+          - is_crowds: a numpy array of int of shape [batch_size, K]. If the
+            field is absent, it is assumed that this instance is not crowd.
+          - areas: a numy array of float of shape [batch_size, K]. If the field
+            is absent, the area is calculated using either boxes or masks
+            depending on which one is available.
+          - masks: a numpy array of float of shape [batch_size, K, mask_height,
+            mask_width],
+
+    Raises:
+      ValueError: if the required prediction or groundtruth fields are not
+        present in the incoming `predictions` or `groundtruths`.
+    """
+    for k in self._required_prediction_fields:
+      if k not in predictions:
+        raise ValueError(
+            'Missing the required key `{}` in predictions!'.format(k))
+    if self._need_rescale_bboxes:
+      self._process_predictions(predictions)
+    for k, v in six.iteritems(predictions):
+      if k not in self._predictions:
+        self._predictions[k] = [v]
+      else:
+        self._predictions[k].append(v)
+
+    if not self._annotation_file:
+      assert groundtruths
+      for k in self._required_groundtruth_fields:
+        if k not in groundtruths:
+          raise ValueError(
+              'Missing the required key `{}` in groundtruths!'.format(k))
+      for k, v in six.iteritems(groundtruths):
+        if k not in self._groundtruths:
+          self._groundtruths[k] = [v]
+        else:
+          self._groundtruths[k].append(v)
+
+
+class OlnXclassEvaluator(COCOEvaluator):
+  """COCO evaluation metric class."""
+
+  def __init__(self, annotation_file, include_mask, need_rescale_bboxes=True,
+               use_category=True, seen_class='all'):
+    """Constructs COCO evaluation class.
+
+    The class provides the interface to metrics_fn in TPUEstimator. The
+    _update_op() takes detections from each image and push them to
+    self.detections. The _evaluate() loads a JSON file in COCO annotation format
+    as the groundtruths and runs COCO evaluation.
+
+    Args:
+      annotation_file: a JSON file that stores annotations of the eval dataset.
+        If `annotation_file` is None, groundtruth annotations will be loaded
+        from the dataloader.
+      include_mask: a boolean to indicate whether or not to include the mask
+        eval.
+      need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back
+        to absolute values (`image_info` is needed in this case).
+      use_category: if `False`, treat all object in all classes in one
+        foreground category.
+      seen_class: 'all' or 'voc' or 'nonvoc'
+    """
+    super(OlnXclassEvaluator, self).__init__(
+        annotation_file=annotation_file,
+        include_mask=include_mask,
+        need_rescale_bboxes=need_rescale_bboxes)
+    self._use_category = use_category
+    self._seen_class = seen_class
+    self._seen_class_ids = class_utils.coco_split_class_ids(seen_class)
+    self._metric_names = [
+        'AP', 'AP50', 'AP75',
+        'APs', 'APm', 'APl',
+        'ARmax10', 'ARmax20', 'ARmax50', 'ARmax100', 'ARmax200',
+        'ARmax10s', 'ARmax10m', 'ARmax10l'
+    ]
+    if self._seen_class != 'all':
+      self._metric_names.extend([
+          'AP_seen', 'AP50_seen', 'AP75_seen',
+          'APs_seen', 'APm_seen', 'APl_seen',
+          'ARmax10_seen', 'ARmax20_seen', 'ARmax50_seen',
+          'ARmax100_seen', 'ARmax200_seen',
+          'ARmax10s_seen', 'ARmax10m_seen', 'ARmax10l_seen',
+
+          'AP_novel', 'AP50_novel', 'AP75_novel',
+          'APs_novel', 'APm_novel', 'APl_novel',
+          'ARmax10_novel', 'ARmax20_novel', 'ARmax50_novel',
+          'ARmax100_novel', 'ARmax200_novel',
+          'ARmax10s_novel', 'ARmax10m_novel', 'ARmax10l_novel',
+      ])
+    if self._include_mask:
+      mask_metric_names = ['mask_' + x for x in self._metric_names]
+      self._metric_names.extend(mask_metric_names)
+      self._required_prediction_fields.extend(['detection_masks'])
+      self._required_groundtruth_fields.extend(['masks'])
+
+    self.reset()
+
+  def evaluate(self):
+    """Evaluates with detections from all images with COCO API.
+
+    Returns:
+      coco_metric: float numpy array with shape [24] representing the
+        coco-style evaluation metrics (box and mask).
+    """
+    if not self._annotation_file:
+      logging.info('Thre is no annotation_file in COCOEvaluator.')
+      gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset(
+          self._groundtruths)
+      coco_gt = coco_utils.COCOWrapper(
+          eval_type=('mask' if self._include_mask else 'box'),
+          gt_dataset=gt_dataset)
+    else:
+      logging.info('Using annotation file: %s', self._annotation_file)
+      coco_gt = self._coco_gt
+
+    coco_predictions = coco_utils.convert_predictions_to_coco_annotations(
+        self._predictions)
+    coco_dt = coco_gt.loadRes(predictions=coco_predictions)
+    image_ids = [ann['image_id'] for ann in coco_predictions]
+    # Class manipulation: 'all' split samples -> ignored_split = 0.
+    for idx, ann in enumerate(coco_gt.dataset['annotations']):
+      coco_gt.dataset['annotations'][idx]['ignored_split'] = 0
+    coco_eval = cocoeval.OlnCOCOevalXclassWrapper(
+        coco_gt, coco_dt, iou_type='bbox')
+    coco_eval.params.maxDets = [10, 20, 50, 100, 200]
+    coco_eval.params.imgIds = image_ids
+    coco_eval.params.useCats = 0 if not self._use_category else 1
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    coco_metrics = coco_eval.stats
+
+    if self._include_mask:
+      mcoco_eval = cocoeval.OlnCOCOevalXclassWrapper(
+          coco_gt, coco_dt, iou_type='segm')
+      mcoco_eval.params.maxDets = [10, 20, 50, 100, 200]
+      mcoco_eval.params.imgIds = image_ids
+      mcoco_eval.params.useCats = 0 if not self._use_category else 1
+      mcoco_eval.evaluate()
+      mcoco_eval.accumulate()
+      mcoco_eval.summarize()
+      mask_coco_metrics = mcoco_eval.stats
+
+    if self._include_mask:
+      metrics = np.hstack((coco_metrics, mask_coco_metrics))
+    else:
+      metrics = coco_metrics
+
+    if self._seen_class != 'all':
+      # for seen class eval, samples of novel_class are ignored.
+      coco_gt_seen = copy.deepcopy(coco_gt)
+      for idx, ann in enumerate(coco_gt.dataset['annotations']):
+        if ann['category_id'] in self._seen_class_ids:
+          coco_gt_seen.dataset['annotations'][idx]['ignored_split'] = 0
+        else:
+          coco_gt_seen.dataset['annotations'][idx]['ignored_split'] = 1
+      coco_eval_seen = cocoeval.OlnCOCOevalXclassWrapper(
+          coco_gt_seen, coco_dt, iou_type='bbox')
+      coco_eval_seen.params.maxDets = [10, 20, 50, 100, 200]
+      coco_eval_seen.params.imgIds = image_ids
+      coco_eval_seen.params.useCats = 0 if not self._use_category else 1
+      coco_eval_seen.evaluate()
+      coco_eval_seen.accumulate()
+      coco_eval_seen.summarize()
+      coco_metrics_seen = coco_eval_seen.stats
+      if self._include_mask:
+        mcoco_eval_seen = cocoeval.OlnCOCOevalXclassWrapper(
+            coco_gt_seen, coco_dt, iou_type='segm')
+        mcoco_eval_seen.params.maxDets = [10, 20, 50, 100, 200]
+        mcoco_eval_seen.params.imgIds = image_ids
+        mcoco_eval_seen.params.useCats = 0 if not self._use_category else 1
+        mcoco_eval_seen.evaluate()
+        mcoco_eval_seen.accumulate()
+        mcoco_eval_seen.summarize()
+        mask_coco_metrics_seen = mcoco_eval_seen.stats
+
+      # for novel class eval, samples of seen_class are ignored.
+      coco_gt_novel = copy.deepcopy(coco_gt)
+      for idx, ann in enumerate(coco_gt.dataset['annotations']):
+        if ann['category_id'] in self._seen_class_ids:
+          coco_gt_novel.dataset['annotations'][idx]['ignored_split'] = 1
+        else:
+          coco_gt_novel.dataset['annotations'][idx]['ignored_split'] = 0
+      coco_eval_novel = cocoeval.OlnCOCOevalXclassWrapper(
+          coco_gt_novel, coco_dt, iou_type='bbox')
+      coco_eval_novel.params.maxDets = [10, 20, 50, 100, 200]
+      coco_eval_novel.params.imgIds = image_ids
+      coco_eval_novel.params.useCats = 0 if not self._use_category else 1
+      coco_eval_novel.evaluate()
+      coco_eval_novel.accumulate()
+      coco_eval_novel.summarize()
+      coco_metrics_novel = coco_eval_novel.stats
+      if self._include_mask:
+        mcoco_eval_novel = cocoeval.OlnCOCOevalXclassWrapper(
+            coco_gt_novel, coco_dt, iou_type='segm')
+        mcoco_eval_novel.params.maxDets = [10, 20, 50, 100, 200]
+        mcoco_eval_novel.params.imgIds = image_ids
+        mcoco_eval_novel.params.useCats = 0 if not self._use_category else 1
+        mcoco_eval_novel.evaluate()
+        mcoco_eval_novel.accumulate()
+        mcoco_eval_novel.summarize()
+        mask_coco_metrics_novel = mcoco_eval_novel.stats
+
+      # Combine all splits.
+      if self._include_mask:
+        metrics = np.hstack((
+            coco_metrics, coco_metrics_seen, coco_metrics_novel,
+            mask_coco_metrics, mask_coco_metrics_seen, mask_coco_metrics_novel))
+      else:
+        metrics = np.hstack((
+            coco_metrics, coco_metrics_seen, coco_metrics_novel))
+
+    # Cleans up the internal variables in order for a fresh eval next time.
+    self.reset()
+
+    metrics_dict = {}
+    for i, name in enumerate(self._metric_names):
+      metrics_dict[name] = metrics[i].astype(np.float32)
+    return metrics_dict
+
+
+class OlnXdataEvaluator(OlnXclassEvaluator):
+  """COCO evaluation metric class."""
+
+  def __init__(self, annotation_file, include_mask, need_rescale_bboxes=True,
+               use_category=True, seen_class='all'):
+    """Constructs COCO evaluation class.
+
+    The class provides the interface to metrics_fn in TPUEstimator. The
+    _update_op() takes detections from each image and push them to
+    self.detections. The _evaluate() loads a JSON file in COCO annotation format
+    as the groundtruths and runs COCO evaluation.
+
+    Args:
+      annotation_file: a JSON file that stores annotations of the eval dataset.
+        If `annotation_file` is None, groundtruth annotations will be loaded
+        from the dataloader.
+      include_mask: a boolean to indicate whether or not to include the mask
+        eval.
+      need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back
+        to absolute values (`image_info` is needed in this case).
+      use_category: if `False`, treat all object in all classes in one
+        foreground category.
+      seen_class: 'all' or 'voc' or 'nonvoc'
+    """
+    super(OlnXdataEvaluator, self).__init__(
+        annotation_file=annotation_file,
+        include_mask=include_mask,
+        need_rescale_bboxes=need_rescale_bboxes,
+        use_category=False,
+        seen_class='all')
+
+  def evaluate(self):
+    """Evaluates with detections from all images with COCO API.
+
+    Returns:
+      coco_metric: float numpy array with shape [24] representing the
+        coco-style evaluation metrics (box and mask).
+    """
+    if not self._annotation_file:
+      logging.info('Thre is no annotation_file in COCOEvaluator.')
+      gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset(
+          self._groundtruths)
+      coco_gt = coco_utils.COCOWrapper(
+          eval_type=('mask' if self._include_mask else 'box'),
+          gt_dataset=gt_dataset)
+    else:
+      logging.info('Using annotation file: %s', self._annotation_file)
+      coco_gt = self._coco_gt
+    coco_predictions = coco_utils.convert_predictions_to_coco_annotations(
+        self._predictions)
+    coco_dt = coco_gt.loadRes(predictions=coco_predictions)
+    image_ids = [ann['image_id'] for ann in coco_predictions]
+    # Class manipulation: 'all' split samples -> ignored_split = 0.
+    for idx, _ in enumerate(coco_gt.dataset['annotations']):
+      coco_gt.dataset['annotations'][idx]['ignored_split'] = 0
+    coco_eval = cocoeval.OlnCOCOevalWrapper(coco_gt, coco_dt, iou_type='bbox')
+    coco_eval.params.maxDets = [10, 20, 50, 100, 200]
+    coco_eval.params.imgIds = image_ids
+    coco_eval.params.useCats = 0 if not self._use_category else 1
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    coco_metrics = coco_eval.stats
+
+    if self._include_mask:
+      mcoco_eval = cocoeval.OlnCOCOevalWrapper(coco_gt, coco_dt,
+                                               iou_type='segm')
+      mcoco_eval.params.maxDets = [10, 20, 50, 100, 200]
+      mcoco_eval.params.imgIds = image_ids
+      mcoco_eval.params.useCats = 0 if not self._use_category else 1
+      mcoco_eval.evaluate()
+      mcoco_eval.accumulate()
+      mcoco_eval.summarize()
+      mask_coco_metrics = mcoco_eval.stats
+
+    if self._include_mask:
+      metrics = np.hstack((coco_metrics, mask_coco_metrics))
+    else:
+      metrics = coco_metrics
+
+    # Cleans up the internal variables in order for a fresh eval next time.
+    self.reset()
+
+    metrics_dict = {}
+    for i, name in enumerate(self._metric_names):
+      metrics_dict[name] = metrics[i].astype(np.float32)
+    return metrics_dict
+
+
+class ShapeMaskCOCOEvaluator(COCOEvaluator):
+  """COCO evaluation metric class for ShapeMask."""
+
+  def __init__(self, mask_eval_class, **kwargs):
+    """Constructs COCO evaluation class.
+
+    The class provides the interface to metrics_fn in TPUEstimator. The
+    _update_op() takes detections from each image and push them to
+    self.detections. The _evaluate() loads a JSON file in COCO annotation format
+    as the groundtruths and runs COCO evaluation.
+
+    Args:
+      mask_eval_class: the set of classes for mask evaluation.
+      **kwargs: other keyword arguments passed to the parent class initializer.
+    """
+    super(ShapeMaskCOCOEvaluator, self).__init__(**kwargs)
+    self._mask_eval_class = mask_eval_class
+    self._eval_categories = class_utils.coco_split_class_ids(mask_eval_class)
+    if mask_eval_class != 'all':
+      self._metric_names = [
+          x.replace('mask', 'novel_mask') for x in self._metric_names
+      ]
+
+  def evaluate(self):
+    """Evaluates with detections from all images with COCO API.
+
+    Returns:
+      coco_metric: float numpy array with shape [24] representing the
+        coco-style evaluation metrics (box and mask).
+    """
+    if not self._annotation_file:
+      gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset(
+          self._groundtruths)
+      coco_gt = coco_utils.COCOWrapper(
+          eval_type=('mask' if self._include_mask else 'box'),
+          gt_dataset=gt_dataset)
+    else:
+      coco_gt = self._coco_gt
+    coco_predictions = coco_utils.convert_predictions_to_coco_annotations(
+        self._predictions)
+    coco_dt = coco_gt.loadRes(predictions=coco_predictions)
+    image_ids = [ann['image_id'] for ann in coco_predictions]
+
+    coco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='bbox')
+    coco_eval.params.imgIds = image_ids
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    coco_metrics = coco_eval.stats
+
+    if self._include_mask:
+      mcoco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='segm')
+      mcoco_eval.params.imgIds = image_ids
+      mcoco_eval.evaluate()
+      mcoco_eval.accumulate()
+      mcoco_eval.summarize()
+      if self._mask_eval_class == 'all':
+        metrics = np.hstack((coco_metrics, mcoco_eval.stats))
+      else:
+        mask_coco_metrics = mcoco_eval.category_stats
+        val_catg_idx = np.isin(mcoco_eval.params.catIds, self._eval_categories)
+        # Gather the valid evaluation of the eval categories.
+        if np.any(val_catg_idx):
+          mean_val_metrics = []
+          for mid in range(len(self._metric_names) // 2):
+            mean_val_metrics.append(
+                np.nanmean(mask_coco_metrics[mid][val_catg_idx]))
+
+          mean_val_metrics = np.array(mean_val_metrics)
+        else:
+          mean_val_metrics = np.zeros(len(self._metric_names) // 2)
+        metrics = np.hstack((coco_metrics, mean_val_metrics))
+    else:
+      metrics = coco_metrics
+
+    # Cleans up the internal variables in order for a fresh eval next time.
+    self.reset()
+
+    metrics_dict = {}
+    for i, name in enumerate(self._metric_names):
+      metrics_dict[name] = metrics[i].astype(np.float32)
+    return metrics_dict
--- a/official/legacy/detection/evaluation/coco_utils.py
+++ b/official/legacy/detection/evaluation/coco_utils.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Util functions related to pycocotools and COCO eval."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import json
+
+from absl import logging
+import numpy as np
+from PIL import Image
+from pycocotools import coco
+from pycocotools import mask as mask_api
+import six
+import tensorflow as tf
+
+from official.legacy.detection.dataloader import tf_example_decoder
+from official.legacy.detection.utils import box_utils
+from official.legacy.detection.utils import mask_utils
+
+
+class COCOWrapper(coco.COCO):
+  """COCO wrapper class.
+
+  This class wraps COCO API object, which provides the following additional
+  functionalities:
+    1. Support string type image id.
+    2. Support loading the groundtruth dataset using the external annotation
+       dictionary.
+    3. Support loading the prediction results using the external annotation
+       dictionary.
+  """
+
+  def __init__(self, eval_type='box', annotation_file=None, gt_dataset=None):
+    """Instantiates a COCO-style API object.
+
+    Args:
+      eval_type: either 'box' or 'mask'.
+      annotation_file: a JSON file that stores annotations of the eval dataset.
+        This is required if `gt_dataset` is not provided.
+      gt_dataset: the groundtruth eval datatset in COCO API format.
+    """
+    if ((annotation_file and gt_dataset) or
+        ((not annotation_file) and (not gt_dataset))):
+      raise ValueError('One and only one of `annotation_file` and `gt_dataset` '
+                       'needs to be specified.')
+
+    if eval_type not in ['box', 'mask']:
+      raise ValueError('The `eval_type` can only be either `box` or `mask`.')
+
+    coco.COCO.__init__(self, annotation_file=annotation_file)
+    self._eval_type = eval_type
+    if gt_dataset:
+      self.dataset = gt_dataset
+      self.createIndex()
+
+  def loadRes(self, predictions):
+    """Loads result file and return a result api object.
+
+    Args:
+      predictions: a list of dictionary each representing an annotation in COCO
+        format. The required fields are `image_id`, `category_id`, `score`,
+        `bbox`, `segmentation`.
+
+    Returns:
+      res: result COCO api object.
+
+    Raises:
+      ValueError: if the set of image id from predctions is not the subset of
+        the set of image id of the groundtruth dataset.
+    """
+    res = coco.COCO()
+    res.dataset['images'] = copy.deepcopy(self.dataset['images'])
+    res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+
+    image_ids = [ann['image_id'] for ann in predictions]
+    if set(image_ids) != (set(image_ids) & set(self.getImgIds())):
+      raise ValueError('Results do not correspond to the current dataset!')
+    for ann in predictions:
+      x1, x2, y1, y2 = [ann['bbox'][0], ann['bbox'][0] + ann['bbox'][2],
+                        ann['bbox'][1], ann['bbox'][1] + ann['bbox'][3]]
+      if self._eval_type == 'box':
+        ann['area'] = ann['bbox'][2] * ann['bbox'][3]
+        ann['segmentation'] = [
+            [x1, y1, x1, y2, x2, y2, x2, y1]]
+      elif self._eval_type == 'mask':
+        ann['area'] = mask_api.area(ann['segmentation'])
+
+    res.dataset['annotations'] = copy.deepcopy(predictions)
+    res.createIndex()
+    return res
+
+
+def convert_predictions_to_coco_annotations(predictions):
+  """Converts a batch of predictions to annotations in COCO format.
+
+  Args:
+    predictions: a dictionary of lists of numpy arrays including the following
+      fields. K below denotes the maximum number of instances per image.
+      Required fields:
+        - source_id: a list of numpy arrays of int or string of shape
+            [batch_size].
+        - num_detections: a list of numpy arrays of int of shape [batch_size].
+        - detection_boxes: a list of numpy arrays of float of shape
+            [batch_size, K, 4], where coordinates are in the original image
+            space (not the scaled image space).
+        - detection_classes: a list of numpy arrays of int of shape
+            [batch_size, K].
+        - detection_scores: a list of numpy arrays of float of shape
+            [batch_size, K].
+      Optional fields:
+        - detection_masks: a list of numpy arrays of float of shape
+            [batch_size, K, mask_height, mask_width].
+
+  Returns:
+    coco_predictions: prediction in COCO annotation format.
+  """
+  coco_predictions = []
+  num_batches = len(predictions['source_id'])
+  batch_size = predictions['source_id'][0].shape[0]
+  max_num_detections = predictions['detection_classes'][0].shape[1]
+  use_outer_box = 'detection_outer_boxes' in predictions
+  for i in range(num_batches):
+    predictions['detection_boxes'][i] = box_utils.yxyx_to_xywh(
+        predictions['detection_boxes'][i])
+    if use_outer_box:
+      predictions['detection_outer_boxes'][i] = box_utils.yxyx_to_xywh(
+          predictions['detection_outer_boxes'][i])
+      mask_boxes = predictions['detection_outer_boxes']
+    else:
+      mask_boxes = predictions['detection_boxes']
+
+    for j in range(batch_size):
+      if 'detection_masks' in predictions:
+        image_masks = mask_utils.paste_instance_masks(
+            predictions['detection_masks'][i][j],
+            mask_boxes[i][j],
+            int(predictions['image_info'][i][j, 0, 0]),
+            int(predictions['image_info'][i][j, 0, 1]))
+        binary_masks = (image_masks > 0.0).astype(np.uint8)
+        encoded_masks = [
+            mask_api.encode(np.asfortranarray(binary_mask))
+            for binary_mask in list(binary_masks)]
+      for k in range(max_num_detections):
+        ann = {}
+        ann['image_id'] = predictions['source_id'][i][j]
+        ann['category_id'] = predictions['detection_classes'][i][j, k]
+        ann['bbox'] = predictions['detection_boxes'][i][j, k]
+        ann['score'] = predictions['detection_scores'][i][j, k]
+        if 'detection_masks' in predictions:
+          ann['segmentation'] = encoded_masks[k]
+        coco_predictions.append(ann)
+
+  for i, ann in enumerate(coco_predictions):
+    ann['id'] = i + 1
+
+  return coco_predictions
+
+
+def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
+  """Converts groundtruths to the dataset in COCO format.
+
+  Args:
+    groundtruths: a dictionary of numpy arrays including the fields below.
+      Note that each element in the list represent the number for a single
+      example without batch dimension. K below denotes the actual number of
+      instances for each image.
+      Required fields:
+        - source_id: a list of numpy arrays of int or string of shape
+          [batch_size].
+        - height: a list of numpy arrays of int of shape [batch_size].
+        - width: a list of numpy arrays of int of shape [batch_size].
+        - num_detections: a list of numpy arrays of int of shape [batch_size].
+        - boxes: a list of numpy arrays of float of shape [batch_size, K, 4],
+            where coordinates are in the original image space (not the
+            normalized coordinates).
+        - classes: a list of numpy arrays of int of shape [batch_size, K].
+      Optional fields:
+        - is_crowds: a list of numpy arrays of int of shape [batch_size, K]. If
+            th field is absent, it is assumed that this instance is not crowd.
+        - areas: a list of numy arrays of float of shape [batch_size, K]. If the
+            field is absent, the area is calculated using either boxes or
+            masks depending on which one is available.
+        - masks: a list of numpy arrays of string of shape [batch_size, K],
+    label_map: (optional) a dictionary that defines items from the category id
+      to the category name. If `None`, collect the category mappping from the
+      `groundtruths`.
+
+  Returns:
+    coco_groundtruths: the groundtruth dataset in COCO format.
+  """
+  source_ids = np.concatenate(groundtruths['source_id'], axis=0)
+  heights = np.concatenate(groundtruths['height'], axis=0)
+  widths = np.concatenate(groundtruths['width'], axis=0)
+  gt_images = [{'id': int(i), 'height': int(h), 'width': int(w)} for i, h, w
+               in zip(source_ids, heights, widths)]
+
+  gt_annotations = []
+  num_batches = len(groundtruths['source_id'])
+  batch_size = groundtruths['source_id'][0].shape[0]
+  for i in range(num_batches):
+    for j in range(batch_size):
+      num_instances = groundtruths['num_detections'][i][j]
+      for k in range(num_instances):
+        ann = {}
+        ann['image_id'] = int(groundtruths['source_id'][i][j])
+        if 'is_crowds' in groundtruths:
+          ann['iscrowd'] = int(groundtruths['is_crowds'][i][j, k])
+        else:
+          ann['iscrowd'] = 0
+        ann['category_id'] = int(groundtruths['classes'][i][j, k])
+        boxes = groundtruths['boxes'][i]
+        ann['bbox'] = [
+            float(boxes[j, k, 1]),
+            float(boxes[j, k, 0]),
+            float(boxes[j, k, 3] - boxes[j, k, 1]),
+            float(boxes[j, k, 2] - boxes[j, k, 0])]
+        if 'areas' in groundtruths:
+          ann['area'] = float(groundtruths['areas'][i][j, k])
+        else:
+          ann['area'] = float(
+              (boxes[j, k, 3] - boxes[j, k, 1]) *
+              (boxes[j, k, 2] - boxes[j, k, 0]))
+        if 'masks' in groundtruths:
+          mask = Image.open(six.BytesIO(groundtruths['masks'][i][j, k]))
+          width, height = mask.size
+          np_mask = (
+              np.array(mask.getdata()).reshape(height, width).astype(np.uint8))
+          np_mask[np_mask > 0] = 255
+          encoded_mask = mask_api.encode(np.asfortranarray(np_mask))
+          ann['segmentation'] = encoded_mask
+          if 'areas' not in groundtruths:
+            ann['area'] = mask_api.area(encoded_mask)
+        gt_annotations.append(ann)
+
+  for i, ann in enumerate(gt_annotations):
+    ann['id'] = i + 1
+
+  if label_map:
+    gt_categories = [{'id': i, 'name': label_map[i]} for i in label_map]
+  else:
+    category_ids = [gt['category_id'] for gt in gt_annotations]
+    gt_categories = [{'id': i} for i in set(category_ids)]
+
+  gt_dataset = {
+      'images': gt_images,
+      'categories': gt_categories,
+      'annotations': copy.deepcopy(gt_annotations),
+  }
+  return gt_dataset
+
+
+class COCOGroundtruthGenerator(object):
+  """Generates the groundtruth annotations from a single example."""
+
+  def __init__(self, file_pattern, num_examples, include_mask):
+    self._file_pattern = file_pattern
+    self._num_examples = num_examples
+    self._include_mask = include_mask
+    self._dataset_fn = tf.data.TFRecordDataset
+
+  def _parse_single_example(self, example):
+    """Parses a single serialized tf.Example proto.
+
+    Args:
+      example: a serialized tf.Example proto string.
+
+    Returns:
+      A dictionary of groundtruth with the following fields:
+        source_id: a scalar tensor of int64 representing the image source_id.
+        height: a scalar tensor of int64 representing the image height.
+        width: a scalar tensor of int64 representing the image width.
+        boxes: a float tensor of shape [K, 4], representing the groundtruth
+          boxes in absolute coordinates with respect to the original image size.
+        classes: a int64 tensor of shape [K], representing the class labels of
+          each instances.
+        is_crowds: a bool tensor of shape [K], indicating whether the instance
+          is crowd.
+        areas: a float tensor of shape [K], indicating the area of each
+          instance.
+        masks: a string tensor of shape [K], containing the bytes of the png
+          mask of each instance.
+    """
+    decoder = tf_example_decoder.TfExampleDecoder(
+        include_mask=self._include_mask)
+    decoded_tensors = decoder.decode(example)
+
+    image = decoded_tensors['image']
+    image_size = tf.shape(image)[0:2]
+    boxes = box_utils.denormalize_boxes(
+        decoded_tensors['groundtruth_boxes'], image_size)
+    groundtruths = {
+        'source_id': tf.string_to_number(
+            decoded_tensors['source_id'], out_type=tf.int64),
+        'height': decoded_tensors['height'],
+        'width': decoded_tensors['width'],
+        'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0],
+        'boxes': boxes,
+        'classes': decoded_tensors['groundtruth_classes'],
+        'is_crowds': decoded_tensors['groundtruth_is_crowd'],
+        'areas': decoded_tensors['groundtruth_area'],
+    }
+    if self._include_mask:
+      groundtruths.update({
+          'masks': decoded_tensors['groundtruth_instance_masks_png'],
+      })
+    return groundtruths
+
+  def _build_pipeline(self):
+    """Builds data pipeline to generate groundtruth annotations."""
+    dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False)
+    dataset = dataset.apply(
+        tf.data.experimental.parallel_interleave(
+            lambda filename: self._dataset_fn(filename).prefetch(1),
+            cycle_length=32,
+            sloppy=False))
+    dataset = dataset.map(self._parse_single_example, num_parallel_calls=64)
+    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+    dataset = dataset.batch(1, drop_remainder=False)
+    return dataset
+
+  def __call__(self):
+    with tf.Graph().as_default():
+      dataset = self._build_pipeline()
+      groundtruth = dataset.make_one_shot_iterator().get_next()
+
+      with tf.Session() as sess:
+        for _ in range(self._num_examples):
+          groundtruth_result = sess.run(groundtruth)
+          yield groundtruth_result
+
+
+def scan_and_generator_annotation_file(file_pattern,
+                                       num_samples,
+                                       include_mask,
+                                       annotation_file):
+  """Scans and generate the COCO-style annotation JSON file given a dataset."""
+  groundtruth_generator = COCOGroundtruthGenerator(
+      file_pattern, num_samples, include_mask)
+  generate_annotation_file(groundtruth_generator, annotation_file)
+
+
+def generate_annotation_file(groundtruth_generator,
+                             annotation_file):
+  """Generates COCO-style annotation JSON file given a groundtruth generator."""
+  groundtruths = {}
+  logging.info('Loading groundtruth annotations from dataset to memory...')
+  for groundtruth in groundtruth_generator():
+    for k, v in six.iteritems(groundtruth):
+      if k not in groundtruths:
+        groundtruths[k] = [v]
+      else:
+        groundtruths[k].append(v)
+  gt_dataset = convert_groundtruths_to_coco_dataset(groundtruths)
+
+  logging.info('Saving groundtruth annotations to the JSON file...')
+  with tf.io.gfile.GFile(annotation_file, 'w') as f:
+    f.write(json.dumps(gt_dataset))
+  logging.info('Done saving the JSON file...')
--- a/official/legacy/detection/evaluation/factory.py
+++ b/official/legacy/detection/evaluation/factory.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Evaluator factory."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from official.legacy.detection.evaluation import coco_evaluator
+
+
+def evaluator_generator(params):
+  """Generator function for various evaluators."""
+  if params.type == 'box':
+    evaluator = coco_evaluator.COCOEvaluator(
+        annotation_file=params.val_json_file, include_mask=False)
+  elif params.type == 'box_and_mask':
+    evaluator = coco_evaluator.COCOEvaluator(
+        annotation_file=params.val_json_file, include_mask=True)
+  elif params.type == 'oln_xclass_box':
+    evaluator = coco_evaluator.OlnXclassEvaluator(
+        annotation_file=params.val_json_file, include_mask=False,
+        use_category=False, seen_class=params.seen_class,)
+  elif params.type == 'oln_xclass_box_and_mask':
+    evaluator = coco_evaluator.OlnXclassEvaluator(
+        annotation_file=params.val_json_file, include_mask=True,
+        use_category=False, seen_class=params.seen_class,)
+  elif params.type == 'oln_xdata_box':
+    evaluator = coco_evaluator.OlnXdataEvaluator(
+        annotation_file=params.val_json_file, include_mask=False,
+        use_category=False, seen_class='all',)
+  elif params.type == 'shapemask_box_and_mask':
+    evaluator = coco_evaluator.ShapeMaskCOCOEvaluator(
+        mask_eval_class=params.mask_eval_class,
+        annotation_file=params.val_json_file, include_mask=True)
+
+  else:
+    raise ValueError('Evaluator %s is not supported.' % params.type)
+
+  return coco_evaluator.MetricWrapper(evaluator)
--- a/official/legacy/detection/executor/__init__.py
+++ b/official/legacy/detection/executor/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/legacy/detection/executor/detection_executor.py
+++ b/official/legacy/detection/executor/detection_executor.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""An executor class for running model on TensorFlow 2.0."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import logging
+
+import tensorflow as tf
+from official.legacy.detection.executor import distributed_executor as executor
+from official.vision.utils.object_detection import visualization_utils
+
+
+class DetectionDistributedExecutor(executor.DistributedExecutor):
+  """Detection specific customer training loop executor.
+
+  Subclasses the DistributedExecutor and adds support for numpy based metrics.
+  """
+
+  def __init__(self,
+               predict_post_process_fn=None,
+               trainable_variables_filter=None,
+               **kwargs):
+    super(DetectionDistributedExecutor, self).__init__(**kwargs)
+    if predict_post_process_fn:
+      assert callable(predict_post_process_fn)
+    if trainable_variables_filter:
+      assert callable(trainable_variables_filter)
+    self._predict_post_process_fn = predict_post_process_fn
+    self._trainable_variables_filter = trainable_variables_filter
+    self.eval_steps = tf.Variable(
+        0,
+        trainable=False,
+        dtype=tf.int32,
+        synchronization=tf.VariableSynchronization.ON_READ,
+        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
+        shape=[])
+
+  def _create_replicated_step(self,
+                              strategy,
+                              model,
+                              loss_fn,
+                              optimizer,
+                              metric=None):
+    trainable_variables = model.trainable_variables
+    if self._trainable_variables_filter:
+      trainable_variables = self._trainable_variables_filter(
+          trainable_variables)
+    logging.info('Filter trainable variables from %d to %d',
+                 len(model.trainable_variables), len(trainable_variables))
+    update_state_fn = lambda labels, outputs: None
+    if isinstance(metric, tf.keras.metrics.Metric):
+      update_state_fn = metric.update_state
+    else:
+      logging.error('Detection: train metric is not an instance of '
+                    'tf.keras.metrics.Metric.')
+
+    def _replicated_step(inputs):
+      """Replicated training step."""
+      inputs, labels = inputs
+
+      with tf.GradientTape() as tape:
+        outputs = model(inputs, training=True)
+        all_losses = loss_fn(labels, outputs)
+        losses = {}
+        for k, v in all_losses.items():
+          losses[k] = tf.reduce_mean(v)
+        per_replica_loss = losses['total_loss'] / strategy.num_replicas_in_sync
+        update_state_fn(labels, outputs)
+
+      grads = tape.gradient(per_replica_loss, trainable_variables)
+      clipped_grads, _ = tf.clip_by_global_norm(grads, clip_norm=1.0)
+      optimizer.apply_gradients(zip(clipped_grads, trainable_variables))
+      return losses
+
+    return _replicated_step
+
+  def _create_test_step(self, strategy, model, metric):
+    """Creates a distributed test step."""
+
+    @tf.function
+    def test_step(iterator, eval_steps):
+      """Calculates evaluation metrics on distributed devices."""
+
+      def _test_step_fn(inputs, eval_steps):
+        """Replicated accuracy calculation."""
+        inputs, labels = inputs
+        model_outputs = model(inputs, training=False)
+        if self._predict_post_process_fn:
+          labels, prediction_outputs = self._predict_post_process_fn(
+              labels, model_outputs)
+          num_remaining_visualizations = (
+              self._params.eval.num_images_to_visualize - eval_steps)
+          # If there are remaining number of visualizations that needs to be
+          # done, add next batch outputs for visualization.
+          #
+          # TODO(hongjunchoi): Once dynamic slicing is supported on TPU, only
+          # write correct slice of outputs to summary file.
+          if num_remaining_visualizations > 0:
+            visualization_utils.visualize_images_with_bounding_boxes(
+                inputs, prediction_outputs['detection_boxes'],
+                self.global_train_step, self.eval_summary_writer)
+
+        return labels, prediction_outputs
+
+      labels, outputs = strategy.run(
+          _test_step_fn, args=(
+              next(iterator),
+              eval_steps,
+          ))
+      outputs = tf.nest.map_structure(strategy.experimental_local_results,
+                                      outputs)
+      labels = tf.nest.map_structure(strategy.experimental_local_results,
+                                     labels)
+
+      eval_steps.assign_add(self._params.eval.batch_size)
+      return labels, outputs
+
+    return test_step
+
+  def _run_evaluation(self, test_step, current_training_step, metric,
+                      test_iterator):
+    """Runs validation steps and aggregate metrics."""
+    self.eval_steps.assign(0)
+    if not test_iterator or not metric:
+      logging.warning(
+          'Both test_iterator (%s) and metrics (%s) must not be None.',
+          test_iterator, metric)
+      return None
+    logging.info('Running evaluation after step: %s.', current_training_step)
+    while True:
+      try:
+        labels, outputs = test_step(test_iterator, self.eval_steps)
+        if metric:
+          metric.update_state(labels, outputs)
+      except (StopIteration, tf.errors.OutOfRangeError):
+        break
+
+    metric_result = metric.result()
+    if isinstance(metric, tf.keras.metrics.Metric):
+      metric_result = tf.nest.map_structure(lambda x: x.numpy().astype(float),
+                                            metric_result)
+    logging.info('Step: [%d] Validation metric = %s', current_training_step,
+                 metric_result)
+    return metric_result
--- a/official/legacy/detection/executor/distributed_executor.py
+++ b/official/legacy/detection/executor/distributed_executor.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Custom training loop for running TensorFlow 2.0 models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from typing import Optional, Dict, List, Text, Callable, Union, Iterator, Any
+
+from absl import flags
+from absl import logging
+
+import numpy as np
+import tensorflow as tf
+
+# pylint: disable=unused-import,g-import-not-at-top,redefined-outer-name,reimported
+from official.common import distribute_utils
+from official.modeling.hyperparams import params_dict
+from official.utils import hyperparams_flags
+from official.utils.misc import keras_utils
+
+FLAGS = flags.FLAGS
+
+strategy_flags_dict = hyperparams_flags.strategy_flags_dict
+hparam_flags_dict = hyperparams_flags.hparam_flags_dict
+
+
+def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
+  """Saves model to model_dir with provided checkpoint prefix."""
+
+  checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
+  saved_path = checkpoint.save(checkpoint_path)
+  logging.info('Saving model as TF checkpoint: %s', saved_path)
+
+
+def _steps_to_run(current_step, total_steps, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError('steps_per_loop should be positive integer.')
+  return min(total_steps - current_step, steps_per_loop)
+
+
+def _no_metric():
+  return None
+
+
+def metrics_as_dict(metric):
+  """Puts input metric(s) into a list.
+
+  Args:
+    metric: metric(s) to be put into the list. `metric` could be an object, a
+      list, or a dict of tf.keras.metrics.Metric or has the `required_method`.
+
+  Returns:
+    A dictionary of valid metrics.
+  """
+  if isinstance(metric, tf.keras.metrics.Metric):
+    metrics = {metric.name: metric}
+  elif isinstance(metric, list):
+    metrics = {m.name: m for m in metric}
+  elif isinstance(metric, dict):
+    metrics = metric
+  elif not metric:
+    return {}
+  else:
+    metrics = {'metric': metric}
+  return metrics
+
+
+def metric_results(metric):
+  """Collects results from the given metric(s)."""
+  metrics = metrics_as_dict(metric)
+  metric_result = {
+      name: m.result().numpy().astype(float) for name, m in metrics.items()
+  }
+  return metric_result
+
+
+def reset_states(metric):
+  """Resets states of the given metric(s)."""
+  metrics = metrics_as_dict(metric)
+  for m in metrics.values():
+    m.reset_states()
+
+
+class SummaryWriter(object):
+  """Simple SummaryWriter for writing dictionary of metrics.
+
+  Attributes:
+    writer: The tf.SummaryWriter.
+  """
+
+  def __init__(self, model_dir: Text, name: Text):
+    """Inits SummaryWriter with paths.
+
+    Args:
+      model_dir: the model folder path.
+      name: the summary subfolder name.
+    """
+    self.writer = tf.summary.create_file_writer(os.path.join(model_dir, name))
+
+  def __call__(self, metrics: Union[Dict[Text, float], float], step: int):
+    """Write metrics to summary with the given writer.
+
+    Args:
+      metrics: a dictionary of metrics values. Prefer dictionary.
+      step: integer. The training step.
+    """
+    if not isinstance(metrics, dict):
+      # Support scalar metric without name.
+      logging.warning('Warning: summary writer prefer metrics as dictionary.')
+      metrics = {'metric': metrics}
+
+    with self.writer.as_default():
+      for k, v in metrics.items():
+        tf.summary.scalar(k, v, step=step)
+      self.writer.flush()
+
+
+class DistributedExecutor(object):
+  """Interface to train and eval models with tf.distribute.Strategy."""
+
+  def __init__(self, strategy, params, model_fn, loss_fn, is_multi_host=False):
+    """Constructor.
+
+    Args:
+      strategy: an instance of tf.distribute.Strategy.
+      params: Model configuration needed to run distribution strategy.
+      model_fn: Keras model function. Signature:
+        (params: ParamsDict) -> tf.keras.models.Model.
+      loss_fn: loss function. Signature:
+        (y_true: Tensor, y_pred: Tensor) -> Tensor
+      is_multi_host: Set to True when using multi hosts for training, like multi
+        worker GPU or TPU pod (slice). Otherwise, False.
+    """
+
+    self._params = params
+    self._model_fn = model_fn
+    self._loss_fn = loss_fn
+    self._strategy = strategy
+    self._checkpoint_name = 'ctl_step_{step}.ckpt'
+    self._is_multi_host = is_multi_host
+    self.train_summary_writer = None
+    self.eval_summary_writer = None
+    self.global_train_step = None
+
+  @property
+  def checkpoint_name(self):
+    """Returns default checkpoint name."""
+    return self._checkpoint_name
+
+  @checkpoint_name.setter
+  def checkpoint_name(self, name):
+    """Sets default summary writer for the current thread."""
+    self._checkpoint_name = name
+
+  def loss_fn(self):
+    return self._loss_fn()
+
+  def model_fn(self, params):
+    return self._model_fn(params)
+
+  def _save_config(self, model_dir):
+    """Save parameters to config files if model_dir is defined."""
+
+    logging.info('Save config to model_dir %s.', model_dir)
+    if model_dir:
+      if not tf.io.gfile.exists(model_dir):
+        tf.io.gfile.makedirs(model_dir)
+      self._params.lock()
+      params_dict.save_params_dict_to_yaml(self._params,
+                                           model_dir + '/params.yaml')
+    else:
+      logging.warning('model_dir is empty, so skip the save config.')
+
+  def _get_input_iterator(
+      self, input_fn: Callable[..., tf.data.Dataset],
+      strategy: tf.distribute.Strategy) -> Optional[Iterator[Any]]:
+    """Returns distributed dataset iterator.
+
+    Args:
+      input_fn: (params: dict) -> tf.data.Dataset.
+      strategy: an instance of tf.distribute.Strategy.
+
+    Returns:
+      An iterator that yields input tensors.
+    """
+
+    if input_fn is None:
+      return None
+    # When training with multiple TPU workers, datasets needs to be cloned
+    # across workers. Since Dataset instance cannot be cloned in eager mode,
+    # we instead pass callable that returns a dataset.
+    if self._is_multi_host:
+      return iter(strategy.distribute_datasets_from_function(input_fn))
+    else:
+      input_data = input_fn()
+      return iter(strategy.experimental_distribute_dataset(input_data))
+
+  def _create_replicated_step(self,
+                              strategy,
+                              model,
+                              loss_fn,
+                              optimizer,
+                              metric=None):
+    """Creates a single training step.
+
+    Args:
+      strategy: an instance of tf.distribute.Strategy.
+      model: (Tensor, bool) -> Tensor. model function.
+      loss_fn: (y_true: Tensor, y_pred: Tensor) -> Tensor.
+      optimizer: tf.keras.optimizers.Optimizer.
+      metric: tf.keras.metrics.Metric subclass.
+
+    Returns:
+      The training step callable.
+    """
+    metrics = metrics_as_dict(metric)
+
+    def _replicated_step(inputs):
+      """Replicated training step."""
+      inputs, labels = inputs
+
+      with tf.GradientTape() as tape:
+        outputs = model(inputs, training=True)
+        prediction_loss = loss_fn(labels, outputs)
+        loss = tf.reduce_mean(prediction_loss)
+        loss = loss / strategy.num_replicas_in_sync
+        for m in metrics.values():
+          m.update_state(labels, outputs)
+
+      grads = tape.gradient(loss, model.trainable_variables)
+      optimizer.apply_gradients(zip(grads, model.trainable_variables))
+      return loss
+
+    return _replicated_step
+
+  def _create_train_step(self,
+                         strategy,
+                         model,
+                         loss_fn,
+                         optimizer,
+                         metric=None):
+    """Creates a distributed training step.
+
+    Args:
+      strategy: an instance of tf.distribute.Strategy.
+      model: (Tensor, bool) -> Tensor. model function.
+      loss_fn: (y_true: Tensor, y_pred: Tensor) -> Tensor.
+      optimizer: tf.keras.optimizers.Optimizer.
+      metric: tf.keras.metrics.Metric subclass.
+
+    Returns:
+      The training step callable.
+    """
+    replicated_step = self._create_replicated_step(strategy, model, loss_fn,
+                                                   optimizer, metric)
+
+    @tf.function
+    def train_step(iterator, num_steps):
+      """Performs a distributed training step.
+
+      Args:
+        iterator: an iterator that yields input tensors.
+        num_steps: the number of steps in the loop.
+
+      Returns:
+        The loss tensor.
+      """
+      if not isinstance(num_steps, tf.Tensor):
+        raise ValueError('steps should be an Tensor. Python object may cause '
+                         'retracing.')
+
+      per_replica_losses = strategy.run(replicated_step, args=(next(iterator),))
+      for _ in tf.range(num_steps - 1):
+        per_replica_losses = strategy.run(
+            replicated_step, args=(next(iterator),))
+
+      # For reporting, we returns the mean of losses.
+      losses = tf.nest.map_structure(
+          lambda x: strategy.reduce(tf.distribute.ReduceOp.MEAN, x, axis=None),
+          per_replica_losses)
+      return losses
+
+    return train_step
+
+  def _create_test_step(self, strategy, model, metric):
+    """Creates a distributed test step."""
+    metrics = metrics_as_dict(metric)
+
+    @tf.function
+    def test_step(iterator):
+      """Calculates evaluation metrics on distributed devices."""
+      if not metric:
+        logging.info('Skip test_step because metric is None (%s)', metric)
+        return None, None
+
+      def _test_step_fn(inputs):
+        """Replicated accuracy calculation."""
+        inputs, labels = inputs
+        model_outputs = model(inputs, training=False)
+        for m in metrics.values():
+          m.update_state(labels, model_outputs)
+        return labels, model_outputs
+
+      return strategy.run(_test_step_fn, args=(next(iterator),))
+
+    return test_step
+
+  def train(
+      self,
+      train_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
+      eval_input_fn: Optional[Callable[[params_dict.ParamsDict],
+                                       tf.data.Dataset]] = None,
+      model_dir: Optional[Text] = None,
+      total_steps: int = 1,
+      iterations_per_loop: int = 1,
+      train_metric_fn: Optional[Callable[[], Any]] = None,
+      eval_metric_fn: Optional[Callable[[], Any]] = None,
+      summary_writer_fn: Callable[[Text, Text], SummaryWriter] = SummaryWriter,
+      init_checkpoint: Optional[Callable[[tf.keras.Model], Any]] = None,
+      custom_callbacks: Optional[List[tf.keras.callbacks.Callback]] = None,
+      continuous_eval: bool = False,
+      save_config: bool = True):
+    """Runs distributed training.
+
+    Args:
+      train_input_fn: (params: dict) -> tf.data.Dataset training data input
+        function.
+      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
+        trigger evaluating metric on eval data. If None, will not run the eval
+        step.
+      model_dir: the folder path for model checkpoints.
+      total_steps: total training steps.
+      iterations_per_loop: train steps per loop. After each loop, this job will
+        update metrics like loss and save checkpoint.
+      train_metric_fn: metric_fn for evaluation in train_step.
+      eval_metric_fn: metric_fn for evaluation in test_step.
+      summary_writer_fn: function to create summary writer.
+      init_checkpoint: function to load checkpoint.
+      custom_callbacks: A list of Keras Callbacks objects to run during
+        training. More specifically, `on_batch_begin()`, `on_batch_end()`,
+        methods are invoked during training.
+      continuous_eval: If `True`, will continously run evaluation on every
+        available checkpoints. If `False`, will do the evaluation once after the
+        final step.
+      save_config: bool. Whether to save params to model_dir.
+
+    Returns:
+      The training loss and eval metrics.
+    """
+    assert train_input_fn is not None
+    if train_metric_fn and not callable(train_metric_fn):
+      raise ValueError('if `train_metric_fn` is specified, '
+                       'train_metric_fn must be a callable.')
+    if eval_metric_fn and not callable(eval_metric_fn):
+      raise ValueError('if `eval_metric_fn` is specified, '
+                       'eval_metric_fn must be a callable.')
+    train_metric_fn = train_metric_fn or _no_metric
+    eval_metric_fn = eval_metric_fn or _no_metric
+
+    if custom_callbacks and iterations_per_loop != 1:
+      logging.warning(
+          'It is sematically wrong to run callbacks when '
+          'iterations_per_loop is not one (%s)', iterations_per_loop)
+
+    custom_callbacks = custom_callbacks or []
+
+    def _run_callbacks_on_batch_begin(batch):
+      """Runs custom callbacks at the start of every step."""
+      if not custom_callbacks:
+        return
+      for callback in custom_callbacks:
+        if callback:
+          callback.on_batch_begin(batch)
+
+    def _run_callbacks_on_batch_end(batch):
+      """Runs custom callbacks at the end of every step."""
+      if not custom_callbacks:
+        return
+      for callback in custom_callbacks:
+        if callback:
+          callback.on_batch_end(batch)
+
+    if save_config:
+      self._save_config(model_dir)
+
+    if FLAGS.save_checkpoint_freq:
+      save_freq = FLAGS.save_checkpoint_freq
+    else:
+      save_freq = iterations_per_loop
+
+    params = self._params
+    strategy = self._strategy
+    # To reduce unnecessary send/receive input pipeline operation, we place
+    # input pipeline ops in worker task.
+    train_iterator = self._get_input_iterator(train_input_fn, strategy)
+    train_loss = None
+    train_metric_result = None
+    eval_metric_result = None
+    tf.keras.backend.set_learning_phase(1)
+    with strategy.scope():
+      # To correctly place the model weights on accelerators,
+      # model and optimizer should be created in scope.
+      model = self.model_fn(params.as_dict())
+      if not hasattr(model, 'optimizer'):
+        raise ValueError('User should set optimizer attribute to model '
+                         'inside `model_fn`.')
+      optimizer = model.optimizer
+
+      # Training loop starts here.
+      checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
+      latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
+      initial_step = 0
+      if latest_checkpoint_file:
+        logging.info(
+            'Checkpoint file %s found and restoring from '
+            'checkpoint', latest_checkpoint_file)
+        checkpoint.restore(latest_checkpoint_file)
+        initial_step = optimizer.iterations.numpy()
+        logging.info('Loading from checkpoint file completed. Init step %d',
+                     initial_step)
+      elif init_checkpoint:
+        logging.info('Restoring from init checkpoint function')
+        init_checkpoint(model)
+        logging.info('Loading from init checkpoint file completed')
+
+      current_step = optimizer.iterations.numpy()
+      checkpoint_name = self.checkpoint_name
+
+      eval_metric = eval_metric_fn()
+      train_metric = train_metric_fn()
+      train_summary_writer = summary_writer_fn(model_dir, 'eval_train')
+      self.train_summary_writer = train_summary_writer.writer
+
+      test_summary_writer = summary_writer_fn(model_dir, 'eval_test')
+      self.eval_summary_writer = test_summary_writer.writer
+
+    # Use training summary writer in TimeHistory if it's in use
+    for cb in custom_callbacks:
+      if isinstance(cb, keras_utils.TimeHistory):
+        cb.summary_writer = self.train_summary_writer
+
+    # Continue training loop.
+    train_step = self._create_train_step(
+        strategy=strategy,
+        model=model,
+        loss_fn=self.loss_fn(),
+        optimizer=optimizer,
+        metric=train_metric)
+    test_step = None
+    if eval_input_fn and eval_metric:
+      self.global_train_step = model.optimizer.iterations
+      test_step = self._create_test_step(strategy, model, metric=eval_metric)
+
+    # Step-0 operations
+    if current_step == 0 and not latest_checkpoint_file:
+      _save_checkpoint(checkpoint, model_dir,
+                       checkpoint_name.format(step=current_step))
+    if test_step:
+      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
+      eval_metric_result = self._run_evaluation(test_step, current_step,
+                                                eval_metric, eval_iterator)
+      logging.info('Step: %s evalation metric = %s.', current_step,
+                   eval_metric_result)
+      test_summary_writer(metrics=eval_metric_result, step=optimizer.iterations)
+      reset_states(eval_metric)
+
+    logging.info('Training started')
+    last_save_checkpoint_step = current_step
+    while current_step < total_steps:
+
+      num_steps = _steps_to_run(current_step, total_steps, iterations_per_loop)
+      _run_callbacks_on_batch_begin(current_step)
+      train_loss = train_step(train_iterator,
+                              tf.convert_to_tensor(num_steps, dtype=tf.int32))
+      current_step += num_steps
+
+      train_loss = tf.nest.map_structure(lambda x: x.numpy().astype(float),
+                                         train_loss)
+
+      _run_callbacks_on_batch_end(current_step - 1)
+      if not isinstance(train_loss, dict):
+        train_loss = {'total_loss': train_loss}
+      if np.isnan(train_loss['total_loss']):
+        raise ValueError('total loss is NaN.')
+
+      if train_metric:
+        train_metric_result = metric_results(train_metric)
+        train_metric_result.update(train_loss)
+      else:
+        train_metric_result = train_loss
+      if callable(optimizer.lr):
+        train_metric_result.update(
+            {'learning_rate': optimizer.lr(current_step).numpy()})
+      else:
+        train_metric_result.update({'learning_rate': optimizer.lr.numpy()})
+      logging.info('Train Step: %d/%d  / loss = %s / training metric = %s',
+                   current_step, total_steps, train_loss, train_metric_result)
+
+      train_summary_writer(
+          metrics=train_metric_result, step=optimizer.iterations)
+
+      # Saves model checkpoints and run validation steps at every
+      # iterations_per_loop steps.
+      # To avoid repeated model saving, we do not save after the last
+      # step of training.
+      if save_freq > 0 and current_step < total_steps and (
+          current_step - last_save_checkpoint_step) >= save_freq:
+        _save_checkpoint(checkpoint, model_dir,
+                         checkpoint_name.format(step=current_step))
+        last_save_checkpoint_step = current_step
+
+      if continuous_eval and current_step < total_steps and test_step:
+        eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
+        eval_metric_result = self._run_evaluation(test_step, current_step,
+                                                  eval_metric, eval_iterator)
+        logging.info('Step: %s evalation metric = %s.', current_step,
+                     eval_metric_result)
+        test_summary_writer(
+            metrics=eval_metric_result, step=optimizer.iterations)
+
+      # Re-initialize evaluation metric, except the last step.
+      if eval_metric and current_step < total_steps:
+        reset_states(eval_metric)
+      if train_metric and current_step < total_steps:
+        reset_states(train_metric)
+
+    # Reaches the end of training and saves the last checkpoint.
+    if last_save_checkpoint_step < total_steps:
+      _save_checkpoint(checkpoint, model_dir,
+                       checkpoint_name.format(step=current_step))
+
+    if test_step:
+      logging.info('Running final evaluation after training is complete.')
+      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
+      eval_metric_result = self._run_evaluation(test_step, current_step,
+                                                eval_metric, eval_iterator)
+      logging.info('Final evaluation metric = %s.', eval_metric_result)
+      test_summary_writer(metrics=eval_metric_result, step=optimizer.iterations)
+
+    self.train_summary_writer.close()
+    self.eval_summary_writer.close()
+
+    return train_metric_result, eval_metric_result
+
+  def _run_evaluation(self, test_step, current_training_step, metric,
+                      test_iterator):
+    """Runs validation steps and aggregate metrics."""
+    if not test_iterator or not metric:
+      logging.warning(
+          'Both test_iterator (%s) and metrics (%s) must not be None.',
+          test_iterator, metric)
+      return None
+    logging.info('Running evaluation after step: %s.', current_training_step)
+    eval_step = 0
+    while True:
+      try:
+        with tf.experimental.async_scope():
+          test_step(test_iterator)
+          eval_step += 1
+      except (StopIteration, tf.errors.OutOfRangeError):
+        tf.experimental.async_clear_error()
+        break
+
+    metric_result = metric_results(metric)
+    logging.info('Total eval steps: [%d]', eval_step)
+    logging.info('At training step: [%r] Validation metric = %r',
+                 current_training_step, metric_result)
+    return metric_result
+
+  def evaluate_from_model_dir(
+      self,
+      model_dir: Text,
+      eval_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
+      eval_metric_fn: Callable[[], Any],
+      total_steps: int = -1,
+      eval_timeout: Optional[int] = None,
+      min_eval_interval: int = 180,
+      summary_writer_fn: Callable[[Text, Text], SummaryWriter] = SummaryWriter):
+    """Runs distributed evaluation on model folder.
+
+    Args:
+      model_dir: the folder for storing model checkpoints.
+      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
+        trigger evaluting metric on eval data. If None, will not run eval step.
+      eval_metric_fn: metric_fn for evaluation in test_step.
+      total_steps: total training steps. If the current step reaches the
+        total_steps, the evaluation loop will stop.
+      eval_timeout: The maximum number of seconds to wait between checkpoints.
+        If left as None, then the process will wait indefinitely. Used by
+        tf.train.checkpoints_iterator.
+      min_eval_interval: The minimum number of seconds between yielding
+        checkpoints. Used by tf.train.checkpoints_iterator.
+      summary_writer_fn: function to create summary writer.
+
+    Returns:
+      Eval metrics dictionary of the last checkpoint.
+    """
+
+    if not model_dir:
+      raise ValueError('model_dir must be set.')
+
+    def terminate_eval():
+      tf.logging.info('Terminating eval after %d seconds of no checkpoints' %
+                      eval_timeout)
+      return True
+
+    summary_writer = summary_writer_fn(model_dir, 'eval')
+    self.eval_summary_writer = summary_writer.writer
+
+    # Read checkpoints from the given model directory
+    # until `eval_timeout` seconds elapses.
+    for checkpoint_path in tf.train.checkpoints_iterator(
+        model_dir,
+        min_interval_secs=min_eval_interval,
+        timeout=eval_timeout,
+        timeout_fn=terminate_eval):
+      eval_metric_result, current_step = self.evaluate_checkpoint(
+          checkpoint_path=checkpoint_path,
+          eval_input_fn=eval_input_fn,
+          eval_metric_fn=eval_metric_fn,
+          summary_writer=summary_writer)
+      if total_steps > 0 and current_step >= total_steps:
+        logging.info('Evaluation finished after training step %d', current_step)
+        break
+    return eval_metric_result
+
+  def evaluate_checkpoint(self,
+                          checkpoint_path: Text,
+                          eval_input_fn: Callable[[params_dict.ParamsDict],
+                                                  tf.data.Dataset],
+                          eval_metric_fn: Callable[[], Any],
+                          summary_writer: Optional[SummaryWriter] = None):
+    """Runs distributed evaluation on the one checkpoint.
+
+    Args:
+      checkpoint_path: the checkpoint to evaluate.
+      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
+        trigger evaluting metric on eval data. If None, will not run eval step.
+      eval_metric_fn: metric_fn for evaluation in test_step.
+      summary_writer: function to create summary writer.
+
+    Returns:
+      Eval metrics dictionary of the last checkpoint.
+    """
+    if not callable(eval_metric_fn):
+      raise ValueError('if `eval_metric_fn` is specified, '
+                       'eval_metric_fn must be a callable.')
+
+    old_phase = tf.keras.backend.learning_phase()
+    tf.keras.backend.set_learning_phase(0)
+    params = self._params
+    strategy = self._strategy
+    # To reduce unnecessary send/receive input pipeline operation, we place
+    # input pipeline ops in worker task.
+    with strategy.scope():
+
+      # To correctly place the model weights on accelerators,
+      # model and optimizer should be created in scope.
+      model = self.model_fn(params.as_dict())
+      checkpoint = tf.train.Checkpoint(model=model)
+
+      eval_metric = eval_metric_fn()
+      assert eval_metric, 'eval_metric does not exist'
+      test_step = self._create_test_step(strategy, model, metric=eval_metric)
+
+      logging.info('Starting to evaluate.')
+      if not checkpoint_path:
+        raise ValueError('checkpoint path is empty')
+      reader = tf.compat.v1.train.NewCheckpointReader(checkpoint_path)
+      current_step = reader.get_tensor(
+          'optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE')
+      logging.info('Checkpoint file %s found and restoring from '
+                   'checkpoint', checkpoint_path)
+      status = checkpoint.restore(checkpoint_path)
+      status.expect_partial().assert_existing_objects_matched()
+
+      self.global_train_step = model.optimizer.iterations
+      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
+      eval_metric_result = self._run_evaluation(test_step, current_step,
+                                                eval_metric, eval_iterator)
+      logging.info('Step: %s evalation metric = %s.', current_step,
+                   eval_metric_result)
+      summary_writer(metrics=eval_metric_result, step=current_step)
+      reset_states(eval_metric)
+
+    tf.keras.backend.set_learning_phase(old_phase)
+    return eval_metric_result, current_step
+
+  def predict(self):
+    return NotImplementedError('Unimplmented function.')
+
+
+class ExecutorBuilder(object):
+  """Builder of DistributedExecutor.
+
+  Example 1: Builds an executor with supported Strategy.
+    builder = ExecutorBuilder(
+        strategy_type='tpu',
+        strategy_config={'tpu': '/bns/xxx'})
+    dist_executor = builder.build_executor(
+        params=params,
+        model_fn=my_model_fn,
+        loss_fn=my_loss_fn,
+        metric_fn=my_metric_fn)
+
+  Example 2: Builds an executor with customized Strategy.
+    builder = ExecutorBuilder()
+    builder.strategy = <some customized Strategy>
+    dist_executor = builder.build_executor(
+        params=params,
+        model_fn=my_model_fn,
+        loss_fn=my_loss_fn,
+        metric_fn=my_metric_fn)
+
+  Example 3: Builds a customized executor with customized Strategy.
+    class MyDistributedExecutor(DistributedExecutor):
+      # implementation ...
+
+    builder = ExecutorBuilder()
+    builder.strategy = <some customized Strategy>
+    dist_executor = builder.build_executor(
+        class_ctor=MyDistributedExecutor,
+        params=params,
+        model_fn=my_model_fn,
+        loss_fn=my_loss_fn,
+        metric_fn=my_metric_fn)
+  """
+
+  def __init__(self, strategy_type=None, strategy_config=None):
+    _ = distribute_utils.configure_cluster(strategy_config.worker_hosts,
+                                           strategy_config.task_index)
+    """Constructor.
+
+    Args:
+      strategy_type: string. One of 'tpu', 'mirrored', 'multi_worker_mirrored'.
+        If None, the user is responsible to set the strategy before calling
+        build_executor(...).
+      strategy_config: necessary config for constructing the proper Strategy.
+        Check strategy_flags_dict() for examples of the structure.
+    """
+    self._strategy = distribute_utils.get_distribution_strategy(
+        distribution_strategy=strategy_type,
+        num_gpus=strategy_config.num_gpus,
+        all_reduce_alg=strategy_config.all_reduce_alg,
+        num_packs=strategy_config.num_packs,
+        tpu_address=strategy_config.tpu)
+
+  @property
+  def strategy(self):
+    """Returns default checkpoint name."""
+    return self._strategy
+
+  @strategy.setter
+  def strategy(self, new_strategy):
+    """Sets default summary writer for the current thread."""
+    self._strategy = new_strategy
+
+  def build_executor(self,
+                     class_ctor=DistributedExecutor,
+                     params=None,
+                     model_fn=None,
+                     loss_fn=None,
+                     **kwargs):
+    """Creates an executor according to strategy type.
+
+    See doc string of the DistributedExecutor.__init__ for more information of
+    the
+    input arguments.
+
+    Args:
+      class_ctor: A constructor of executor (default: DistributedExecutor).
+      params: ParamsDict, all the model parameters and runtime parameters.
+      model_fn: Keras model function.
+      loss_fn: loss function.
+      **kwargs: other arguments to the executor constructor.
+
+    Returns:
+      An instance of DistributedExecutor or its subclass.
+    """
+    if self._strategy is None:
+      raise ValueError('`strategy` should not be None. You need to specify '
+                       '`strategy_type` in the builder contructor or directly '
+                       'set the `strategy` property of the builder.')
+    return class_ctor(
+        strategy=self._strategy,
+        params=params,
+        model_fn=model_fn,
+        loss_fn=loss_fn,
+        **kwargs)
--- a/official/legacy/detection/main.py
+++ b/official/legacy/detection/main.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Main function to train various object detection models."""
+
+import functools
+import pprint
+
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+
+from official.common import distribute_utils
+from official.legacy.detection.configs import factory as config_factory
+from official.legacy.detection.dataloader import input_reader
+from official.legacy.detection.dataloader import mode_keys as ModeKeys
+from official.legacy.detection.executor import distributed_executor as executor
+from official.legacy.detection.executor.detection_executor import DetectionDistributedExecutor
+from official.legacy.detection.modeling import factory as model_factory
+from official.modeling.hyperparams import params_dict
+from official.utils import hyperparams_flags
+from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+
+hyperparams_flags.initialize_common_flags()
+flags_core.define_log_steps()
+
+flags.DEFINE_bool('enable_xla', default=False, help='Enable XLA for GPU')
+
+flags.DEFINE_string(
+    'mode',
+    default='train',
+    help='Mode to run: `train`, `eval` or `eval_once`.')
+
+flags.DEFINE_string(
+    'model', default='retinanet',
+    help='Model to run: `retinanet`, `mask_rcnn` or `shapemask`.')
+
+flags.DEFINE_string('training_file_pattern', None,
+                    'Location of the train data.')
+
+flags.DEFINE_string('eval_file_pattern', None, 'Location of ther eval data')
+
+flags.DEFINE_string(
+    'checkpoint_path', None,
+    'The checkpoint path to eval. Only used in eval_once mode.')
+
+FLAGS = flags.FLAGS
+
+
+def run_executor(params,
+                 mode,
+                 checkpoint_path=None,
+                 train_input_fn=None,
+                 eval_input_fn=None,
+                 callbacks=None,
+                 prebuilt_strategy=None):
+  """Runs the object detection model on distribution strategy defined by the user."""
+
+  if params.architecture.use_bfloat16:
+    tf.compat.v2.keras.mixed_precision.set_global_policy('mixed_bfloat16')
+
+  model_builder = model_factory.model_generator(params)
+
+  if prebuilt_strategy is not None:
+    strategy = prebuilt_strategy
+  else:
+    strategy_config = params.strategy_config
+    distribute_utils.configure_cluster(strategy_config.worker_hosts,
+                                       strategy_config.task_index)
+    strategy = distribute_utils.get_distribution_strategy(
+        distribution_strategy=params.strategy_type,
+        num_gpus=strategy_config.num_gpus,
+        all_reduce_alg=strategy_config.all_reduce_alg,
+        num_packs=strategy_config.num_packs,
+        tpu_address=strategy_config.tpu)
+
+  num_workers = int(strategy.num_replicas_in_sync + 7) // 8
+  is_multi_host = (int(num_workers) >= 2)
+
+  if mode == 'train':
+
+    def _model_fn(params):
+      return model_builder.build_model(params, mode=ModeKeys.TRAIN)
+
+    logging.info(
+        'Train num_replicas_in_sync %d num_workers %d is_multi_host %s',
+        strategy.num_replicas_in_sync, num_workers, is_multi_host)
+
+    dist_executor = DetectionDistributedExecutor(
+        strategy=strategy,
+        params=params,
+        model_fn=_model_fn,
+        loss_fn=model_builder.build_loss_fn,
+        is_multi_host=is_multi_host,
+        predict_post_process_fn=model_builder.post_processing,
+        trainable_variables_filter=model_builder
+        .make_filter_trainable_variables_fn())
+
+    if is_multi_host:
+      train_input_fn = functools.partial(
+          train_input_fn,
+          batch_size=params.train.batch_size // strategy.num_replicas_in_sync)
+
+    return dist_executor.train(
+        train_input_fn=train_input_fn,
+        model_dir=params.model_dir,
+        iterations_per_loop=params.train.iterations_per_loop,
+        total_steps=params.train.total_steps,
+        init_checkpoint=model_builder.make_restore_checkpoint_fn(),
+        custom_callbacks=callbacks,
+        save_config=True)
+  elif mode == 'eval' or mode == 'eval_once':
+
+    def _model_fn(params):
+      return model_builder.build_model(params, mode=ModeKeys.PREDICT_WITH_GT)
+
+    logging.info('Eval num_replicas_in_sync %d num_workers %d is_multi_host %s',
+                 strategy.num_replicas_in_sync, num_workers, is_multi_host)
+
+    if is_multi_host:
+      eval_input_fn = functools.partial(
+          eval_input_fn,
+          batch_size=params.eval.batch_size // strategy.num_replicas_in_sync)
+
+    dist_executor = DetectionDistributedExecutor(
+        strategy=strategy,
+        params=params,
+        model_fn=_model_fn,
+        loss_fn=model_builder.build_loss_fn,
+        is_multi_host=is_multi_host,
+        predict_post_process_fn=model_builder.post_processing,
+        trainable_variables_filter=model_builder
+        .make_filter_trainable_variables_fn())
+
+    if mode == 'eval':
+      results = dist_executor.evaluate_from_model_dir(
+          model_dir=params.model_dir,
+          eval_input_fn=eval_input_fn,
+          eval_metric_fn=model_builder.eval_metrics,
+          eval_timeout=params.eval.eval_timeout,
+          min_eval_interval=params.eval.min_eval_interval,
+          total_steps=params.train.total_steps)
+    else:
+      # Run evaluation once for a single checkpoint.
+      if not checkpoint_path:
+        raise ValueError('checkpoint_path cannot be empty.')
+      if tf.io.gfile.isdir(checkpoint_path):
+        checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
+      summary_writer = executor.SummaryWriter(params.model_dir, 'eval')
+      results, _ = dist_executor.evaluate_checkpoint(
+          checkpoint_path=checkpoint_path,
+          eval_input_fn=eval_input_fn,
+          eval_metric_fn=model_builder.eval_metrics,
+          summary_writer=summary_writer)
+    for k, v in results.items():
+      logging.info('Final eval metric %s: %f', k, v)
+    return results
+  else:
+    raise ValueError('Mode not found: %s.' % mode)
+
+
+def run(callbacks=None):
+  """Runs the experiment."""
+  keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)
+
+  params = config_factory.config_generator(FLAGS.model)
+
+  params = params_dict.override_params_dict(
+      params, FLAGS.config_file, is_strict=True)
+
+  params = params_dict.override_params_dict(
+      params, FLAGS.params_override, is_strict=True)
+  params.override(
+      {
+          'strategy_type': FLAGS.strategy_type,
+          'model_dir': FLAGS.model_dir,
+          'strategy_config': executor.strategy_flags_dict(),
+      },
+      is_strict=False)
+
+  # Make sure use_tpu and strategy_type are in sync.
+  params.use_tpu = (params.strategy_type == 'tpu')
+
+  if not params.use_tpu:
+    params.override({
+        'architecture': {
+            'use_bfloat16': False,
+        },
+        'norm_activation': {
+            'use_sync_bn': False,
+        },
+    }, is_strict=True)
+
+  params.validate()
+  params.lock()
+  pp = pprint.PrettyPrinter()
+  params_str = pp.pformat(params.as_dict())
+  logging.info('Model Parameters: %s', params_str)
+
+  train_input_fn = None
+  eval_input_fn = None
+  training_file_pattern = FLAGS.training_file_pattern or params.train.train_file_pattern
+  eval_file_pattern = FLAGS.eval_file_pattern or params.eval.eval_file_pattern
+  if not training_file_pattern and not eval_file_pattern:
+    raise ValueError('Must provide at least one of training_file_pattern and '
+                     'eval_file_pattern.')
+
+  if training_file_pattern:
+    # Use global batch size for single host.
+    train_input_fn = input_reader.InputFn(
+        file_pattern=training_file_pattern,
+        params=params,
+        mode=input_reader.ModeKeys.TRAIN,
+        batch_size=params.train.batch_size)
+
+  if eval_file_pattern:
+    eval_input_fn = input_reader.InputFn(
+        file_pattern=eval_file_pattern,
+        params=params,
+        mode=input_reader.ModeKeys.PREDICT_WITH_GT,
+        batch_size=params.eval.batch_size,
+        num_examples=params.eval.eval_samples)
+
+  if callbacks is None:
+    callbacks = []
+
+  if FLAGS.log_steps:
+    callbacks.append(
+        keras_utils.TimeHistory(
+            batch_size=params.train.batch_size,
+            log_steps=FLAGS.log_steps,
+        ))
+
+  return run_executor(
+      params,
+      FLAGS.mode,
+      checkpoint_path=FLAGS.checkpoint_path,
+      train_input_fn=train_input_fn,
+      eval_input_fn=eval_input_fn,
+      callbacks=callbacks)
+
+
+def main(argv):
+  del argv  # Unused.
+
+  run()
+
+
+if __name__ == '__main__':
+  tf.config.set_soft_device_placement(True)
+  app.run(main)
--- a/official/legacy/detection/modeling/__init__.py
+++ b/official/legacy/detection/modeling/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/legacy/detection/modeling/architecture/__init__.py
+++ b/official/legacy/detection/modeling/architecture/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/legacy/detection/modeling/architecture/factory.py
+++ b/official/legacy/detection/modeling/architecture/factory.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Model architecture factory."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from official.legacy.detection.modeling.architecture import fpn
+from official.legacy.detection.modeling.architecture import heads
+from official.legacy.detection.modeling.architecture import identity
+from official.legacy.detection.modeling.architecture import nn_ops
+from official.legacy.detection.modeling.architecture import resnet
+from official.legacy.detection.modeling.architecture import spinenet
+
+
+def norm_activation_generator(params):
+  return nn_ops.norm_activation_builder(
+      momentum=params.batch_norm_momentum,
+      epsilon=params.batch_norm_epsilon,
+      trainable=params.batch_norm_trainable,
+      activation=params.activation)
+
+
+def backbone_generator(params):
+  """Generator function for various backbone models."""
+  if params.architecture.backbone == 'resnet':
+    resnet_params = params.resnet
+    backbone_fn = resnet.Resnet(
+        resnet_depth=resnet_params.resnet_depth,
+        activation=params.norm_activation.activation,
+        norm_activation=norm_activation_generator(
+            params.norm_activation))
+  elif params.architecture.backbone == 'spinenet':
+    spinenet_params = params.spinenet
+    backbone_fn = spinenet.SpineNetBuilder(model_id=spinenet_params.model_id)
+  else:
+    raise ValueError('Backbone model `{}` is not supported.'
+                     .format(params.architecture.backbone))
+
+  return backbone_fn
+
+
+def multilevel_features_generator(params):
+  """Generator function for various FPN models."""
+  if params.architecture.multilevel_features == 'fpn':
+    fpn_params = params.fpn
+    fpn_fn = fpn.Fpn(
+        min_level=params.architecture.min_level,
+        max_level=params.architecture.max_level,
+        fpn_feat_dims=fpn_params.fpn_feat_dims,
+        use_separable_conv=fpn_params.use_separable_conv,
+        activation=params.norm_activation.activation,
+        use_batch_norm=fpn_params.use_batch_norm,
+        norm_activation=norm_activation_generator(
+            params.norm_activation))
+  elif params.architecture.multilevel_features == 'identity':
+    fpn_fn = identity.Identity()
+  else:
+    raise ValueError('The multi-level feature model `{}` is not supported.'
+                     .format(params.architecture.multilevel_features))
+  return fpn_fn
+
+
+def retinanet_head_generator(params):
+  """Generator function for RetinaNet head architecture."""
+  head_params = params.retinanet_head
+  anchors_per_location = params.anchor.num_scales * len(
+      params.anchor.aspect_ratios)
+  return heads.RetinanetHead(
+      params.architecture.min_level,
+      params.architecture.max_level,
+      params.architecture.num_classes,
+      anchors_per_location,
+      head_params.num_convs,
+      head_params.num_filters,
+      head_params.use_separable_conv,
+      norm_activation=norm_activation_generator(params.norm_activation))
+
+
+def rpn_head_generator(params):
+  """Generator function for RPN head architecture."""
+  head_params = params.rpn_head
+  anchors_per_location = params.anchor.num_scales * len(
+      params.anchor.aspect_ratios)
+  return heads.RpnHead(
+      params.architecture.min_level,
+      params.architecture.max_level,
+      anchors_per_location,
+      head_params.num_convs,
+      head_params.num_filters,
+      head_params.use_separable_conv,
+      params.norm_activation.activation,
+      head_params.use_batch_norm,
+      norm_activation=norm_activation_generator(params.norm_activation))
+
+
+def oln_rpn_head_generator(params):
+  """Generator function for OLN-proposal (OLN-RPN) head architecture."""
+  head_params = params.rpn_head
+  anchors_per_location = params.anchor.num_scales * len(
+      params.anchor.aspect_ratios)
+  return heads.OlnRpnHead(
+      params.architecture.min_level,
+      params.architecture.max_level,
+      anchors_per_location,
+      head_params.num_convs,
+      head_params.num_filters,
+      head_params.use_separable_conv,
+      params.norm_activation.activation,
+      head_params.use_batch_norm,
+      norm_activation=norm_activation_generator(params.norm_activation))
+
+
+def fast_rcnn_head_generator(params):
+  """Generator function for Fast R-CNN head architecture."""
+  head_params = params.frcnn_head
+  return heads.FastrcnnHead(
+      params.architecture.num_classes,
+      head_params.num_convs,
+      head_params.num_filters,
+      head_params.use_separable_conv,
+      head_params.num_fcs,
+      head_params.fc_dims,
+      params.norm_activation.activation,
+      head_params.use_batch_norm,
+      norm_activation=norm_activation_generator(params.norm_activation))
+
+
+def oln_box_score_head_generator(params):
+  """Generator function for Scoring Fast R-CNN head architecture."""
+  head_params = params.frcnn_head
+  return heads.OlnBoxScoreHead(
+      params.architecture.num_classes,
+      head_params.num_convs,
+      head_params.num_filters,
+      head_params.use_separable_conv,
+      head_params.num_fcs,
+      head_params.fc_dims,
+      params.norm_activation.activation,
+      head_params.use_batch_norm,
+      norm_activation=norm_activation_generator(params.norm_activation))
+
+
+def mask_rcnn_head_generator(params):
+  """Generator function for Mask R-CNN head architecture."""
+  head_params = params.mrcnn_head
+  return heads.MaskrcnnHead(
+      params.architecture.num_classes,
+      params.architecture.mask_target_size,
+      head_params.num_convs,
+      head_params.num_filters,
+      head_params.use_separable_conv,
+      params.norm_activation.activation,
+      head_params.use_batch_norm,
+      norm_activation=norm_activation_generator(params.norm_activation))
+
+
+def oln_mask_score_head_generator(params):
+  """Generator function for Scoring Mask R-CNN head architecture."""
+  head_params = params.mrcnn_head
+  return heads.OlnMaskScoreHead(
+      params.architecture.num_classes,
+      params.architecture.mask_target_size,
+      head_params.num_convs,
+      head_params.num_filters,
+      head_params.use_separable_conv,
+      params.norm_activation.activation,
+      head_params.use_batch_norm,
+      norm_activation=norm_activation_generator(params.norm_activation))
+
+
+def shapeprior_head_generator(params):
+  """Generator function for shape prior head architecture."""
+  head_params = params.shapemask_head
+  return heads.ShapemaskPriorHead(
+      params.architecture.num_classes,
+      head_params.num_downsample_channels,
+      head_params.mask_crop_size,
+      head_params.use_category_for_mask,
+      head_params.shape_prior_path)
+
+
+def coarsemask_head_generator(params):
+  """Generator function for ShapeMask coarse mask head architecture."""
+  head_params = params.shapemask_head
+  return heads.ShapemaskCoarsemaskHead(
+      params.architecture.num_classes,
+      head_params.num_downsample_channels,
+      head_params.mask_crop_size,
+      head_params.use_category_for_mask,
+      head_params.num_convs,
+      norm_activation=norm_activation_generator(params.norm_activation))
+
+
+def finemask_head_generator(params):
+  """Generator function for Shapemask fine mask head architecture."""
+  head_params = params.shapemask_head
+  return heads.ShapemaskFinemaskHead(
+      params.architecture.num_classes,
+      head_params.num_downsample_channels,
+      head_params.mask_crop_size,
+      head_params.use_category_for_mask,
+      head_params.num_convs,
+      head_params.upsample_factor)
--- a/official/legacy/detection/modeling/architecture/fpn.py
+++ b/official/legacy/detection/modeling/architecture/fpn.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Feature Pyramid Networks.
+
+Feature Pyramid Networks were proposed in:
+[1] Tsung-Yi Lin, Piotr Dollar, Ross Girshick, Kaiming He, Bharath Hariharan,
+    , and Serge Belongie
+    Feature Pyramid Networks for Object Detection. CVPR 2017.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import tensorflow as tf
+
+from official.legacy.detection.modeling.architecture import nn_ops
+from official.legacy.detection.ops import spatial_transform_ops
+
+
+class Fpn(object):
+  """Feature pyramid networks."""
+
+  def __init__(self,
+               min_level=3,
+               max_level=7,
+               fpn_feat_dims=256,
+               use_separable_conv=False,
+               activation='relu',
+               use_batch_norm=True,
+               norm_activation=nn_ops.norm_activation_builder(
+                   activation='relu')):
+    """FPN initialization function.
+
+    Args:
+      min_level: `int` minimum level in FPN output feature maps.
+      max_level: `int` maximum level in FPN output feature maps.
+      fpn_feat_dims: `int` number of filters in FPN layers.
+      use_separable_conv: `bool`, if True use separable convolution for
+        convolution in FPN layers.
+      activation: the activation function.
+      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
+      norm_activation: an operation that includes a normalization layer
+        followed by an optional activation layer.
+    """
+    self._min_level = min_level
+    self._max_level = max_level
+    self._fpn_feat_dims = fpn_feat_dims
+    if use_separable_conv:
+      self._conv2d_op = functools.partial(
+          tf.keras.layers.SeparableConv2D, depth_multiplier=1)
+    else:
+      self._conv2d_op = tf.keras.layers.Conv2D
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))
+    self._use_batch_norm = use_batch_norm
+    self._norm_activation = norm_activation
+
+    self._norm_activations = {}
+    self._lateral_conv2d_op = {}
+    self._post_hoc_conv2d_op = {}
+    self._coarse_conv2d_op = {}
+    for level in range(self._min_level, self._max_level + 1):
+      if self._use_batch_norm:
+        self._norm_activations[level] = norm_activation(
+            use_activation=False, name='p%d-bn' % level)
+      self._lateral_conv2d_op[level] = self._conv2d_op(
+          filters=self._fpn_feat_dims,
+          kernel_size=(1, 1),
+          padding='same',
+          name='l%d' % level)
+      self._post_hoc_conv2d_op[level] = self._conv2d_op(
+          filters=self._fpn_feat_dims,
+          strides=(1, 1),
+          kernel_size=(3, 3),
+          padding='same',
+          name='post_hoc_d%d' % level)
+      self._coarse_conv2d_op[level] = self._conv2d_op(
+          filters=self._fpn_feat_dims,
+          strides=(2, 2),
+          kernel_size=(3, 3),
+          padding='same',
+          name='p%d' % level)
+
+  def __call__(self, multilevel_features, is_training=None):
+    """Returns the FPN features for a given multilevel features.
+
+    Args:
+      multilevel_features: a `dict` containing `int` keys for continuous feature
+        levels, e.g., [2, 3, 4, 5]. The values are corresponding features with
+        shape [batch_size, height_l, width_l, num_filters].
+      is_training: `bool` if True, the model is in training mode.
+
+    Returns:
+      a `dict` containing `int` keys for continuous feature levels
+      [min_level, min_level + 1, ..., max_level]. The values are corresponding
+      FPN features with shape [batch_size, height_l, width_l, fpn_feat_dims].
+    """
+    input_levels = list(multilevel_features.keys())
+    if min(input_levels) > self._min_level:
+      raise ValueError(
+          'The minimum backbone level %d should be '%(min(input_levels)) +
+          'less or equal to FPN minimum level %d.:'%(self._min_level))
+    backbone_max_level = min(max(input_levels), self._max_level)
+    with tf.name_scope('fpn'):
+      # Adds lateral connections.
+      feats_lateral = {}
+      for level in range(self._min_level, backbone_max_level + 1):
+        feats_lateral[level] = self._lateral_conv2d_op[level](
+            multilevel_features[level])
+
+      # Adds top-down path.
+      feats = {backbone_max_level: feats_lateral[backbone_max_level]}
+      for level in range(backbone_max_level - 1, self._min_level - 1, -1):
+        feats[level] = spatial_transform_ops.nearest_upsampling(
+            feats[level + 1], 2) + feats_lateral[level]
+
+      # Adds post-hoc 3x3 convolution kernel.
+      for level in range(self._min_level, backbone_max_level + 1):
+        feats[level] = self._post_hoc_conv2d_op[level](feats[level])
+
+      # Adds coarser FPN levels introduced for RetinaNet.
+      for level in range(backbone_max_level + 1, self._max_level + 1):
+        feats_in = feats[level - 1]
+        if level > backbone_max_level + 1:
+          feats_in = self._activation_op(feats_in)
+        feats[level] = self._coarse_conv2d_op[level](feats_in)
+      if self._use_batch_norm:
+        # Adds batch_norm layer.
+        for level in range(self._min_level, self._max_level + 1):
+          feats[level] = self._norm_activations[level](
+              feats[level], is_training=is_training)
+    return feats
--- a/official/legacy/detection/modeling/architecture/heads.py
+++ b/official/legacy/detection/modeling/architecture/heads.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Classes to build various prediction heads in all supported models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import numpy as np
+import tensorflow as tf
+
+from official.legacy.detection.modeling.architecture import nn_ops
+from official.legacy.detection.ops import spatial_transform_ops
+
+
+class RpnHead(tf.keras.layers.Layer):
+  """Region Proposal Network head."""
+
+  def __init__(
+      self,
+      min_level,
+      max_level,
+      anchors_per_location,
+      num_convs=2,
+      num_filters=256,
+      use_separable_conv=False,
+      activation='relu',
+      use_batch_norm=True,
+      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
+    """Initialize params to build Region Proposal Network head.
+
+    Args:
+      min_level: `int` number of minimum feature level.
+      max_level: `int` number of maximum feature level.
+      anchors_per_location: `int` number of number of anchors per pixel
+        location.
+      num_convs: `int` number that represents the number of the intermediate
+        conv layers before the prediction.
+      num_filters: `int` number that represents the number of filters of the
+        intermediate conv layers.
+      use_separable_conv: `bool`, indicating whether the separable conv layers
+        is used.
+      activation: activation function. Support 'relu' and 'swish'.
+      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
+      norm_activation: an operation that includes a normalization layer followed
+        by an optional activation layer.
+    """
+    super().__init__(autocast=False)
+
+    self._min_level = min_level
+    self._max_level = max_level
+    self._anchors_per_location = anchors_per_location
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))
+    self._use_batch_norm = use_batch_norm
+
+    if use_separable_conv:
+      self._conv2d_op = functools.partial(
+          tf.keras.layers.SeparableConv2D,
+          depth_multiplier=1,
+          bias_initializer=tf.zeros_initializer())
+    else:
+      self._conv2d_op = functools.partial(
+          tf.keras.layers.Conv2D,
+          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+          bias_initializer=tf.zeros_initializer())
+
+    self._rpn_conv = self._conv2d_op(
+        num_filters,
+        kernel_size=(3, 3),
+        strides=(1, 1),
+        activation=(None if self._use_batch_norm else self._activation_op),
+        padding='same',
+        name='rpn')
+    self._rpn_class_conv = self._conv2d_op(
+        anchors_per_location,
+        kernel_size=(1, 1),
+        strides=(1, 1),
+        padding='valid',
+        name='rpn-class')
+    self._rpn_box_conv = self._conv2d_op(
+        4 * anchors_per_location,
+        kernel_size=(1, 1),
+        strides=(1, 1),
+        padding='valid',
+        name='rpn-box')
+
+    self._norm_activations = {}
+    if self._use_batch_norm:
+      for level in range(self._min_level, self._max_level + 1):
+        self._norm_activations[level] = norm_activation(name='rpn-l%d-bn' %
+                                                        level)
+
+  def _shared_rpn_heads(self, features, anchors_per_location, level,
+                        is_training):
+    """Shared RPN heads."""
+    features = self._rpn_conv(features)
+    if self._use_batch_norm:
+      # The batch normalization layers are not shared between levels.
+      features = self._norm_activations[level](
+          features, is_training=is_training)
+    # Proposal classification scores
+    scores = self._rpn_class_conv(features)
+    # Proposal bbox regression deltas
+    bboxes = self._rpn_box_conv(features)
+
+    return scores, bboxes
+
+  def call(self, features, is_training=None):
+
+    scores_outputs = {}
+    box_outputs = {}
+
+    with tf.name_scope('rpn_head'):
+      for level in range(self._min_level, self._max_level + 1):
+        scores_output, box_output = self._shared_rpn_heads(
+            features[level], self._anchors_per_location, level, is_training)
+        scores_outputs[level] = scores_output
+        box_outputs[level] = box_output
+      return scores_outputs, box_outputs
+
+
+class OlnRpnHead(tf.keras.layers.Layer):
+  """Region Proposal Network for Object Localization Network (OLN)."""
+
+  def __init__(
+      self,
+      min_level,
+      max_level,
+      anchors_per_location,
+      num_convs=2,
+      num_filters=256,
+      use_separable_conv=False,
+      activation='relu',
+      use_batch_norm=True,
+      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
+    """Initialize params to build Region Proposal Network head.
+
+    Args:
+      min_level: `int` number of minimum feature level.
+      max_level: `int` number of maximum feature level.
+      anchors_per_location: `int` number of number of anchors per pixel
+        location.
+      num_convs: `int` number that represents the number of the intermediate
+        conv layers before the prediction.
+      num_filters: `int` number that represents the number of filters of the
+        intermediate conv layers.
+      use_separable_conv: `bool`, indicating whether the separable conv layers
+        is used.
+      activation: activation function. Support 'relu' and 'swish'.
+      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
+      norm_activation: an operation that includes a normalization layer followed
+        by an optional activation layer.
+    """
+    self._min_level = min_level
+    self._max_level = max_level
+    self._anchors_per_location = anchors_per_location
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))
+    self._use_batch_norm = use_batch_norm
+
+    if use_separable_conv:
+      self._conv2d_op = functools.partial(
+          tf.keras.layers.SeparableConv2D,
+          depth_multiplier=1,
+          bias_initializer=tf.zeros_initializer())
+    else:
+      self._conv2d_op = functools.partial(
+          tf.keras.layers.Conv2D,
+          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+          bias_initializer=tf.zeros_initializer())
+
+    self._rpn_conv = self._conv2d_op(
+        num_filters,
+        kernel_size=(3, 3),
+        strides=(1, 1),
+        activation=(None if self._use_batch_norm else self._activation_op),
+        padding='same',
+        name='rpn')
+    self._rpn_class_conv = self._conv2d_op(
+        anchors_per_location,
+        kernel_size=(1, 1),
+        strides=(1, 1),
+        padding='valid',
+        name='rpn-class')
+    self._rpn_box_conv = self._conv2d_op(
+        4 * anchors_per_location,
+        kernel_size=(1, 1),
+        strides=(1, 1),
+        padding='valid',
+        name='rpn-box-lrtb')
+    self._rpn_center_conv = self._conv2d_op(
+        anchors_per_location,
+        kernel_size=(1, 1),
+        strides=(1, 1),
+        padding='valid',
+        name='rpn-centerness')
+
+    self._norm_activations = {}
+    if self._use_batch_norm:
+      for level in range(self._min_level, self._max_level + 1):
+        self._norm_activations[level] = norm_activation(name='rpn-l%d-bn' %
+                                                        level)
+
+  def _shared_rpn_heads(self, features, anchors_per_location, level,
+                        is_training):
+    """Shared RPN heads."""
+    features = self._rpn_conv(features)
+    if self._use_batch_norm:
+      # The batch normalization layers are not shared between levels.
+      features = self._norm_activations[level](
+          features, is_training=is_training)
+    # Feature L2 normalization for training stability
+    features = tf.math.l2_normalize(
+        features,
+        axis=-1,
+        name='rpn-norm',)
+    # Proposal classification scores
+    scores = self._rpn_class_conv(features)
+    # Proposal bbox regression deltas
+    bboxes = self._rpn_box_conv(features)
+    # Proposal centerness scores
+    centers = self._rpn_center_conv(features)
+
+    return scores, bboxes, centers
+
+  def __call__(self, features, is_training=None):
+
+    scores_outputs = {}
+    box_outputs = {}
+    center_outputs = {}
+
+    with tf.name_scope('rpn_head'):
+      for level in range(self._min_level, self._max_level + 1):
+        scores_output, box_output, center_output = self._shared_rpn_heads(
+            features[level], self._anchors_per_location, level, is_training)
+        scores_outputs[level] = scores_output
+        box_outputs[level] = box_output
+        center_outputs[level] = center_output
+      return scores_outputs, box_outputs, center_outputs
+
+
+class FastrcnnHead(tf.keras.layers.Layer):
+  """Fast R-CNN box head."""
+
+  def __init__(
+      self,
+      num_classes,
+      num_convs=0,
+      num_filters=256,
+      use_separable_conv=False,
+      num_fcs=2,
+      fc_dims=1024,
+      activation='relu',
+      use_batch_norm=True,
+      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
+    """Initialize params to build Fast R-CNN box head.
+
+    Args:
+      num_classes: a integer for the number of classes.
+      num_convs: `int` number that represents the number of the intermediate
+        conv layers before the FC layers.
+      num_filters: `int` number that represents the number of filters of the
+        intermediate conv layers.
+      use_separable_conv: `bool`, indicating whether the separable conv layers
+        is used.
+      num_fcs: `int` number that represents the number of FC layers before the
+        predictions.
+      fc_dims: `int` number that represents the number of dimension of the FC
+        layers.
+      activation: activation function. Support 'relu' and 'swish'.
+      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
+      norm_activation: an operation that includes a normalization layer followed
+        by an optional activation layer.
+    """
+    super(FastrcnnHead, self).__init__(autocast=False)
+
+    self._num_classes = num_classes
+
+    self._num_convs = num_convs
+    self._num_filters = num_filters
+    if use_separable_conv:
+      self._conv2d_op = functools.partial(
+          tf.keras.layers.SeparableConv2D,
+          depth_multiplier=1,
+          bias_initializer=tf.zeros_initializer())
+    else:
+      self._conv2d_op = functools.partial(
+          tf.keras.layers.Conv2D,
+          kernel_initializer=tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          bias_initializer=tf.zeros_initializer())
+
+    self._num_fcs = num_fcs
+    self._fc_dims = fc_dims
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))
+    self._use_batch_norm = use_batch_norm
+    self._norm_activation = norm_activation
+
+    self._conv_ops = []
+    self._conv_bn_ops = []
+    for i in range(self._num_convs):
+      self._conv_ops.append(
+          self._conv2d_op(
+              self._num_filters,
+              kernel_size=(3, 3),
+              strides=(1, 1),
+              padding='same',
+              dilation_rate=(1, 1),
+              activation=(None
+                          if self._use_batch_norm else self._activation_op),
+              name='conv_{}'.format(i)))
+      if self._use_batch_norm:
+        self._conv_bn_ops.append(self._norm_activation())
+
+    self._fc_ops = []
+    self._fc_bn_ops = []
+    for i in range(self._num_fcs):
+      self._fc_ops.append(
+          tf.keras.layers.Dense(
+              units=self._fc_dims,
+              activation=(None
+                          if self._use_batch_norm else self._activation_op),
+              name='fc{}'.format(i)))
+      if self._use_batch_norm:
+        self._fc_bn_ops.append(self._norm_activation(fused=False))
+
+    self._class_predict = tf.keras.layers.Dense(
+        self._num_classes,
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        bias_initializer=tf.zeros_initializer(),
+        name='class-predict')
+    self._box_predict = tf.keras.layers.Dense(
+        self._num_classes * 4,
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.001),
+        bias_initializer=tf.zeros_initializer(),
+        name='box-predict')
+
+  def call(self, roi_features, is_training=None):
+    """Box and class branches for the Mask-RCNN model.
+
+    Args:
+      roi_features: A ROI feature tensor of shape [batch_size, num_rois,
+        height_l, width_l, num_filters].
+      is_training: `boolean`, if True if model is in training mode.
+
+    Returns:
+      class_outputs: a tensor with a shape of
+        [batch_size, num_rois, num_classes], representing the class predictions.
+      box_outputs: a tensor with a shape of
+        [batch_size, num_rois, num_classes * 4], representing the box
+        predictions.
+    """
+
+    with tf.name_scope(
+        'fast_rcnn_head'):
+      # reshape inputs beofre FC.
+      _, num_rois, height, width, filters = roi_features.get_shape().as_list()
+
+      net = tf.reshape(roi_features, [-1, height, width, filters])
+      for i in range(self._num_convs):
+        net = self._conv_ops[i](net)
+        if self._use_batch_norm:
+          net = self._conv_bn_ops[i](net, is_training=is_training)
+
+      filters = self._num_filters if self._num_convs > 0 else filters
+      net = tf.reshape(net, [-1, num_rois, height * width * filters])
+
+      for i in range(self._num_fcs):
+        net = self._fc_ops[i](net)
+        if self._use_batch_norm:
+          net = self._fc_bn_ops[i](net, is_training=is_training)
+
+      class_outputs = self._class_predict(net)
+      box_outputs = self._box_predict(net)
+      return class_outputs, box_outputs
+
+
+class OlnBoxScoreHead(tf.keras.layers.Layer):
+  """Box head of Object Localization Network (OLN)."""
+
+  def __init__(
+      self,
+      num_classes,
+      num_convs=0,
+      num_filters=256,
+      use_separable_conv=False,
+      num_fcs=2,
+      fc_dims=1024,
+      activation='relu',
+      use_batch_norm=True,
+      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
+    """Initialize params to build OLN box head.
+
+    Args:
+      num_classes: a integer for the number of classes.
+      num_convs: `int` number that represents the number of the intermediate
+        conv layers before the FC layers.
+      num_filters: `int` number that represents the number of filters of the
+        intermediate conv layers.
+      use_separable_conv: `bool`, indicating whether the separable conv layers
+        is used.
+      num_fcs: `int` number that represents the number of FC layers before the
+        predictions.
+      fc_dims: `int` number that represents the number of dimension of the FC
+        layers.
+      activation: activation function. Support 'relu' and 'swish'.
+      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
+      norm_activation: an operation that includes a normalization layer followed
+        by an optional activation layer.
+    """
+    self._num_classes = num_classes
+
+    self._num_convs = num_convs
+    self._num_filters = num_filters
+    if use_separable_conv:
+      self._conv2d_op = functools.partial(
+          tf.keras.layers.SeparableConv2D,
+          depth_multiplier=1,
+          bias_initializer=tf.zeros_initializer())
+    else:
+      self._conv2d_op = functools.partial(
+          tf.keras.layers.Conv2D,
+          kernel_initializer=tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          bias_initializer=tf.zeros_initializer())
+
+    self._num_fcs = num_fcs
+    self._fc_dims = fc_dims
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))
+    self._use_batch_norm = use_batch_norm
+    self._norm_activation = norm_activation
+
+    self._conv_ops = []
+    self._conv_bn_ops = []
+    for i in range(self._num_convs):
+      self._conv_ops.append(
+          self._conv2d_op(
+              self._num_filters,
+              kernel_size=(3, 3),
+              strides=(1, 1),
+              padding='same',
+              dilation_rate=(1, 1),
+              activation=(None
+                          if self._use_batch_norm else self._activation_op),
+              name='conv_{}'.format(i)))
+      if self._use_batch_norm:
+        self._conv_bn_ops.append(self._norm_activation())
+
+    self._fc_ops = []
+    self._fc_bn_ops = []
+    for i in range(self._num_fcs):
+      self._fc_ops.append(
+          tf.keras.layers.Dense(
+              units=self._fc_dims,
+              activation=(None
+                          if self._use_batch_norm else self._activation_op),
+              name='fc{}'.format(i)))
+      if self._use_batch_norm:
+        self._fc_bn_ops.append(self._norm_activation(fused=False))
+
+    self._class_predict = tf.keras.layers.Dense(
+        self._num_classes,
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        bias_initializer=tf.zeros_initializer(),
+        name='class-predict')
+    self._box_predict = tf.keras.layers.Dense(
+        self._num_classes * 4,
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.001),
+        bias_initializer=tf.zeros_initializer(),
+        name='box-predict')
+    self._score_predict = tf.keras.layers.Dense(
+        1,
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        bias_initializer=tf.zeros_initializer(),
+        name='score-predict')
+
+  def __call__(self, roi_features, is_training=None):
+    """Box and class branches for the Mask-RCNN model.
+
+    Args:
+      roi_features: A ROI feature tensor of shape [batch_size, num_rois,
+        height_l, width_l, num_filters].
+      is_training: `boolean`, if True if model is in training mode.
+
+    Returns:
+      class_outputs: a tensor with a shape of
+        [batch_size, num_rois, num_classes], representing the class predictions.
+      box_outputs: a tensor with a shape of
+        [batch_size, num_rois, num_classes * 4], representing the box
+        predictions.
+    """
+
+    with tf.name_scope('fast_rcnn_head'):
+      # reshape inputs beofre FC.
+      _, num_rois, height, width, filters = roi_features.get_shape().as_list()
+
+      net = tf.reshape(roi_features, [-1, height, width, filters])
+      for i in range(self._num_convs):
+        net = self._conv_ops[i](net)
+        if self._use_batch_norm:
+          net = self._conv_bn_ops[i](net, is_training=is_training)
+
+      filters = self._num_filters if self._num_convs > 0 else filters
+      net = tf.reshape(net, [-1, num_rois, height * width * filters])
+
+      for i in range(self._num_fcs):
+        net = self._fc_ops[i](net)
+        if self._use_batch_norm:
+          net = self._fc_bn_ops[i](net, is_training=is_training)
+
+      class_outputs = self._class_predict(net)
+      box_outputs = self._box_predict(net)
+      score_outputs = self._score_predict(net)
+      return class_outputs, box_outputs, score_outputs
+
+
+class MaskrcnnHead(tf.keras.layers.Layer):
+  """Mask R-CNN head."""
+
+  def __init__(
+      self,
+      num_classes,
+      mask_target_size,
+      num_convs=4,
+      num_filters=256,
+      use_separable_conv=False,
+      activation='relu',
+      use_batch_norm=True,
+      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
+    """Initialize params to build Fast R-CNN head.
+
+    Args:
+      num_classes: a integer for the number of classes.
+      mask_target_size: a integer that is the resolution of masks.
+      num_convs: `int` number that represents the number of the intermediate
+        conv layers before the prediction.
+      num_filters: `int` number that represents the number of filters of the
+        intermediate conv layers.
+      use_separable_conv: `bool`, indicating whether the separable conv layers
+        is used.
+      activation: activation function. Support 'relu' and 'swish'.
+      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
+      norm_activation: an operation that includes a normalization layer followed
+        by an optional activation layer.
+    """
+    super(MaskrcnnHead, self).__init__(autocast=False)
+    self._num_classes = num_classes
+    self._mask_target_size = mask_target_size
+
+    self._num_convs = num_convs
+    self._num_filters = num_filters
+    if use_separable_conv:
+      self._conv2d_op = functools.partial(
+          tf.keras.layers.SeparableConv2D,
+          depth_multiplier=1,
+          bias_initializer=tf.zeros_initializer())
+    else:
+      self._conv2d_op = functools.partial(
+          tf.keras.layers.Conv2D,
+          kernel_initializer=tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          bias_initializer=tf.zeros_initializer())
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))
+    self._use_batch_norm = use_batch_norm
+    self._norm_activation = norm_activation
+    self._conv2d_ops = []
+    for i in range(self._num_convs):
+      self._conv2d_ops.append(
+          self._conv2d_op(
+              self._num_filters,
+              kernel_size=(3, 3),
+              strides=(1, 1),
+              padding='same',
+              dilation_rate=(1, 1),
+              activation=(None
+                          if self._use_batch_norm else self._activation_op),
+              name='mask-conv-l%d' % i))
+    self._mask_conv_transpose = tf.keras.layers.Conv2DTranspose(
+        self._num_filters,
+        kernel_size=(2, 2),
+        strides=(2, 2),
+        padding='valid',
+        activation=(None if self._use_batch_norm else self._activation_op),
+        kernel_initializer=tf.keras.initializers.VarianceScaling(
+            scale=2, mode='fan_out', distribution='untruncated_normal'),
+        bias_initializer=tf.zeros_initializer(),
+        name='conv5-mask')
+
+    with tf.name_scope('mask_head'):
+      self._mask_conv2d_op = self._conv2d_op(
+          self._num_classes,
+          kernel_size=(1, 1),
+          strides=(1, 1),
+          padding='valid',
+          name='mask_fcn_logits')
+
+  def call(self, roi_features, class_indices, is_training=None):
+    """Mask branch for the Mask-RCNN model.
+
+    Args:
+      roi_features: A ROI feature tensor of shape [batch_size, num_rois,
+        height_l, width_l, num_filters].
+      class_indices: a Tensor of shape [batch_size, num_rois], indicating which
+        class the ROI is.
+      is_training: `boolean`, if True if model is in training mode.
+
+    Returns:
+      mask_outputs: a tensor with a shape of
+        [batch_size, num_masks, mask_height, mask_width, num_classes],
+        representing the mask predictions.
+      fg_gather_indices: a tensor with a shape of [batch_size, num_masks, 2],
+        representing the fg mask targets.
+    Raises:
+      ValueError: If boxes is not a rank-3 tensor or the last dimension of
+        boxes is not 4.
+    """
+
+    with tf.name_scope('mask_head'):
+      _, num_rois, height, width, filters = roi_features.get_shape().as_list()
+      net = tf.reshape(roi_features, [-1, height, width, filters])
+
+      for i in range(self._num_convs):
+        net = self._conv2d_ops[i](net)
+        if self._use_batch_norm:
+          net = self._norm_activation()(net, is_training=is_training)
+
+      net = self._mask_conv_transpose(net)
+      if self._use_batch_norm:
+        net = self._norm_activation()(net, is_training=is_training)
+
+      mask_outputs = self._mask_conv2d_op(net)
+      mask_outputs = tf.reshape(mask_outputs, [
+          -1, num_rois, self._mask_target_size, self._mask_target_size,
+          self._num_classes
+      ])
+
+      with tf.name_scope('masks_post_processing'):
+        # TODO(pengchong): Figure out the way not to use the static inferred
+        # batch size.
+        batch_size, num_masks = class_indices.get_shape().as_list()
+        mask_outputs = tf.transpose(a=mask_outputs, perm=[0, 1, 4, 2, 3])
+        # Constructs indices for gather.
+        batch_indices = tf.tile(
+            tf.expand_dims(tf.range(batch_size), axis=1), [1, num_masks])
+        mask_indices = tf.tile(
+            tf.expand_dims(tf.range(num_masks), axis=0), [batch_size, 1])
+        gather_indices = tf.stack(
+            [batch_indices, mask_indices, class_indices], axis=2)
+        mask_outputs = tf.gather_nd(mask_outputs, gather_indices)
+      return mask_outputs
+
+
+class RetinanetHead(object):
+  """RetinaNet head."""
+
+  def __init__(
+      self,
+      min_level,
+      max_level,
+      num_classes,
+      anchors_per_location,
+      num_convs=4,
+      num_filters=256,
+      use_separable_conv=False,
+      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
+    """Initialize params to build RetinaNet head.
+
+    Args:
+      min_level: `int` number of minimum feature level.
+      max_level: `int` number of maximum feature level.
+      num_classes: `int` number of classification categories.
+      anchors_per_location: `int` number of anchors per pixel location.
+      num_convs: `int` number of stacked convolution before the last prediction
+        layer.
+      num_filters: `int` number of filters used in the head architecture.
+      use_separable_conv: `bool` to indicate whether to use separable
+        convoluation.
+      norm_activation: an operation that includes a normalization layer followed
+        by an optional activation layer.
+    """
+    self._min_level = min_level
+    self._max_level = max_level
+
+    self._num_classes = num_classes
+    self._anchors_per_location = anchors_per_location
+
+    self._num_convs = num_convs
+    self._num_filters = num_filters
+    self._use_separable_conv = use_separable_conv
+    with tf.name_scope('class_net') as scope_name:
+      self._class_name_scope = tf.name_scope(scope_name)
+    with tf.name_scope('box_net') as scope_name:
+      self._box_name_scope = tf.name_scope(scope_name)
+    self._build_class_net_layers(norm_activation)
+    self._build_box_net_layers(norm_activation)
+
+  def _class_net_batch_norm_name(self, i, level):
+    return 'class-%d-%d' % (i, level)
+
+  def _box_net_batch_norm_name(self, i, level):
+    return 'box-%d-%d' % (i, level)
+
+  def _build_class_net_layers(self, norm_activation):
+    """Build re-usable layers for class prediction network."""
+    if self._use_separable_conv:
+      self._class_predict = tf.keras.layers.SeparableConv2D(
+          self._num_classes * self._anchors_per_location,
+          kernel_size=(3, 3),
+          bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
+          padding='same',
+          name='class-predict')
+    else:
+      self._class_predict = tf.keras.layers.Conv2D(
+          self._num_classes * self._anchors_per_location,
+          kernel_size=(3, 3),
+          bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
+          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1e-5),
+          padding='same',
+          name='class-predict')
+    self._class_conv = []
+    self._class_norm_activation = {}
+    for i in range(self._num_convs):
+      if self._use_separable_conv:
+        self._class_conv.append(
+            tf.keras.layers.SeparableConv2D(
+                self._num_filters,
+                kernel_size=(3, 3),
+                bias_initializer=tf.zeros_initializer(),
+                activation=None,
+                padding='same',
+                name='class-' + str(i)))
+      else:
+        self._class_conv.append(
+            tf.keras.layers.Conv2D(
+                self._num_filters,
+                kernel_size=(3, 3),
+                bias_initializer=tf.zeros_initializer(),
+                kernel_initializer=tf.keras.initializers.RandomNormal(
+                    stddev=0.01),
+                activation=None,
+                padding='same',
+                name='class-' + str(i)))
+      for level in range(self._min_level, self._max_level + 1):
+        name = self._class_net_batch_norm_name(i, level)
+        self._class_norm_activation[name] = norm_activation(name=name)
+
+  def _build_box_net_layers(self, norm_activation):
+    """Build re-usable layers for box prediction network."""
+    if self._use_separable_conv:
+      self._box_predict = tf.keras.layers.SeparableConv2D(
+          4 * self._anchors_per_location,
+          kernel_size=(3, 3),
+          bias_initializer=tf.zeros_initializer(),
+          padding='same',
+          name='box-predict')
+    else:
+      self._box_predict = tf.keras.layers.Conv2D(
+          4 * self._anchors_per_location,
+          kernel_size=(3, 3),
+          bias_initializer=tf.zeros_initializer(),
+          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1e-5),
+          padding='same',
+          name='box-predict')
+    self._box_conv = []
+    self._box_norm_activation = {}
+    for i in range(self._num_convs):
+      if self._use_separable_conv:
+        self._box_conv.append(
+            tf.keras.layers.SeparableConv2D(
+                self._num_filters,
+                kernel_size=(3, 3),
+                activation=None,
+                bias_initializer=tf.zeros_initializer(),
+                padding='same',
+                name='box-' + str(i)))
+      else:
+        self._box_conv.append(
+            tf.keras.layers.Conv2D(
+                self._num_filters,
+                kernel_size=(3, 3),
+                activation=None,
+                bias_initializer=tf.zeros_initializer(),
+                kernel_initializer=tf.keras.initializers.RandomNormal(
+                    stddev=0.01),
+                padding='same',
+                name='box-' + str(i)))
+      for level in range(self._min_level, self._max_level + 1):
+        name = self._box_net_batch_norm_name(i, level)
+        self._box_norm_activation[name] = norm_activation(name=name)
+
+  def __call__(self, fpn_features, is_training=None):
+    """Returns outputs of RetinaNet head."""
+    class_outputs = {}
+    box_outputs = {}
+    with tf.name_scope('retinanet_head'):
+      for level in range(self._min_level, self._max_level + 1):
+        features = fpn_features[level]
+
+        class_outputs[level] = self.class_net(
+            features, level, is_training=is_training)
+        box_outputs[level] = self.box_net(
+            features, level, is_training=is_training)
+    return class_outputs, box_outputs
+
+  def class_net(self, features, level, is_training):
+    """Class prediction network for RetinaNet."""
+    with self._class_name_scope:
+      for i in range(self._num_convs):
+        features = self._class_conv[i](features)
+        # The convolution layers in the class net are shared among all levels,
+        # but each level has its batch normlization to capture the statistical
+        # difference among different levels.
+        name = self._class_net_batch_norm_name(i, level)
+        features = self._class_norm_activation[name](
+            features, is_training=is_training)
+
+      classes = self._class_predict(features)
+    return classes
+
+  def box_net(self, features, level, is_training=None):
+    """Box regression network for RetinaNet."""
+    with self._box_name_scope:
+      for i in range(self._num_convs):
+        features = self._box_conv[i](features)
+        # The convolution layers in the box net are shared among all levels, but
+        # each level has its batch normlization to capture the statistical
+        # difference among different levels.
+        name = self._box_net_batch_norm_name(i, level)
+        features = self._box_norm_activation[name](
+            features, is_training=is_training)
+
+      boxes = self._box_predict(features)
+    return boxes
+
+
+# TODO(yeqing): Refactor this class when it is ready for var_scope reuse.
+class ShapemaskPriorHead(object):
+  """ShapeMask Prior head."""
+
+  def __init__(self, num_classes, num_downsample_channels, mask_crop_size,
+               use_category_for_mask, shape_prior_path):
+    """Initialize params to build RetinaNet head.
+
+    Args:
+      num_classes: Number of output classes.
+      num_downsample_channels: number of channels in mask branch.
+      mask_crop_size: feature crop size.
+      use_category_for_mask: use class information in mask branch.
+      shape_prior_path: the path to load shape priors.
+    """
+    self._mask_num_classes = num_classes if use_category_for_mask else 1
+    self._num_downsample_channels = num_downsample_channels
+    self._mask_crop_size = mask_crop_size
+    self._shape_prior_path = shape_prior_path
+    self._use_category_for_mask = use_category_for_mask
+
+    self._shape_prior_fc = tf.keras.layers.Dense(
+        self._num_downsample_channels, name='shape-prior-fc')
+
+  def __call__(self, fpn_features, boxes, outer_boxes, classes, is_training):
+    """Generate the detection priors from the box detections and FPN features.
+
+    This corresponds to the Fig. 4 of the ShapeMask paper at
+    https://arxiv.org/pdf/1904.03239.pdf
+
+    Args:
+      fpn_features: a dictionary of FPN features.
+      boxes: a float tensor of shape [batch_size, num_instances, 4] representing
+        the tight gt boxes from dataloader/detection.
+      outer_boxes: a float tensor of shape [batch_size, num_instances, 4]
+        representing the loose gt boxes from dataloader/detection.
+      classes: a int Tensor of shape [batch_size, num_instances] of instance
+        classes.
+      is_training: training mode or not.
+
+    Returns:
+      instance_features: a float Tensor of shape [batch_size * num_instances,
+          mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
+          instance feature crop.
+      detection_priors: A float Tensor of shape [batch_size * num_instances,
+        mask_size, mask_size, 1].
+    """
+    with tf.name_scope('prior_mask'):
+      batch_size, num_instances, _ = boxes.get_shape().as_list()
+      outer_boxes = tf.cast(outer_boxes, tf.float32)
+      boxes = tf.cast(boxes, tf.float32)
+      instance_features = spatial_transform_ops.multilevel_crop_and_resize(
+          fpn_features, outer_boxes, output_size=self._mask_crop_size)
+      instance_features = self._shape_prior_fc(instance_features)
+
+      shape_priors = self._get_priors()
+
+      # Get uniform priors for each outer box.
+      uniform_priors = tf.ones([
+          batch_size, num_instances, self._mask_crop_size, self._mask_crop_size
+      ])
+      uniform_priors = spatial_transform_ops.crop_mask_in_target_box(
+          uniform_priors, boxes, outer_boxes, self._mask_crop_size)
+
+      # Classify shape priors using uniform priors + instance features.
+      prior_distribution = self._classify_shape_priors(
+          tf.cast(instance_features, tf.float32), uniform_priors, classes)
+
+      instance_priors = tf.gather(shape_priors, classes)
+      instance_priors *= tf.expand_dims(
+          tf.expand_dims(tf.cast(prior_distribution, tf.float32), axis=-1),
+          axis=-1)
+      instance_priors = tf.reduce_sum(instance_priors, axis=2)
+      detection_priors = spatial_transform_ops.crop_mask_in_target_box(
+          instance_priors, boxes, outer_boxes, self._mask_crop_size)
+
+      return instance_features, detection_priors
+
+  def _get_priors(self):
+    """Load shape priors from file."""
+    # loads class specific or agnostic shape priors
+    if self._shape_prior_path:
+      # Priors are loaded into shape [mask_num_classes, num_clusters, 32, 32].
+      priors = np.load(tf.io.gfile.GFile(self._shape_prior_path, 'rb'))
+      priors = tf.convert_to_tensor(priors, dtype=tf.float32)
+      self._num_clusters = priors.get_shape().as_list()[1]
+    else:
+      # If prior path does not exist, do not use priors, i.e., pirors equal to
+      # uniform empty 32x32 patch.
+      self._num_clusters = 1
+      priors = tf.zeros([
+          self._mask_num_classes, self._num_clusters, self._mask_crop_size,
+          self._mask_crop_size
+      ])
+    return priors
+
+  def _classify_shape_priors(self, features, uniform_priors, classes):
+    """Classify the uniform prior by predicting the shape modes.
+
+    Classify the object crop features into K modes of the clusters for each
+    category.
+
+    Args:
+      features: A float Tensor of shape [batch_size, num_instances, mask_size,
+        mask_size, num_channels].
+      uniform_priors: A float Tensor of shape [batch_size, num_instances,
+        mask_size, mask_size] representing the uniform detection priors.
+      classes: A int Tensor of shape [batch_size, num_instances] of detection
+        class ids.
+
+    Returns:
+      prior_distribution: A float Tensor of shape
+        [batch_size, num_instances, num_clusters] representing the classifier
+        output probability over all possible shapes.
+    """
+
+    batch_size, num_instances, _, _, _ = features.get_shape().as_list()
+    features *= tf.expand_dims(uniform_priors, axis=-1)
+    # Reduce spatial dimension of features. The features have shape
+    # [batch_size, num_instances, num_channels].
+    features = tf.reduce_mean(features, axis=(2, 3))
+    logits = tf.keras.layers.Dense(
+        self._mask_num_classes * self._num_clusters,
+        kernel_initializer=tf.random_normal_initializer(stddev=0.01),
+        name='classify-shape-prior-fc')(features)
+    logits = tf.reshape(
+        logits,
+        [batch_size, num_instances, self._mask_num_classes, self._num_clusters])
+    if self._use_category_for_mask:
+      logits = tf.gather(logits, tf.expand_dims(classes, axis=-1), batch_dims=2)
+      logits = tf.squeeze(logits, axis=2)
+    else:
+      logits = logits[:, :, 0, :]
+
+    distribution = tf.nn.softmax(logits, name='shape_prior_weights')
+    return distribution
+
+
+class ShapemaskCoarsemaskHead(object):
+  """ShapemaskCoarsemaskHead head."""
+
+  def __init__(self,
+               num_classes,
+               num_downsample_channels,
+               mask_crop_size,
+               use_category_for_mask,
+               num_convs,
+               norm_activation=nn_ops.norm_activation_builder()):
+    """Initialize params to build ShapeMask coarse and fine prediction head.
+
+    Args:
+      num_classes: `int` number of mask classification categories.
+      num_downsample_channels: `int` number of filters at mask head.
+      mask_crop_size: feature crop size.
+      use_category_for_mask: use class information in mask branch.
+      num_convs: `int` number of stacked convolution before the last prediction
+        layer.
+      norm_activation: an operation that includes a normalization layer followed
+        by an optional activation layer.
+    """
+    self._mask_num_classes = num_classes if use_category_for_mask else 1
+    self._use_category_for_mask = use_category_for_mask
+    self._num_downsample_channels = num_downsample_channels
+    self._mask_crop_size = mask_crop_size
+    self._num_convs = num_convs
+    self._norm_activation = norm_activation
+
+    self._coarse_mask_fc = tf.keras.layers.Dense(
+        self._num_downsample_channels, name='coarse-mask-fc')
+
+    self._class_conv = []
+    self._class_norm_activation = []
+
+    for i in range(self._num_convs):
+      self._class_conv.append(
+          tf.keras.layers.Conv2D(
+              self._num_downsample_channels,
+              kernel_size=(3, 3),
+              bias_initializer=tf.zeros_initializer(),
+              kernel_initializer=tf.keras.initializers.RandomNormal(
+                  stddev=0.01),
+              padding='same',
+              name='coarse-mask-class-%d' % i))
+
+      self._class_norm_activation.append(
+          norm_activation(name='coarse-mask-class-%d-bn' % i))
+
+    self._class_predict = tf.keras.layers.Conv2D(
+        self._mask_num_classes,
+        kernel_size=(1, 1),
+        # Focal loss bias initialization to have foreground 0.01 probability.
+        bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        padding='same',
+        name='coarse-mask-class-predict')
+
+  def __call__(self, features, detection_priors, classes, is_training):
+    """Generate instance masks from FPN features and detection priors.
+
+    This corresponds to the Fig. 5-6 of the ShapeMask paper at
+    https://arxiv.org/pdf/1904.03239.pdf
+
+    Args:
+      features: a float Tensor of shape [batch_size, num_instances,
+        mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
+        instance feature crop.
+      detection_priors: a float Tensor of shape [batch_size, num_instances,
+        mask_crop_size, mask_crop_size, 1]. This is the detection prior for the
+        instance.
+      classes: a int Tensor of shape [batch_size, num_instances] of instance
+        classes.
+      is_training: a bool indicating whether in training mode.
+
+    Returns:
+      mask_outputs: instance mask prediction as a float Tensor of shape
+        [batch_size, num_instances, mask_size, mask_size].
+    """
+    with tf.name_scope('coarse_mask'):
+      # Transform detection priors to have the same dimension as features.
+      detection_priors = tf.expand_dims(detection_priors, axis=-1)
+      detection_priors = self._coarse_mask_fc(detection_priors)
+
+      features += detection_priors
+      mask_logits = self.decoder_net(features, is_training)
+      # Gather the logits with right input class.
+      if self._use_category_for_mask:
+        mask_logits = tf.transpose(mask_logits, [0, 1, 4, 2, 3])
+        mask_logits = tf.gather(
+            mask_logits, tf.expand_dims(classes, -1), batch_dims=2)
+        mask_logits = tf.squeeze(mask_logits, axis=2)
+      else:
+        mask_logits = mask_logits[..., 0]
+
+      return mask_logits
+
+  def decoder_net(self, features, is_training=False):
+    """Coarse mask decoder network architecture.
+
+    Args:
+      features: A tensor of size [batch, height_in, width_in, channels_in].
+      is_training: Whether batch_norm layers are in training mode.
+
+    Returns:
+      images: A feature tensor of size [batch, output_size, output_size,
+        num_channels]
+    """
+    (batch_size, num_instances, height, width,
+     num_channels) = features.get_shape().as_list()
+    features = tf.reshape(
+        features, [batch_size * num_instances, height, width, num_channels])
+    for i in range(self._num_convs):
+      features = self._class_conv[i](features)
+      features = self._class_norm_activation[i](
+          features, is_training=is_training)
+
+    mask_logits = self._class_predict(features)
+    mask_logits = tf.reshape(
+        mask_logits,
+        [batch_size, num_instances, height, width, self._mask_num_classes])
+    return mask_logits
+
+
+class ShapemaskFinemaskHead(object):
+  """ShapemaskFinemaskHead head."""
+
+  def __init__(self,
+               num_classes,
+               num_downsample_channels,
+               mask_crop_size,
+               use_category_for_mask,
+               num_convs,
+               upsample_factor,
+               norm_activation=nn_ops.norm_activation_builder()):
+    """Initialize params to build ShapeMask coarse and fine prediction head.
+
+    Args:
+      num_classes: `int` number of mask classification categories.
+      num_downsample_channels: `int` number of filters at mask head.
+      mask_crop_size: feature crop size.
+      use_category_for_mask: use class information in mask branch.
+      num_convs: `int` number of stacked convolution before the last prediction
+        layer.
+      upsample_factor: `int` number of fine mask upsampling factor.
+      norm_activation: an operation that includes a batch normalization layer
+        followed by a relu layer(optional).
+    """
+    self._use_category_for_mask = use_category_for_mask
+    self._mask_num_classes = num_classes if use_category_for_mask else 1
+    self._num_downsample_channels = num_downsample_channels
+    self._mask_crop_size = mask_crop_size
+    self._num_convs = num_convs
+    self.up_sample_factor = upsample_factor
+
+    self._fine_mask_fc = tf.keras.layers.Dense(
+        self._num_downsample_channels, name='fine-mask-fc')
+
+    self._upsample_conv = tf.keras.layers.Conv2DTranspose(
+        self._num_downsample_channels,
+        (self.up_sample_factor, self.up_sample_factor),
+        (self.up_sample_factor, self.up_sample_factor),
+        name='fine-mask-conv2d-tran')
+
+    self._fine_class_conv = []
+    self._fine_class_bn = []
+    for i in range(self._num_convs):
+      self._fine_class_conv.append(
+          tf.keras.layers.Conv2D(
+              self._num_downsample_channels,
+              kernel_size=(3, 3),
+              bias_initializer=tf.zeros_initializer(),
+              kernel_initializer=tf.keras.initializers.RandomNormal(
+                  stddev=0.01),
+              activation=None,
+              padding='same',
+              name='fine-mask-class-%d' % i))
+      self._fine_class_bn.append(
+          norm_activation(name='fine-mask-class-%d-bn' % i))
+
+    self._class_predict_conv = tf.keras.layers.Conv2D(
+        self._mask_num_classes,
+        kernel_size=(1, 1),
+        # Focal loss bias initialization to have foreground 0.01 probability.
+        bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        padding='same',
+        name='fine-mask-class-predict')
+
+  def __call__(self, features, mask_logits, classes, is_training):
+    """Generate instance masks from FPN features and detection priors.
+
+    This corresponds to the Fig. 5-6 of the ShapeMask paper at
+    https://arxiv.org/pdf/1904.03239.pdf
+
+    Args:
+      features: a float Tensor of shape [batch_size, num_instances,
+        mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
+        instance feature crop.
+      mask_logits: a float Tensor of shape [batch_size, num_instances,
+        mask_crop_size, mask_crop_size] indicating predicted mask logits.
+      classes: a int Tensor of shape [batch_size, num_instances] of instance
+        classes.
+      is_training: a bool indicating whether in training mode.
+
+    Returns:
+      mask_outputs: instance mask prediction as a float Tensor of shape
+        [batch_size, num_instances, mask_size, mask_size].
+    """
+    # Extract the foreground mean features
+    # with tf.variable_scope('fine_mask', reuse=tf.AUTO_REUSE):
+    with tf.name_scope('fine_mask'):
+      mask_probs = tf.nn.sigmoid(mask_logits)
+      # Compute instance embedding for hard average.
+      binary_mask = tf.cast(tf.greater(mask_probs, 0.5), features.dtype)
+      instance_embedding = tf.reduce_sum(
+          features * tf.expand_dims(binary_mask, axis=-1), axis=(2, 3))
+      instance_embedding /= tf.expand_dims(
+          tf.reduce_sum(binary_mask, axis=(2, 3)) + 1e-20, axis=-1)
+      # Take the difference between crop features and mean instance features.
+      features -= tf.expand_dims(
+          tf.expand_dims(instance_embedding, axis=2), axis=2)
+
+      features += self._fine_mask_fc(tf.expand_dims(mask_probs, axis=-1))
+
+      # Decoder to generate upsampled segmentation mask.
+      mask_logits = self.decoder_net(features, is_training)
+      if self._use_category_for_mask:
+        mask_logits = tf.transpose(mask_logits, [0, 1, 4, 2, 3])
+        mask_logits = tf.gather(
+            mask_logits, tf.expand_dims(classes, -1), batch_dims=2)
+        mask_logits = tf.squeeze(mask_logits, axis=2)
+      else:
+        mask_logits = mask_logits[..., 0]
+
+    return mask_logits
+
+  def decoder_net(self, features, is_training=False):
+    """Fine mask decoder network architecture.
+
+    Args:
+      features: A tensor of size [batch, height_in, width_in, channels_in].
+      is_training: Whether batch_norm layers are in training mode.
+
+    Returns:
+      images: A feature tensor of size [batch, output_size, output_size,
+        num_channels], where output size is self._gt_upsample_scale times
+        that of input.
+    """
+    (batch_size, num_instances, height, width,
+     num_channels) = features.get_shape().as_list()
+    features = tf.reshape(
+        features, [batch_size * num_instances, height, width, num_channels])
+    for i in range(self._num_convs):
+      features = self._fine_class_conv[i](features)
+      features = self._fine_class_bn[i](features, is_training=is_training)
+
+    if self.up_sample_factor > 1:
+      features = self._upsample_conv(features)
+
+    # Predict per-class instance masks.
+    mask_logits = self._class_predict_conv(features)
+
+    mask_logits = tf.reshape(mask_logits, [
+        batch_size, num_instances, height * self.up_sample_factor,
+        width * self.up_sample_factor, self._mask_num_classes
+    ])
+    return mask_logits
--- a/official/legacy/detection/modeling/architecture/identity.py
+++ b/official/legacy/detection/modeling/architecture/identity.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Identity Fn that forwards the input features."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class Identity(object):
+  """Identity function that forwards the input features."""
+
+  def __call__(self, features, is_training=False):
+    """Only forwards the input features."""
+    return features
+
--- a/official/legacy/detection/modeling/architecture/nn_blocks.py
+++ b/official/legacy/detection/modeling/architecture/nn_blocks.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains common building blocks for neural networks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from official.modeling import tf_utils
+
+
+class ResidualBlock(tf.keras.layers.Layer):
+  """A residual block."""
+
+  def __init__(self,
+               filters,
+               strides,
+               use_projection=False,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """A residual block with BN after convolutions.
+
+    Args:
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      use_projection: `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+        Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+        Default to None.
+      activation: `str` name of the activation function.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(ResidualBlock, self).__init__(**kwargs)
+
+    self._filters = filters
+    self._strides = strides
+    self._use_projection = use_projection
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape):
+    if self._use_projection:
+      self._shortcut = tf.keras.layers.Conv2D(
+          filters=self._filters,
+          kernel_size=1,
+          strides=self._strides,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=self._strides,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    super(ResidualBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'use_projection': self._use_projection,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+
+    base_config = super(ResidualBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    shortcut = inputs
+    if self._use_projection:
+      shortcut = self._shortcut(shortcut)
+      shortcut = self._norm0(shortcut)
+
+    x = self._conv1(inputs)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+
+    return self._activation_fn(x + shortcut)
+
+
+class BottleneckBlock(tf.keras.layers.Layer):
+  """A standard bottleneck block."""
+
+  def __init__(self,
+               filters,
+               strides,
+               use_projection=False,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """A standard bottleneck block with BN after convolutions.
+
+    Args:
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      use_projection: `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+        Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+        Default to None.
+      activation: `str` name of the activation function.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(BottleneckBlock, self).__init__(**kwargs)
+
+    self._filters = filters
+    self._strides = strides
+    self._use_projection = use_projection
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape):
+    if self._use_projection:
+      self._shortcut = tf.keras.layers.Conv2D(
+          filters=self._filters * 4,
+          kernel_size=1,
+          strides=self._strides,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=self._strides,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv3 = tf.keras.layers.Conv2D(
+        filters=self._filters * 4,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm3 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    super(BottleneckBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'use_projection': self._use_projection,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+
+    base_config = super(BottleneckBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    shortcut = inputs
+    if self._use_projection:
+      shortcut = self._shortcut(shortcut)
+      shortcut = self._norm0(shortcut)
+
+    x = self._conv1(inputs)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+    x = self._activation_fn(x)
+
+    x = self._conv3(x)
+    x = self._norm3(x)
+
+    return self._activation_fn(x + shortcut)
--- a/official/legacy/detection/modeling/architecture/nn_ops.py
+++ b/official/legacy/detection/modeling/architecture/nn_ops.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Neural network operations commonly shared by the architectures."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import tensorflow as tf
+
+
+class NormActivation(tf.keras.layers.Layer):
+  """Combined Normalization and Activation layers."""
+
+  def __init__(self,
+               momentum=0.997,
+               epsilon=1e-4,
+               trainable=True,
+               init_zero=False,
+               use_activation=True,
+               activation='relu',
+               fused=True,
+               name=None):
+    """A class to construct layers for a batch normalization followed by a ReLU.
+
+    Args:
+      momentum: momentum for the moving average.
+      epsilon: small float added to variance to avoid dividing by zero.
+      trainable: `bool`, if True also add variables to the graph collection
+        GraphKeys.TRAINABLE_VARIABLES. If False, freeze batch normalization
+        layer.
+      init_zero: `bool` if True, initializes scale parameter of batch
+        normalization with 0. If False, initialize it with 1.
+      use_activation: `bool`, whether to add the optional activation layer after
+        the batch normalization layer.
+      activation: 'string', the type of the activation layer. Currently support
+        `relu` and `swish`.
+      fused: `bool` fused option in batch normalziation.
+      name: `str` name for the operation.
+    """
+    super(NormActivation, self).__init__(trainable=trainable)
+    if init_zero:
+      gamma_initializer = tf.keras.initializers.Zeros()
+    else:
+      gamma_initializer = tf.keras.initializers.Ones()
+    self._normalization_op = tf.keras.layers.BatchNormalization(
+        momentum=momentum,
+        epsilon=epsilon,
+        center=True,
+        scale=True,
+        trainable=trainable,
+        fused=fused,
+        gamma_initializer=gamma_initializer,
+        name=name)
+    self._use_activation = use_activation
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))
+
+  def __call__(self, inputs, is_training=None):
+    """Builds the normalization layer followed by an optional activation layer.
+
+    Args:
+      inputs: `Tensor` of shape `[batch, channels, ...]`.
+      is_training: `boolean`, if True if model is in training mode.
+
+    Returns:
+      A normalized `Tensor` with the same `data_format`.
+    """
+    # We will need to keep training=None by default, so that it can be inherit
+    # from keras.Model.training
+    if is_training and self.trainable:
+      is_training = True
+    inputs = self._normalization_op(inputs, training=is_training)
+
+    if self._use_activation:
+      inputs = self._activation_op(inputs)
+    return inputs
+
+
+def norm_activation_builder(momentum=0.997,
+                            epsilon=1e-4,
+                            trainable=True,
+                            activation='relu',
+                            **kwargs):
+  return functools.partial(
+      NormActivation,
+      momentum=momentum,
+      epsilon=epsilon,
+      trainable=trainable,
+      activation=activation,
+      **kwargs)
--- a/official/legacy/detection/modeling/architecture/resnet.py
+++ b/official/legacy/detection/modeling/architecture/resnet.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions for the post-activation form of Residual Networks.
+
+Residual networks (ResNets) were proposed in:
+[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Deep Residual Learning for Image Recognition. arXiv:1512.03385
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from official.legacy.detection.modeling.architecture import nn_ops
+
+
+# TODO(b/140112644): Refactor the code with Keras style, i.e. build and call.
+class Resnet(object):
+  """Class to build ResNet family model."""
+
+  def __init__(
+      self,
+      resnet_depth,
+      activation='relu',
+      norm_activation=nn_ops.norm_activation_builder(activation='relu'),
+      data_format='channels_last'):
+    """ResNet initialization function.
+
+    Args:
+      resnet_depth: `int` depth of ResNet backbone model.
+      activation: the activation function.
+      norm_activation: an operation that includes a normalization layer followed
+        by an optional activation layer.
+      data_format: `str` either "channels_first" for `[batch, channels, height,
+        width]` or "channels_last for `[batch, height, width, channels]`.
+    """
+    self._resnet_depth = resnet_depth
+    if activation == 'relu':
+      self._activation_op = tf.nn.relu
+    elif activation == 'swish':
+      self._activation_op = tf.nn.swish
+    else:
+      raise ValueError('Unsupported activation `{}`.'.format(activation))
+    self._norm_activation = norm_activation
+    self._data_format = data_format
+
+    model_params = {
+        10: {
+            'block': self.residual_block,
+            'layers': [1, 1, 1, 1]
+        },
+        18: {
+            'block': self.residual_block,
+            'layers': [2, 2, 2, 2]
+        },
+        34: {
+            'block': self.residual_block,
+            'layers': [3, 4, 6, 3]
+        },
+        50: {
+            'block': self.bottleneck_block,
+            'layers': [3, 4, 6, 3]
+        },
+        101: {
+            'block': self.bottleneck_block,
+            'layers': [3, 4, 23, 3]
+        },
+        152: {
+            'block': self.bottleneck_block,
+            'layers': [3, 8, 36, 3]
+        },
+        200: {
+            'block': self.bottleneck_block,
+            'layers': [3, 24, 36, 3]
+        }
+    }
+
+    if resnet_depth not in model_params:
+      valid_resnet_depths = ', '.join(
+          [str(depth) for depth in sorted(model_params.keys())])
+      raise ValueError(
+          'The resnet_depth should be in [%s]. Not a valid resnet_depth:' %
+          (valid_resnet_depths), self._resnet_depth)
+    params = model_params[resnet_depth]
+    self._resnet_fn = self.resnet_v1_generator(params['block'],
+                                               params['layers'])
+
+  def __call__(self, inputs, is_training=None):
+    """Returns the ResNet model for a given size and number of output classes.
+
+    Args:
+      inputs: a `Tesnor` with shape [batch_size, height, width, 3] representing
+        a batch of images.
+      is_training: `bool` if True, the model is in training mode.
+
+    Returns:
+      a `dict` containing `int` keys for continuous feature levels [2, 3, 4, 5].
+      The values are corresponding feature hierarchy in ResNet with shape
+      [batch_size, height_l, width_l, num_filters].
+    """
+    with tf.name_scope('resnet%s' % self._resnet_depth):
+      return self._resnet_fn(inputs, is_training)
+
+  def fixed_padding(self, inputs, kernel_size):
+    """Pads the input along the spatial dimensions independently of input size.
+
+    Args:
+      inputs: `Tensor` of size `[batch, channels, height, width]` or `[batch,
+        height, width, channels]` depending on `data_format`.
+      kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d`
+        operations. Should be a positive integer.
+
+    Returns:
+      A padded `Tensor` of the same `data_format` with size either intact
+      (if `kernel_size == 1`) or padded (if `kernel_size > 1`).
+    """
+    pad_total = kernel_size - 1
+    pad_beg = pad_total // 2
+    pad_end = pad_total - pad_beg
+    if self._data_format == 'channels_first':
+      padded_inputs = tf.pad(
+          tensor=inputs,
+          paddings=[[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]])
+    else:
+      padded_inputs = tf.pad(
+          tensor=inputs,
+          paddings=[[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
+
+    return padded_inputs
+
+  def conv2d_fixed_padding(self, inputs, filters, kernel_size, strides):
+    """Strided 2-D convolution with explicit padding.
+
+    The padding is consistent and is based only on `kernel_size`, not on the
+    dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
+
+    Args:
+      inputs: `Tensor` of size `[batch, channels, height_in, width_in]`.
+      filters: `int` number of filters in the convolution.
+      kernel_size: `int` size of the kernel to be used in the convolution.
+      strides: `int` strides of the convolution.
+
+    Returns:
+      A `Tensor` of shape `[batch, filters, height_out, width_out]`.
+    """
+    if strides > 1:
+      inputs = self.fixed_padding(inputs, kernel_size)
+
+    return tf.keras.layers.Conv2D(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=('SAME' if strides == 1 else 'VALID'),
+        use_bias=False,
+        kernel_initializer=tf.initializers.VarianceScaling(),
+        data_format=self._data_format)(
+            inputs=inputs)
+
+  def residual_block(self,
+                     inputs,
+                     filters,
+                     strides,
+                     use_projection=False,
+                     is_training=None):
+    """Standard building block for residual networks with BN after convolutions.
+
+    Args:
+      inputs: `Tensor` of size `[batch, channels, height, width]`.
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      use_projection: `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      is_training: `bool` if True, the model is in training mode.
+
+    Returns:
+      The output `Tensor` of the block.
+    """
+    shortcut = inputs
+    if use_projection:
+      # Projection shortcut in first layer to match filters and strides
+      shortcut = self.conv2d_fixed_padding(
+          inputs=inputs, filters=filters, kernel_size=1, strides=strides)
+      shortcut = self._norm_activation(use_activation=False)(
+          shortcut, is_training=is_training)
+
+    inputs = self.conv2d_fixed_padding(
+        inputs=inputs, filters=filters, kernel_size=3, strides=strides)
+    inputs = self._norm_activation()(inputs, is_training=is_training)
+
+    inputs = self.conv2d_fixed_padding(
+        inputs=inputs, filters=filters, kernel_size=3, strides=1)
+    inputs = self._norm_activation(
+        use_activation=False, init_zero=True)(
+            inputs, is_training=is_training)
+
+    return self._activation_op(inputs + shortcut)
+
+  def bottleneck_block(self,
+                       inputs,
+                       filters,
+                       strides,
+                       use_projection=False,
+                       is_training=None):
+    """Bottleneck block variant for residual networks with BN after convolutions.
+
+    Args:
+      inputs: `Tensor` of size `[batch, channels, height, width]`.
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      use_projection: `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      is_training: `bool` if True, the model is in training mode.
+
+    Returns:
+      The output `Tensor` of the block.
+    """
+    shortcut = inputs
+    if use_projection:
+      # Projection shortcut only in first block within a group. Bottleneck
+      # blocks end with 4 times the number of filters.
+      filters_out = 4 * filters
+      shortcut = self.conv2d_fixed_padding(
+          inputs=inputs, filters=filters_out, kernel_size=1, strides=strides)
+      shortcut = self._norm_activation(use_activation=False)(
+          shortcut, is_training=is_training)
+
+    inputs = self.conv2d_fixed_padding(
+        inputs=inputs, filters=filters, kernel_size=1, strides=1)
+    inputs = self._norm_activation()(inputs, is_training=is_training)
+
+    inputs = self.conv2d_fixed_padding(
+        inputs=inputs, filters=filters, kernel_size=3, strides=strides)
+    inputs = self._norm_activation()(inputs, is_training=is_training)
+
+    inputs = self.conv2d_fixed_padding(
+        inputs=inputs, filters=4 * filters, kernel_size=1, strides=1)
+    inputs = self._norm_activation(
+        use_activation=False, init_zero=True)(
+            inputs, is_training=is_training)
+
+    return self._activation_op(inputs + shortcut)
+
+  def block_group(self, inputs, filters, block_fn, blocks, strides, name,
+                  is_training):
+    """Creates one group of blocks for the ResNet model.
+
+    Args:
+      inputs: `Tensor` of size `[batch, channels, height, width]`.
+      filters: `int` number of filters for the first convolution of the layer.
+      block_fn: `function` for the block to use within the model
+      blocks: `int` number of blocks contained in the layer.
+      strides: `int` stride to use for the first convolution of the layer. If
+        greater than 1, this layer will downsample the input.
+      name: `str`name for the Tensor output of the block layer.
+      is_training: `bool` if True, the model is in training mode.
+
+    Returns:
+      The output `Tensor` of the block layer.
+    """
+    # Only the first block per block_group uses projection shortcut and strides.
+    inputs = block_fn(
+        inputs, filters, strides, use_projection=True, is_training=is_training)
+
+    for _ in range(1, blocks):
+      inputs = block_fn(inputs, filters, 1, is_training=is_training)
+
+    return tf.identity(inputs, name)
+
+  def resnet_v1_generator(self, block_fn, layers):
+    """Generator for ResNet v1 models.
+
+    Args:
+      block_fn: `function` for the block to use within the model. Either
+        `residual_block` or `bottleneck_block`.
+      layers: list of 4 `int`s denoting the number of blocks to include in each
+        of the 4 block groups. Each group consists of blocks that take inputs of
+        the same resolution.
+
+    Returns:
+      Model `function` that takes in `inputs` and `is_training` and returns the
+      output `Tensor` of the ResNet model.
+    """
+
+    def model(inputs, is_training=None):
+      """Creation of the model graph."""
+      inputs = self.conv2d_fixed_padding(
+          inputs=inputs, filters=64, kernel_size=7, strides=2)
+      inputs = tf.identity(inputs, 'initial_conv')
+      inputs = self._norm_activation()(inputs, is_training=is_training)
+
+      inputs = tf.keras.layers.MaxPool2D(
+          pool_size=3, strides=2, padding='SAME',
+          data_format=self._data_format)(
+              inputs)
+      inputs = tf.identity(inputs, 'initial_max_pool')
+
+      c2 = self.block_group(
+          inputs=inputs,
+          filters=64,
+          block_fn=block_fn,
+          blocks=layers[0],
+          strides=1,
+          name='block_group1',
+          is_training=is_training)
+      c3 = self.block_group(
+          inputs=c2,
+          filters=128,
+          block_fn=block_fn,
+          blocks=layers[1],
+          strides=2,
+          name='block_group2',
+          is_training=is_training)
+      c4 = self.block_group(
+          inputs=c3,
+          filters=256,
+          block_fn=block_fn,
+          blocks=layers[2],
+          strides=2,
+          name='block_group3',
+          is_training=is_training)
+      c5 = self.block_group(
+          inputs=c4,
+          filters=512,
+          block_fn=block_fn,
+          blocks=layers[3],
+          strides=2,
+          name='block_group4',
+          is_training=is_training)
+      return {2: c2, 3: c3, 4: c4, 5: c5}
+
+    return model
--- a/official/legacy/detection/modeling/architecture/spinenet.py
+++ b/official/legacy/detection/modeling/architecture/spinenet.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+# ==============================================================================
+"""Implementation of SpineNet model.
+
+X. Du, T-Y. Lin, P. Jin, G. Ghiasi, M. Tan, Y. Cui, Q. V. Le, X. Song
+SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization
+https://arxiv.org/abs/1912.05027
+"""
+import math
+
+from absl import logging
+import tensorflow as tf
+from official.legacy.detection.modeling.architecture import nn_blocks
+from official.modeling import tf_utils
+
+layers = tf.keras.layers
+
+FILTER_SIZE_MAP = {
+    1: 32,
+    2: 64,
+    3: 128,
+    4: 256,
+    5: 256,
+    6: 256,
+    7: 256,
+}
+
+# The fixed SpineNet architecture discovered by NAS.
+# Each element represents a specification of a building block:
+#   (block_level, block_fn, (input_offset0, input_offset1), is_output).
+SPINENET_BLOCK_SPECS = [
+    (2, 'bottleneck', (0, 1), False),
+    (4, 'residual', (0, 1), False),
+    (3, 'bottleneck', (2, 3), False),
+    (4, 'bottleneck', (2, 4), False),
+    (6, 'residual', (3, 5), False),
+    (4, 'bottleneck', (3, 5), False),
+    (5, 'residual', (6, 7), False),
+    (7, 'residual', (6, 8), False),
+    (5, 'bottleneck', (8, 9), False),
+    (5, 'bottleneck', (8, 10), False),
+    (4, 'bottleneck', (5, 10), True),
+    (3, 'bottleneck', (4, 10), True),
+    (5, 'bottleneck', (7, 12), True),
+    (7, 'bottleneck', (5, 14), True),
+    (6, 'bottleneck', (12, 14), True),
+]
+
+SCALING_MAP = {
+    '49S': {
+        'endpoints_num_filters': 128,
+        'filter_size_scale': 0.65,
+        'resample_alpha': 0.5,
+        'block_repeats': 1,
+    },
+    '49': {
+        'endpoints_num_filters': 256,
+        'filter_size_scale': 1.0,
+        'resample_alpha': 0.5,
+        'block_repeats': 1,
+    },
+    '96': {
+        'endpoints_num_filters': 256,
+        'filter_size_scale': 1.0,
+        'resample_alpha': 0.5,
+        'block_repeats': 2,
+    },
+    '143': {
+        'endpoints_num_filters': 256,
+        'filter_size_scale': 1.0,
+        'resample_alpha': 1.0,
+        'block_repeats': 3,
+    },
+    '190': {
+        'endpoints_num_filters': 512,
+        'filter_size_scale': 1.3,
+        'resample_alpha': 1.0,
+        'block_repeats': 4,
+    },
+}
+
+
+class BlockSpec(object):
+  """A container class that specifies the block configuration for SpineNet."""
+
+  def __init__(self, level, block_fn, input_offsets, is_output):
+    self.level = level
+    self.block_fn = block_fn
+    self.input_offsets = input_offsets
+    self.is_output = is_output
+
+
+def build_block_specs(block_specs=None):
+  """Builds the list of BlockSpec objects for SpineNet."""
+  if not block_specs:
+    block_specs = SPINENET_BLOCK_SPECS
+  logging.info('Building SpineNet block specs: %s', block_specs)
+  return [BlockSpec(*b) for b in block_specs]
+
+
+class SpineNet(tf.keras.Model):
+  """Class to build SpineNet models."""
+
+  def __init__(self,
+               input_specs=tf.keras.layers.InputSpec(shape=[None, 640, 640, 3]),
+               min_level=3,
+               max_level=7,
+               block_specs=build_block_specs(),
+               endpoints_num_filters=256,
+               resample_alpha=0.5,
+               block_repeats=1,
+               filter_size_scale=1.0,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """SpineNet model."""
+    self._min_level = min_level
+    self._max_level = max_level
+    self._block_specs = block_specs
+    self._endpoints_num_filters = endpoints_num_filters
+    self._resample_alpha = resample_alpha
+    self._block_repeats = block_repeats
+    self._filter_size_scale = filter_size_scale
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    if activation == 'relu':
+      self._activation = tf.nn.relu
+    elif activation == 'swish':
+      self._activation = tf.nn.swish
+    else:
+      raise ValueError('Activation {} not implemented.'.format(activation))
+    self._init_block_fn = 'bottleneck'
+    self._num_init_blocks = 2
+
+    if use_sync_bn:
+      self._norm = layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = layers.BatchNormalization
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+
+    # Build SpineNet.
+    inputs = tf.keras.Input(shape=input_specs.shape[1:])
+
+    net = self._build_stem(inputs=inputs)
+    net = self._build_scale_permuted_network(
+        net=net, input_width=input_specs.shape[1])
+    net = self._build_endpoints(net=net)
+
+    super(SpineNet, self).__init__(inputs=inputs, outputs=net)
+
+  def _block_group(self,
+                   inputs,
+                   filters,
+                   strides,
+                   block_fn_cand,
+                   block_repeats=1,
+                   name='block_group'):
+    """Creates one group of blocks for the SpineNet model."""
+    block_fn_candidates = {
+        'bottleneck': nn_blocks.BottleneckBlock,
+        'residual': nn_blocks.ResidualBlock,
+    }
+    block_fn = block_fn_candidates[block_fn_cand]
+    _, _, _, num_filters = inputs.get_shape().as_list()
+
+    if block_fn_cand == 'bottleneck':
+      use_projection = not (num_filters == (filters * 4) and strides == 1)
+    else:
+      use_projection = not (num_filters == filters and strides == 1)
+
+    x = block_fn(
+        filters=filters,
+        strides=strides,
+        use_projection=use_projection,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        norm_momentum=self._norm_momentum,
+        norm_epsilon=self._norm_epsilon)(
+            inputs)
+    for _ in range(1, block_repeats):
+      x = block_fn(
+          filters=filters,
+          strides=1,
+          use_projection=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer,
+          activation=self._activation,
+          use_sync_bn=self._use_sync_bn,
+          norm_momentum=self._norm_momentum,
+          norm_epsilon=self._norm_epsilon)(
+              x)
+    return tf.identity(x, name=name)
+
+  def _build_stem(self, inputs):
+    """Build SpineNet stem."""
+    x = layers.Conv2D(
+        filters=64,
+        kernel_size=7,
+        strides=2,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            inputs)
+    x = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)(
+            x)
+    x = tf_utils.get_activation(self._activation)(x)
+    x = layers.MaxPool2D(pool_size=3, strides=2, padding='same')(x)
+
+    net = []
+    # Build the initial level 2 blocks.
+    for i in range(self._num_init_blocks):
+      x = self._block_group(
+          inputs=x,
+          filters=int(FILTER_SIZE_MAP[2] * self._filter_size_scale),
+          strides=1,
+          block_fn_cand=self._init_block_fn,
+          block_repeats=self._block_repeats,
+          name='stem_block_{}'.format(i + 1))
+      net.append(x)
+    return net
+
+  def _build_scale_permuted_network(self,
+                                    net,
+                                    input_width,
+                                    weighted_fusion=False):
+    """Build scale-permuted network."""
+    net_sizes = [int(math.ceil(input_width / 2**2))] * len(net)
+    net_block_fns = [self._init_block_fn] * len(net)
+    num_outgoing_connections = [0] * len(net)
+
+    endpoints = {}
+    for i, block_spec in enumerate(self._block_specs):
+      # Find out specs for the target block.
+      target_width = int(math.ceil(input_width / 2**block_spec.level))
+      target_num_filters = int(FILTER_SIZE_MAP[block_spec.level] *
+                               self._filter_size_scale)
+      target_block_fn = block_spec.block_fn
+
+      # Resample then merge input0 and input1.
+      parents = []
+      input0 = block_spec.input_offsets[0]
+      input1 = block_spec.input_offsets[1]
+
+      x0 = self._resample_with_alpha(
+          inputs=net[input0],
+          input_width=net_sizes[input0],
+          input_block_fn=net_block_fns[input0],
+          target_width=target_width,
+          target_num_filters=target_num_filters,
+          target_block_fn=target_block_fn,
+          alpha=self._resample_alpha)
+      parents.append(x0)
+      num_outgoing_connections[input0] += 1
+
+      x1 = self._resample_with_alpha(
+          inputs=net[input1],
+          input_width=net_sizes[input1],
+          input_block_fn=net_block_fns[input1],
+          target_width=target_width,
+          target_num_filters=target_num_filters,
+          target_block_fn=target_block_fn,
+          alpha=self._resample_alpha)
+      parents.append(x1)
+      num_outgoing_connections[input1] += 1
+
+      # Merge 0 outdegree blocks to the output block.
+      if block_spec.is_output:
+        for j, (j_feat,
+                j_connections) in enumerate(zip(net, num_outgoing_connections)):
+          if j_connections == 0 and (j_feat.shape[2] == target_width and
+                                     j_feat.shape[3] == x0.shape[3]):
+            parents.append(j_feat)
+            num_outgoing_connections[j] += 1
+
+      # pylint: disable=g-direct-tensorflow-import
+      if weighted_fusion:
+        dtype = parents[0].dtype
+        parent_weights = [
+            tf.nn.relu(tf.cast(tf.Variable(1.0, name='block{}_fusion{}'.format(
+                i, j)), dtype=dtype)) for j in range(len(parents))]
+        weights_sum = tf.add_n(parent_weights)
+        parents = [
+            parents[i] * parent_weights[i] / (weights_sum + 0.0001)
+            for i in range(len(parents))
+        ]
+
+      # Fuse all parent nodes then build a new block.
+      x = tf_utils.get_activation(self._activation)(tf.add_n(parents))
+      x = self._block_group(
+          inputs=x,
+          filters=target_num_filters,
+          strides=1,
+          block_fn_cand=target_block_fn,
+          block_repeats=self._block_repeats,
+          name='scale_permuted_block_{}'.format(i + 1))
+
+      net.append(x)
+      net_sizes.append(target_width)
+      net_block_fns.append(target_block_fn)
+      num_outgoing_connections.append(0)
+
+      # Save output feats.
+      if block_spec.is_output:
+        if block_spec.level in endpoints:
+          raise ValueError('Duplicate feats found for output level {}.'.format(
+              block_spec.level))
+        if (block_spec.level < self._min_level or
+            block_spec.level > self._max_level):
+          raise ValueError('Output level is out of range [{}, {}]'.format(
+              self._min_level, self._max_level))
+        endpoints[block_spec.level] = x
+
+    return endpoints
+
+  def _build_endpoints(self, net):
+    """Match filter size for endpoints before sharing conv layers."""
+    endpoints = {}
+    for level in range(self._min_level, self._max_level + 1):
+      x = layers.Conv2D(
+          filters=self._endpoints_num_filters,
+          kernel_size=1,
+          strides=1,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)(
+              net[level])
+      x = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)(
+              x)
+      x = tf_utils.get_activation(self._activation)(x)
+      endpoints[level] = x
+    return endpoints
+
+  def _resample_with_alpha(self,
+                           inputs,
+                           input_width,
+                           input_block_fn,
+                           target_width,
+                           target_num_filters,
+                           target_block_fn,
+                           alpha=0.5):
+    """Match resolution and feature dimension."""
+    _, _, _, input_num_filters = inputs.get_shape().as_list()
+    if input_block_fn == 'bottleneck':
+      input_num_filters /= 4
+    new_num_filters = int(input_num_filters * alpha)
+
+    x = layers.Conv2D(
+        filters=new_num_filters,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            inputs)
+    x = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)(
+            x)
+    x = tf_utils.get_activation(self._activation)(x)
+
+    # Spatial resampling.
+    if input_width > target_width:
+      x = layers.Conv2D(
+          filters=new_num_filters,
+          kernel_size=3,
+          strides=2,
+          padding='SAME',
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)(
+              x)
+      x = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)(
+              x)
+      x = tf_utils.get_activation(self._activation)(x)
+      input_width /= 2
+      while input_width > target_width:
+        x = layers.MaxPool2D(pool_size=3, strides=2, padding='SAME')(x)
+        input_width /= 2
+    elif input_width < target_width:
+      scale = target_width // input_width
+      x = layers.UpSampling2D(size=(scale, scale))(x)
+
+    # Last 1x1 conv to match filter size.
+    if target_block_fn == 'bottleneck':
+      target_num_filters *= 4
+    x = layers.Conv2D(
+        filters=target_num_filters,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            x)
+    x = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)(
+            x)
+
+    return x
+
+
+class SpineNetBuilder(object):
+  """SpineNet builder."""
+
+  def __init__(self,
+               model_id,
+               input_specs=tf.keras.layers.InputSpec(shape=[None, 640, 640, 3]),
+               min_level=3,
+               max_level=7,
+               block_specs=build_block_specs(),
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001):
+    if model_id not in SCALING_MAP:
+      raise ValueError(
+          'SpineNet {} is not a valid architecture.'.format(model_id))
+    scaling_params = SCALING_MAP[model_id]
+    self._input_specs = input_specs
+    self._min_level = min_level
+    self._max_level = max_level
+    self._block_specs = block_specs
+    self._endpoints_num_filters = scaling_params['endpoints_num_filters']
+    self._resample_alpha = scaling_params['resample_alpha']
+    self._block_repeats = scaling_params['block_repeats']
+    self._filter_size_scale = scaling_params['filter_size_scale']
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._activation = activation
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+
+  def __call__(self, inputs, is_training=None):
+    model = SpineNet(
+        input_specs=self._input_specs,
+        min_level=self._min_level,
+        max_level=self._max_level,
+        block_specs=self._block_specs,
+        endpoints_num_filters=self._endpoints_num_filters,
+        resample_alpha=self._resample_alpha,
+        block_repeats=self._block_repeats,
+        filter_size_scale=self._filter_size_scale,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        norm_momentum=self._norm_momentum,
+        norm_epsilon=self._norm_epsilon)
+    return model(inputs)
--- a/official/legacy/detection/modeling/base_model.py
+++ b/official/legacy/detection/modeling/base_model.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base Model definition."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import re
+
+import tensorflow as tf
+from official.legacy.detection.modeling import checkpoint_utils
+from official.legacy.detection.modeling import learning_rates
+from official.legacy.detection.modeling import optimizers
+
+
+def _make_filter_trainable_variables_fn(frozen_variable_prefix):
+  """Creates a function for filtering trainable varialbes."""
+
+  def _filter_trainable_variables(variables):
+    """Filters trainable varialbes.
+
+    Args:
+      variables: a list of tf.Variable to be filtered.
+
+    Returns:
+      filtered_variables: a list of tf.Variable filtered out the frozen ones.
+    """
+    # frozen_variable_prefix: a regex string specifing the prefix pattern of
+    # the frozen variables' names.
+    filtered_variables = [
+        v for v in variables if not frozen_variable_prefix or
+        not re.match(frozen_variable_prefix, v.name)
+    ]
+    return filtered_variables
+
+  return _filter_trainable_variables
+
+
+class Model(object):
+  """Base class for model function."""
+
+  __metaclass__ = abc.ABCMeta
+
+  def __init__(self, params):
+    self._use_bfloat16 = params.architecture.use_bfloat16
+
+    if params.architecture.use_bfloat16:
+      tf.compat.v2.keras.mixed_precision.set_global_policy('mixed_bfloat16')
+
+    # Optimization.
+    self._optimizer_fn = optimizers.OptimizerFactory(params.train.optimizer)
+    self._learning_rate = learning_rates.learning_rate_generator(
+        params.train.total_steps, params.train.learning_rate)
+
+    self._frozen_variable_prefix = params.train.frozen_variable_prefix
+    self._regularization_var_regex = params.train.regularization_variable_regex
+    self._l2_weight_decay = params.train.l2_weight_decay
+
+    # Checkpoint restoration.
+    self._checkpoint = params.train.checkpoint.as_dict()
+
+    # Summary.
+    self._enable_summary = params.enable_summary
+    self._model_dir = params.model_dir
+
+  @abc.abstractmethod
+  def build_outputs(self, inputs, mode):
+    """Build the graph of the forward path."""
+    pass
+
+  @abc.abstractmethod
+  def build_model(self, params, mode):
+    """Build the model object."""
+    pass
+
+  @abc.abstractmethod
+  def build_loss_fn(self):
+    """Build the model object."""
+    pass
+
+  def post_processing(self, labels, outputs):
+    """Post-processing function."""
+    return labels, outputs
+
+  def model_outputs(self, inputs, mode):
+    """Build the model outputs."""
+    return self.build_outputs(inputs, mode)
+
+  def build_optimizer(self):
+    """Returns train_op to optimize total loss."""
+    # Sets up the optimizer.
+    return self._optimizer_fn(self._learning_rate)
+
+  def make_filter_trainable_variables_fn(self):
+    """Creates a function for filtering trainable varialbes."""
+    return _make_filter_trainable_variables_fn(self._frozen_variable_prefix)
+
+  def weight_decay_loss(self, trainable_variables):
+    reg_variables = [
+        v for v in trainable_variables
+        if self._regularization_var_regex is None or
+        re.match(self._regularization_var_regex, v.name)
+    ]
+
+    return self._l2_weight_decay * tf.add_n(
+        [tf.nn.l2_loss(v) for v in reg_variables])
+
+  def make_restore_checkpoint_fn(self):
+    """Returns scaffold function to restore parameters from v1 checkpoint."""
+    if 'skip_checkpoint_variables' in self._checkpoint:
+      skip_regex = self._checkpoint['skip_checkpoint_variables']
+    else:
+      skip_regex = None
+    return checkpoint_utils.make_restore_checkpoint_fn(
+        self._checkpoint['path'],
+        prefix=self._checkpoint['prefix'],
+        skip_regex=skip_regex)
+
+  def eval_metrics(self):
+    """Returns tuple of metric function and its inputs for evaluation."""
+    raise NotImplementedError('Unimplemented eval_metrics')
--- a/official/legacy/detection/modeling/checkpoint_utils.py
+++ b/official/legacy/detection/modeling/checkpoint_utils.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Util functions for loading checkpoints.
+
+Especially for loading Tensorflow 1.x
+checkpoint to Tensorflow 2.x (keras) model.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from absl import logging
+
+import tensorflow as tf
+
+
+def _build_assignment_map(keras_model,
+                          prefix='',
+                          skip_variables_regex=None,
+                          var_to_shape_map=None):
+  """Builds the variable assignment map.
+
+  Compute an assignment mapping for loading older checkpoints into a Keras
+  model. Variable names are remapped from the original TPUEstimator model to
+  the new Keras name.
+
+  Args:
+    keras_model: tf.keras.Model object to provide variables to assign.
+    prefix: prefix in the variable name to be remove for alignment with names in
+      the checkpoint.
+    skip_variables_regex: regular expression to math the names of variables that
+      do not need to be assign.
+    var_to_shape_map: variable name to shape mapping from the checkpoint.
+
+  Returns:
+    The variable assignment map.
+  """
+  assignment_map = {}
+
+  checkpoint_names = []
+  if var_to_shape_map:
+    # pylint: disable=g-long-lambda
+    checkpoint_names = list(
+        filter(
+            lambda x: not x.endswith('Momentum') and not x.endswith(
+                'global_step'), var_to_shape_map.keys()))
+    # pylint: enable=g-long-lambda
+
+  logging.info('Number of variables in the checkpoint %d',
+               len(checkpoint_names))
+
+  for var in keras_model.variables:
+    var_name = var.name
+
+    if skip_variables_regex and re.match(skip_variables_regex, var_name):
+      continue
+    # Trim the index of the variable.
+    if ':' in var_name:
+      var_name = var_name[:var_name.rindex(':')]
+    if var_name.startswith(prefix):
+      var_name = var_name[len(prefix):]
+
+    if not var_to_shape_map:
+      assignment_map[var_name] = var
+      continue
+
+    # Match name with variables in the checkpoint.
+    # pylint: disable=cell-var-from-loop
+    match_names = list(filter(lambda x: x.endswith(var_name), checkpoint_names))
+    # pylint: enable=cell-var-from-loop
+    try:
+      if match_names:
+        assert len(match_names) == 1, 'more then on matches for {}: {}'.format(
+            var_name, match_names)
+        checkpoint_names.remove(match_names[0])
+        assignment_map[match_names[0]] = var
+      else:
+        logging.info('Error not found var name: %s', var_name)
+    except Exception as e:
+      logging.info('Error removing the match_name: %s', match_names)
+      logging.info('Exception: %s', e)
+      raise
+  logging.info('Found matching variable in checkpoint: %d', len(assignment_map))
+  return assignment_map
+
+
+def _get_checkpoint_map(checkpoint_path):
+  reader = tf.train.load_checkpoint(checkpoint_path)
+  return reader.get_variable_to_shape_map()
+
+
+def make_restore_checkpoint_fn(checkpoint_path, prefix='', skip_regex=None):
+  """Returns scaffold function to restore parameters from v1 checkpoint.
+
+  Args:
+    checkpoint_path: path of the checkpoint folder or file.
+      Example 1: '/path/to/model_dir/'
+      Example 2: '/path/to/model.ckpt-22500'
+    prefix: prefix in the variable name to be remove for alignment with names in
+      the checkpoint.
+    skip_regex: regular expression to math the names of variables that do not
+      need to be assign.
+
+  Returns:
+    Callable[tf.kears.Model] -> void. Fn to load v1 checkpoint to keras model.
+  """
+
+  def _restore_checkpoint_fn(keras_model):
+    """Loads pretrained model through scaffold function."""
+    if not checkpoint_path:
+      logging.info('checkpoint_path is empty')
+      return
+    var_prefix = prefix
+    if prefix and not prefix.endswith('/'):
+      var_prefix += '/'
+    var_to_shape_map = _get_checkpoint_map(checkpoint_path)
+    assert var_to_shape_map, 'var_to_shape_map should not be empty'
+    vars_to_load = _build_assignment_map(
+        keras_model,
+        prefix=var_prefix,
+        skip_variables_regex=skip_regex,
+        var_to_shape_map=var_to_shape_map)
+    if not vars_to_load:
+      raise ValueError('Variables to load is empty.')
+    tf.compat.v1.train.init_from_checkpoint(checkpoint_path, vars_to_load)
+
+  return _restore_checkpoint_fn
--- a/official/legacy/detection/modeling/factory.py
+++ b/official/legacy/detection/modeling/factory.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Factory to build detection model."""
+
+
+from official.legacy.detection.modeling import maskrcnn_model
+from official.legacy.detection.modeling import olnmask_model
+from official.legacy.detection.modeling import retinanet_model
+from official.legacy.detection.modeling import shapemask_model
+
+
+def model_generator(params):
+  """Model function generator."""
+  if params.type == 'retinanet':
+    model_fn = retinanet_model.RetinanetModel(params)
+  elif params.type == 'mask_rcnn':
+    model_fn = maskrcnn_model.MaskrcnnModel(params)
+  elif params.type == 'olnmask':
+    model_fn = olnmask_model.OlnMaskModel(params)
+  elif params.type == 'shapemask':
+    model_fn = shapemask_model.ShapeMaskModel(params)
+  else:
+    raise ValueError('Model %s is not supported.'% params.type)
+
+  return model_fn