Deprecating official/vision/detection folder.

The folder is archived in the official/legacy/detection PiperOrigin-RevId: 419643226

Deprecating official/vision/detection folder.
The folder is archived in the official/legacy/detection PiperOrigin-RevId: 419643226
9a264c9f · Yeqing Li · A. Unique TensorFlower · 8fb4e6a6 · 8fb4e6a6 · 8fb4e6a6
Commit 9a264c9f authored Jan 04, 2022 by Yeqing Li Committed by A. Unique TensorFlower Jan 04, 2022
20 changed files
--- a/official/vision/detection/evaluation/coco_evaluator.py
+++ b/official/vision/detection/evaluation/coco_evaluator.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""The COCO-style evaluator.
-
-The following snippet demonstrates the use of interfaces:
-
-  evaluator = COCOEvaluator(...)
-  for _ in range(num_evals):
-    for _ in range(num_batches_per_eval):
-      predictions, groundtruth = predictor.predict(...)  # pop a batch.
-      evaluator.update(predictions, groundtruths)  # aggregate internal stats.
-    evaluator.evaluate()  # finish one full eval.
-
-See also: https://github.com/cocodataset/cocoapi/
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import atexit
-import copy
-import tempfile
-
-from absl import logging
-import numpy as np
-from pycocotools import cocoeval
-import six
-import tensorflow as tf
-
-from official.vision.detection.evaluation import coco_utils
-from official.vision.detection.utils import class_utils
-
-
-class MetricWrapper(object):
-  # This is only a wrapper for COCO metric and works on for numpy array. So it
-  # doesn't inherit from tf.keras.layers.Layer or tf.keras.metrics.Metric.
-
-  def __init__(self, evaluator):
-    self._evaluator = evaluator
-
-  def update_state(self, y_true, y_pred):
-    labels = tf.nest.map_structure(lambda x: x.numpy(), y_true)
-    outputs = tf.nest.map_structure(lambda x: x.numpy(), y_pred)
-    groundtruths = {}
-    predictions = {}
-    for key, val in outputs.items():
-      if isinstance(val, tuple):
-        val = np.concatenate(val)
-      predictions[key] = val
-    for key, val in labels.items():
-      if isinstance(val, tuple):
-        val = np.concatenate(val)
-      groundtruths[key] = val
-    self._evaluator.update(predictions, groundtruths)
-
-  def result(self):
-    return self._evaluator.evaluate()
-
-  def reset_states(self):
-    return self._evaluator.reset()
-
-
-class COCOEvaluator(object):
-  """COCO evaluation metric class."""
-
-  def __init__(self, annotation_file, include_mask, need_rescale_bboxes=True):
-    """Constructs COCO evaluation class.
-
-    The class provides the interface to metrics_fn in TPUEstimator. The
-    _update_op() takes detections from each image and push them to
-    self.detections. The _evaluate() loads a JSON file in COCO annotation format
-    as the groundtruths and runs COCO evaluation.
-
-    Args:
-      annotation_file: a JSON file that stores annotations of the eval dataset.
-        If `annotation_file` is None, groundtruth annotations will be loaded
-        from the dataloader.
-      include_mask: a boolean to indicate whether or not to include the mask
-        eval.
-      need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back
-        to absolute values (`image_info` is needed in this case).
-    """
-    if annotation_file:
-      if annotation_file.startswith('gs://'):
-        _, local_val_json = tempfile.mkstemp(suffix='.json')
-        tf.io.gfile.remove(local_val_json)
-
-        tf.io.gfile.copy(annotation_file, local_val_json)
-        atexit.register(tf.io.gfile.remove, local_val_json)
-      else:
-        local_val_json = annotation_file
-      self._coco_gt = coco_utils.COCOWrapper(
-          eval_type=('mask' if include_mask else 'box'),
-          annotation_file=local_val_json)
-    self._annotation_file = annotation_file
-    self._include_mask = include_mask
-    self._metric_names = [
-        'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1', 'ARmax10',
-        'ARmax100', 'ARs', 'ARm', 'ARl'
-    ]
-    self._required_prediction_fields = [
-        'source_id', 'num_detections', 'detection_classes', 'detection_scores',
-        'detection_boxes'
-    ]
-    self._need_rescale_bboxes = need_rescale_bboxes
-    if self._need_rescale_bboxes:
-      self._required_prediction_fields.append('image_info')
-    self._required_groundtruth_fields = [
-        'source_id', 'height', 'width', 'classes', 'boxes'
-    ]
-    if self._include_mask:
-      mask_metric_names = ['mask_' + x for x in self._metric_names]
-      self._metric_names.extend(mask_metric_names)
-      self._required_prediction_fields.extend(['detection_masks'])
-      self._required_groundtruth_fields.extend(['masks'])
-
-    self.reset()
-
-  def reset(self):
-    """Resets internal states for a fresh run."""
-    self._predictions = {}
-    if not self._annotation_file:
-      self._groundtruths = {}
-
-  def evaluate(self):
-    """Evaluates with detections from all images with COCO API.
-
-    Returns:
-      coco_metric: float numpy array with shape [24] representing the
-        coco-style evaluation metrics (box and mask).
-    """
-    if not self._annotation_file:
-      logging.info('Thre is no annotation_file in COCOEvaluator.')
-      gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset(
-          self._groundtruths)
-      coco_gt = coco_utils.COCOWrapper(
-          eval_type=('mask' if self._include_mask else 'box'),
-          gt_dataset=gt_dataset)
-    else:
-      logging.info('Using annotation file: %s', self._annotation_file)
-      coco_gt = self._coco_gt
-    coco_predictions = coco_utils.convert_predictions_to_coco_annotations(
-        self._predictions)
-    coco_dt = coco_gt.loadRes(predictions=coco_predictions)
-    image_ids = [ann['image_id'] for ann in coco_predictions]
-
-    coco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='bbox')
-    coco_eval.params.imgIds = image_ids
-    coco_eval.evaluate()
-    coco_eval.accumulate()
-    coco_eval.summarize()
-    coco_metrics = coco_eval.stats
-
-    if self._include_mask:
-      mcoco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='segm')
-      mcoco_eval.params.imgIds = image_ids
-      mcoco_eval.evaluate()
-      mcoco_eval.accumulate()
-      mcoco_eval.summarize()
-      mask_coco_metrics = mcoco_eval.stats
-
-    if self._include_mask:
-      metrics = np.hstack((coco_metrics, mask_coco_metrics))
-    else:
-      metrics = coco_metrics
-
-    # Cleans up the internal variables in order for a fresh eval next time.
-    self.reset()
-
-    metrics_dict = {}
-    for i, name in enumerate(self._metric_names):
-      metrics_dict[name] = metrics[i].astype(np.float32)
-    return metrics_dict
-
-  def _process_predictions(self, predictions):
-    image_scale = np.tile(predictions['image_info'][:, 2:3, :], (1, 1, 2))
-    predictions['detection_boxes'] = (
-        predictions['detection_boxes'].astype(np.float32))
-    predictions['detection_boxes'] /= image_scale
-    if 'detection_outer_boxes' in predictions:
-      predictions['detection_outer_boxes'] = (
-          predictions['detection_outer_boxes'].astype(np.float32))
-      predictions['detection_outer_boxes'] /= image_scale
-
-  def update(self, predictions, groundtruths=None):
-    """Update and aggregate detection results and groundtruth data.
-
-    Args:
-      predictions: a dictionary of numpy arrays including the fields below. See
-        different parsers under `../dataloader` for more details.
-        Required fields:
-          - source_id: a numpy array of int or string of shape [batch_size].
-          - image_info [if `need_rescale_bboxes` is True]: a numpy array of
-            float of shape [batch_size, 4, 2].
-          - num_detections: a numpy array of int of shape [batch_size].
-          - detection_boxes: a numpy array of float of shape [batch_size, K, 4].
-          - detection_classes: a numpy array of int of shape [batch_size, K].
-          - detection_scores: a numpy array of float of shape [batch_size, K].
-        Optional fields:
-          - detection_masks: a numpy array of float of shape [batch_size, K,
-            mask_height, mask_width].
-      groundtruths: a dictionary of numpy arrays including the fields below. See
-        also different parsers under `../dataloader` for more details.
-        Required fields:
-          - source_id: a numpy array of int or string of shape [batch_size].
-          - height: a numpy array of int of shape [batch_size].
-          - width: a numpy array of int of shape [batch_size].
-          - num_detections: a numpy array of int of shape [batch_size].
-          - boxes: a numpy array of float of shape [batch_size, K, 4].
-          - classes: a numpy array of int of shape [batch_size, K].
-        Optional fields:
-          - is_crowds: a numpy array of int of shape [batch_size, K]. If the
-            field is absent, it is assumed that this instance is not crowd.
-          - areas: a numy array of float of shape [batch_size, K]. If the field
-            is absent, the area is calculated using either boxes or masks
-            depending on which one is available.
-          - masks: a numpy array of float of shape [batch_size, K, mask_height,
-            mask_width],
-
-    Raises:
-      ValueError: if the required prediction or groundtruth fields are not
-        present in the incoming `predictions` or `groundtruths`.
-    """
-    for k in self._required_prediction_fields:
-      if k not in predictions:
-        raise ValueError(
-            'Missing the required key `{}` in predictions!'.format(k))
-    if self._need_rescale_bboxes:
-      self._process_predictions(predictions)
-    for k, v in six.iteritems(predictions):
-      if k not in self._predictions:
-        self._predictions[k] = [v]
-      else:
-        self._predictions[k].append(v)
-
-    if not self._annotation_file:
-      assert groundtruths
-      for k in self._required_groundtruth_fields:
-        if k not in groundtruths:
-          raise ValueError(
-              'Missing the required key `{}` in groundtruths!'.format(k))
-      for k, v in six.iteritems(groundtruths):
-        if k not in self._groundtruths:
-          self._groundtruths[k] = [v]
-        else:
-          self._groundtruths[k].append(v)
-
-
-class OlnXclassEvaluator(COCOEvaluator):
-  """COCO evaluation metric class."""
-
-  def __init__(self, annotation_file, include_mask, need_rescale_bboxes=True,
-               use_category=True, seen_class='all'):
-    """Constructs COCO evaluation class.
-
-    The class provides the interface to metrics_fn in TPUEstimator. The
-    _update_op() takes detections from each image and push them to
-    self.detections. The _evaluate() loads a JSON file in COCO annotation format
-    as the groundtruths and runs COCO evaluation.
-
-    Args:
-      annotation_file: a JSON file that stores annotations of the eval dataset.
-        If `annotation_file` is None, groundtruth annotations will be loaded
-        from the dataloader.
-      include_mask: a boolean to indicate whether or not to include the mask
-        eval.
-      need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back
-        to absolute values (`image_info` is needed in this case).
-      use_category: if `False`, treat all object in all classes in one
-        foreground category.
-      seen_class: 'all' or 'voc' or 'nonvoc'
-    """
-    super(OlnXclassEvaluator, self).__init__(
-        annotation_file=annotation_file,
-        include_mask=include_mask,
-        need_rescale_bboxes=need_rescale_bboxes)
-    self._use_category = use_category
-    self._seen_class = seen_class
-    self._seen_class_ids = class_utils.coco_split_class_ids(seen_class)
-    self._metric_names = [
-        'AP', 'AP50', 'AP75',
-        'APs', 'APm', 'APl',
-        'ARmax10', 'ARmax20', 'ARmax50', 'ARmax100', 'ARmax200',
-        'ARmax10s', 'ARmax10m', 'ARmax10l'
-    ]
-    if self._seen_class != 'all':
-      self._metric_names.extend([
-          'AP_seen', 'AP50_seen', 'AP75_seen',
-          'APs_seen', 'APm_seen', 'APl_seen',
-          'ARmax10_seen', 'ARmax20_seen', 'ARmax50_seen',
-          'ARmax100_seen', 'ARmax200_seen',
-          'ARmax10s_seen', 'ARmax10m_seen', 'ARmax10l_seen',
-
-          'AP_novel', 'AP50_novel', 'AP75_novel',
-          'APs_novel', 'APm_novel', 'APl_novel',
-          'ARmax10_novel', 'ARmax20_novel', 'ARmax50_novel',
-          'ARmax100_novel', 'ARmax200_novel',
-          'ARmax10s_novel', 'ARmax10m_novel', 'ARmax10l_novel',
-      ])
-    if self._include_mask:
-      mask_metric_names = ['mask_' + x for x in self._metric_names]
-      self._metric_names.extend(mask_metric_names)
-      self._required_prediction_fields.extend(['detection_masks'])
-      self._required_groundtruth_fields.extend(['masks'])
-
-    self.reset()
-
-  def evaluate(self):
-    """Evaluates with detections from all images with COCO API.
-
-    Returns:
-      coco_metric: float numpy array with shape [24] representing the
-        coco-style evaluation metrics (box and mask).
-    """
-    if not self._annotation_file:
-      logging.info('Thre is no annotation_file in COCOEvaluator.')
-      gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset(
-          self._groundtruths)
-      coco_gt = coco_utils.COCOWrapper(
-          eval_type=('mask' if self._include_mask else 'box'),
-          gt_dataset=gt_dataset)
-    else:
-      logging.info('Using annotation file: %s', self._annotation_file)
-      coco_gt = self._coco_gt
-
-    coco_predictions = coco_utils.convert_predictions_to_coco_annotations(
-        self._predictions)
-    coco_dt = coco_gt.loadRes(predictions=coco_predictions)
-    image_ids = [ann['image_id'] for ann in coco_predictions]
-    # Class manipulation: 'all' split samples -> ignored_split = 0.
-    for idx, ann in enumerate(coco_gt.dataset['annotations']):
-      coco_gt.dataset['annotations'][idx]['ignored_split'] = 0
-    coco_eval = cocoeval.OlnCOCOevalXclassWrapper(
-        coco_gt, coco_dt, iou_type='bbox')
-    coco_eval.params.maxDets = [10, 20, 50, 100, 200]
-    coco_eval.params.imgIds = image_ids
-    coco_eval.params.useCats = 0 if not self._use_category else 1
-    coco_eval.evaluate()
-    coco_eval.accumulate()
-    coco_eval.summarize()
-    coco_metrics = coco_eval.stats
-
-    if self._include_mask:
-      mcoco_eval = cocoeval.OlnCOCOevalXclassWrapper(
-          coco_gt, coco_dt, iou_type='segm')
-      mcoco_eval.params.maxDets = [10, 20, 50, 100, 200]
-      mcoco_eval.params.imgIds = image_ids
-      mcoco_eval.params.useCats = 0 if not self._use_category else 1
-      mcoco_eval.evaluate()
-      mcoco_eval.accumulate()
-      mcoco_eval.summarize()
-      mask_coco_metrics = mcoco_eval.stats
-
-    if self._include_mask:
-      metrics = np.hstack((coco_metrics, mask_coco_metrics))
-    else:
-      metrics = coco_metrics
-
-    if self._seen_class != 'all':
-      # for seen class eval, samples of novel_class are ignored.
-      coco_gt_seen = copy.deepcopy(coco_gt)
-      for idx, ann in enumerate(coco_gt.dataset['annotations']):
-        if ann['category_id'] in self._seen_class_ids:
-          coco_gt_seen.dataset['annotations'][idx]['ignored_split'] = 0
-        else:
-          coco_gt_seen.dataset['annotations'][idx]['ignored_split'] = 1
-      coco_eval_seen = cocoeval.OlnCOCOevalXclassWrapper(
-          coco_gt_seen, coco_dt, iou_type='bbox')
-      coco_eval_seen.params.maxDets = [10, 20, 50, 100, 200]
-      coco_eval_seen.params.imgIds = image_ids
-      coco_eval_seen.params.useCats = 0 if not self._use_category else 1
-      coco_eval_seen.evaluate()
-      coco_eval_seen.accumulate()
-      coco_eval_seen.summarize()
-      coco_metrics_seen = coco_eval_seen.stats
-      if self._include_mask:
-        mcoco_eval_seen = cocoeval.OlnCOCOevalXclassWrapper(
-            coco_gt_seen, coco_dt, iou_type='segm')
-        mcoco_eval_seen.params.maxDets = [10, 20, 50, 100, 200]
-        mcoco_eval_seen.params.imgIds = image_ids
-        mcoco_eval_seen.params.useCats = 0 if not self._use_category else 1
-        mcoco_eval_seen.evaluate()
-        mcoco_eval_seen.accumulate()
-        mcoco_eval_seen.summarize()
-        mask_coco_metrics_seen = mcoco_eval_seen.stats
-
-      # for novel class eval, samples of seen_class are ignored.
-      coco_gt_novel = copy.deepcopy(coco_gt)
-      for idx, ann in enumerate(coco_gt.dataset['annotations']):
-        if ann['category_id'] in self._seen_class_ids:
-          coco_gt_novel.dataset['annotations'][idx]['ignored_split'] = 1
-        else:
-          coco_gt_novel.dataset['annotations'][idx]['ignored_split'] = 0
-      coco_eval_novel = cocoeval.OlnCOCOevalXclassWrapper(
-          coco_gt_novel, coco_dt, iou_type='bbox')
-      coco_eval_novel.params.maxDets = [10, 20, 50, 100, 200]
-      coco_eval_novel.params.imgIds = image_ids
-      coco_eval_novel.params.useCats = 0 if not self._use_category else 1
-      coco_eval_novel.evaluate()
-      coco_eval_novel.accumulate()
-      coco_eval_novel.summarize()
-      coco_metrics_novel = coco_eval_novel.stats
-      if self._include_mask:
-        mcoco_eval_novel = cocoeval.OlnCOCOevalXclassWrapper(
-            coco_gt_novel, coco_dt, iou_type='segm')
-        mcoco_eval_novel.params.maxDets = [10, 20, 50, 100, 200]
-        mcoco_eval_novel.params.imgIds = image_ids
-        mcoco_eval_novel.params.useCats = 0 if not self._use_category else 1
-        mcoco_eval_novel.evaluate()
-        mcoco_eval_novel.accumulate()
-        mcoco_eval_novel.summarize()
-        mask_coco_metrics_novel = mcoco_eval_novel.stats
-
-      # Combine all splits.
-      if self._include_mask:
-        metrics = np.hstack((
-            coco_metrics, coco_metrics_seen, coco_metrics_novel,
-            mask_coco_metrics, mask_coco_metrics_seen, mask_coco_metrics_novel))
-      else:
-        metrics = np.hstack((
-            coco_metrics, coco_metrics_seen, coco_metrics_novel))
-
-    # Cleans up the internal variables in order for a fresh eval next time.
-    self.reset()
-
-    metrics_dict = {}
-    for i, name in enumerate(self._metric_names):
-      metrics_dict[name] = metrics[i].astype(np.float32)
-    return metrics_dict
-
-
-class OlnXdataEvaluator(OlnXclassEvaluator):
-  """COCO evaluation metric class."""
-
-  def __init__(self, annotation_file, include_mask, need_rescale_bboxes=True,
-               use_category=True, seen_class='all'):
-    """Constructs COCO evaluation class.
-
-    The class provides the interface to metrics_fn in TPUEstimator. The
-    _update_op() takes detections from each image and push them to
-    self.detections. The _evaluate() loads a JSON file in COCO annotation format
-    as the groundtruths and runs COCO evaluation.
-
-    Args:
-      annotation_file: a JSON file that stores annotations of the eval dataset.
-        If `annotation_file` is None, groundtruth annotations will be loaded
-        from the dataloader.
-      include_mask: a boolean to indicate whether or not to include the mask
-        eval.
-      need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back
-        to absolute values (`image_info` is needed in this case).
-      use_category: if `False`, treat all object in all classes in one
-        foreground category.
-      seen_class: 'all' or 'voc' or 'nonvoc'
-    """
-    super(OlnXdataEvaluator, self).__init__(
-        annotation_file=annotation_file,
-        include_mask=include_mask,
-        need_rescale_bboxes=need_rescale_bboxes,
-        use_category=False,
-        seen_class='all')
-
-  def evaluate(self):
-    """Evaluates with detections from all images with COCO API.
-
-    Returns:
-      coco_metric: float numpy array with shape [24] representing the
-        coco-style evaluation metrics (box and mask).
-    """
-    if not self._annotation_file:
-      logging.info('Thre is no annotation_file in COCOEvaluator.')
-      gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset(
-          self._groundtruths)
-      coco_gt = coco_utils.COCOWrapper(
-          eval_type=('mask' if self._include_mask else 'box'),
-          gt_dataset=gt_dataset)
-    else:
-      logging.info('Using annotation file: %s', self._annotation_file)
-      coco_gt = self._coco_gt
-    coco_predictions = coco_utils.convert_predictions_to_coco_annotations(
-        self._predictions)
-    coco_dt = coco_gt.loadRes(predictions=coco_predictions)
-    image_ids = [ann['image_id'] for ann in coco_predictions]
-    # Class manipulation: 'all' split samples -> ignored_split = 0.
-    for idx, _ in enumerate(coco_gt.dataset['annotations']):
-      coco_gt.dataset['annotations'][idx]['ignored_split'] = 0
-    coco_eval = cocoeval.OlnCOCOevalWrapper(coco_gt, coco_dt, iou_type='bbox')
-    coco_eval.params.maxDets = [10, 20, 50, 100, 200]
-    coco_eval.params.imgIds = image_ids
-    coco_eval.params.useCats = 0 if not self._use_category else 1
-    coco_eval.evaluate()
-    coco_eval.accumulate()
-    coco_eval.summarize()
-    coco_metrics = coco_eval.stats
-
-    if self._include_mask:
-      mcoco_eval = cocoeval.OlnCOCOevalWrapper(coco_gt, coco_dt,
-                                               iou_type='segm')
-      mcoco_eval.params.maxDets = [10, 20, 50, 100, 200]
-      mcoco_eval.params.imgIds = image_ids
-      mcoco_eval.params.useCats = 0 if not self._use_category else 1
-      mcoco_eval.evaluate()
-      mcoco_eval.accumulate()
-      mcoco_eval.summarize()
-      mask_coco_metrics = mcoco_eval.stats
-
-    if self._include_mask:
-      metrics = np.hstack((coco_metrics, mask_coco_metrics))
-    else:
-      metrics = coco_metrics
-
-    # Cleans up the internal variables in order for a fresh eval next time.
-    self.reset()
-
-    metrics_dict = {}
-    for i, name in enumerate(self._metric_names):
-      metrics_dict[name] = metrics[i].astype(np.float32)
-    return metrics_dict
-
-
-class ShapeMaskCOCOEvaluator(COCOEvaluator):
-  """COCO evaluation metric class for ShapeMask."""
-
-  def __init__(self, mask_eval_class, **kwargs):
-    """Constructs COCO evaluation class.
-
-    The class provides the interface to metrics_fn in TPUEstimator. The
-    _update_op() takes detections from each image and push them to
-    self.detections. The _evaluate() loads a JSON file in COCO annotation format
-    as the groundtruths and runs COCO evaluation.
-
-    Args:
-      mask_eval_class: the set of classes for mask evaluation.
-      **kwargs: other keyword arguments passed to the parent class initializer.
-    """
-    super(ShapeMaskCOCOEvaluator, self).__init__(**kwargs)
-    self._mask_eval_class = mask_eval_class
-    self._eval_categories = class_utils.coco_split_class_ids(mask_eval_class)
-    if mask_eval_class != 'all':
-      self._metric_names = [
-          x.replace('mask', 'novel_mask') for x in self._metric_names
-      ]
-
-  def evaluate(self):
-    """Evaluates with detections from all images with COCO API.
-
-    Returns:
-      coco_metric: float numpy array with shape [24] representing the
-        coco-style evaluation metrics (box and mask).
-    """
-    if not self._annotation_file:
-      gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset(
-          self._groundtruths)
-      coco_gt = coco_utils.COCOWrapper(
-          eval_type=('mask' if self._include_mask else 'box'),
-          gt_dataset=gt_dataset)
-    else:
-      coco_gt = self._coco_gt
-    coco_predictions = coco_utils.convert_predictions_to_coco_annotations(
-        self._predictions)
-    coco_dt = coco_gt.loadRes(predictions=coco_predictions)
-    image_ids = [ann['image_id'] for ann in coco_predictions]
-
-    coco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='bbox')
-    coco_eval.params.imgIds = image_ids
-    coco_eval.evaluate()
-    coco_eval.accumulate()
-    coco_eval.summarize()
-    coco_metrics = coco_eval.stats
-
-    if self._include_mask:
-      mcoco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='segm')
-      mcoco_eval.params.imgIds = image_ids
-      mcoco_eval.evaluate()
-      mcoco_eval.accumulate()
-      mcoco_eval.summarize()
-      if self._mask_eval_class == 'all':
-        metrics = np.hstack((coco_metrics, mcoco_eval.stats))
-      else:
-        mask_coco_metrics = mcoco_eval.category_stats
-        val_catg_idx = np.isin(mcoco_eval.params.catIds, self._eval_categories)
-        # Gather the valid evaluation of the eval categories.
-        if np.any(val_catg_idx):
-          mean_val_metrics = []
-          for mid in range(len(self._metric_names) // 2):
-            mean_val_metrics.append(
-                np.nanmean(mask_coco_metrics[mid][val_catg_idx]))
-
-          mean_val_metrics = np.array(mean_val_metrics)
-        else:
-          mean_val_metrics = np.zeros(len(self._metric_names) // 2)
-        metrics = np.hstack((coco_metrics, mean_val_metrics))
-    else:
-      metrics = coco_metrics
-
-    # Cleans up the internal variables in order for a fresh eval next time.
-    self.reset()
-
-    metrics_dict = {}
-    for i, name in enumerate(self._metric_names):
-      metrics_dict[name] = metrics[i].astype(np.float32)
-    return metrics_dict
--- a/official/vision/detection/evaluation/coco_utils.py
+++ b/official/vision/detection/evaluation/coco_utils.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Util functions related to pycocotools and COCO eval."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-import json
-
-from absl import logging
-import numpy as np
-from PIL import Image
-from pycocotools import coco
-from pycocotools import mask as mask_api
-import six
-import tensorflow as tf
-
-from official.vision.detection.dataloader import tf_example_decoder
-from official.vision.detection.utils import box_utils
-from official.vision.detection.utils import mask_utils
-
-
-class COCOWrapper(coco.COCO):
-  """COCO wrapper class.
-
-  This class wraps COCO API object, which provides the following additional
-  functionalities:
-    1. Support string type image id.
-    2. Support loading the groundtruth dataset using the external annotation
-       dictionary.
-    3. Support loading the prediction results using the external annotation
-       dictionary.
-  """
-
-  def __init__(self, eval_type='box', annotation_file=None, gt_dataset=None):
-    """Instantiates a COCO-style API object.
-
-    Args:
-      eval_type: either 'box' or 'mask'.
-      annotation_file: a JSON file that stores annotations of the eval dataset.
-        This is required if `gt_dataset` is not provided.
-      gt_dataset: the groundtruth eval datatset in COCO API format.
-    """
-    if ((annotation_file and gt_dataset) or
-        ((not annotation_file) and (not gt_dataset))):
-      raise ValueError('One and only one of `annotation_file` and `gt_dataset` '
-                       'needs to be specified.')
-
-    if eval_type not in ['box', 'mask']:
-      raise ValueError('The `eval_type` can only be either `box` or `mask`.')
-
-    coco.COCO.__init__(self, annotation_file=annotation_file)
-    self._eval_type = eval_type
-    if gt_dataset:
-      self.dataset = gt_dataset
-      self.createIndex()
-
-  def loadRes(self, predictions):
-    """Loads result file and return a result api object.
-
-    Args:
-      predictions: a list of dictionary each representing an annotation in COCO
-        format. The required fields are `image_id`, `category_id`, `score`,
-        `bbox`, `segmentation`.
-
-    Returns:
-      res: result COCO api object.
-
-    Raises:
-      ValueError: if the set of image id from predctions is not the subset of
-        the set of image id of the groundtruth dataset.
-    """
-    res = coco.COCO()
-    res.dataset['images'] = copy.deepcopy(self.dataset['images'])
-    res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
-
-    image_ids = [ann['image_id'] for ann in predictions]
-    if set(image_ids) != (set(image_ids) & set(self.getImgIds())):
-      raise ValueError('Results do not correspond to the current dataset!')
-    for ann in predictions:
-      x1, x2, y1, y2 = [ann['bbox'][0], ann['bbox'][0] + ann['bbox'][2],
-                        ann['bbox'][1], ann['bbox'][1] + ann['bbox'][3]]
-      if self._eval_type == 'box':
-        ann['area'] = ann['bbox'][2] * ann['bbox'][3]
-        ann['segmentation'] = [
-            [x1, y1, x1, y2, x2, y2, x2, y1]]
-      elif self._eval_type == 'mask':
-        ann['area'] = mask_api.area(ann['segmentation'])
-
-    res.dataset['annotations'] = copy.deepcopy(predictions)
-    res.createIndex()
-    return res
-
-
-def convert_predictions_to_coco_annotations(predictions):
-  """Converts a batch of predictions to annotations in COCO format.
-
-  Args:
-    predictions: a dictionary of lists of numpy arrays including the following
-      fields. K below denotes the maximum number of instances per image.
-      Required fields:
-        - source_id: a list of numpy arrays of int or string of shape
-            [batch_size].
-        - num_detections: a list of numpy arrays of int of shape [batch_size].
-        - detection_boxes: a list of numpy arrays of float of shape
-            [batch_size, K, 4], where coordinates are in the original image
-            space (not the scaled image space).
-        - detection_classes: a list of numpy arrays of int of shape
-            [batch_size, K].
-        - detection_scores: a list of numpy arrays of float of shape
-            [batch_size, K].
-      Optional fields:
-        - detection_masks: a list of numpy arrays of float of shape
-            [batch_size, K, mask_height, mask_width].
-
-  Returns:
-    coco_predictions: prediction in COCO annotation format.
-  """
-  coco_predictions = []
-  num_batches = len(predictions['source_id'])
-  batch_size = predictions['source_id'][0].shape[0]
-  max_num_detections = predictions['detection_classes'][0].shape[1]
-  use_outer_box = 'detection_outer_boxes' in predictions
-  for i in range(num_batches):
-    predictions['detection_boxes'][i] = box_utils.yxyx_to_xywh(
-        predictions['detection_boxes'][i])
-    if use_outer_box:
-      predictions['detection_outer_boxes'][i] = box_utils.yxyx_to_xywh(
-          predictions['detection_outer_boxes'][i])
-      mask_boxes = predictions['detection_outer_boxes']
-    else:
-      mask_boxes = predictions['detection_boxes']
-
-    for j in range(batch_size):
-      if 'detection_masks' in predictions:
-        image_masks = mask_utils.paste_instance_masks(
-            predictions['detection_masks'][i][j],
-            mask_boxes[i][j],
-            int(predictions['image_info'][i][j, 0, 0]),
-            int(predictions['image_info'][i][j, 0, 1]))
-        binary_masks = (image_masks > 0.0).astype(np.uint8)
-        encoded_masks = [
-            mask_api.encode(np.asfortranarray(binary_mask))
-            for binary_mask in list(binary_masks)]
-      for k in range(max_num_detections):
-        ann = {}
-        ann['image_id'] = predictions['source_id'][i][j]
-        ann['category_id'] = predictions['detection_classes'][i][j, k]
-        ann['bbox'] = predictions['detection_boxes'][i][j, k]
-        ann['score'] = predictions['detection_scores'][i][j, k]
-        if 'detection_masks' in predictions:
-          ann['segmentation'] = encoded_masks[k]
-        coco_predictions.append(ann)
-
-  for i, ann in enumerate(coco_predictions):
-    ann['id'] = i + 1
-
-  return coco_predictions
-
-
-def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
-  """Converts groundtruths to the dataset in COCO format.
-
-  Args:
-    groundtruths: a dictionary of numpy arrays including the fields below.
-      Note that each element in the list represent the number for a single
-      example without batch dimension. K below denotes the actual number of
-      instances for each image.
-      Required fields:
-        - source_id: a list of numpy arrays of int or string of shape
-          [batch_size].
-        - height: a list of numpy arrays of int of shape [batch_size].
-        - width: a list of numpy arrays of int of shape [batch_size].
-        - num_detections: a list of numpy arrays of int of shape [batch_size].
-        - boxes: a list of numpy arrays of float of shape [batch_size, K, 4],
-            where coordinates are in the original image space (not the
-            normalized coordinates).
-        - classes: a list of numpy arrays of int of shape [batch_size, K].
-      Optional fields:
-        - is_crowds: a list of numpy arrays of int of shape [batch_size, K]. If
-            th field is absent, it is assumed that this instance is not crowd.
-        - areas: a list of numy arrays of float of shape [batch_size, K]. If the
-            field is absent, the area is calculated using either boxes or
-            masks depending on which one is available.
-        - masks: a list of numpy arrays of string of shape [batch_size, K],
-    label_map: (optional) a dictionary that defines items from the category id
-      to the category name. If `None`, collect the category mappping from the
-      `groundtruths`.
-
-  Returns:
-    coco_groundtruths: the groundtruth dataset in COCO format.
-  """
-  source_ids = np.concatenate(groundtruths['source_id'], axis=0)
-  heights = np.concatenate(groundtruths['height'], axis=0)
-  widths = np.concatenate(groundtruths['width'], axis=0)
-  gt_images = [{'id': int(i), 'height': int(h), 'width': int(w)} for i, h, w
-               in zip(source_ids, heights, widths)]
-
-  gt_annotations = []
-  num_batches = len(groundtruths['source_id'])
-  batch_size = groundtruths['source_id'][0].shape[0]
-  for i in range(num_batches):
-    for j in range(batch_size):
-      num_instances = groundtruths['num_detections'][i][j]
-      for k in range(num_instances):
-        ann = {}
-        ann['image_id'] = int(groundtruths['source_id'][i][j])
-        if 'is_crowds' in groundtruths:
-          ann['iscrowd'] = int(groundtruths['is_crowds'][i][j, k])
-        else:
-          ann['iscrowd'] = 0
-        ann['category_id'] = int(groundtruths['classes'][i][j, k])
-        boxes = groundtruths['boxes'][i]
-        ann['bbox'] = [
-            float(boxes[j, k, 1]),
-            float(boxes[j, k, 0]),
-            float(boxes[j, k, 3] - boxes[j, k, 1]),
-            float(boxes[j, k, 2] - boxes[j, k, 0])]
-        if 'areas' in groundtruths:
-          ann['area'] = float(groundtruths['areas'][i][j, k])
-        else:
-          ann['area'] = float(
-              (boxes[j, k, 3] - boxes[j, k, 1]) *
-              (boxes[j, k, 2] - boxes[j, k, 0]))
-        if 'masks' in groundtruths:
-          mask = Image.open(six.BytesIO(groundtruths['masks'][i][j, k]))
-          width, height = mask.size
-          np_mask = (
-              np.array(mask.getdata()).reshape(height, width).astype(np.uint8))
-          np_mask[np_mask > 0] = 255
-          encoded_mask = mask_api.encode(np.asfortranarray(np_mask))
-          ann['segmentation'] = encoded_mask
-          if 'areas' not in groundtruths:
-            ann['area'] = mask_api.area(encoded_mask)
-        gt_annotations.append(ann)
-
-  for i, ann in enumerate(gt_annotations):
-    ann['id'] = i + 1
-
-  if label_map:
-    gt_categories = [{'id': i, 'name': label_map[i]} for i in label_map]
-  else:
-    category_ids = [gt['category_id'] for gt in gt_annotations]
-    gt_categories = [{'id': i} for i in set(category_ids)]
-
-  gt_dataset = {
-      'images': gt_images,
-      'categories': gt_categories,
-      'annotations': copy.deepcopy(gt_annotations),
-  }
-  return gt_dataset
-
-
-class COCOGroundtruthGenerator(object):
-  """Generates the groundtruth annotations from a single example."""
-
-  def __init__(self, file_pattern, num_examples, include_mask):
-    self._file_pattern = file_pattern
-    self._num_examples = num_examples
-    self._include_mask = include_mask
-    self._dataset_fn = tf.data.TFRecordDataset
-
-  def _parse_single_example(self, example):
-    """Parses a single serialized tf.Example proto.
-
-    Args:
-      example: a serialized tf.Example proto string.
-
-    Returns:
-      A dictionary of groundtruth with the following fields:
-        source_id: a scalar tensor of int64 representing the image source_id.
-        height: a scalar tensor of int64 representing the image height.
-        width: a scalar tensor of int64 representing the image width.
-        boxes: a float tensor of shape [K, 4], representing the groundtruth
-          boxes in absolute coordinates with respect to the original image size.
-        classes: a int64 tensor of shape [K], representing the class labels of
-          each instances.
-        is_crowds: a bool tensor of shape [K], indicating whether the instance
-          is crowd.
-        areas: a float tensor of shape [K], indicating the area of each
-          instance.
-        masks: a string tensor of shape [K], containing the bytes of the png
-          mask of each instance.
-    """
-    decoder = tf_example_decoder.TfExampleDecoder(
-        include_mask=self._include_mask)
-    decoded_tensors = decoder.decode(example)
-
-    image = decoded_tensors['image']
-    image_size = tf.shape(image)[0:2]
-    boxes = box_utils.denormalize_boxes(
-        decoded_tensors['groundtruth_boxes'], image_size)
-    groundtruths = {
-        'source_id': tf.string_to_number(
-            decoded_tensors['source_id'], out_type=tf.int64),
-        'height': decoded_tensors['height'],
-        'width': decoded_tensors['width'],
-        'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0],
-        'boxes': boxes,
-        'classes': decoded_tensors['groundtruth_classes'],
-        'is_crowds': decoded_tensors['groundtruth_is_crowd'],
-        'areas': decoded_tensors['groundtruth_area'],
-    }
-    if self._include_mask:
-      groundtruths.update({
-          'masks': decoded_tensors['groundtruth_instance_masks_png'],
-      })
-    return groundtruths
-
-  def _build_pipeline(self):
-    """Builds data pipeline to generate groundtruth annotations."""
-    dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False)
-    dataset = dataset.apply(
-        tf.data.experimental.parallel_interleave(
-            lambda filename: self._dataset_fn(filename).prefetch(1),
-            cycle_length=32,
-            sloppy=False))
-    dataset = dataset.map(self._parse_single_example, num_parallel_calls=64)
-    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
-    dataset = dataset.batch(1, drop_remainder=False)
-    return dataset
-
-  def __call__(self):
-    with tf.Graph().as_default():
-      dataset = self._build_pipeline()
-      groundtruth = dataset.make_one_shot_iterator().get_next()
-
-      with tf.Session() as sess:
-        for _ in range(self._num_examples):
-          groundtruth_result = sess.run(groundtruth)
-          yield groundtruth_result
-
-
-def scan_and_generator_annotation_file(file_pattern,
-                                       num_samples,
-                                       include_mask,
-                                       annotation_file):
-  """Scans and generate the COCO-style annotation JSON file given a dataset."""
-  groundtruth_generator = COCOGroundtruthGenerator(
-      file_pattern, num_samples, include_mask)
-  generate_annotation_file(groundtruth_generator, annotation_file)
-
-
-def generate_annotation_file(groundtruth_generator,
-                             annotation_file):
-  """Generates COCO-style annotation JSON file given a groundtruth generator."""
-  groundtruths = {}
-  logging.info('Loading groundtruth annotations from dataset to memory...')
-  for groundtruth in groundtruth_generator():
-    for k, v in six.iteritems(groundtruth):
-      if k not in groundtruths:
-        groundtruths[k] = [v]
-      else:
-        groundtruths[k].append(v)
-  gt_dataset = convert_groundtruths_to_coco_dataset(groundtruths)
-
-  logging.info('Saving groundtruth annotations to the JSON file...')
-  with tf.io.gfile.GFile(annotation_file, 'w') as f:
-    f.write(json.dumps(gt_dataset))
-  logging.info('Done saving the JSON file...')
--- a/official/vision/detection/evaluation/factory.py
+++ b/official/vision/detection/evaluation/factory.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Evaluator factory."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from official.vision.detection.evaluation import coco_evaluator
-
-
-def evaluator_generator(params):
-  """Generator function for various evaluators."""
-  if params.type == 'box':
-    evaluator = coco_evaluator.COCOEvaluator(
-        annotation_file=params.val_json_file, include_mask=False)
-  elif params.type == 'box_and_mask':
-    evaluator = coco_evaluator.COCOEvaluator(
-        annotation_file=params.val_json_file, include_mask=True)
-  elif params.type == 'oln_xclass_box':
-    evaluator = coco_evaluator.OlnXclassEvaluator(
-        annotation_file=params.val_json_file, include_mask=False,
-        use_category=False, seen_class=params.seen_class,)
-  elif params.type == 'oln_xclass_box_and_mask':
-    evaluator = coco_evaluator.OlnXclassEvaluator(
-        annotation_file=params.val_json_file, include_mask=True,
-        use_category=False, seen_class=params.seen_class,)
-  elif params.type == 'oln_xdata_box':
-    evaluator = coco_evaluator.OlnXdataEvaluator(
-        annotation_file=params.val_json_file, include_mask=False,
-        use_category=False, seen_class='all',)
-  elif params.type == 'shapemask_box_and_mask':
-    evaluator = coco_evaluator.ShapeMaskCOCOEvaluator(
-        mask_eval_class=params.mask_eval_class,
-        annotation_file=params.val_json_file, include_mask=True)
-
-  else:
-    raise ValueError('Evaluator %s is not supported.' % params.type)
-
-  return coco_evaluator.MetricWrapper(evaluator)
--- a/official/vision/detection/executor/__init__.py
+++ b/official/vision/detection/executor/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
--- a/official/vision/detection/executor/detection_executor.py
+++ b/official/vision/detection/executor/detection_executor.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""An executor class for running model on TensorFlow 2.0."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import logging
-
-import tensorflow as tf
-from official.vision.detection.executor import distributed_executor as executor
-from official.vision.utils.object_detection import visualization_utils
-
-
-class DetectionDistributedExecutor(executor.DistributedExecutor):
-  """Detection specific customer training loop executor.
-
-  Subclasses the DistributedExecutor and adds support for numpy based metrics.
-  """
-
-  def __init__(self,
-               predict_post_process_fn=None,
-               trainable_variables_filter=None,
-               **kwargs):
-    super(DetectionDistributedExecutor, self).__init__(**kwargs)
-    if predict_post_process_fn:
-      assert callable(predict_post_process_fn)
-    if trainable_variables_filter:
-      assert callable(trainable_variables_filter)
-    self._predict_post_process_fn = predict_post_process_fn
-    self._trainable_variables_filter = trainable_variables_filter
-    self.eval_steps = tf.Variable(
-        0,
-        trainable=False,
-        dtype=tf.int32,
-        synchronization=tf.VariableSynchronization.ON_READ,
-        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
-        shape=[])
-
-  def _create_replicated_step(self,
-                              strategy,
-                              model,
-                              loss_fn,
-                              optimizer,
-                              metric=None):
-    trainable_variables = model.trainable_variables
-    if self._trainable_variables_filter:
-      trainable_variables = self._trainable_variables_filter(
-          trainable_variables)
-    logging.info('Filter trainable variables from %d to %d',
-                 len(model.trainable_variables), len(trainable_variables))
-    update_state_fn = lambda labels, outputs: None
-    if isinstance(metric, tf.keras.metrics.Metric):
-      update_state_fn = metric.update_state
-    else:
-      logging.error('Detection: train metric is not an instance of '
-                    'tf.keras.metrics.Metric.')
-
-    def _replicated_step(inputs):
-      """Replicated training step."""
-      inputs, labels = inputs
-
-      with tf.GradientTape() as tape:
-        outputs = model(inputs, training=True)
-        all_losses = loss_fn(labels, outputs)
-        losses = {}
-        for k, v in all_losses.items():
-          losses[k] = tf.reduce_mean(v)
-        per_replica_loss = losses['total_loss'] / strategy.num_replicas_in_sync
-        update_state_fn(labels, outputs)
-
-      grads = tape.gradient(per_replica_loss, trainable_variables)
-      clipped_grads, _ = tf.clip_by_global_norm(grads, clip_norm=1.0)
-      optimizer.apply_gradients(zip(clipped_grads, trainable_variables))
-      return losses
-
-    return _replicated_step
-
-  def _create_test_step(self, strategy, model, metric):
-    """Creates a distributed test step."""
-
-    @tf.function
-    def test_step(iterator, eval_steps):
-      """Calculates evaluation metrics on distributed devices."""
-
-      def _test_step_fn(inputs, eval_steps):
-        """Replicated accuracy calculation."""
-        inputs, labels = inputs
-        model_outputs = model(inputs, training=False)
-        if self._predict_post_process_fn:
-          labels, prediction_outputs = self._predict_post_process_fn(
-              labels, model_outputs)
-          num_remaining_visualizations = (
-              self._params.eval.num_images_to_visualize - eval_steps)
-          # If there are remaining number of visualizations that needs to be
-          # done, add next batch outputs for visualization.
-          #
-          # TODO(hongjunchoi): Once dynamic slicing is supported on TPU, only
-          # write correct slice of outputs to summary file.
-          if num_remaining_visualizations > 0:
-            visualization_utils.visualize_images_with_bounding_boxes(
-                inputs, prediction_outputs['detection_boxes'],
-                self.global_train_step, self.eval_summary_writer)
-
-        return labels, prediction_outputs
-
-      labels, outputs = strategy.run(
-          _test_step_fn, args=(
-              next(iterator),
-              eval_steps,
-          ))
-      outputs = tf.nest.map_structure(strategy.experimental_local_results,
-                                      outputs)
-      labels = tf.nest.map_structure(strategy.experimental_local_results,
-                                     labels)
-
-      eval_steps.assign_add(self._params.eval.batch_size)
-      return labels, outputs
-
-    return test_step
-
-  def _run_evaluation(self, test_step, current_training_step, metric,
-                      test_iterator):
-    """Runs validation steps and aggregate metrics."""
-    self.eval_steps.assign(0)
-    if not test_iterator or not metric:
-      logging.warning(
-          'Both test_iterator (%s) and metrics (%s) must not be None.',
-          test_iterator, metric)
-      return None
-    logging.info('Running evaluation after step: %s.', current_training_step)
-    while True:
-      try:
-        labels, outputs = test_step(test_iterator, self.eval_steps)
-        if metric:
-          metric.update_state(labels, outputs)
-      except (StopIteration, tf.errors.OutOfRangeError):
-        break
-
-    metric_result = metric.result()
-    if isinstance(metric, tf.keras.metrics.Metric):
-      metric_result = tf.nest.map_structure(lambda x: x.numpy().astype(float),
-                                            metric_result)
-    logging.info('Step: [%d] Validation metric = %s', current_training_step,
-                 metric_result)
-    return metric_result
--- a/official/vision/detection/executor/distributed_executor.py
+++ b/official/vision/detection/executor/distributed_executor.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Custom training loop for running TensorFlow 2.0 models."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl import flags
-from absl import logging
-
-import numpy as np
-import tensorflow as tf
-
-# pylint: disable=unused-import,g-import-not-at-top,redefined-outer-name,reimported
-from typing import Optional, Dict, List, Text, Callable, Union, Iterator, Any
-from official.modeling.hyperparams import params_dict
-from official.utils import hyperparams_flags
-from official.common import distribute_utils
-from official.utils.misc import keras_utils
-
-FLAGS = flags.FLAGS
-
-strategy_flags_dict = hyperparams_flags.strategy_flags_dict
-hparam_flags_dict = hyperparams_flags.hparam_flags_dict
-
-
-def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
-  """Saves model to model_dir with provided checkpoint prefix."""
-
-  checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
-  saved_path = checkpoint.save(checkpoint_path)
-  logging.info('Saving model as TF checkpoint: %s', saved_path)
-
-
-def _steps_to_run(current_step, total_steps, steps_per_loop):
-  """Calculates steps to run on device."""
-  if steps_per_loop <= 0:
-    raise ValueError('steps_per_loop should be positive integer.')
-  return min(total_steps - current_step, steps_per_loop)
-
-
-def _no_metric():
-  return None
-
-
-def metrics_as_dict(metric):
-  """Puts input metric(s) into a list.
-
-  Args:
-    metric: metric(s) to be put into the list. `metric` could be an object, a
-      list, or a dict of tf.keras.metrics.Metric or has the `required_method`.
-
-  Returns:
-    A dictionary of valid metrics.
-  """
-  if isinstance(metric, tf.keras.metrics.Metric):
-    metrics = {metric.name: metric}
-  elif isinstance(metric, list):
-    metrics = {m.name: m for m in metric}
-  elif isinstance(metric, dict):
-    metrics = metric
-  elif not metric:
-    return {}
-  else:
-    metrics = {'metric': metric}
-  return metrics
-
-
-def metric_results(metric):
-  """Collects results from the given metric(s)."""
-  metrics = metrics_as_dict(metric)
-  metric_result = {
-      name: m.result().numpy().astype(float) for name, m in metrics.items()
-  }
-  return metric_result
-
-
-def reset_states(metric):
-  """Resets states of the given metric(s)."""
-  metrics = metrics_as_dict(metric)
-  for m in metrics.values():
-    m.reset_states()
-
-
-class SummaryWriter(object):
-  """Simple SummaryWriter for writing dictionary of metrics.
-
-  Attributes:
-    writer: The tf.SummaryWriter.
-  """
-
-  def __init__(self, model_dir: Text, name: Text):
-    """Inits SummaryWriter with paths.
-
-    Args:
-      model_dir: the model folder path.
-      name: the summary subfolder name.
-    """
-    self.writer = tf.summary.create_file_writer(os.path.join(model_dir, name))
-
-  def __call__(self, metrics: Union[Dict[Text, float], float], step: int):
-    """Write metrics to summary with the given writer.
-
-    Args:
-      metrics: a dictionary of metrics values. Prefer dictionary.
-      step: integer. The training step.
-    """
-    if not isinstance(metrics, dict):
-      # Support scalar metric without name.
-      logging.warning('Warning: summary writer prefer metrics as dictionary.')
-      metrics = {'metric': metrics}
-
-    with self.writer.as_default():
-      for k, v in metrics.items():
-        tf.summary.scalar(k, v, step=step)
-      self.writer.flush()
-
-
-class DistributedExecutor(object):
-  """Interface to train and eval models with tf.distribute.Strategy."""
-
-  def __init__(self, strategy, params, model_fn, loss_fn, is_multi_host=False):
-    """Constructor.
-
-    Args:
-      strategy: an instance of tf.distribute.Strategy.
-      params: Model configuration needed to run distribution strategy.
-      model_fn: Keras model function. Signature:
-        (params: ParamsDict) -> tf.keras.models.Model.
-      loss_fn: loss function. Signature:
-        (y_true: Tensor, y_pred: Tensor) -> Tensor
-      is_multi_host: Set to True when using multi hosts for training, like multi
-        worker GPU or TPU pod (slice). Otherwise, False.
-    """
-
-    self._params = params
-    self._model_fn = model_fn
-    self._loss_fn = loss_fn
-    self._strategy = strategy
-    self._checkpoint_name = 'ctl_step_{step}.ckpt'
-    self._is_multi_host = is_multi_host
-    self.train_summary_writer = None
-    self.eval_summary_writer = None
-    self.global_train_step = None
-
-  @property
-  def checkpoint_name(self):
-    """Returns default checkpoint name."""
-    return self._checkpoint_name
-
-  @checkpoint_name.setter
-  def checkpoint_name(self, name):
-    """Sets default summary writer for the current thread."""
-    self._checkpoint_name = name
-
-  def loss_fn(self):
-    return self._loss_fn()
-
-  def model_fn(self, params):
-    return self._model_fn(params)
-
-  def _save_config(self, model_dir):
-    """Save parameters to config files if model_dir is defined."""
-
-    logging.info('Save config to model_dir %s.', model_dir)
-    if model_dir:
-      if not tf.io.gfile.exists(model_dir):
-        tf.io.gfile.makedirs(model_dir)
-      self._params.lock()
-      params_dict.save_params_dict_to_yaml(self._params,
-                                           model_dir + '/params.yaml')
-    else:
-      logging.warning('model_dir is empty, so skip the save config.')
-
-  def _get_input_iterator(
-      self, input_fn: Callable[..., tf.data.Dataset],
-      strategy: tf.distribute.Strategy) -> Optional[Iterator[Any]]:
-    """Returns distributed dataset iterator.
-
-    Args:
-      input_fn: (params: dict) -> tf.data.Dataset.
-      strategy: an instance of tf.distribute.Strategy.
-
-    Returns:
-      An iterator that yields input tensors.
-    """
-
-    if input_fn is None:
-      return None
-    # When training with multiple TPU workers, datasets needs to be cloned
-    # across workers. Since Dataset instance cannot be cloned in eager mode,
-    # we instead pass callable that returns a dataset.
-    if self._is_multi_host:
-      return iter(strategy.distribute_datasets_from_function(input_fn))
-    else:
-      input_data = input_fn()
-      return iter(strategy.experimental_distribute_dataset(input_data))
-
-  def _create_replicated_step(self,
-                              strategy,
-                              model,
-                              loss_fn,
-                              optimizer,
-                              metric=None):
-    """Creates a single training step.
-
-    Args:
-      strategy: an instance of tf.distribute.Strategy.
-      model: (Tensor, bool) -> Tensor. model function.
-      loss_fn: (y_true: Tensor, y_pred: Tensor) -> Tensor.
-      optimizer: tf.keras.optimizers.Optimizer.
-      metric: tf.keras.metrics.Metric subclass.
-
-    Returns:
-      The training step callable.
-    """
-    metrics = metrics_as_dict(metric)
-
-    def _replicated_step(inputs):
-      """Replicated training step."""
-      inputs, labels = inputs
-
-      with tf.GradientTape() as tape:
-        outputs = model(inputs, training=True)
-        prediction_loss = loss_fn(labels, outputs)
-        loss = tf.reduce_mean(prediction_loss)
-        loss = loss / strategy.num_replicas_in_sync
-        for m in metrics.values():
-          m.update_state(labels, outputs)
-
-      grads = tape.gradient(loss, model.trainable_variables)
-      optimizer.apply_gradients(zip(grads, model.trainable_variables))
-      return loss
-
-    return _replicated_step
-
-  def _create_train_step(self,
-                         strategy,
-                         model,
-                         loss_fn,
-                         optimizer,
-                         metric=None):
-    """Creates a distributed training step.
-
-    Args:
-      strategy: an instance of tf.distribute.Strategy.
-      model: (Tensor, bool) -> Tensor. model function.
-      loss_fn: (y_true: Tensor, y_pred: Tensor) -> Tensor.
-      optimizer: tf.keras.optimizers.Optimizer.
-      metric: tf.keras.metrics.Metric subclass.
-
-    Returns:
-      The training step callable.
-    """
-    replicated_step = self._create_replicated_step(strategy, model, loss_fn,
-                                                   optimizer, metric)
-
-    @tf.function
-    def train_step(iterator, num_steps):
-      """Performs a distributed training step.
-
-      Args:
-        iterator: an iterator that yields input tensors.
-        num_steps: the number of steps in the loop.
-
-      Returns:
-        The loss tensor.
-      """
-      if not isinstance(num_steps, tf.Tensor):
-        raise ValueError('steps should be an Tensor. Python object may cause '
-                         'retracing.')
-
-      per_replica_losses = strategy.run(replicated_step, args=(next(iterator),))
-      for _ in tf.range(num_steps - 1):
-        per_replica_losses = strategy.run(
-            replicated_step, args=(next(iterator),))
-
-      # For reporting, we returns the mean of losses.
-      losses = tf.nest.map_structure(
-          lambda x: strategy.reduce(tf.distribute.ReduceOp.MEAN, x, axis=None),
-          per_replica_losses)
-      return losses
-
-    return train_step
-
-  def _create_test_step(self, strategy, model, metric):
-    """Creates a distributed test step."""
-    metrics = metrics_as_dict(metric)
-
-    @tf.function
-    def test_step(iterator):
-      """Calculates evaluation metrics on distributed devices."""
-      if not metric:
-        logging.info('Skip test_step because metric is None (%s)', metric)
-        return None, None
-
-      def _test_step_fn(inputs):
-        """Replicated accuracy calculation."""
-        inputs, labels = inputs
-        model_outputs = model(inputs, training=False)
-        for m in metrics.values():
-          m.update_state(labels, model_outputs)
-        return labels, model_outputs
-
-      return strategy.run(_test_step_fn, args=(next(iterator),))
-
-    return test_step
-
-  def train(
-      self,
-      train_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
-      eval_input_fn: Optional[Callable[[params_dict.ParamsDict],
-                                       tf.data.Dataset]] = None,
-      model_dir: Optional[Text] = None,
-      total_steps: int = 1,
-      iterations_per_loop: int = 1,
-      train_metric_fn: Optional[Callable[[], Any]] = None,
-      eval_metric_fn: Optional[Callable[[], Any]] = None,
-      summary_writer_fn: Callable[[Text, Text], SummaryWriter] = SummaryWriter,
-      init_checkpoint: Optional[Callable[[tf.keras.Model], Any]] = None,
-      custom_callbacks: Optional[List[tf.keras.callbacks.Callback]] = None,
-      continuous_eval: bool = False,
-      save_config: bool = True):
-    """Runs distributed training.
-
-    Args:
-      train_input_fn: (params: dict) -> tf.data.Dataset training data input
-        function.
-      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
-        trigger evaluating metric on eval data. If None, will not run the eval
-        step.
-      model_dir: the folder path for model checkpoints.
-      total_steps: total training steps.
-      iterations_per_loop: train steps per loop. After each loop, this job will
-        update metrics like loss and save checkpoint.
-      train_metric_fn: metric_fn for evaluation in train_step.
-      eval_metric_fn: metric_fn for evaluation in test_step.
-      summary_writer_fn: function to create summary writer.
-      init_checkpoint: function to load checkpoint.
-      custom_callbacks: A list of Keras Callbacks objects to run during
-        training. More specifically, `on_batch_begin()`, `on_batch_end()`,
-        methods are invoked during training.
-      continuous_eval: If `True`, will continously run evaluation on every
-        available checkpoints. If `False`, will do the evaluation once after the
-        final step.
-      save_config: bool. Whether to save params to model_dir.
-
-    Returns:
-      The training loss and eval metrics.
-    """
-    assert train_input_fn is not None
-    if train_metric_fn and not callable(train_metric_fn):
-      raise ValueError('if `train_metric_fn` is specified, '
-                       'train_metric_fn must be a callable.')
-    if eval_metric_fn and not callable(eval_metric_fn):
-      raise ValueError('if `eval_metric_fn` is specified, '
-                       'eval_metric_fn must be a callable.')
-    train_metric_fn = train_metric_fn or _no_metric
-    eval_metric_fn = eval_metric_fn or _no_metric
-
-    if custom_callbacks and iterations_per_loop != 1:
-      logging.warning(
-          'It is sematically wrong to run callbacks when '
-          'iterations_per_loop is not one (%s)', iterations_per_loop)
-
-    custom_callbacks = custom_callbacks or []
-
-    def _run_callbacks_on_batch_begin(batch):
-      """Runs custom callbacks at the start of every step."""
-      if not custom_callbacks:
-        return
-      for callback in custom_callbacks:
-        if callback:
-          callback.on_batch_begin(batch)
-
-    def _run_callbacks_on_batch_end(batch):
-      """Runs custom callbacks at the end of every step."""
-      if not custom_callbacks:
-        return
-      for callback in custom_callbacks:
-        if callback:
-          callback.on_batch_end(batch)
-
-    if save_config:
-      self._save_config(model_dir)
-
-    if FLAGS.save_checkpoint_freq:
-      save_freq = FLAGS.save_checkpoint_freq
-    else:
-      save_freq = iterations_per_loop
-
-    params = self._params
-    strategy = self._strategy
-    # To reduce unnecessary send/receive input pipeline operation, we place
-    # input pipeline ops in worker task.
-    train_iterator = self._get_input_iterator(train_input_fn, strategy)
-    train_loss = None
-    train_metric_result = None
-    eval_metric_result = None
-    tf.keras.backend.set_learning_phase(1)
-    with strategy.scope():
-      # To correctly place the model weights on accelerators,
-      # model and optimizer should be created in scope.
-      model = self.model_fn(params.as_dict())
-      if not hasattr(model, 'optimizer'):
-        raise ValueError('User should set optimizer attribute to model '
-                         'inside `model_fn`.')
-      optimizer = model.optimizer
-
-      # Training loop starts here.
-      checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
-      latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
-      initial_step = 0
-      if latest_checkpoint_file:
-        logging.info(
-            'Checkpoint file %s found and restoring from '
-            'checkpoint', latest_checkpoint_file)
-        checkpoint.restore(latest_checkpoint_file)
-        initial_step = optimizer.iterations.numpy()
-        logging.info('Loading from checkpoint file completed. Init step %d',
-                     initial_step)
-      elif init_checkpoint:
-        logging.info('Restoring from init checkpoint function')
-        init_checkpoint(model)
-        logging.info('Loading from init checkpoint file completed')
-
-      current_step = optimizer.iterations.numpy()
-      checkpoint_name = self.checkpoint_name
-
-      eval_metric = eval_metric_fn()
-      train_metric = train_metric_fn()
-      train_summary_writer = summary_writer_fn(model_dir, 'eval_train')
-      self.train_summary_writer = train_summary_writer.writer
-
-      test_summary_writer = summary_writer_fn(model_dir, 'eval_test')
-      self.eval_summary_writer = test_summary_writer.writer
-
-    # Use training summary writer in TimeHistory if it's in use
-    for cb in custom_callbacks:
-      if isinstance(cb, keras_utils.TimeHistory):
-        cb.summary_writer = self.train_summary_writer
-
-    # Continue training loop.
-    train_step = self._create_train_step(
-        strategy=strategy,
-        model=model,
-        loss_fn=self.loss_fn(),
-        optimizer=optimizer,
-        metric=train_metric)
-    test_step = None
-    if eval_input_fn and eval_metric:
-      self.global_train_step = model.optimizer.iterations
-      test_step = self._create_test_step(strategy, model, metric=eval_metric)
-
-    # Step-0 operations
-    if current_step == 0 and not latest_checkpoint_file:
-      _save_checkpoint(checkpoint, model_dir,
-                       checkpoint_name.format(step=current_step))
-    if test_step:
-      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
-      eval_metric_result = self._run_evaluation(test_step, current_step,
-                                                eval_metric, eval_iterator)
-      logging.info('Step: %s evalation metric = %s.', current_step,
-                   eval_metric_result)
-      test_summary_writer(metrics=eval_metric_result, step=optimizer.iterations)
-      reset_states(eval_metric)
-
-    logging.info('Training started')
-    last_save_checkpoint_step = current_step
-    while current_step < total_steps:
-
-      num_steps = _steps_to_run(current_step, total_steps, iterations_per_loop)
-      _run_callbacks_on_batch_begin(current_step)
-      train_loss = train_step(train_iterator,
-                              tf.convert_to_tensor(num_steps, dtype=tf.int32))
-      current_step += num_steps
-
-      train_loss = tf.nest.map_structure(lambda x: x.numpy().astype(float),
-                                         train_loss)
-
-      _run_callbacks_on_batch_end(current_step - 1)
-      if not isinstance(train_loss, dict):
-        train_loss = {'total_loss': train_loss}
-      if np.isnan(train_loss['total_loss']):
-        raise ValueError('total loss is NaN.')
-
-      if train_metric:
-        train_metric_result = metric_results(train_metric)
-        train_metric_result.update(train_loss)
-      else:
-        train_metric_result = train_loss
-      if callable(optimizer.lr):
-        train_metric_result.update(
-            {'learning_rate': optimizer.lr(current_step).numpy()})
-      else:
-        train_metric_result.update({'learning_rate': optimizer.lr.numpy()})
-      logging.info('Train Step: %d/%d  / loss = %s / training metric = %s',
-                   current_step, total_steps, train_loss, train_metric_result)
-
-      train_summary_writer(
-          metrics=train_metric_result, step=optimizer.iterations)
-
-      # Saves model checkpoints and run validation steps at every
-      # iterations_per_loop steps.
-      # To avoid repeated model saving, we do not save after the last
-      # step of training.
-      if save_freq > 0 and current_step < total_steps and (
-          current_step - last_save_checkpoint_step) >= save_freq:
-        _save_checkpoint(checkpoint, model_dir,
-                         checkpoint_name.format(step=current_step))
-        last_save_checkpoint_step = current_step
-
-      if continuous_eval and current_step < total_steps and test_step:
-        eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
-        eval_metric_result = self._run_evaluation(test_step, current_step,
-                                                  eval_metric, eval_iterator)
-        logging.info('Step: %s evalation metric = %s.', current_step,
-                     eval_metric_result)
-        test_summary_writer(
-            metrics=eval_metric_result, step=optimizer.iterations)
-
-      # Re-initialize evaluation metric, except the last step.
-      if eval_metric and current_step < total_steps:
-        reset_states(eval_metric)
-      if train_metric and current_step < total_steps:
-        reset_states(train_metric)
-
-    # Reaches the end of training and saves the last checkpoint.
-    if last_save_checkpoint_step < total_steps:
-      _save_checkpoint(checkpoint, model_dir,
-                       checkpoint_name.format(step=current_step))
-
-    if test_step:
-      logging.info('Running final evaluation after training is complete.')
-      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
-      eval_metric_result = self._run_evaluation(test_step, current_step,
-                                                eval_metric, eval_iterator)
-      logging.info('Final evaluation metric = %s.', eval_metric_result)
-      test_summary_writer(metrics=eval_metric_result, step=optimizer.iterations)
-
-    self.train_summary_writer.close()
-    self.eval_summary_writer.close()
-
-    return train_metric_result, eval_metric_result
-
-  def _run_evaluation(self, test_step, current_training_step, metric,
-                      test_iterator):
-    """Runs validation steps and aggregate metrics."""
-    if not test_iterator or not metric:
-      logging.warning(
-          'Both test_iterator (%s) and metrics (%s) must not be None.',
-          test_iterator, metric)
-      return None
-    logging.info('Running evaluation after step: %s.', current_training_step)
-    eval_step = 0
-    while True:
-      try:
-        with tf.experimental.async_scope():
-          test_step(test_iterator)
-          eval_step += 1
-      except (StopIteration, tf.errors.OutOfRangeError):
-        tf.experimental.async_clear_error()
-        break
-
-    metric_result = metric_results(metric)
-    logging.info('Total eval steps: [%d]', eval_step)
-    logging.info('At training step: [%r] Validation metric = %r',
-                 current_training_step, metric_result)
-    return metric_result
-
-  def evaluate_from_model_dir(
-      self,
-      model_dir: Text,
-      eval_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
-      eval_metric_fn: Callable[[], Any],
-      total_steps: int = -1,
-      eval_timeout: Optional[int] = None,
-      min_eval_interval: int = 180,
-      summary_writer_fn: Callable[[Text, Text], SummaryWriter] = SummaryWriter):
-    """Runs distributed evaluation on model folder.
-
-    Args:
-      model_dir: the folder for storing model checkpoints.
-      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
-        trigger evaluting metric on eval data. If None, will not run eval step.
-      eval_metric_fn: metric_fn for evaluation in test_step.
-      total_steps: total training steps. If the current step reaches the
-        total_steps, the evaluation loop will stop.
-      eval_timeout: The maximum number of seconds to wait between checkpoints.
-        If left as None, then the process will wait indefinitely. Used by
-        tf.train.checkpoints_iterator.
-      min_eval_interval: The minimum number of seconds between yielding
-        checkpoints. Used by tf.train.checkpoints_iterator.
-      summary_writer_fn: function to create summary writer.
-
-    Returns:
-      Eval metrics dictionary of the last checkpoint.
-    """
-
-    if not model_dir:
-      raise ValueError('model_dir must be set.')
-
-    def terminate_eval():
-      tf.logging.info('Terminating eval after %d seconds of no checkpoints' %
-                      eval_timeout)
-      return True
-
-    summary_writer = summary_writer_fn(model_dir, 'eval')
-    self.eval_summary_writer = summary_writer.writer
-
-    # Read checkpoints from the given model directory
-    # until `eval_timeout` seconds elapses.
-    for checkpoint_path in tf.train.checkpoints_iterator(
-        model_dir,
-        min_interval_secs=min_eval_interval,
-        timeout=eval_timeout,
-        timeout_fn=terminate_eval):
-      eval_metric_result, current_step = self.evaluate_checkpoint(
-          checkpoint_path=checkpoint_path,
-          eval_input_fn=eval_input_fn,
-          eval_metric_fn=eval_metric_fn,
-          summary_writer=summary_writer)
-      if total_steps > 0 and current_step >= total_steps:
-        logging.info('Evaluation finished after training step %d', current_step)
-        break
-    return eval_metric_result
-
-  def evaluate_checkpoint(self,
-                          checkpoint_path: Text,
-                          eval_input_fn: Callable[[params_dict.ParamsDict],
-                                                  tf.data.Dataset],
-                          eval_metric_fn: Callable[[], Any],
-                          summary_writer: Optional[SummaryWriter] = None):
-    """Runs distributed evaluation on the one checkpoint.
-
-    Args:
-      checkpoint_path: the checkpoint to evaluate.
-      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
-        trigger evaluting metric on eval data. If None, will not run eval step.
-      eval_metric_fn: metric_fn for evaluation in test_step.
-      summary_writer: function to create summary writer.
-
-    Returns:
-      Eval metrics dictionary of the last checkpoint.
-    """
-    if not callable(eval_metric_fn):
-      raise ValueError('if `eval_metric_fn` is specified, '
-                       'eval_metric_fn must be a callable.')
-
-    old_phase = tf.keras.backend.learning_phase()
-    tf.keras.backend.set_learning_phase(0)
-    params = self._params
-    strategy = self._strategy
-    # To reduce unnecessary send/receive input pipeline operation, we place
-    # input pipeline ops in worker task.
-    with strategy.scope():
-
-      # To correctly place the model weights on accelerators,
-      # model and optimizer should be created in scope.
-      model = self.model_fn(params.as_dict())
-      checkpoint = tf.train.Checkpoint(model=model)
-
-      eval_metric = eval_metric_fn()
-      assert eval_metric, 'eval_metric does not exist'
-      test_step = self._create_test_step(strategy, model, metric=eval_metric)
-
-      logging.info('Starting to evaluate.')
-      if not checkpoint_path:
-        raise ValueError('checkpoint path is empty')
-      reader = tf.compat.v1.train.NewCheckpointReader(checkpoint_path)
-      current_step = reader.get_tensor(
-          'optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE')
-      logging.info('Checkpoint file %s found and restoring from '
-                   'checkpoint', checkpoint_path)
-      status = checkpoint.restore(checkpoint_path)
-      status.expect_partial().assert_existing_objects_matched()
-
-      self.global_train_step = model.optimizer.iterations
-      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
-      eval_metric_result = self._run_evaluation(test_step, current_step,
-                                                eval_metric, eval_iterator)
-      logging.info('Step: %s evalation metric = %s.', current_step,
-                   eval_metric_result)
-      summary_writer(metrics=eval_metric_result, step=current_step)
-      reset_states(eval_metric)
-
-    tf.keras.backend.set_learning_phase(old_phase)
-    return eval_metric_result, current_step
-
-  def predict(self):
-    return NotImplementedError('Unimplmented function.')
-
-
-class ExecutorBuilder(object):
-  """Builder of DistributedExecutor.
-
-  Example 1: Builds an executor with supported Strategy.
-    builder = ExecutorBuilder(
-        strategy_type='tpu',
-        strategy_config={'tpu': '/bns/xxx'})
-    dist_executor = builder.build_executor(
-        params=params,
-        model_fn=my_model_fn,
-        loss_fn=my_loss_fn,
-        metric_fn=my_metric_fn)
-
-  Example 2: Builds an executor with customized Strategy.
-    builder = ExecutorBuilder()
-    builder.strategy = <some customized Strategy>
-    dist_executor = builder.build_executor(
-        params=params,
-        model_fn=my_model_fn,
-        loss_fn=my_loss_fn,
-        metric_fn=my_metric_fn)
-
-  Example 3: Builds a customized executor with customized Strategy.
-    class MyDistributedExecutor(DistributedExecutor):
-      # implementation ...
-
-    builder = ExecutorBuilder()
-    builder.strategy = <some customized Strategy>
-    dist_executor = builder.build_executor(
-        class_ctor=MyDistributedExecutor,
-        params=params,
-        model_fn=my_model_fn,
-        loss_fn=my_loss_fn,
-        metric_fn=my_metric_fn)
-  """
-
-  def __init__(self, strategy_type=None, strategy_config=None):
-    _ = distribute_utils.configure_cluster(strategy_config.worker_hosts,
-                                           strategy_config.task_index)
-    """Constructor.
-
-    Args:
-      strategy_type: string. One of 'tpu', 'mirrored', 'multi_worker_mirrored'.
-        If None, the user is responsible to set the strategy before calling
-        build_executor(...).
-      strategy_config: necessary config for constructing the proper Strategy.
-        Check strategy_flags_dict() for examples of the structure.
-    """
-    self._strategy = distribute_utils.get_distribution_strategy(
-        distribution_strategy=strategy_type,
-        num_gpus=strategy_config.num_gpus,
-        all_reduce_alg=strategy_config.all_reduce_alg,
-        num_packs=strategy_config.num_packs,
-        tpu_address=strategy_config.tpu)
-
-  @property
-  def strategy(self):
-    """Returns default checkpoint name."""
-    return self._strategy
-
-  @strategy.setter
-  def strategy(self, new_strategy):
-    """Sets default summary writer for the current thread."""
-    self._strategy = new_strategy
-
-  def build_executor(self,
-                     class_ctor=DistributedExecutor,
-                     params=None,
-                     model_fn=None,
-                     loss_fn=None,
-                     **kwargs):
-    """Creates an executor according to strategy type.
-
-    See doc string of the DistributedExecutor.__init__ for more information of
-    the
-    input arguments.
-
-    Args:
-      class_ctor: A constructor of executor (default: DistributedExecutor).
-      params: ParamsDict, all the model parameters and runtime parameters.
-      model_fn: Keras model function.
-      loss_fn: loss function.
-      **kwargs: other arguments to the executor constructor.
-
-    Returns:
-      An instance of DistributedExecutor or its subclass.
-    """
-    if self._strategy is None:
-      raise ValueError('`strategy` should not be None. You need to specify '
-                       '`strategy_type` in the builder contructor or directly '
-                       'set the `strategy` property of the builder.')
-    return class_ctor(
-        strategy=self._strategy,
-        params=params,
-        model_fn=model_fn,
-        loss_fn=loss_fn,
-        **kwargs)
--- a/official/vision/detection/main.py
+++ b/official/vision/detection/main.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Main function to train various object detection models."""
-
-import functools
-import pprint
-
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-
-from official.common import distribute_utils
-from official.modeling.hyperparams import params_dict
-from official.utils import hyperparams_flags
-from official.utils.flags import core as flags_core
-from official.utils.misc import keras_utils
-from official.vision.detection.configs import factory as config_factory
-from official.vision.detection.dataloader import input_reader
-from official.vision.detection.dataloader import mode_keys as ModeKeys
-from official.vision.detection.executor import distributed_executor as executor
-from official.vision.detection.executor.detection_executor import DetectionDistributedExecutor
-from official.vision.detection.modeling import factory as model_factory
-
-hyperparams_flags.initialize_common_flags()
-flags_core.define_log_steps()
-
-flags.DEFINE_bool('enable_xla', default=False, help='Enable XLA for GPU')
-
-flags.DEFINE_string(
-    'mode',
-    default='train',
-    help='Mode to run: `train`, `eval` or `eval_once`.')
-
-flags.DEFINE_string(
-    'model', default='retinanet',
-    help='Model to run: `retinanet`, `mask_rcnn` or `shapemask`.')
-
-flags.DEFINE_string('training_file_pattern', None,
-                    'Location of the train data.')
-
-flags.DEFINE_string('eval_file_pattern', None, 'Location of ther eval data')
-
-flags.DEFINE_string(
-    'checkpoint_path', None,
-    'The checkpoint path to eval. Only used in eval_once mode.')
-
-FLAGS = flags.FLAGS
-
-
-def run_executor(params,
-                 mode,
-                 checkpoint_path=None,
-                 train_input_fn=None,
-                 eval_input_fn=None,
-                 callbacks=None,
-                 prebuilt_strategy=None):
-  """Runs the object detection model on distribution strategy defined by the user."""
-
-  if params.architecture.use_bfloat16:
-    tf.compat.v2.keras.mixed_precision.set_global_policy('mixed_bfloat16')
-
-  model_builder = model_factory.model_generator(params)
-
-  if prebuilt_strategy is not None:
-    strategy = prebuilt_strategy
-  else:
-    strategy_config = params.strategy_config
-    distribute_utils.configure_cluster(strategy_config.worker_hosts,
-                                       strategy_config.task_index)
-    strategy = distribute_utils.get_distribution_strategy(
-        distribution_strategy=params.strategy_type,
-        num_gpus=strategy_config.num_gpus,
-        all_reduce_alg=strategy_config.all_reduce_alg,
-        num_packs=strategy_config.num_packs,
-        tpu_address=strategy_config.tpu)
-
-  num_workers = int(strategy.num_replicas_in_sync + 7) // 8
-  is_multi_host = (int(num_workers) >= 2)
-
-  if mode == 'train':
-
-    def _model_fn(params):
-      return model_builder.build_model(params, mode=ModeKeys.TRAIN)
-
-    logging.info(
-        'Train num_replicas_in_sync %d num_workers %d is_multi_host %s',
-        strategy.num_replicas_in_sync, num_workers, is_multi_host)
-
-    dist_executor = DetectionDistributedExecutor(
-        strategy=strategy,
-        params=params,
-        model_fn=_model_fn,
-        loss_fn=model_builder.build_loss_fn,
-        is_multi_host=is_multi_host,
-        predict_post_process_fn=model_builder.post_processing,
-        trainable_variables_filter=model_builder
-        .make_filter_trainable_variables_fn())
-
-    if is_multi_host:
-      train_input_fn = functools.partial(
-          train_input_fn,
-          batch_size=params.train.batch_size // strategy.num_replicas_in_sync)
-
-    return dist_executor.train(
-        train_input_fn=train_input_fn,
-        model_dir=params.model_dir,
-        iterations_per_loop=params.train.iterations_per_loop,
-        total_steps=params.train.total_steps,
-        init_checkpoint=model_builder.make_restore_checkpoint_fn(),
-        custom_callbacks=callbacks,
-        save_config=True)
-  elif mode == 'eval' or mode == 'eval_once':
-
-    def _model_fn(params):
-      return model_builder.build_model(params, mode=ModeKeys.PREDICT_WITH_GT)
-
-    logging.info('Eval num_replicas_in_sync %d num_workers %d is_multi_host %s',
-                 strategy.num_replicas_in_sync, num_workers, is_multi_host)
-
-    if is_multi_host:
-      eval_input_fn = functools.partial(
-          eval_input_fn,
-          batch_size=params.eval.batch_size // strategy.num_replicas_in_sync)
-
-    dist_executor = DetectionDistributedExecutor(
-        strategy=strategy,
-        params=params,
-        model_fn=_model_fn,
-        loss_fn=model_builder.build_loss_fn,
-        is_multi_host=is_multi_host,
-        predict_post_process_fn=model_builder.post_processing,
-        trainable_variables_filter=model_builder
-        .make_filter_trainable_variables_fn())
-
-    if mode == 'eval':
-      results = dist_executor.evaluate_from_model_dir(
-          model_dir=params.model_dir,
-          eval_input_fn=eval_input_fn,
-          eval_metric_fn=model_builder.eval_metrics,
-          eval_timeout=params.eval.eval_timeout,
-          min_eval_interval=params.eval.min_eval_interval,
-          total_steps=params.train.total_steps)
-    else:
-      # Run evaluation once for a single checkpoint.
-      if not checkpoint_path:
-        raise ValueError('checkpoint_path cannot be empty.')
-      if tf.io.gfile.isdir(checkpoint_path):
-        checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
-      summary_writer = executor.SummaryWriter(params.model_dir, 'eval')
-      results, _ = dist_executor.evaluate_checkpoint(
-          checkpoint_path=checkpoint_path,
-          eval_input_fn=eval_input_fn,
-          eval_metric_fn=model_builder.eval_metrics,
-          summary_writer=summary_writer)
-    for k, v in results.items():
-      logging.info('Final eval metric %s: %f', k, v)
-    return results
-  else:
-    raise ValueError('Mode not found: %s.' % mode)
-
-
-def run(callbacks=None):
-  keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)
-
-  params = config_factory.config_generator(FLAGS.model)
-
-  params = params_dict.override_params_dict(
-      params, FLAGS.config_file, is_strict=True)
-
-  params = params_dict.override_params_dict(
-      params, FLAGS.params_override, is_strict=True)
-  params.override(
-      {
-          'strategy_type': FLAGS.strategy_type,
-          'model_dir': FLAGS.model_dir,
-          'strategy_config': executor.strategy_flags_dict(),
-      },
-      is_strict=False)
-
-  # Make sure use_tpu and strategy_type are in sync.
-  params.use_tpu = (params.strategy_type == 'tpu')
-
-  if not params.use_tpu:
-    params.override({
-        'architecture': {
-            'use_bfloat16': False,
-        },
-        'norm_activation': {
-            'use_sync_bn': False,
-        },
-    }, is_strict=True)
-
-  params.validate()
-  params.lock()
-  pp = pprint.PrettyPrinter()
-  params_str = pp.pformat(params.as_dict())
-  logging.info('Model Parameters: %s', params_str)
-
-  train_input_fn = None
-  eval_input_fn = None
-  training_file_pattern = FLAGS.training_file_pattern or params.train.train_file_pattern
-  eval_file_pattern = FLAGS.eval_file_pattern or params.eval.eval_file_pattern
-  if not training_file_pattern and not eval_file_pattern:
-    raise ValueError('Must provide at least one of training_file_pattern and '
-                     'eval_file_pattern.')
-
-  if training_file_pattern:
-    # Use global batch size for single host.
-    train_input_fn = input_reader.InputFn(
-        file_pattern=training_file_pattern,
-        params=params,
-        mode=input_reader.ModeKeys.TRAIN,
-        batch_size=params.train.batch_size)
-
-  if eval_file_pattern:
-    eval_input_fn = input_reader.InputFn(
-        file_pattern=eval_file_pattern,
-        params=params,
-        mode=input_reader.ModeKeys.PREDICT_WITH_GT,
-        batch_size=params.eval.batch_size,
-        num_examples=params.eval.eval_samples)
-
-  if callbacks is None:
-    callbacks = []
-
-  if FLAGS.log_steps:
-    callbacks.append(
-        keras_utils.TimeHistory(
-            batch_size=params.train.batch_size,
-            log_steps=FLAGS.log_steps,
-        ))
-
-  return run_executor(
-      params,
-      FLAGS.mode,
-      checkpoint_path=FLAGS.checkpoint_path,
-      train_input_fn=train_input_fn,
-      eval_input_fn=eval_input_fn,
-      callbacks=callbacks)
-
-
-def main(argv):
-  del argv  # Unused.
-
-  run()
-
-
-if __name__ == '__main__':
-  tf.config.set_soft_device_placement(True)
-  app.run(main)
--- a/official/vision/detection/modeling/__init__.py
+++ b/official/vision/detection/modeling/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
--- a/official/vision/detection/modeling/architecture/__init__.py
+++ b/official/vision/detection/modeling/architecture/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
--- a/official/vision/detection/modeling/architecture/factory.py
+++ b/official/vision/detection/modeling/architecture/factory.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Model architecture factory."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from official.vision.detection.modeling.architecture import fpn
-from official.vision.detection.modeling.architecture import heads
-from official.vision.detection.modeling.architecture import identity
-from official.vision.detection.modeling.architecture import nn_ops
-from official.vision.detection.modeling.architecture import resnet
-from official.vision.detection.modeling.architecture import spinenet
-
-
-def norm_activation_generator(params):
-  return nn_ops.norm_activation_builder(
-      momentum=params.batch_norm_momentum,
-      epsilon=params.batch_norm_epsilon,
-      trainable=params.batch_norm_trainable,
-      activation=params.activation)
-
-
-def backbone_generator(params):
-  """Generator function for various backbone models."""
-  if params.architecture.backbone == 'resnet':
-    resnet_params = params.resnet
-    backbone_fn = resnet.Resnet(
-        resnet_depth=resnet_params.resnet_depth,
-        activation=params.norm_activation.activation,
-        norm_activation=norm_activation_generator(
-            params.norm_activation))
-  elif params.architecture.backbone == 'spinenet':
-    spinenet_params = params.spinenet
-    backbone_fn = spinenet.SpineNetBuilder(model_id=spinenet_params.model_id)
-  else:
-    raise ValueError('Backbone model `{}` is not supported.'
-                     .format(params.architecture.backbone))
-
-  return backbone_fn
-
-
-def multilevel_features_generator(params):
-  """Generator function for various FPN models."""
-  if params.architecture.multilevel_features == 'fpn':
-    fpn_params = params.fpn
-    fpn_fn = fpn.Fpn(
-        min_level=params.architecture.min_level,
-        max_level=params.architecture.max_level,
-        fpn_feat_dims=fpn_params.fpn_feat_dims,
-        use_separable_conv=fpn_params.use_separable_conv,
-        activation=params.norm_activation.activation,
-        use_batch_norm=fpn_params.use_batch_norm,
-        norm_activation=norm_activation_generator(
-            params.norm_activation))
-  elif params.architecture.multilevel_features == 'identity':
-    fpn_fn = identity.Identity()
-  else:
-    raise ValueError('The multi-level feature model `{}` is not supported.'
-                     .format(params.architecture.multilevel_features))
-  return fpn_fn
-
-
-def retinanet_head_generator(params):
-  """Generator function for RetinaNet head architecture."""
-  head_params = params.retinanet_head
-  anchors_per_location = params.anchor.num_scales * len(
-      params.anchor.aspect_ratios)
-  return heads.RetinanetHead(
-      params.architecture.min_level,
-      params.architecture.max_level,
-      params.architecture.num_classes,
-      anchors_per_location,
-      head_params.num_convs,
-      head_params.num_filters,
-      head_params.use_separable_conv,
-      norm_activation=norm_activation_generator(params.norm_activation))
-
-
-def rpn_head_generator(params):
-  """Generator function for RPN head architecture."""
-  head_params = params.rpn_head
-  anchors_per_location = params.anchor.num_scales * len(
-      params.anchor.aspect_ratios)
-  return heads.RpnHead(
-      params.architecture.min_level,
-      params.architecture.max_level,
-      anchors_per_location,
-      head_params.num_convs,
-      head_params.num_filters,
-      head_params.use_separable_conv,
-      params.norm_activation.activation,
-      head_params.use_batch_norm,
-      norm_activation=norm_activation_generator(params.norm_activation))
-
-
-def oln_rpn_head_generator(params):
-  """Generator function for OLN-proposal (OLN-RPN) head architecture."""
-  head_params = params.rpn_head
-  anchors_per_location = params.anchor.num_scales * len(
-      params.anchor.aspect_ratios)
-  return heads.OlnRpnHead(
-      params.architecture.min_level,
-      params.architecture.max_level,
-      anchors_per_location,
-      head_params.num_convs,
-      head_params.num_filters,
-      head_params.use_separable_conv,
-      params.norm_activation.activation,
-      head_params.use_batch_norm,
-      norm_activation=norm_activation_generator(params.norm_activation))
-
-
-def fast_rcnn_head_generator(params):
-  """Generator function for Fast R-CNN head architecture."""
-  head_params = params.frcnn_head
-  return heads.FastrcnnHead(
-      params.architecture.num_classes,
-      head_params.num_convs,
-      head_params.num_filters,
-      head_params.use_separable_conv,
-      head_params.num_fcs,
-      head_params.fc_dims,
-      params.norm_activation.activation,
-      head_params.use_batch_norm,
-      norm_activation=norm_activation_generator(params.norm_activation))
-
-
-def oln_box_score_head_generator(params):
-  """Generator function for Scoring Fast R-CNN head architecture."""
-  head_params = params.frcnn_head
-  return heads.OlnBoxScoreHead(
-      params.architecture.num_classes,
-      head_params.num_convs,
-      head_params.num_filters,
-      head_params.use_separable_conv,
-      head_params.num_fcs,
-      head_params.fc_dims,
-      params.norm_activation.activation,
-      head_params.use_batch_norm,
-      norm_activation=norm_activation_generator(params.norm_activation))
-
-
-def mask_rcnn_head_generator(params):
-  """Generator function for Mask R-CNN head architecture."""
-  head_params = params.mrcnn_head
-  return heads.MaskrcnnHead(
-      params.architecture.num_classes,
-      params.architecture.mask_target_size,
-      head_params.num_convs,
-      head_params.num_filters,
-      head_params.use_separable_conv,
-      params.norm_activation.activation,
-      head_params.use_batch_norm,
-      norm_activation=norm_activation_generator(params.norm_activation))
-
-
-def oln_mask_score_head_generator(params):
-  """Generator function for Scoring Mask R-CNN head architecture."""
-  head_params = params.mrcnn_head
-  return heads.OlnMaskScoreHead(
-      params.architecture.num_classes,
-      params.architecture.mask_target_size,
-      head_params.num_convs,
-      head_params.num_filters,
-      head_params.use_separable_conv,
-      params.norm_activation.activation,
-      head_params.use_batch_norm,
-      norm_activation=norm_activation_generator(params.norm_activation))
-
-
-def shapeprior_head_generator(params):
-  """Generator function for shape prior head architecture."""
-  head_params = params.shapemask_head
-  return heads.ShapemaskPriorHead(
-      params.architecture.num_classes,
-      head_params.num_downsample_channels,
-      head_params.mask_crop_size,
-      head_params.use_category_for_mask,
-      head_params.shape_prior_path)
-
-
-def coarsemask_head_generator(params):
-  """Generator function for ShapeMask coarse mask head architecture."""
-  head_params = params.shapemask_head
-  return heads.ShapemaskCoarsemaskHead(
-      params.architecture.num_classes,
-      head_params.num_downsample_channels,
-      head_params.mask_crop_size,
-      head_params.use_category_for_mask,
-      head_params.num_convs,
-      norm_activation=norm_activation_generator(params.norm_activation))
-
-
-def finemask_head_generator(params):
-  """Generator function for Shapemask fine mask head architecture."""
-  head_params = params.shapemask_head
-  return heads.ShapemaskFinemaskHead(
-      params.architecture.num_classes,
-      head_params.num_downsample_channels,
-      head_params.mask_crop_size,
-      head_params.use_category_for_mask,
-      head_params.num_convs,
-      head_params.upsample_factor)
--- a/official/vision/detection/modeling/architecture/fpn.py
+++ b/official/vision/detection/modeling/architecture/fpn.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Feature Pyramid Networks.
-
-Feature Pyramid Networks were proposed in:
-[1] Tsung-Yi Lin, Piotr Dollar, Ross Girshick, Kaiming He, Bharath Hariharan,
-    , and Serge Belongie
-    Feature Pyramid Networks for Object Detection. CVPR 2017.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-import tensorflow as tf
-
-from official.vision.detection.modeling.architecture import nn_ops
-from official.vision.detection.ops import spatial_transform_ops
-
-
-class Fpn(object):
-  """Feature pyramid networks."""
-
-  def __init__(self,
-               min_level=3,
-               max_level=7,
-               fpn_feat_dims=256,
-               use_separable_conv=False,
-               activation='relu',
-               use_batch_norm=True,
-               norm_activation=nn_ops.norm_activation_builder(
-                   activation='relu')):
-    """FPN initialization function.
-
-    Args:
-      min_level: `int` minimum level in FPN output feature maps.
-      max_level: `int` maximum level in FPN output feature maps.
-      fpn_feat_dims: `int` number of filters in FPN layers.
-      use_separable_conv: `bool`, if True use separable convolution for
-        convolution in FPN layers.
-      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
-      norm_activation: an operation that includes a normalization layer
-        followed by an optional activation layer.
-    """
-    self._min_level = min_level
-    self._max_level = max_level
-    self._fpn_feat_dims = fpn_feat_dims
-    if use_separable_conv:
-      self._conv2d_op = functools.partial(
-          tf.keras.layers.SeparableConv2D, depth_multiplier=1)
-    else:
-      self._conv2d_op = tf.keras.layers.Conv2D
-    if activation == 'relu':
-      self._activation_op = tf.nn.relu
-    elif activation == 'swish':
-      self._activation_op = tf.nn.swish
-    else:
-      raise ValueError('Unsupported activation `{}`.'.format(activation))
-    self._use_batch_norm = use_batch_norm
-    self._norm_activation = norm_activation
-
-    self._norm_activations = {}
-    self._lateral_conv2d_op = {}
-    self._post_hoc_conv2d_op = {}
-    self._coarse_conv2d_op = {}
-    for level in range(self._min_level, self._max_level + 1):
-      if self._use_batch_norm:
-        self._norm_activations[level] = norm_activation(
-            use_activation=False, name='p%d-bn' % level)
-      self._lateral_conv2d_op[level] = self._conv2d_op(
-          filters=self._fpn_feat_dims,
-          kernel_size=(1, 1),
-          padding='same',
-          name='l%d' % level)
-      self._post_hoc_conv2d_op[level] = self._conv2d_op(
-          filters=self._fpn_feat_dims,
-          strides=(1, 1),
-          kernel_size=(3, 3),
-          padding='same',
-          name='post_hoc_d%d' % level)
-      self._coarse_conv2d_op[level] = self._conv2d_op(
-          filters=self._fpn_feat_dims,
-          strides=(2, 2),
-          kernel_size=(3, 3),
-          padding='same',
-          name='p%d' % level)
-
-  def __call__(self, multilevel_features, is_training=None):
-    """Returns the FPN features for a given multilevel features.
-
-    Args:
-      multilevel_features: a `dict` containing `int` keys for continuous feature
-        levels, e.g., [2, 3, 4, 5]. The values are corresponding features with
-        shape [batch_size, height_l, width_l, num_filters].
-      is_training: `bool` if True, the model is in training mode.
-
-    Returns:
-      a `dict` containing `int` keys for continuous feature levels
-      [min_level, min_level + 1, ..., max_level]. The values are corresponding
-      FPN features with shape [batch_size, height_l, width_l, fpn_feat_dims].
-    """
-    input_levels = list(multilevel_features.keys())
-    if min(input_levels) > self._min_level:
-      raise ValueError(
-          'The minimum backbone level %d should be '%(min(input_levels)) +
-          'less or equal to FPN minimum level %d.:'%(self._min_level))
-    backbone_max_level = min(max(input_levels), self._max_level)
-    with tf.name_scope('fpn'):
-      # Adds lateral connections.
-      feats_lateral = {}
-      for level in range(self._min_level, backbone_max_level + 1):
-        feats_lateral[level] = self._lateral_conv2d_op[level](
-            multilevel_features[level])
-
-      # Adds top-down path.
-      feats = {backbone_max_level: feats_lateral[backbone_max_level]}
-      for level in range(backbone_max_level - 1, self._min_level - 1, -1):
-        feats[level] = spatial_transform_ops.nearest_upsampling(
-            feats[level + 1], 2) + feats_lateral[level]
-
-      # Adds post-hoc 3x3 convolution kernel.
-      for level in range(self._min_level, backbone_max_level + 1):
-        feats[level] = self._post_hoc_conv2d_op[level](feats[level])
-
-      # Adds coarser FPN levels introduced for RetinaNet.
-      for level in range(backbone_max_level + 1, self._max_level + 1):
-        feats_in = feats[level - 1]
-        if level > backbone_max_level + 1:
-          feats_in = self._activation_op(feats_in)
-        feats[level] = self._coarse_conv2d_op[level](feats_in)
-      if self._use_batch_norm:
-        # Adds batch_norm layer.
-        for level in range(self._min_level, self._max_level + 1):
-          feats[level] = self._norm_activations[level](
-              feats[level], is_training=is_training)
-    return feats
--- a/official/vision/detection/modeling/architecture/heads.py
+++ b/official/vision/detection/modeling/architecture/heads.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Classes to build various prediction heads in all supported models."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-import numpy as np
-import tensorflow as tf
-
-from official.vision.detection.modeling.architecture import nn_ops
-from official.vision.detection.ops import spatial_transform_ops
-
-
-class RpnHead(tf.keras.layers.Layer):
-  """Region Proposal Network head."""
-
-  def __init__(
-      self,
-      min_level,
-      max_level,
-      anchors_per_location,
-      num_convs=2,
-      num_filters=256,
-      use_separable_conv=False,
-      activation='relu',
-      use_batch_norm=True,
-      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
-    """Initialize params to build Region Proposal Network head.
-
-    Args:
-      min_level: `int` number of minimum feature level.
-      max_level: `int` number of maximum feature level.
-      anchors_per_location: `int` number of number of anchors per pixel
-        location.
-      num_convs: `int` number that represents the number of the intermediate
-        conv layers before the prediction.
-      num_filters: `int` number that represents the number of filters of the
-        intermediate conv layers.
-      use_separable_conv: `bool`, indicating whether the separable conv layers
-        is used.
-      activation: activation function. Support 'relu' and 'swish'.
-      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
-      norm_activation: an operation that includes a normalization layer followed
-        by an optional activation layer.
-    """
-    super().__init__(autocast=False)
-
-    self._min_level = min_level
-    self._max_level = max_level
-    self._anchors_per_location = anchors_per_location
-    if activation == 'relu':
-      self._activation_op = tf.nn.relu
-    elif activation == 'swish':
-      self._activation_op = tf.nn.swish
-    else:
-      raise ValueError('Unsupported activation `{}`.'.format(activation))
-    self._use_batch_norm = use_batch_norm
-
-    if use_separable_conv:
-      self._conv2d_op = functools.partial(
-          tf.keras.layers.SeparableConv2D,
-          depth_multiplier=1,
-          bias_initializer=tf.zeros_initializer())
-    else:
-      self._conv2d_op = functools.partial(
-          tf.keras.layers.Conv2D,
-          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
-          bias_initializer=tf.zeros_initializer())
-
-    self._rpn_conv = self._conv2d_op(
-        num_filters,
-        kernel_size=(3, 3),
-        strides=(1, 1),
-        activation=(None if self._use_batch_norm else self._activation_op),
-        padding='same',
-        name='rpn')
-    self._rpn_class_conv = self._conv2d_op(
-        anchors_per_location,
-        kernel_size=(1, 1),
-        strides=(1, 1),
-        padding='valid',
-        name='rpn-class')
-    self._rpn_box_conv = self._conv2d_op(
-        4 * anchors_per_location,
-        kernel_size=(1, 1),
-        strides=(1, 1),
-        padding='valid',
-        name='rpn-box')
-
-    self._norm_activations = {}
-    if self._use_batch_norm:
-      for level in range(self._min_level, self._max_level + 1):
-        self._norm_activations[level] = norm_activation(name='rpn-l%d-bn' %
-                                                        level)
-
-  def _shared_rpn_heads(self, features, anchors_per_location, level,
-                        is_training):
-    """Shared RPN heads."""
-    features = self._rpn_conv(features)
-    if self._use_batch_norm:
-      # The batch normalization layers are not shared between levels.
-      features = self._norm_activations[level](
-          features, is_training=is_training)
-    # Proposal classification scores
-    scores = self._rpn_class_conv(features)
-    # Proposal bbox regression deltas
-    bboxes = self._rpn_box_conv(features)
-
-    return scores, bboxes
-
-  def call(self, features, is_training=None):
-
-    scores_outputs = {}
-    box_outputs = {}
-
-    with tf.name_scope('rpn_head'):
-      for level in range(self._min_level, self._max_level + 1):
-        scores_output, box_output = self._shared_rpn_heads(
-            features[level], self._anchors_per_location, level, is_training)
-        scores_outputs[level] = scores_output
-        box_outputs[level] = box_output
-      return scores_outputs, box_outputs
-
-
-class OlnRpnHead(tf.keras.layers.Layer):
-  """Region Proposal Network for Object Localization Network (OLN)."""
-
-  def __init__(
-      self,
-      min_level,
-      max_level,
-      anchors_per_location,
-      num_convs=2,
-      num_filters=256,
-      use_separable_conv=False,
-      activation='relu',
-      use_batch_norm=True,
-      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
-    """Initialize params to build Region Proposal Network head.
-
-    Args:
-      min_level: `int` number of minimum feature level.
-      max_level: `int` number of maximum feature level.
-      anchors_per_location: `int` number of number of anchors per pixel
-        location.
-      num_convs: `int` number that represents the number of the intermediate
-        conv layers before the prediction.
-      num_filters: `int` number that represents the number of filters of the
-        intermediate conv layers.
-      use_separable_conv: `bool`, indicating whether the separable conv layers
-        is used.
-      activation: activation function. Support 'relu' and 'swish'.
-      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
-      norm_activation: an operation that includes a normalization layer followed
-        by an optional activation layer.
-    """
-    self._min_level = min_level
-    self._max_level = max_level
-    self._anchors_per_location = anchors_per_location
-    if activation == 'relu':
-      self._activation_op = tf.nn.relu
-    elif activation == 'swish':
-      self._activation_op = tf.nn.swish
-    else:
-      raise ValueError('Unsupported activation `{}`.'.format(activation))
-    self._use_batch_norm = use_batch_norm
-
-    if use_separable_conv:
-      self._conv2d_op = functools.partial(
-          tf.keras.layers.SeparableConv2D,
-          depth_multiplier=1,
-          bias_initializer=tf.zeros_initializer())
-    else:
-      self._conv2d_op = functools.partial(
-          tf.keras.layers.Conv2D,
-          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
-          bias_initializer=tf.zeros_initializer())
-
-    self._rpn_conv = self._conv2d_op(
-        num_filters,
-        kernel_size=(3, 3),
-        strides=(1, 1),
-        activation=(None if self._use_batch_norm else self._activation_op),
-        padding='same',
-        name='rpn')
-    self._rpn_class_conv = self._conv2d_op(
-        anchors_per_location,
-        kernel_size=(1, 1),
-        strides=(1, 1),
-        padding='valid',
-        name='rpn-class')
-    self._rpn_box_conv = self._conv2d_op(
-        4 * anchors_per_location,
-        kernel_size=(1, 1),
-        strides=(1, 1),
-        padding='valid',
-        name='rpn-box-lrtb')
-    self._rpn_center_conv = self._conv2d_op(
-        anchors_per_location,
-        kernel_size=(1, 1),
-        strides=(1, 1),
-        padding='valid',
-        name='rpn-centerness')
-
-    self._norm_activations = {}
-    if self._use_batch_norm:
-      for level in range(self._min_level, self._max_level + 1):
-        self._norm_activations[level] = norm_activation(name='rpn-l%d-bn' %
-                                                        level)
-
-  def _shared_rpn_heads(self, features, anchors_per_location, level,
-                        is_training):
-    """Shared RPN heads."""
-    features = self._rpn_conv(features)
-    if self._use_batch_norm:
-      # The batch normalization layers are not shared between levels.
-      features = self._norm_activations[level](
-          features, is_training=is_training)
-    # Feature L2 normalization for training stability
-    features = tf.math.l2_normalize(
-        features,
-        axis=-1,
-        name='rpn-norm',)
-    # Proposal classification scores
-    scores = self._rpn_class_conv(features)
-    # Proposal bbox regression deltas
-    bboxes = self._rpn_box_conv(features)
-    # Proposal centerness scores
-    centers = self._rpn_center_conv(features)
-
-    return scores, bboxes, centers
-
-  def __call__(self, features, is_training=None):
-
-    scores_outputs = {}
-    box_outputs = {}
-    center_outputs = {}
-
-    with tf.name_scope('rpn_head'):
-      for level in range(self._min_level, self._max_level + 1):
-        scores_output, box_output, center_output = self._shared_rpn_heads(
-            features[level], self._anchors_per_location, level, is_training)
-        scores_outputs[level] = scores_output
-        box_outputs[level] = box_output
-        center_outputs[level] = center_output
-      return scores_outputs, box_outputs, center_outputs
-
-
-class FastrcnnHead(tf.keras.layers.Layer):
-  """Fast R-CNN box head."""
-
-  def __init__(
-      self,
-      num_classes,
-      num_convs=0,
-      num_filters=256,
-      use_separable_conv=False,
-      num_fcs=2,
-      fc_dims=1024,
-      activation='relu',
-      use_batch_norm=True,
-      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
-    """Initialize params to build Fast R-CNN box head.
-
-    Args:
-      num_classes: a integer for the number of classes.
-      num_convs: `int` number that represents the number of the intermediate
-        conv layers before the FC layers.
-      num_filters: `int` number that represents the number of filters of the
-        intermediate conv layers.
-      use_separable_conv: `bool`, indicating whether the separable conv layers
-        is used.
-      num_fcs: `int` number that represents the number of FC layers before the
-        predictions.
-      fc_dims: `int` number that represents the number of dimension of the FC
-        layers.
-      activation: activation function. Support 'relu' and 'swish'.
-      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
-      norm_activation: an operation that includes a normalization layer followed
-        by an optional activation layer.
-    """
-    super(FastrcnnHead, self).__init__(autocast=False)
-
-    self._num_classes = num_classes
-
-    self._num_convs = num_convs
-    self._num_filters = num_filters
-    if use_separable_conv:
-      self._conv2d_op = functools.partial(
-          tf.keras.layers.SeparableConv2D,
-          depth_multiplier=1,
-          bias_initializer=tf.zeros_initializer())
-    else:
-      self._conv2d_op = functools.partial(
-          tf.keras.layers.Conv2D,
-          kernel_initializer=tf.keras.initializers.VarianceScaling(
-              scale=2, mode='fan_out', distribution='untruncated_normal'),
-          bias_initializer=tf.zeros_initializer())
-
-    self._num_fcs = num_fcs
-    self._fc_dims = fc_dims
-    if activation == 'relu':
-      self._activation_op = tf.nn.relu
-    elif activation == 'swish':
-      self._activation_op = tf.nn.swish
-    else:
-      raise ValueError('Unsupported activation `{}`.'.format(activation))
-    self._use_batch_norm = use_batch_norm
-    self._norm_activation = norm_activation
-
-    self._conv_ops = []
-    self._conv_bn_ops = []
-    for i in range(self._num_convs):
-      self._conv_ops.append(
-          self._conv2d_op(
-              self._num_filters,
-              kernel_size=(3, 3),
-              strides=(1, 1),
-              padding='same',
-              dilation_rate=(1, 1),
-              activation=(None
-                          if self._use_batch_norm else self._activation_op),
-              name='conv_{}'.format(i)))
-      if self._use_batch_norm:
-        self._conv_bn_ops.append(self._norm_activation())
-
-    self._fc_ops = []
-    self._fc_bn_ops = []
-    for i in range(self._num_fcs):
-      self._fc_ops.append(
-          tf.keras.layers.Dense(
-              units=self._fc_dims,
-              activation=(None
-                          if self._use_batch_norm else self._activation_op),
-              name='fc{}'.format(i)))
-      if self._use_batch_norm:
-        self._fc_bn_ops.append(self._norm_activation(fused=False))
-
-    self._class_predict = tf.keras.layers.Dense(
-        self._num_classes,
-        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
-        bias_initializer=tf.zeros_initializer(),
-        name='class-predict')
-    self._box_predict = tf.keras.layers.Dense(
-        self._num_classes * 4,
-        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.001),
-        bias_initializer=tf.zeros_initializer(),
-        name='box-predict')
-
-  def call(self, roi_features, is_training=None):
-    """Box and class branches for the Mask-RCNN model.
-
-    Args:
-      roi_features: A ROI feature tensor of shape [batch_size, num_rois,
-        height_l, width_l, num_filters].
-      is_training: `boolean`, if True if model is in training mode.
-
-    Returns:
-      class_outputs: a tensor with a shape of
-        [batch_size, num_rois, num_classes], representing the class predictions.
-      box_outputs: a tensor with a shape of
-        [batch_size, num_rois, num_classes * 4], representing the box
-        predictions.
-    """
-
-    with tf.name_scope(
-        'fast_rcnn_head'):
-      # reshape inputs beofre FC.
-      _, num_rois, height, width, filters = roi_features.get_shape().as_list()
-
-      net = tf.reshape(roi_features, [-1, height, width, filters])
-      for i in range(self._num_convs):
-        net = self._conv_ops[i](net)
-        if self._use_batch_norm:
-          net = self._conv_bn_ops[i](net, is_training=is_training)
-
-      filters = self._num_filters if self._num_convs > 0 else filters
-      net = tf.reshape(net, [-1, num_rois, height * width * filters])
-
-      for i in range(self._num_fcs):
-        net = self._fc_ops[i](net)
-        if self._use_batch_norm:
-          net = self._fc_bn_ops[i](net, is_training=is_training)
-
-      class_outputs = self._class_predict(net)
-      box_outputs = self._box_predict(net)
-      return class_outputs, box_outputs
-
-
-class OlnBoxScoreHead(tf.keras.layers.Layer):
-  """Box head of Object Localization Network (OLN)."""
-
-  def __init__(
-      self,
-      num_classes,
-      num_convs=0,
-      num_filters=256,
-      use_separable_conv=False,
-      num_fcs=2,
-      fc_dims=1024,
-      activation='relu',
-      use_batch_norm=True,
-      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
-    """Initialize params to build OLN box head.
-
-    Args:
-      num_classes: a integer for the number of classes.
-      num_convs: `int` number that represents the number of the intermediate
-        conv layers before the FC layers.
-      num_filters: `int` number that represents the number of filters of the
-        intermediate conv layers.
-      use_separable_conv: `bool`, indicating whether the separable conv layers
-        is used.
-      num_fcs: `int` number that represents the number of FC layers before the
-        predictions.
-      fc_dims: `int` number that represents the number of dimension of the FC
-        layers.
-      activation: activation function. Support 'relu' and 'swish'.
-      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
-      norm_activation: an operation that includes a normalization layer followed
-        by an optional activation layer.
-    """
-    self._num_classes = num_classes
-
-    self._num_convs = num_convs
-    self._num_filters = num_filters
-    if use_separable_conv:
-      self._conv2d_op = functools.partial(
-          tf.keras.layers.SeparableConv2D,
-          depth_multiplier=1,
-          bias_initializer=tf.zeros_initializer())
-    else:
-      self._conv2d_op = functools.partial(
-          tf.keras.layers.Conv2D,
-          kernel_initializer=tf.keras.initializers.VarianceScaling(
-              scale=2, mode='fan_out', distribution='untruncated_normal'),
-          bias_initializer=tf.zeros_initializer())
-
-    self._num_fcs = num_fcs
-    self._fc_dims = fc_dims
-    if activation == 'relu':
-      self._activation_op = tf.nn.relu
-    elif activation == 'swish':
-      self._activation_op = tf.nn.swish
-    else:
-      raise ValueError('Unsupported activation `{}`.'.format(activation))
-    self._use_batch_norm = use_batch_norm
-    self._norm_activation = norm_activation
-
-    self._conv_ops = []
-    self._conv_bn_ops = []
-    for i in range(self._num_convs):
-      self._conv_ops.append(
-          self._conv2d_op(
-              self._num_filters,
-              kernel_size=(3, 3),
-              strides=(1, 1),
-              padding='same',
-              dilation_rate=(1, 1),
-              activation=(None
-                          if self._use_batch_norm else self._activation_op),
-              name='conv_{}'.format(i)))
-      if self._use_batch_norm:
-        self._conv_bn_ops.append(self._norm_activation())
-
-    self._fc_ops = []
-    self._fc_bn_ops = []
-    for i in range(self._num_fcs):
-      self._fc_ops.append(
-          tf.keras.layers.Dense(
-              units=self._fc_dims,
-              activation=(None
-                          if self._use_batch_norm else self._activation_op),
-              name='fc{}'.format(i)))
-      if self._use_batch_norm:
-        self._fc_bn_ops.append(self._norm_activation(fused=False))
-
-    self._class_predict = tf.keras.layers.Dense(
-        self._num_classes,
-        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
-        bias_initializer=tf.zeros_initializer(),
-        name='class-predict')
-    self._box_predict = tf.keras.layers.Dense(
-        self._num_classes * 4,
-        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.001),
-        bias_initializer=tf.zeros_initializer(),
-        name='box-predict')
-    self._score_predict = tf.keras.layers.Dense(
-        1,
-        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
-        bias_initializer=tf.zeros_initializer(),
-        name='score-predict')
-
-  def __call__(self, roi_features, is_training=None):
-    """Box and class branches for the Mask-RCNN model.
-
-    Args:
-      roi_features: A ROI feature tensor of shape [batch_size, num_rois,
-        height_l, width_l, num_filters].
-      is_training: `boolean`, if True if model is in training mode.
-
-    Returns:
-      class_outputs: a tensor with a shape of
-        [batch_size, num_rois, num_classes], representing the class predictions.
-      box_outputs: a tensor with a shape of
-        [batch_size, num_rois, num_classes * 4], representing the box
-        predictions.
-    """
-
-    with tf.name_scope('fast_rcnn_head'):
-      # reshape inputs beofre FC.
-      _, num_rois, height, width, filters = roi_features.get_shape().as_list()
-
-      net = tf.reshape(roi_features, [-1, height, width, filters])
-      for i in range(self._num_convs):
-        net = self._conv_ops[i](net)
-        if self._use_batch_norm:
-          net = self._conv_bn_ops[i](net, is_training=is_training)
-
-      filters = self._num_filters if self._num_convs > 0 else filters
-      net = tf.reshape(net, [-1, num_rois, height * width * filters])
-
-      for i in range(self._num_fcs):
-        net = self._fc_ops[i](net)
-        if self._use_batch_norm:
-          net = self._fc_bn_ops[i](net, is_training=is_training)
-
-      class_outputs = self._class_predict(net)
-      box_outputs = self._box_predict(net)
-      score_outputs = self._score_predict(net)
-      return class_outputs, box_outputs, score_outputs
-
-
-class MaskrcnnHead(tf.keras.layers.Layer):
-  """Mask R-CNN head."""
-
-  def __init__(
-      self,
-      num_classes,
-      mask_target_size,
-      num_convs=4,
-      num_filters=256,
-      use_separable_conv=False,
-      activation='relu',
-      use_batch_norm=True,
-      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
-    """Initialize params to build Fast R-CNN head.
-
-    Args:
-      num_classes: a integer for the number of classes.
-      mask_target_size: a integer that is the resolution of masks.
-      num_convs: `int` number that represents the number of the intermediate
-        conv layers before the prediction.
-      num_filters: `int` number that represents the number of filters of the
-        intermediate conv layers.
-      use_separable_conv: `bool`, indicating whether the separable conv layers
-        is used.
-      activation: activation function. Support 'relu' and 'swish'.
-      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
-      norm_activation: an operation that includes a normalization layer followed
-        by an optional activation layer.
-    """
-    super(MaskrcnnHead, self).__init__(autocast=False)
-    self._num_classes = num_classes
-    self._mask_target_size = mask_target_size
-
-    self._num_convs = num_convs
-    self._num_filters = num_filters
-    if use_separable_conv:
-      self._conv2d_op = functools.partial(
-          tf.keras.layers.SeparableConv2D,
-          depth_multiplier=1,
-          bias_initializer=tf.zeros_initializer())
-    else:
-      self._conv2d_op = functools.partial(
-          tf.keras.layers.Conv2D,
-          kernel_initializer=tf.keras.initializers.VarianceScaling(
-              scale=2, mode='fan_out', distribution='untruncated_normal'),
-          bias_initializer=tf.zeros_initializer())
-    if activation == 'relu':
-      self._activation_op = tf.nn.relu
-    elif activation == 'swish':
-      self._activation_op = tf.nn.swish
-    else:
-      raise ValueError('Unsupported activation `{}`.'.format(activation))
-    self._use_batch_norm = use_batch_norm
-    self._norm_activation = norm_activation
-    self._conv2d_ops = []
-    for i in range(self._num_convs):
-      self._conv2d_ops.append(
-          self._conv2d_op(
-              self._num_filters,
-              kernel_size=(3, 3),
-              strides=(1, 1),
-              padding='same',
-              dilation_rate=(1, 1),
-              activation=(None
-                          if self._use_batch_norm else self._activation_op),
-              name='mask-conv-l%d' % i))
-    self._mask_conv_transpose = tf.keras.layers.Conv2DTranspose(
-        self._num_filters,
-        kernel_size=(2, 2),
-        strides=(2, 2),
-        padding='valid',
-        activation=(None if self._use_batch_norm else self._activation_op),
-        kernel_initializer=tf.keras.initializers.VarianceScaling(
-            scale=2, mode='fan_out', distribution='untruncated_normal'),
-        bias_initializer=tf.zeros_initializer(),
-        name='conv5-mask')
-
-    with tf.name_scope('mask_head'):
-      self._mask_conv2d_op = self._conv2d_op(
-          self._num_classes,
-          kernel_size=(1, 1),
-          strides=(1, 1),
-          padding='valid',
-          name='mask_fcn_logits')
-
-  def call(self, roi_features, class_indices, is_training=None):
-    """Mask branch for the Mask-RCNN model.
-
-    Args:
-      roi_features: A ROI feature tensor of shape [batch_size, num_rois,
-        height_l, width_l, num_filters].
-      class_indices: a Tensor of shape [batch_size, num_rois], indicating which
-        class the ROI is.
-      is_training: `boolean`, if True if model is in training mode.
-
-    Returns:
-      mask_outputs: a tensor with a shape of
-        [batch_size, num_masks, mask_height, mask_width, num_classes],
-        representing the mask predictions.
-      fg_gather_indices: a tensor with a shape of [batch_size, num_masks, 2],
-        representing the fg mask targets.
-    Raises:
-      ValueError: If boxes is not a rank-3 tensor or the last dimension of
-        boxes is not 4.
-    """
-
-    with tf.name_scope('mask_head'):
-      _, num_rois, height, width, filters = roi_features.get_shape().as_list()
-      net = tf.reshape(roi_features, [-1, height, width, filters])
-
-      for i in range(self._num_convs):
-        net = self._conv2d_ops[i](net)
-        if self._use_batch_norm:
-          net = self._norm_activation()(net, is_training=is_training)
-
-      net = self._mask_conv_transpose(net)
-      if self._use_batch_norm:
-        net = self._norm_activation()(net, is_training=is_training)
-
-      mask_outputs = self._mask_conv2d_op(net)
-      mask_outputs = tf.reshape(mask_outputs, [
-          -1, num_rois, self._mask_target_size, self._mask_target_size,
-          self._num_classes
-      ])
-
-      with tf.name_scope('masks_post_processing'):
-        # TODO(pengchong): Figure out the way not to use the static inferred
-        # batch size.
-        batch_size, num_masks = class_indices.get_shape().as_list()
-        mask_outputs = tf.transpose(a=mask_outputs, perm=[0, 1, 4, 2, 3])
-        # Constructs indices for gather.
-        batch_indices = tf.tile(
-            tf.expand_dims(tf.range(batch_size), axis=1), [1, num_masks])
-        mask_indices = tf.tile(
-            tf.expand_dims(tf.range(num_masks), axis=0), [batch_size, 1])
-        gather_indices = tf.stack(
-            [batch_indices, mask_indices, class_indices], axis=2)
-        mask_outputs = tf.gather_nd(mask_outputs, gather_indices)
-      return mask_outputs
-
-
-class RetinanetHead(object):
-  """RetinaNet head."""
-
-  def __init__(
-      self,
-      min_level,
-      max_level,
-      num_classes,
-      anchors_per_location,
-      num_convs=4,
-      num_filters=256,
-      use_separable_conv=False,
-      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
-    """Initialize params to build RetinaNet head.
-
-    Args:
-      min_level: `int` number of minimum feature level.
-      max_level: `int` number of maximum feature level.
-      num_classes: `int` number of classification categories.
-      anchors_per_location: `int` number of anchors per pixel location.
-      num_convs: `int` number of stacked convolution before the last prediction
-        layer.
-      num_filters: `int` number of filters used in the head architecture.
-      use_separable_conv: `bool` to indicate whether to use separable
-        convoluation.
-      norm_activation: an operation that includes a normalization layer followed
-        by an optional activation layer.
-    """
-    self._min_level = min_level
-    self._max_level = max_level
-
-    self._num_classes = num_classes
-    self._anchors_per_location = anchors_per_location
-
-    self._num_convs = num_convs
-    self._num_filters = num_filters
-    self._use_separable_conv = use_separable_conv
-    with tf.name_scope('class_net') as scope_name:
-      self._class_name_scope = tf.name_scope(scope_name)
-    with tf.name_scope('box_net') as scope_name:
-      self._box_name_scope = tf.name_scope(scope_name)
-    self._build_class_net_layers(norm_activation)
-    self._build_box_net_layers(norm_activation)
-
-  def _class_net_batch_norm_name(self, i, level):
-    return 'class-%d-%d' % (i, level)
-
-  def _box_net_batch_norm_name(self, i, level):
-    return 'box-%d-%d' % (i, level)
-
-  def _build_class_net_layers(self, norm_activation):
-    """Build re-usable layers for class prediction network."""
-    if self._use_separable_conv:
-      self._class_predict = tf.keras.layers.SeparableConv2D(
-          self._num_classes * self._anchors_per_location,
-          kernel_size=(3, 3),
-          bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
-          padding='same',
-          name='class-predict')
-    else:
-      self._class_predict = tf.keras.layers.Conv2D(
-          self._num_classes * self._anchors_per_location,
-          kernel_size=(3, 3),
-          bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
-          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1e-5),
-          padding='same',
-          name='class-predict')
-    self._class_conv = []
-    self._class_norm_activation = {}
-    for i in range(self._num_convs):
-      if self._use_separable_conv:
-        self._class_conv.append(
-            tf.keras.layers.SeparableConv2D(
-                self._num_filters,
-                kernel_size=(3, 3),
-                bias_initializer=tf.zeros_initializer(),
-                activation=None,
-                padding='same',
-                name='class-' + str(i)))
-      else:
-        self._class_conv.append(
-            tf.keras.layers.Conv2D(
-                self._num_filters,
-                kernel_size=(3, 3),
-                bias_initializer=tf.zeros_initializer(),
-                kernel_initializer=tf.keras.initializers.RandomNormal(
-                    stddev=0.01),
-                activation=None,
-                padding='same',
-                name='class-' + str(i)))
-      for level in range(self._min_level, self._max_level + 1):
-        name = self._class_net_batch_norm_name(i, level)
-        self._class_norm_activation[name] = norm_activation(name=name)
-
-  def _build_box_net_layers(self, norm_activation):
-    """Build re-usable layers for box prediction network."""
-    if self._use_separable_conv:
-      self._box_predict = tf.keras.layers.SeparableConv2D(
-          4 * self._anchors_per_location,
-          kernel_size=(3, 3),
-          bias_initializer=tf.zeros_initializer(),
-          padding='same',
-          name='box-predict')
-    else:
-      self._box_predict = tf.keras.layers.Conv2D(
-          4 * self._anchors_per_location,
-          kernel_size=(3, 3),
-          bias_initializer=tf.zeros_initializer(),
-          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1e-5),
-          padding='same',
-          name='box-predict')
-    self._box_conv = []
-    self._box_norm_activation = {}
-    for i in range(self._num_convs):
-      if self._use_separable_conv:
-        self._box_conv.append(
-            tf.keras.layers.SeparableConv2D(
-                self._num_filters,
-                kernel_size=(3, 3),
-                activation=None,
-                bias_initializer=tf.zeros_initializer(),
-                padding='same',
-                name='box-' + str(i)))
-      else:
-        self._box_conv.append(
-            tf.keras.layers.Conv2D(
-                self._num_filters,
-                kernel_size=(3, 3),
-                activation=None,
-                bias_initializer=tf.zeros_initializer(),
-                kernel_initializer=tf.keras.initializers.RandomNormal(
-                    stddev=0.01),
-                padding='same',
-                name='box-' + str(i)))
-      for level in range(self._min_level, self._max_level + 1):
-        name = self._box_net_batch_norm_name(i, level)
-        self._box_norm_activation[name] = norm_activation(name=name)
-
-  def __call__(self, fpn_features, is_training=None):
-    """Returns outputs of RetinaNet head."""
-    class_outputs = {}
-    box_outputs = {}
-    with tf.name_scope('retinanet_head'):
-      for level in range(self._min_level, self._max_level + 1):
-        features = fpn_features[level]
-
-        class_outputs[level] = self.class_net(
-            features, level, is_training=is_training)
-        box_outputs[level] = self.box_net(
-            features, level, is_training=is_training)
-    return class_outputs, box_outputs
-
-  def class_net(self, features, level, is_training):
-    """Class prediction network for RetinaNet."""
-    with self._class_name_scope:
-      for i in range(self._num_convs):
-        features = self._class_conv[i](features)
-        # The convolution layers in the class net are shared among all levels,
-        # but each level has its batch normlization to capture the statistical
-        # difference among different levels.
-        name = self._class_net_batch_norm_name(i, level)
-        features = self._class_norm_activation[name](
-            features, is_training=is_training)
-
-      classes = self._class_predict(features)
-    return classes
-
-  def box_net(self, features, level, is_training=None):
-    """Box regression network for RetinaNet."""
-    with self._box_name_scope:
-      for i in range(self._num_convs):
-        features = self._box_conv[i](features)
-        # The convolution layers in the box net are shared among all levels, but
-        # each level has its batch normlization to capture the statistical
-        # difference among different levels.
-        name = self._box_net_batch_norm_name(i, level)
-        features = self._box_norm_activation[name](
-            features, is_training=is_training)
-
-      boxes = self._box_predict(features)
-    return boxes
-
-
-# TODO(yeqing): Refactor this class when it is ready for var_scope reuse.
-class ShapemaskPriorHead(object):
-  """ShapeMask Prior head."""
-
-  def __init__(self, num_classes, num_downsample_channels, mask_crop_size,
-               use_category_for_mask, shape_prior_path):
-    """Initialize params to build RetinaNet head.
-
-    Args:
-      num_classes: Number of output classes.
-      num_downsample_channels: number of channels in mask branch.
-      mask_crop_size: feature crop size.
-      use_category_for_mask: use class information in mask branch.
-      shape_prior_path: the path to load shape priors.
-    """
-    self._mask_num_classes = num_classes if use_category_for_mask else 1
-    self._num_downsample_channels = num_downsample_channels
-    self._mask_crop_size = mask_crop_size
-    self._shape_prior_path = shape_prior_path
-    self._use_category_for_mask = use_category_for_mask
-
-    self._shape_prior_fc = tf.keras.layers.Dense(
-        self._num_downsample_channels, name='shape-prior-fc')
-
-  def __call__(self, fpn_features, boxes, outer_boxes, classes, is_training):
-    """Generate the detection priors from the box detections and FPN features.
-
-    This corresponds to the Fig. 4 of the ShapeMask paper at
-    https://arxiv.org/pdf/1904.03239.pdf
-
-    Args:
-      fpn_features: a dictionary of FPN features.
-      boxes: a float tensor of shape [batch_size, num_instances, 4] representing
-        the tight gt boxes from dataloader/detection.
-      outer_boxes: a float tensor of shape [batch_size, num_instances, 4]
-        representing the loose gt boxes from dataloader/detection.
-      classes: a int Tensor of shape [batch_size, num_instances] of instance
-        classes.
-      is_training: training mode or not.
-
-    Returns:
-      instance_features: a float Tensor of shape [batch_size * num_instances,
-          mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
-          instance feature crop.
-      detection_priors: A float Tensor of shape [batch_size * num_instances,
-        mask_size, mask_size, 1].
-    """
-    with tf.name_scope('prior_mask'):
-      batch_size, num_instances, _ = boxes.get_shape().as_list()
-      outer_boxes = tf.cast(outer_boxes, tf.float32)
-      boxes = tf.cast(boxes, tf.float32)
-      instance_features = spatial_transform_ops.multilevel_crop_and_resize(
-          fpn_features, outer_boxes, output_size=self._mask_crop_size)
-      instance_features = self._shape_prior_fc(instance_features)
-
-      shape_priors = self._get_priors()
-
-      # Get uniform priors for each outer box.
-      uniform_priors = tf.ones([
-          batch_size, num_instances, self._mask_crop_size, self._mask_crop_size
-      ])
-      uniform_priors = spatial_transform_ops.crop_mask_in_target_box(
-          uniform_priors, boxes, outer_boxes, self._mask_crop_size)
-
-      # Classify shape priors using uniform priors + instance features.
-      prior_distribution = self._classify_shape_priors(
-          tf.cast(instance_features, tf.float32), uniform_priors, classes)
-
-      instance_priors = tf.gather(shape_priors, classes)
-      instance_priors *= tf.expand_dims(
-          tf.expand_dims(tf.cast(prior_distribution, tf.float32), axis=-1),
-          axis=-1)
-      instance_priors = tf.reduce_sum(instance_priors, axis=2)
-      detection_priors = spatial_transform_ops.crop_mask_in_target_box(
-          instance_priors, boxes, outer_boxes, self._mask_crop_size)
-
-      return instance_features, detection_priors
-
-  def _get_priors(self):
-    """Load shape priors from file."""
-    # loads class specific or agnostic shape priors
-    if self._shape_prior_path:
-      # Priors are loaded into shape [mask_num_classes, num_clusters, 32, 32].
-      priors = np.load(tf.io.gfile.GFile(self._shape_prior_path, 'rb'))
-      priors = tf.convert_to_tensor(priors, dtype=tf.float32)
-      self._num_clusters = priors.get_shape().as_list()[1]
-    else:
-      # If prior path does not exist, do not use priors, i.e., pirors equal to
-      # uniform empty 32x32 patch.
-      self._num_clusters = 1
-      priors = tf.zeros([
-          self._mask_num_classes, self._num_clusters, self._mask_crop_size,
-          self._mask_crop_size
-      ])
-    return priors
-
-  def _classify_shape_priors(self, features, uniform_priors, classes):
-    """Classify the uniform prior by predicting the shape modes.
-
-    Classify the object crop features into K modes of the clusters for each
-    category.
-
-    Args:
-      features: A float Tensor of shape [batch_size, num_instances, mask_size,
-        mask_size, num_channels].
-      uniform_priors: A float Tensor of shape [batch_size, num_instances,
-        mask_size, mask_size] representing the uniform detection priors.
-      classes: A int Tensor of shape [batch_size, num_instances] of detection
-        class ids.
-
-    Returns:
-      prior_distribution: A float Tensor of shape
-        [batch_size, num_instances, num_clusters] representing the classifier
-        output probability over all possible shapes.
-    """
-
-    batch_size, num_instances, _, _, _ = features.get_shape().as_list()
-    features *= tf.expand_dims(uniform_priors, axis=-1)
-    # Reduce spatial dimension of features. The features have shape
-    # [batch_size, num_instances, num_channels].
-    features = tf.reduce_mean(features, axis=(2, 3))
-    logits = tf.keras.layers.Dense(
-        self._mask_num_classes * self._num_clusters,
-        kernel_initializer=tf.random_normal_initializer(stddev=0.01),
-        name='classify-shape-prior-fc')(features)
-    logits = tf.reshape(
-        logits,
-        [batch_size, num_instances, self._mask_num_classes, self._num_clusters])
-    if self._use_category_for_mask:
-      logits = tf.gather(logits, tf.expand_dims(classes, axis=-1), batch_dims=2)
-      logits = tf.squeeze(logits, axis=2)
-    else:
-      logits = logits[:, :, 0, :]
-
-    distribution = tf.nn.softmax(logits, name='shape_prior_weights')
-    return distribution
-
-
-class ShapemaskCoarsemaskHead(object):
-  """ShapemaskCoarsemaskHead head."""
-
-  def __init__(self,
-               num_classes,
-               num_downsample_channels,
-               mask_crop_size,
-               use_category_for_mask,
-               num_convs,
-               norm_activation=nn_ops.norm_activation_builder()):
-    """Initialize params to build ShapeMask coarse and fine prediction head.
-
-    Args:
-      num_classes: `int` number of mask classification categories.
-      num_downsample_channels: `int` number of filters at mask head.
-      mask_crop_size: feature crop size.
-      use_category_for_mask: use class information in mask branch.
-      num_convs: `int` number of stacked convolution before the last prediction
-        layer.
-      norm_activation: an operation that includes a normalization layer followed
-        by an optional activation layer.
-    """
-    self._mask_num_classes = num_classes if use_category_for_mask else 1
-    self._use_category_for_mask = use_category_for_mask
-    self._num_downsample_channels = num_downsample_channels
-    self._mask_crop_size = mask_crop_size
-    self._num_convs = num_convs
-    self._norm_activation = norm_activation
-
-    self._coarse_mask_fc = tf.keras.layers.Dense(
-        self._num_downsample_channels, name='coarse-mask-fc')
-
-    self._class_conv = []
-    self._class_norm_activation = []
-
-    for i in range(self._num_convs):
-      self._class_conv.append(
-          tf.keras.layers.Conv2D(
-              self._num_downsample_channels,
-              kernel_size=(3, 3),
-              bias_initializer=tf.zeros_initializer(),
-              kernel_initializer=tf.keras.initializers.RandomNormal(
-                  stddev=0.01),
-              padding='same',
-              name='coarse-mask-class-%d' % i))
-
-      self._class_norm_activation.append(
-          norm_activation(name='coarse-mask-class-%d-bn' % i))
-
-    self._class_predict = tf.keras.layers.Conv2D(
-        self._mask_num_classes,
-        kernel_size=(1, 1),
-        # Focal loss bias initialization to have foreground 0.01 probability.
-        bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
-        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
-        padding='same',
-        name='coarse-mask-class-predict')
-
-  def __call__(self, features, detection_priors, classes, is_training):
-    """Generate instance masks from FPN features and detection priors.
-
-    This corresponds to the Fig. 5-6 of the ShapeMask paper at
-    https://arxiv.org/pdf/1904.03239.pdf
-
-    Args:
-      features: a float Tensor of shape [batch_size, num_instances,
-        mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
-        instance feature crop.
-      detection_priors: a float Tensor of shape [batch_size, num_instances,
-        mask_crop_size, mask_crop_size, 1]. This is the detection prior for the
-        instance.
-      classes: a int Tensor of shape [batch_size, num_instances] of instance
-        classes.
-      is_training: a bool indicating whether in training mode.
-
-    Returns:
-      mask_outputs: instance mask prediction as a float Tensor of shape
-        [batch_size, num_instances, mask_size, mask_size].
-    """
-    with tf.name_scope('coarse_mask'):
-      # Transform detection priors to have the same dimension as features.
-      detection_priors = tf.expand_dims(detection_priors, axis=-1)
-      detection_priors = self._coarse_mask_fc(detection_priors)
-
-      features += detection_priors
-      mask_logits = self.decoder_net(features, is_training)
-      # Gather the logits with right input class.
-      if self._use_category_for_mask:
-        mask_logits = tf.transpose(mask_logits, [0, 1, 4, 2, 3])
-        mask_logits = tf.gather(
-            mask_logits, tf.expand_dims(classes, -1), batch_dims=2)
-        mask_logits = tf.squeeze(mask_logits, axis=2)
-      else:
-        mask_logits = mask_logits[..., 0]
-
-      return mask_logits
-
-  def decoder_net(self, features, is_training=False):
-    """Coarse mask decoder network architecture.
-
-    Args:
-      features: A tensor of size [batch, height_in, width_in, channels_in].
-      is_training: Whether batch_norm layers are in training mode.
-
-    Returns:
-      images: A feature tensor of size [batch, output_size, output_size,
-        num_channels]
-    """
-    (batch_size, num_instances, height, width,
-     num_channels) = features.get_shape().as_list()
-    features = tf.reshape(
-        features, [batch_size * num_instances, height, width, num_channels])
-    for i in range(self._num_convs):
-      features = self._class_conv[i](features)
-      features = self._class_norm_activation[i](
-          features, is_training=is_training)
-
-    mask_logits = self._class_predict(features)
-    mask_logits = tf.reshape(
-        mask_logits,
-        [batch_size, num_instances, height, width, self._mask_num_classes])
-    return mask_logits
-
-
-class ShapemaskFinemaskHead(object):
-  """ShapemaskFinemaskHead head."""
-
-  def __init__(self,
-               num_classes,
-               num_downsample_channels,
-               mask_crop_size,
-               use_category_for_mask,
-               num_convs,
-               upsample_factor,
-               norm_activation=nn_ops.norm_activation_builder()):
-    """Initialize params to build ShapeMask coarse and fine prediction head.
-
-    Args:
-      num_classes: `int` number of mask classification categories.
-      num_downsample_channels: `int` number of filters at mask head.
-      mask_crop_size: feature crop size.
-      use_category_for_mask: use class information in mask branch.
-      num_convs: `int` number of stacked convolution before the last prediction
-        layer.
-      upsample_factor: `int` number of fine mask upsampling factor.
-      norm_activation: an operation that includes a batch normalization layer
-        followed by a relu layer(optional).
-    """
-    self._use_category_for_mask = use_category_for_mask
-    self._mask_num_classes = num_classes if use_category_for_mask else 1
-    self._num_downsample_channels = num_downsample_channels
-    self._mask_crop_size = mask_crop_size
-    self._num_convs = num_convs
-    self.up_sample_factor = upsample_factor
-
-    self._fine_mask_fc = tf.keras.layers.Dense(
-        self._num_downsample_channels, name='fine-mask-fc')
-
-    self._upsample_conv = tf.keras.layers.Conv2DTranspose(
-        self._num_downsample_channels,
-        (self.up_sample_factor, self.up_sample_factor),
-        (self.up_sample_factor, self.up_sample_factor),
-        name='fine-mask-conv2d-tran')
-
-    self._fine_class_conv = []
-    self._fine_class_bn = []
-    for i in range(self._num_convs):
-      self._fine_class_conv.append(
-          tf.keras.layers.Conv2D(
-              self._num_downsample_channels,
-              kernel_size=(3, 3),
-              bias_initializer=tf.zeros_initializer(),
-              kernel_initializer=tf.keras.initializers.RandomNormal(
-                  stddev=0.01),
-              activation=None,
-              padding='same',
-              name='fine-mask-class-%d' % i))
-      self._fine_class_bn.append(
-          norm_activation(name='fine-mask-class-%d-bn' % i))
-
-    self._class_predict_conv = tf.keras.layers.Conv2D(
-        self._mask_num_classes,
-        kernel_size=(1, 1),
-        # Focal loss bias initialization to have foreground 0.01 probability.
-        bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
-        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
-        padding='same',
-        name='fine-mask-class-predict')
-
-  def __call__(self, features, mask_logits, classes, is_training):
-    """Generate instance masks from FPN features and detection priors.
-
-    This corresponds to the Fig. 5-6 of the ShapeMask paper at
-    https://arxiv.org/pdf/1904.03239.pdf
-
-    Args:
-      features: a float Tensor of shape [batch_size, num_instances,
-        mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
-        instance feature crop.
-      mask_logits: a float Tensor of shape [batch_size, num_instances,
-        mask_crop_size, mask_crop_size] indicating predicted mask logits.
-      classes: a int Tensor of shape [batch_size, num_instances] of instance
-        classes.
-      is_training: a bool indicating whether in training mode.
-
-    Returns:
-      mask_outputs: instance mask prediction as a float Tensor of shape
-        [batch_size, num_instances, mask_size, mask_size].
-    """
-    # Extract the foreground mean features
-    # with tf.variable_scope('fine_mask', reuse=tf.AUTO_REUSE):
-    with tf.name_scope('fine_mask'):
-      mask_probs = tf.nn.sigmoid(mask_logits)
-      # Compute instance embedding for hard average.
-      binary_mask = tf.cast(tf.greater(mask_probs, 0.5), features.dtype)
-      instance_embedding = tf.reduce_sum(
-          features * tf.expand_dims(binary_mask, axis=-1), axis=(2, 3))
-      instance_embedding /= tf.expand_dims(
-          tf.reduce_sum(binary_mask, axis=(2, 3)) + 1e-20, axis=-1)
-      # Take the difference between crop features and mean instance features.
-      features -= tf.expand_dims(
-          tf.expand_dims(instance_embedding, axis=2), axis=2)
-
-      features += self._fine_mask_fc(tf.expand_dims(mask_probs, axis=-1))
-
-      # Decoder to generate upsampled segmentation mask.
-      mask_logits = self.decoder_net(features, is_training)
-      if self._use_category_for_mask:
-        mask_logits = tf.transpose(mask_logits, [0, 1, 4, 2, 3])
-        mask_logits = tf.gather(
-            mask_logits, tf.expand_dims(classes, -1), batch_dims=2)
-        mask_logits = tf.squeeze(mask_logits, axis=2)
-      else:
-        mask_logits = mask_logits[..., 0]
-
-    return mask_logits
-
-  def decoder_net(self, features, is_training=False):
-    """Fine mask decoder network architecture.
-
-    Args:
-      features: A tensor of size [batch, height_in, width_in, channels_in].
-      is_training: Whether batch_norm layers are in training mode.
-
-    Returns:
-      images: A feature tensor of size [batch, output_size, output_size,
-        num_channels], where output size is self._gt_upsample_scale times
-        that of input.
-    """
-    (batch_size, num_instances, height, width,
-     num_channels) = features.get_shape().as_list()
-    features = tf.reshape(
-        features, [batch_size * num_instances, height, width, num_channels])
-    for i in range(self._num_convs):
-      features = self._fine_class_conv[i](features)
-      features = self._fine_class_bn[i](features, is_training=is_training)
-
-    if self.up_sample_factor > 1:
-      features = self._upsample_conv(features)
-
-    # Predict per-class instance masks.
-    mask_logits = self._class_predict_conv(features)
-
-    mask_logits = tf.reshape(mask_logits, [
-        batch_size, num_instances, height * self.up_sample_factor,
-        width * self.up_sample_factor, self._mask_num_classes
-    ])
-    return mask_logits
--- a/official/vision/detection/modeling/architecture/identity.py
+++ b/official/vision/detection/modeling/architecture/identity.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Identity Fn that forwards the input features."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-class Identity(object):
-  """Identity function that forwards the input features."""
-
-  def __call__(self, features, is_training=False):
-    """Only forwards the input features."""
-    return features
-
--- a/official/vision/detection/modeling/architecture/nn_blocks.py
+++ b/official/vision/detection/modeling/architecture/nn_blocks.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Contains common building blocks for neural networks."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from official.modeling import tf_utils
-
-
-class ResidualBlock(tf.keras.layers.Layer):
-  """A residual block."""
-
-  def __init__(self,
-               filters,
-               strides,
-               use_projection=False,
-               kernel_initializer='VarianceScaling',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activation='relu',
-               use_sync_bn=False,
-               norm_momentum=0.99,
-               norm_epsilon=0.001,
-               **kwargs):
-    """A residual block with BN after convolutions.
-
-    Args:
-      filters: `int` number of filters for the first two convolutions. Note that
-        the third and final convolution will use 4 times as many filters.
-      strides: `int` block stride. If greater than 1, this block will ultimately
-        downsample the input.
-      use_projection: `bool` for whether this block should use a projection
-        shortcut (versus the default identity shortcut). This is usually `True`
-        for the first block of a block group, which may change the number of
-        filters and the resolution.
-      kernel_initializer: kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-        Default to None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
-        Default to None.
-      activation: `str` name of the activation function.
-      use_sync_bn: if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      **kwargs: keyword arguments to be passed.
-    """
-    super(ResidualBlock, self).__init__(**kwargs)
-
-    self._filters = filters
-    self._strides = strides
-    self._use_projection = use_projection
-    self._use_sync_bn = use_sync_bn
-    self._activation = activation
-    self._kernel_initializer = kernel_initializer
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-
-    if use_sync_bn:
-      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
-    else:
-      self._norm = tf.keras.layers.BatchNormalization
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-    self._activation_fn = tf_utils.get_activation(activation)
-
-  def build(self, input_shape):
-    if self._use_projection:
-      self._shortcut = tf.keras.layers.Conv2D(
-          filters=self._filters,
-          kernel_size=1,
-          strides=self._strides,
-          use_bias=False,
-          kernel_initializer=self._kernel_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer)
-      self._norm0 = self._norm(
-          axis=self._bn_axis,
-          momentum=self._norm_momentum,
-          epsilon=self._norm_epsilon)
-
-    self._conv1 = tf.keras.layers.Conv2D(
-        filters=self._filters,
-        kernel_size=3,
-        strides=self._strides,
-        padding='same',
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm1 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    self._conv2 = tf.keras.layers.Conv2D(
-        filters=self._filters,
-        kernel_size=3,
-        strides=1,
-        padding='same',
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm2 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    super(ResidualBlock, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        'filters': self._filters,
-        'strides': self._strides,
-        'use_projection': self._use_projection,
-        'kernel_initializer': self._kernel_initializer,
-        'kernel_regularizer': self._kernel_regularizer,
-        'bias_regularizer': self._bias_regularizer,
-        'activation': self._activation,
-        'use_sync_bn': self._use_sync_bn,
-        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon
-    }
-
-    base_config = super(ResidualBlock, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs):
-    shortcut = inputs
-    if self._use_projection:
-      shortcut = self._shortcut(shortcut)
-      shortcut = self._norm0(shortcut)
-
-    x = self._conv1(inputs)
-    x = self._norm1(x)
-    x = self._activation_fn(x)
-
-    x = self._conv2(x)
-    x = self._norm2(x)
-
-    return self._activation_fn(x + shortcut)
-
-
-class BottleneckBlock(tf.keras.layers.Layer):
-  """A standard bottleneck block."""
-
-  def __init__(self,
-               filters,
-               strides,
-               use_projection=False,
-               kernel_initializer='VarianceScaling',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activation='relu',
-               use_sync_bn=False,
-               norm_momentum=0.99,
-               norm_epsilon=0.001,
-               **kwargs):
-    """A standard bottleneck block with BN after convolutions.
-
-    Args:
-      filters: `int` number of filters for the first two convolutions. Note that
-        the third and final convolution will use 4 times as many filters.
-      strides: `int` block stride. If greater than 1, this block will ultimately
-        downsample the input.
-      use_projection: `bool` for whether this block should use a projection
-        shortcut (versus the default identity shortcut). This is usually `True`
-        for the first block of a block group, which may change the number of
-        filters and the resolution.
-      kernel_initializer: kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-        Default to None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
-        Default to None.
-      activation: `str` name of the activation function.
-      use_sync_bn: if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      **kwargs: keyword arguments to be passed.
-    """
-    super(BottleneckBlock, self).__init__(**kwargs)
-
-    self._filters = filters
-    self._strides = strides
-    self._use_projection = use_projection
-    self._use_sync_bn = use_sync_bn
-    self._activation = activation
-    self._kernel_initializer = kernel_initializer
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-    if use_sync_bn:
-      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
-    else:
-      self._norm = tf.keras.layers.BatchNormalization
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-    self._activation_fn = tf_utils.get_activation(activation)
-
-  def build(self, input_shape):
-    if self._use_projection:
-      self._shortcut = tf.keras.layers.Conv2D(
-          filters=self._filters * 4,
-          kernel_size=1,
-          strides=self._strides,
-          use_bias=False,
-          kernel_initializer=self._kernel_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer)
-      self._norm0 = self._norm(
-          axis=self._bn_axis,
-          momentum=self._norm_momentum,
-          epsilon=self._norm_epsilon)
-
-    self._conv1 = tf.keras.layers.Conv2D(
-        filters=self._filters,
-        kernel_size=1,
-        strides=1,
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm1 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    self._conv2 = tf.keras.layers.Conv2D(
-        filters=self._filters,
-        kernel_size=3,
-        strides=self._strides,
-        padding='same',
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm2 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    self._conv3 = tf.keras.layers.Conv2D(
-        filters=self._filters * 4,
-        kernel_size=1,
-        strides=1,
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm3 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    super(BottleneckBlock, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        'filters': self._filters,
-        'strides': self._strides,
-        'use_projection': self._use_projection,
-        'kernel_initializer': self._kernel_initializer,
-        'kernel_regularizer': self._kernel_regularizer,
-        'bias_regularizer': self._bias_regularizer,
-        'activation': self._activation,
-        'use_sync_bn': self._use_sync_bn,
-        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon
-    }
-
-    base_config = super(BottleneckBlock, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs):
-    shortcut = inputs
-    if self._use_projection:
-      shortcut = self._shortcut(shortcut)
-      shortcut = self._norm0(shortcut)
-
-    x = self._conv1(inputs)
-    x = self._norm1(x)
-    x = self._activation_fn(x)
-
-    x = self._conv2(x)
-    x = self._norm2(x)
-    x = self._activation_fn(x)
-
-    x = self._conv3(x)
-    x = self._norm3(x)
-
-    return self._activation_fn(x + shortcut)
--- a/official/vision/detection/modeling/architecture/nn_ops.py
+++ b/official/vision/detection/modeling/architecture/nn_ops.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Neural network operations commonly shared by the architectures."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-import tensorflow as tf
-
-
-class NormActivation(tf.keras.layers.Layer):
-  """Combined Normalization and Activation layers."""
-
-  def __init__(self,
-               momentum=0.997,
-               epsilon=1e-4,
-               trainable=True,
-               init_zero=False,
-               use_activation=True,
-               activation='relu',
-               fused=True,
-               name=None):
-    """A class to construct layers for a batch normalization followed by a ReLU.
-
-    Args:
-      momentum: momentum for the moving average.
-      epsilon: small float added to variance to avoid dividing by zero.
-      trainable: `bool`, if True also add variables to the graph collection
-        GraphKeys.TRAINABLE_VARIABLES. If False, freeze batch normalization
-        layer.
-      init_zero: `bool` if True, initializes scale parameter of batch
-        normalization with 0. If False, initialize it with 1.
-      fused: `bool` fused option in batch normalziation.
-      use_actiation: `bool`, whether to add the optional activation layer after
-        the batch normalization layer.
-      activation: 'string', the type of the activation layer. Currently support
-        `relu` and `swish`.
-      name: `str` name for the operation.
-    """
-    super(NormActivation, self).__init__(trainable=trainable)
-    if init_zero:
-      gamma_initializer = tf.keras.initializers.Zeros()
-    else:
-      gamma_initializer = tf.keras.initializers.Ones()
-    self._normalization_op = tf.keras.layers.BatchNormalization(
-        momentum=momentum,
-        epsilon=epsilon,
-        center=True,
-        scale=True,
-        trainable=trainable,
-        fused=fused,
-        gamma_initializer=gamma_initializer,
-        name=name)
-    self._use_activation = use_activation
-    if activation == 'relu':
-      self._activation_op = tf.nn.relu
-    elif activation == 'swish':
-      self._activation_op = tf.nn.swish
-    else:
-      raise ValueError('Unsupported activation `{}`.'.format(activation))
-
-  def __call__(self, inputs, is_training=None):
-    """Builds the normalization layer followed by an optional activation layer.
-
-    Args:
-      inputs: `Tensor` of shape `[batch, channels, ...]`.
-      is_training: `boolean`, if True if model is in training mode.
-
-    Returns:
-      A normalized `Tensor` with the same `data_format`.
-    """
-    # We will need to keep training=None by default, so that it can be inherit
-    # from keras.Model.training
-    if is_training and self.trainable:
-      is_training = True
-    inputs = self._normalization_op(inputs, training=is_training)
-
-    if self._use_activation:
-      inputs = self._activation_op(inputs)
-    return inputs
-
-
-def norm_activation_builder(momentum=0.997,
-                            epsilon=1e-4,
-                            trainable=True,
-                            activation='relu',
-                            **kwargs):
-  return functools.partial(
-      NormActivation,
-      momentum=momentum,
-      epsilon=epsilon,
-      trainable=trainable,
-      activation=activation,
-      **kwargs)
--- a/official/vision/detection/modeling/architecture/resnet.py
+++ b/official/vision/detection/modeling/architecture/resnet.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Contains definitions for the post-activation form of Residual Networks.
-
-Residual networks (ResNets) were proposed in:
-[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
-    Deep Residual Learning for Image Recognition. arXiv:1512.03385
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-from official.vision.detection.modeling.architecture import nn_ops
-
-
-# TODO(b/140112644): Refactor the code with Keras style, i.e. build and call.
-class Resnet(object):
-  """Class to build ResNet family model."""
-
-  def __init__(
-      self,
-      resnet_depth,
-      activation='relu',
-      norm_activation=nn_ops.norm_activation_builder(activation='relu'),
-      data_format='channels_last'):
-    """ResNet initialization function.
-
-    Args:
-      resnet_depth: `int` depth of ResNet backbone model.
-      norm_activation: an operation that includes a normalization layer followed
-        by an optional activation layer.
-      data_format: `str` either "channels_first" for `[batch, channels, height,
-        width]` or "channels_last for `[batch, height, width, channels]`.
-    """
-    self._resnet_depth = resnet_depth
-    if activation == 'relu':
-      self._activation_op = tf.nn.relu
-    elif activation == 'swish':
-      self._activation_op = tf.nn.swish
-    else:
-      raise ValueError('Unsupported activation `{}`.'.format(activation))
-    self._norm_activation = norm_activation
-    self._data_format = data_format
-
-    model_params = {
-        10: {
-            'block': self.residual_block,
-            'layers': [1, 1, 1, 1]
-        },
-        18: {
-            'block': self.residual_block,
-            'layers': [2, 2, 2, 2]
-        },
-        34: {
-            'block': self.residual_block,
-            'layers': [3, 4, 6, 3]
-        },
-        50: {
-            'block': self.bottleneck_block,
-            'layers': [3, 4, 6, 3]
-        },
-        101: {
-            'block': self.bottleneck_block,
-            'layers': [3, 4, 23, 3]
-        },
-        152: {
-            'block': self.bottleneck_block,
-            'layers': [3, 8, 36, 3]
-        },
-        200: {
-            'block': self.bottleneck_block,
-            'layers': [3, 24, 36, 3]
-        }
-    }
-
-    if resnet_depth not in model_params:
-      valid_resnet_depths = ', '.join(
-          [str(depth) for depth in sorted(model_params.keys())])
-      raise ValueError(
-          'The resnet_depth should be in [%s]. Not a valid resnet_depth:' %
-          (valid_resnet_depths), self._resnet_depth)
-    params = model_params[resnet_depth]
-    self._resnet_fn = self.resnet_v1_generator(params['block'],
-                                               params['layers'])
-
-  def __call__(self, inputs, is_training=None):
-    """Returns the ResNet model for a given size and number of output classes.
-
-    Args:
-      inputs: a `Tesnor` with shape [batch_size, height, width, 3] representing
-        a batch of images.
-      is_training: `bool` if True, the model is in training mode.
-
-    Returns:
-      a `dict` containing `int` keys for continuous feature levels [2, 3, 4, 5].
-      The values are corresponding feature hierarchy in ResNet with shape
-      [batch_size, height_l, width_l, num_filters].
-    """
-    with tf.name_scope('resnet%s' % self._resnet_depth):
-      return self._resnet_fn(inputs, is_training)
-
-  def fixed_padding(self, inputs, kernel_size):
-    """Pads the input along the spatial dimensions independently of input size.
-
-    Args:
-      inputs: `Tensor` of size `[batch, channels, height, width]` or `[batch,
-        height, width, channels]` depending on `data_format`.
-      kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d`
-        operations. Should be a positive integer.
-
-    Returns:
-      A padded `Tensor` of the same `data_format` with size either intact
-      (if `kernel_size == 1`) or padded (if `kernel_size > 1`).
-    """
-    pad_total = kernel_size - 1
-    pad_beg = pad_total // 2
-    pad_end = pad_total - pad_beg
-    if self._data_format == 'channels_first':
-      padded_inputs = tf.pad(
-          tensor=inputs,
-          paddings=[[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]])
-    else:
-      padded_inputs = tf.pad(
-          tensor=inputs,
-          paddings=[[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
-
-    return padded_inputs
-
-  def conv2d_fixed_padding(self, inputs, filters, kernel_size, strides):
-    """Strided 2-D convolution with explicit padding.
-
-    The padding is consistent and is based only on `kernel_size`, not on the
-    dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
-
-    Args:
-      inputs: `Tensor` of size `[batch, channels, height_in, width_in]`.
-      filters: `int` number of filters in the convolution.
-      kernel_size: `int` size of the kernel to be used in the convolution.
-      strides: `int` strides of the convolution.
-
-    Returns:
-      A `Tensor` of shape `[batch, filters, height_out, width_out]`.
-    """
-    if strides > 1:
-      inputs = self.fixed_padding(inputs, kernel_size)
-
-    return tf.keras.layers.Conv2D(
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=('SAME' if strides == 1 else 'VALID'),
-        use_bias=False,
-        kernel_initializer=tf.initializers.VarianceScaling(),
-        data_format=self._data_format)(
-            inputs=inputs)
-
-  def residual_block(self,
-                     inputs,
-                     filters,
-                     strides,
-                     use_projection=False,
-                     is_training=None):
-    """Standard building block for residual networks with BN after convolutions.
-
-    Args:
-      inputs: `Tensor` of size `[batch, channels, height, width]`.
-      filters: `int` number of filters for the first two convolutions. Note that
-        the third and final convolution will use 4 times as many filters.
-      strides: `int` block stride. If greater than 1, this block will ultimately
-        downsample the input.
-      use_projection: `bool` for whether this block should use a projection
-        shortcut (versus the default identity shortcut). This is usually `True`
-        for the first block of a block group, which may change the number of
-        filters and the resolution.
-      is_training: `bool` if True, the model is in training mode.
-
-    Returns:
-      The output `Tensor` of the block.
-    """
-    shortcut = inputs
-    if use_projection:
-      # Projection shortcut in first layer to match filters and strides
-      shortcut = self.conv2d_fixed_padding(
-          inputs=inputs, filters=filters, kernel_size=1, strides=strides)
-      shortcut = self._norm_activation(use_activation=False)(
-          shortcut, is_training=is_training)
-
-    inputs = self.conv2d_fixed_padding(
-        inputs=inputs, filters=filters, kernel_size=3, strides=strides)
-    inputs = self._norm_activation()(inputs, is_training=is_training)
-
-    inputs = self.conv2d_fixed_padding(
-        inputs=inputs, filters=filters, kernel_size=3, strides=1)
-    inputs = self._norm_activation(
-        use_activation=False, init_zero=True)(
-            inputs, is_training=is_training)
-
-    return self._activation_op(inputs + shortcut)
-
-  def bottleneck_block(self,
-                       inputs,
-                       filters,
-                       strides,
-                       use_projection=False,
-                       is_training=None):
-    """Bottleneck block variant for residual networks with BN after convolutions.
-
-    Args:
-      inputs: `Tensor` of size `[batch, channels, height, width]`.
-      filters: `int` number of filters for the first two convolutions. Note that
-        the third and final convolution will use 4 times as many filters.
-      strides: `int` block stride. If greater than 1, this block will ultimately
-        downsample the input.
-      use_projection: `bool` for whether this block should use a projection
-        shortcut (versus the default identity shortcut). This is usually `True`
-        for the first block of a block group, which may change the number of
-        filters and the resolution.
-      is_training: `bool` if True, the model is in training mode.
-
-    Returns:
-      The output `Tensor` of the block.
-    """
-    shortcut = inputs
-    if use_projection:
-      # Projection shortcut only in first block within a group. Bottleneck
-      # blocks end with 4 times the number of filters.
-      filters_out = 4 * filters
-      shortcut = self.conv2d_fixed_padding(
-          inputs=inputs, filters=filters_out, kernel_size=1, strides=strides)
-      shortcut = self._norm_activation(use_activation=False)(
-          shortcut, is_training=is_training)
-
-    inputs = self.conv2d_fixed_padding(
-        inputs=inputs, filters=filters, kernel_size=1, strides=1)
-    inputs = self._norm_activation()(inputs, is_training=is_training)
-
-    inputs = self.conv2d_fixed_padding(
-        inputs=inputs, filters=filters, kernel_size=3, strides=strides)
-    inputs = self._norm_activation()(inputs, is_training=is_training)
-
-    inputs = self.conv2d_fixed_padding(
-        inputs=inputs, filters=4 * filters, kernel_size=1, strides=1)
-    inputs = self._norm_activation(
-        use_activation=False, init_zero=True)(
-            inputs, is_training=is_training)
-
-    return self._activation_op(inputs + shortcut)
-
-  def block_group(self, inputs, filters, block_fn, blocks, strides, name,
-                  is_training):
-    """Creates one group of blocks for the ResNet model.
-
-    Args:
-      inputs: `Tensor` of size `[batch, channels, height, width]`.
-      filters: `int` number of filters for the first convolution of the layer.
-      block_fn: `function` for the block to use within the model
-      blocks: `int` number of blocks contained in the layer.
-      strides: `int` stride to use for the first convolution of the layer. If
-        greater than 1, this layer will downsample the input.
-      name: `str`name for the Tensor output of the block layer.
-      is_training: `bool` if True, the model is in training mode.
-
-    Returns:
-      The output `Tensor` of the block layer.
-    """
-    # Only the first block per block_group uses projection shortcut and strides.
-    inputs = block_fn(
-        inputs, filters, strides, use_projection=True, is_training=is_training)
-
-    for _ in range(1, blocks):
-      inputs = block_fn(inputs, filters, 1, is_training=is_training)
-
-    return tf.identity(inputs, name)
-
-  def resnet_v1_generator(self, block_fn, layers):
-    """Generator for ResNet v1 models.
-
-    Args:
-      block_fn: `function` for the block to use within the model. Either
-        `residual_block` or `bottleneck_block`.
-      layers: list of 4 `int`s denoting the number of blocks to include in each
-        of the 4 block groups. Each group consists of blocks that take inputs of
-        the same resolution.
-
-    Returns:
-      Model `function` that takes in `inputs` and `is_training` and returns the
-      output `Tensor` of the ResNet model.
-    """
-
-    def model(inputs, is_training=None):
-      """Creation of the model graph."""
-      inputs = self.conv2d_fixed_padding(
-          inputs=inputs, filters=64, kernel_size=7, strides=2)
-      inputs = tf.identity(inputs, 'initial_conv')
-      inputs = self._norm_activation()(inputs, is_training=is_training)
-
-      inputs = tf.keras.layers.MaxPool2D(
-          pool_size=3, strides=2, padding='SAME',
-          data_format=self._data_format)(
-              inputs)
-      inputs = tf.identity(inputs, 'initial_max_pool')
-
-      c2 = self.block_group(
-          inputs=inputs,
-          filters=64,
-          block_fn=block_fn,
-          blocks=layers[0],
-          strides=1,
-          name='block_group1',
-          is_training=is_training)
-      c3 = self.block_group(
-          inputs=c2,
-          filters=128,
-          block_fn=block_fn,
-          blocks=layers[1],
-          strides=2,
-          name='block_group2',
-          is_training=is_training)
-      c4 = self.block_group(
-          inputs=c3,
-          filters=256,
-          block_fn=block_fn,
-          blocks=layers[2],
-          strides=2,
-          name='block_group3',
-          is_training=is_training)
-      c5 = self.block_group(
-          inputs=c4,
-          filters=512,
-          block_fn=block_fn,
-          blocks=layers[3],
-          strides=2,
-          name='block_group4',
-          is_training=is_training)
-      return {2: c2, 3: c3, 4: c4, 5: c5}
-
-    return model
--- a/official/vision/detection/modeling/architecture/spinenet.py
+++ b/official/vision/detection/modeling/architecture/spinenet.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-# ==============================================================================
-"""Implementation of SpineNet model.
-
-X. Du, T-Y. Lin, P. Jin, G. Ghiasi, M. Tan, Y. Cui, Q. V. Le, X. Song
-SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization
-https://arxiv.org/abs/1912.05027
-"""
-import math
-
-from absl import logging
-import tensorflow as tf
-
-from official.modeling import tf_utils
-from official.vision.detection.modeling.architecture import nn_blocks
-
-layers = tf.keras.layers
-
-FILTER_SIZE_MAP = {
-    1: 32,
-    2: 64,
-    3: 128,
-    4: 256,
-    5: 256,
-    6: 256,
-    7: 256,
-}
-
-# The fixed SpineNet architecture discovered by NAS.
-# Each element represents a specification of a building block:
-#   (block_level, block_fn, (input_offset0, input_offset1), is_output).
-SPINENET_BLOCK_SPECS = [
-    (2, 'bottleneck', (0, 1), False),
-    (4, 'residual', (0, 1), False),
-    (3, 'bottleneck', (2, 3), False),
-    (4, 'bottleneck', (2, 4), False),
-    (6, 'residual', (3, 5), False),
-    (4, 'bottleneck', (3, 5), False),
-    (5, 'residual', (6, 7), False),
-    (7, 'residual', (6, 8), False),
-    (5, 'bottleneck', (8, 9), False),
-    (5, 'bottleneck', (8, 10), False),
-    (4, 'bottleneck', (5, 10), True),
-    (3, 'bottleneck', (4, 10), True),
-    (5, 'bottleneck', (7, 12), True),
-    (7, 'bottleneck', (5, 14), True),
-    (6, 'bottleneck', (12, 14), True),
-]
-
-SCALING_MAP = {
-    '49S': {
-        'endpoints_num_filters': 128,
-        'filter_size_scale': 0.65,
-        'resample_alpha': 0.5,
-        'block_repeats': 1,
-    },
-    '49': {
-        'endpoints_num_filters': 256,
-        'filter_size_scale': 1.0,
-        'resample_alpha': 0.5,
-        'block_repeats': 1,
-    },
-    '96': {
-        'endpoints_num_filters': 256,
-        'filter_size_scale': 1.0,
-        'resample_alpha': 0.5,
-        'block_repeats': 2,
-    },
-    '143': {
-        'endpoints_num_filters': 256,
-        'filter_size_scale': 1.0,
-        'resample_alpha': 1.0,
-        'block_repeats': 3,
-    },
-    '190': {
-        'endpoints_num_filters': 512,
-        'filter_size_scale': 1.3,
-        'resample_alpha': 1.0,
-        'block_repeats': 4,
-    },
-}
-
-
-class BlockSpec(object):
-  """A container class that specifies the block configuration for SpineNet."""
-
-  def __init__(self, level, block_fn, input_offsets, is_output):
-    self.level = level
-    self.block_fn = block_fn
-    self.input_offsets = input_offsets
-    self.is_output = is_output
-
-
-def build_block_specs(block_specs=None):
-  """Builds the list of BlockSpec objects for SpineNet."""
-  if not block_specs:
-    block_specs = SPINENET_BLOCK_SPECS
-  logging.info('Building SpineNet block specs: %s', block_specs)
-  return [BlockSpec(*b) for b in block_specs]
-
-
-class SpineNet(tf.keras.Model):
-  """Class to build SpineNet models."""
-
-  def __init__(self,
-               input_specs=tf.keras.layers.InputSpec(shape=[None, 640, 640, 3]),
-               min_level=3,
-               max_level=7,
-               block_specs=build_block_specs(),
-               endpoints_num_filters=256,
-               resample_alpha=0.5,
-               block_repeats=1,
-               filter_size_scale=1.0,
-               kernel_initializer='VarianceScaling',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activation='relu',
-               use_sync_bn=False,
-               norm_momentum=0.99,
-               norm_epsilon=0.001,
-               **kwargs):
-    """SpineNet model."""
-    self._min_level = min_level
-    self._max_level = max_level
-    self._block_specs = block_specs
-    self._endpoints_num_filters = endpoints_num_filters
-    self._resample_alpha = resample_alpha
-    self._block_repeats = block_repeats
-    self._filter_size_scale = filter_size_scale
-    self._kernel_initializer = kernel_initializer
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-    self._use_sync_bn = use_sync_bn
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-    if activation == 'relu':
-      self._activation = tf.nn.relu
-    elif activation == 'swish':
-      self._activation = tf.nn.swish
-    else:
-      raise ValueError('Activation {} not implemented.'.format(activation))
-    self._init_block_fn = 'bottleneck'
-    self._num_init_blocks = 2
-
-    if use_sync_bn:
-      self._norm = layers.experimental.SyncBatchNormalization
-    else:
-      self._norm = layers.BatchNormalization
-
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-
-    # Build SpineNet.
-    inputs = tf.keras.Input(shape=input_specs.shape[1:])
-
-    net = self._build_stem(inputs=inputs)
-    net = self._build_scale_permuted_network(
-        net=net, input_width=input_specs.shape[1])
-    net = self._build_endpoints(net=net)
-
-    super(SpineNet, self).__init__(inputs=inputs, outputs=net)
-
-  def _block_group(self,
-                   inputs,
-                   filters,
-                   strides,
-                   block_fn_cand,
-                   block_repeats=1,
-                   name='block_group'):
-    """Creates one group of blocks for the SpineNet model."""
-    block_fn_candidates = {
-        'bottleneck': nn_blocks.BottleneckBlock,
-        'residual': nn_blocks.ResidualBlock,
-    }
-    block_fn = block_fn_candidates[block_fn_cand]
-    _, _, _, num_filters = inputs.get_shape().as_list()
-
-    if block_fn_cand == 'bottleneck':
-      use_projection = not (num_filters == (filters * 4) and strides == 1)
-    else:
-      use_projection = not (num_filters == filters and strides == 1)
-
-    x = block_fn(
-        filters=filters,
-        strides=strides,
-        use_projection=use_projection,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activation=self._activation,
-        use_sync_bn=self._use_sync_bn,
-        norm_momentum=self._norm_momentum,
-        norm_epsilon=self._norm_epsilon)(
-            inputs)
-    for _ in range(1, block_repeats):
-      x = block_fn(
-          filters=filters,
-          strides=1,
-          use_projection=False,
-          kernel_initializer=self._kernel_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer,
-          activation=self._activation,
-          use_sync_bn=self._use_sync_bn,
-          norm_momentum=self._norm_momentum,
-          norm_epsilon=self._norm_epsilon)(
-              x)
-    return tf.identity(x, name=name)
-
-  def _build_stem(self, inputs):
-    """Build SpineNet stem."""
-    x = layers.Conv2D(
-        filters=64,
-        kernel_size=7,
-        strides=2,
-        use_bias=False,
-        padding='same',
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)(
-            inputs)
-    x = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)(
-            x)
-    x = tf_utils.get_activation(self._activation)(x)
-    x = layers.MaxPool2D(pool_size=3, strides=2, padding='same')(x)
-
-    net = []
-    # Build the initial level 2 blocks.
-    for i in range(self._num_init_blocks):
-      x = self._block_group(
-          inputs=x,
-          filters=int(FILTER_SIZE_MAP[2] * self._filter_size_scale),
-          strides=1,
-          block_fn_cand=self._init_block_fn,
-          block_repeats=self._block_repeats,
-          name='stem_block_{}'.format(i + 1))
-      net.append(x)
-    return net
-
-  def _build_scale_permuted_network(self,
-                                    net,
-                                    input_width,
-                                    weighted_fusion=False):
-    """Build scale-permuted network."""
-    net_sizes = [int(math.ceil(input_width / 2**2))] * len(net)
-    net_block_fns = [self._init_block_fn] * len(net)
-    num_outgoing_connections = [0] * len(net)
-
-    endpoints = {}
-    for i, block_spec in enumerate(self._block_specs):
-      # Find out specs for the target block.
-      target_width = int(math.ceil(input_width / 2**block_spec.level))
-      target_num_filters = int(FILTER_SIZE_MAP[block_spec.level] *
-                               self._filter_size_scale)
-      target_block_fn = block_spec.block_fn
-
-      # Resample then merge input0 and input1.
-      parents = []
-      input0 = block_spec.input_offsets[0]
-      input1 = block_spec.input_offsets[1]
-
-      x0 = self._resample_with_alpha(
-          inputs=net[input0],
-          input_width=net_sizes[input0],
-          input_block_fn=net_block_fns[input0],
-          target_width=target_width,
-          target_num_filters=target_num_filters,
-          target_block_fn=target_block_fn,
-          alpha=self._resample_alpha)
-      parents.append(x0)
-      num_outgoing_connections[input0] += 1
-
-      x1 = self._resample_with_alpha(
-          inputs=net[input1],
-          input_width=net_sizes[input1],
-          input_block_fn=net_block_fns[input1],
-          target_width=target_width,
-          target_num_filters=target_num_filters,
-          target_block_fn=target_block_fn,
-          alpha=self._resample_alpha)
-      parents.append(x1)
-      num_outgoing_connections[input1] += 1
-
-      # Merge 0 outdegree blocks to the output block.
-      if block_spec.is_output:
-        for j, (j_feat,
-                j_connections) in enumerate(zip(net, num_outgoing_connections)):
-          if j_connections == 0 and (j_feat.shape[2] == target_width and
-                                     j_feat.shape[3] == x0.shape[3]):
-            parents.append(j_feat)
-            num_outgoing_connections[j] += 1
-
-      # pylint: disable=g-direct-tensorflow-import
-      if weighted_fusion:
-        dtype = parents[0].dtype
-        parent_weights = [
-            tf.nn.relu(tf.cast(tf.Variable(1.0, name='block{}_fusion{}'.format(
-                i, j)), dtype=dtype)) for j in range(len(parents))]
-        weights_sum = tf.add_n(parent_weights)
-        parents = [
-            parents[i] * parent_weights[i] / (weights_sum + 0.0001)
-            for i in range(len(parents))
-        ]
-
-      # Fuse all parent nodes then build a new block.
-      x = tf_utils.get_activation(self._activation)(tf.add_n(parents))
-      x = self._block_group(
-          inputs=x,
-          filters=target_num_filters,
-          strides=1,
-          block_fn_cand=target_block_fn,
-          block_repeats=self._block_repeats,
-          name='scale_permuted_block_{}'.format(i + 1))
-
-      net.append(x)
-      net_sizes.append(target_width)
-      net_block_fns.append(target_block_fn)
-      num_outgoing_connections.append(0)
-
-      # Save output feats.
-      if block_spec.is_output:
-        if block_spec.level in endpoints:
-          raise ValueError('Duplicate feats found for output level {}.'.format(
-              block_spec.level))
-        if (block_spec.level < self._min_level or
-            block_spec.level > self._max_level):
-          raise ValueError('Output level is out of range [{}, {}]'.format(
-              self._min_level, self._max_level))
-        endpoints[block_spec.level] = x
-
-    return endpoints
-
-  def _build_endpoints(self, net):
-    """Match filter size for endpoints before sharing conv layers."""
-    endpoints = {}
-    for level in range(self._min_level, self._max_level + 1):
-      x = layers.Conv2D(
-          filters=self._endpoints_num_filters,
-          kernel_size=1,
-          strides=1,
-          use_bias=False,
-          kernel_initializer=self._kernel_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer)(
-              net[level])
-      x = self._norm(
-          axis=self._bn_axis,
-          momentum=self._norm_momentum,
-          epsilon=self._norm_epsilon)(
-              x)
-      x = tf_utils.get_activation(self._activation)(x)
-      endpoints[level] = x
-    return endpoints
-
-  def _resample_with_alpha(self,
-                           inputs,
-                           input_width,
-                           input_block_fn,
-                           target_width,
-                           target_num_filters,
-                           target_block_fn,
-                           alpha=0.5):
-    """Match resolution and feature dimension."""
-    _, _, _, input_num_filters = inputs.get_shape().as_list()
-    if input_block_fn == 'bottleneck':
-      input_num_filters /= 4
-    new_num_filters = int(input_num_filters * alpha)
-
-    x = layers.Conv2D(
-        filters=new_num_filters,
-        kernel_size=1,
-        strides=1,
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)(
-            inputs)
-    x = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)(
-            x)
-    x = tf_utils.get_activation(self._activation)(x)
-
-    # Spatial resampling.
-    if input_width > target_width:
-      x = layers.Conv2D(
-          filters=new_num_filters,
-          kernel_size=3,
-          strides=2,
-          padding='SAME',
-          use_bias=False,
-          kernel_initializer=self._kernel_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer)(
-              x)
-      x = self._norm(
-          axis=self._bn_axis,
-          momentum=self._norm_momentum,
-          epsilon=self._norm_epsilon)(
-              x)
-      x = tf_utils.get_activation(self._activation)(x)
-      input_width /= 2
-      while input_width > target_width:
-        x = layers.MaxPool2D(pool_size=3, strides=2, padding='SAME')(x)
-        input_width /= 2
-    elif input_width < target_width:
-      scale = target_width // input_width
-      x = layers.UpSampling2D(size=(scale, scale))(x)
-
-    # Last 1x1 conv to match filter size.
-    if target_block_fn == 'bottleneck':
-      target_num_filters *= 4
-    x = layers.Conv2D(
-        filters=target_num_filters,
-        kernel_size=1,
-        strides=1,
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)(
-            x)
-    x = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)(
-            x)
-
-    return x
-
-
-class SpineNetBuilder(object):
-  """SpineNet builder."""
-
-  def __init__(self,
-               model_id,
-               input_specs=tf.keras.layers.InputSpec(shape=[None, 640, 640, 3]),
-               min_level=3,
-               max_level=7,
-               block_specs=build_block_specs(),
-               kernel_initializer='VarianceScaling',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activation='relu',
-               use_sync_bn=False,
-               norm_momentum=0.99,
-               norm_epsilon=0.001):
-    if model_id not in SCALING_MAP:
-      raise ValueError(
-          'SpineNet {} is not a valid architecture.'.format(model_id))
-    scaling_params = SCALING_MAP[model_id]
-    self._input_specs = input_specs
-    self._min_level = min_level
-    self._max_level = max_level
-    self._block_specs = block_specs
-    self._endpoints_num_filters = scaling_params['endpoints_num_filters']
-    self._resample_alpha = scaling_params['resample_alpha']
-    self._block_repeats = scaling_params['block_repeats']
-    self._filter_size_scale = scaling_params['filter_size_scale']
-    self._kernel_initializer = kernel_initializer
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-    self._activation = activation
-    self._use_sync_bn = use_sync_bn
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-
-  def __call__(self, inputs, is_training=None):
-    model = SpineNet(
-        input_specs=self._input_specs,
-        min_level=self._min_level,
-        max_level=self._max_level,
-        block_specs=self._block_specs,
-        endpoints_num_filters=self._endpoints_num_filters,
-        resample_alpha=self._resample_alpha,
-        block_repeats=self._block_repeats,
-        filter_size_scale=self._filter_size_scale,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activation=self._activation,
-        use_sync_bn=self._use_sync_bn,
-        norm_momentum=self._norm_momentum,
-        norm_epsilon=self._norm_epsilon)
-    return model(inputs)
--- a/official/vision/detection/modeling/base_model.py
+++ b/official/vision/detection/modeling/base_model.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Base Model definition."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import functools
-import re
-
-import tensorflow as tf
-from official.vision.detection.modeling import checkpoint_utils
-from official.vision.detection.modeling import learning_rates
-from official.vision.detection.modeling import optimizers
-
-
-def _make_filter_trainable_variables_fn(frozen_variable_prefix):
-  """Creates a function for filtering trainable varialbes."""
-
-  def _filter_trainable_variables(variables):
-    """Filters trainable varialbes.
-
-    Args:
-      variables: a list of tf.Variable to be filtered.
-
-    Returns:
-      filtered_variables: a list of tf.Variable filtered out the frozen ones.
-    """
-    # frozen_variable_prefix: a regex string specifing the prefix pattern of
-    # the frozen variables' names.
-    filtered_variables = [
-        v for v in variables if not frozen_variable_prefix or
-        not re.match(frozen_variable_prefix, v.name)
-    ]
-    return filtered_variables
-
-  return _filter_trainable_variables
-
-
-class Model(object):
-  """Base class for model function."""
-
-  __metaclass__ = abc.ABCMeta
-
-  def __init__(self, params):
-    self._use_bfloat16 = params.architecture.use_bfloat16
-
-    if params.architecture.use_bfloat16:
-      tf.compat.v2.keras.mixed_precision.set_global_policy('mixed_bfloat16')
-
-    # Optimization.
-    self._optimizer_fn = optimizers.OptimizerFactory(params.train.optimizer)
-    self._learning_rate = learning_rates.learning_rate_generator(
-        params.train.total_steps, params.train.learning_rate)
-
-    self._frozen_variable_prefix = params.train.frozen_variable_prefix
-    self._regularization_var_regex = params.train.regularization_variable_regex
-    self._l2_weight_decay = params.train.l2_weight_decay
-
-    # Checkpoint restoration.
-    self._checkpoint = params.train.checkpoint.as_dict()
-
-    # Summary.
-    self._enable_summary = params.enable_summary
-    self._model_dir = params.model_dir
-
-  @abc.abstractmethod
-  def build_outputs(self, inputs, mode):
-    """Build the graph of the forward path."""
-    pass
-
-  @abc.abstractmethod
-  def build_model(self, params, mode):
-    """Build the model object."""
-    pass
-
-  @abc.abstractmethod
-  def build_loss_fn(self):
-    """Build the model object."""
-    pass
-
-  def post_processing(self, labels, outputs):
-    """Post-processing function."""
-    return labels, outputs
-
-  def model_outputs(self, inputs, mode):
-    """Build the model outputs."""
-    return self.build_outputs(inputs, mode)
-
-  def build_optimizer(self):
-    """Returns train_op to optimize total loss."""
-    # Sets up the optimizer.
-    return self._optimizer_fn(self._learning_rate)
-
-  def make_filter_trainable_variables_fn(self):
-    """Creates a function for filtering trainable varialbes."""
-    return _make_filter_trainable_variables_fn(self._frozen_variable_prefix)
-
-  def weight_decay_loss(self, trainable_variables):
-    reg_variables = [
-        v for v in trainable_variables
-        if self._regularization_var_regex is None or
-        re.match(self._regularization_var_regex, v.name)
-    ]
-
-    return self._l2_weight_decay * tf.add_n(
-        [tf.nn.l2_loss(v) for v in reg_variables])
-
-  def make_restore_checkpoint_fn(self):
-    """Returns scaffold function to restore parameters from v1 checkpoint."""
-    if 'skip_checkpoint_variables' in self._checkpoint:
-      skip_regex = self._checkpoint['skip_checkpoint_variables']
-    else:
-      skip_regex = None
-    return checkpoint_utils.make_restore_checkpoint_fn(
-        self._checkpoint['path'],
-        prefix=self._checkpoint['prefix'],
-        skip_regex=skip_regex)
-
-  def eval_metrics(self):
-    """Returns tuple of metric function and its inputs for evaluation."""
-    raise NotImplementedError('Unimplemented eval_metrics')
--- a/official/vision/detection/modeling/checkpoint_utils.py
+++ b/official/vision/detection/modeling/checkpoint_utils.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Util functions for loading checkpoints.
-
-Especially for loading Tensorflow 1.x
-checkpoint to Tensorflow 2.x (keras) model.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import re
-
-from absl import logging
-
-import tensorflow as tf
-
-
-def _build_assignment_map(keras_model,
-                          prefix='',
-                          skip_variables_regex=None,
-                          var_to_shape_map=None):
-  """Compute an assignment mapping for loading older checkpoints into a Keras
-
-  model. Variable names are remapped from the original TPUEstimator model to
-  the new Keras name.
-
-  Args:
-    keras_model: tf.keras.Model object to provide variables to assign.
-    prefix: prefix in the variable name to be remove for alignment with names in
-      the checkpoint.
-    skip_variables_regex: regular expression to math the names of variables that
-      do not need to be assign.
-    var_to_shape_map: variable name to shape mapping from the checkpoint.
-
-  Returns:
-    The variable assignment map.
-  """
-  assignment_map = {}
-
-  checkpoint_names = []
-  if var_to_shape_map:
-    checkpoint_names = list(
-        filter(
-            lambda x: not x.endswith('Momentum') and not x.endswith(
-                'global_step'), var_to_shape_map.keys()))
-
-  logging.info('Number of variables in the checkpoint %d',
-               len(checkpoint_names))
-
-  for var in keras_model.variables:
-    var_name = var.name
-
-    if skip_variables_regex and re.match(skip_variables_regex, var_name):
-      continue
-    # Trim the index of the variable.
-    if ':' in var_name:
-      var_name = var_name[:var_name.rindex(':')]
-    if var_name.startswith(prefix):
-      var_name = var_name[len(prefix):]
-
-    if not var_to_shape_map:
-      assignment_map[var_name] = var
-      continue
-
-    # Match name with variables in the checkpoint.
-    match_names = list(filter(lambda x: x.endswith(var_name), checkpoint_names))
-    try:
-      if match_names:
-        assert len(match_names) == 1, 'more then on matches for {}: {}'.format(
-            var_name, match_names)
-        checkpoint_names.remove(match_names[0])
-        assignment_map[match_names[0]] = var
-      else:
-        logging.info('Error not found var name: %s', var_name)
-    except Exception as e:
-      logging.info('Error removing the match_name: %s', match_names)
-      logging.info('Exception: %s', e)
-      raise
-  logging.info('Found matching variable in checkpoint: %d', len(assignment_map))
-  return assignment_map
-
-
-def _get_checkpoint_map(checkpoint_path):
-  reader = tf.train.load_checkpoint(checkpoint_path)
-  return reader.get_variable_to_shape_map()
-
-
-def make_restore_checkpoint_fn(checkpoint_path, prefix='', skip_regex=None):
-  """Returns scaffold function to restore parameters from v1 checkpoint.
-
-  Args:
-    checkpoint_path: path of the checkpoint folder or file.
-      Example 1: '/path/to/model_dir/'
-      Example 2: '/path/to/model.ckpt-22500'
-    prefix: prefix in the variable name to be remove for alignment with names in
-      the checkpoint.
-    skip_regex: regular expression to math the names of variables that do not
-      need to be assign.
-
-  Returns:
-    Callable[tf.kears.Model] -> void. Fn to load v1 checkpoint to keras model.
-  """
-
-  def _restore_checkpoint_fn(keras_model):
-    """Loads pretrained model through scaffold function."""
-    if not checkpoint_path:
-      logging.info('checkpoint_path is empty')
-      return
-    var_prefix = prefix
-    if prefix and not prefix.endswith('/'):
-      var_prefix += '/'
-    var_to_shape_map = _get_checkpoint_map(checkpoint_path)
-    assert var_to_shape_map, 'var_to_shape_map should not be empty'
-    vars_to_load = _build_assignment_map(
-        keras_model,
-        prefix=var_prefix,
-        skip_variables_regex=skip_regex,
-        var_to_shape_map=var_to_shape_map)
-    if not vars_to_load:
-      raise ValueError('Variables to load is empty.')
-    tf.compat.v1.train.init_from_checkpoint(checkpoint_path, vars_to_load)
-
-  return _restore_checkpoint_fn
--- a/official/vision/detection/modeling/factory.py
+++ b/official/vision/detection/modeling/factory.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Factory to build detection model."""
-
-
-from official.vision.detection.modeling import maskrcnn_model
-from official.vision.detection.modeling import olnmask_model
-from official.vision.detection.modeling import retinanet_model
-from official.vision.detection.modeling import shapemask_model
-
-
-def model_generator(params):
-  """Model function generator."""
-  if params.type == 'retinanet':
-    model_fn = retinanet_model.RetinanetModel(params)
-  elif params.type == 'mask_rcnn':
-    model_fn = maskrcnn_model.MaskrcnnModel(params)
-  elif params.type == 'olnmask':
-    model_fn = olnmask_model.OlnMaskModel(params)
-  elif params.type == 'shapemask':
-    model_fn = shapemask_model.ShapeMaskModel(params)
-  else:
-    raise ValueError('Model %s is not supported.'% params.type)
-
-  return model_fn