Deprecating official/vision/detection folder.

The folder is archived in the official/legacy/detection PiperOrigin-RevId: 419643226

Deprecating official/vision/detection folder.
The folder is archived in the official/legacy/detection PiperOrigin-RevId: 419643226
9a264c9f · Yeqing Li · A. Unique TensorFlower · 8fb4e6a6 · 8fb4e6a6 · 8fb4e6a6
Commit 9a264c9f authored Jan 04, 2022 by Yeqing Li Committed by A. Unique TensorFlower Jan 04, 2022
19 changed files
--- a/official/vision/detection/modeling/learning_rates.py
+++ b/official/vision/detection/modeling/learning_rates.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Learning rate schedule."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-import numpy as np
-import tensorflow as tf
-from official.modeling.hyperparams import params_dict
-
-
-class StepLearningRateWithLinearWarmup(
-    tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Class to generate learning rate tensor."""
-
-  def __init__(self, total_steps, params):
-    """Creates the step learning rate tensor with linear warmup."""
-    super(StepLearningRateWithLinearWarmup, self).__init__()
-    self._total_steps = total_steps
-    assert isinstance(params, (dict, params_dict.ParamsDict))
-    if isinstance(params, dict):
-      params = params_dict.ParamsDict(params)
-    self._params = params
-
-  def __call__(self, global_step):
-    warmup_lr = self._params.warmup_learning_rate
-    warmup_steps = self._params.warmup_steps
-    init_lr = self._params.init_learning_rate
-    lr_levels = self._params.learning_rate_levels
-    lr_steps = self._params.learning_rate_steps
-    linear_warmup = (
-        warmup_lr + tf.cast(global_step, dtype=tf.float32) / warmup_steps *
-        (init_lr - warmup_lr))
-    learning_rate = tf.where(global_step < warmup_steps, linear_warmup, init_lr)
-
-    for next_learning_rate, start_step in zip(lr_levels, lr_steps):
-      learning_rate = tf.where(global_step >= start_step, next_learning_rate,
-                               learning_rate)
-    return learning_rate
-
-  def get_config(self):
-    return {'_params': self._params.as_dict()}
-
-
-class CosineLearningRateWithLinearWarmup(
-    tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Class to generate learning rate tensor."""
-
-  def __init__(self, total_steps, params):
-    """Creates the consine learning rate tensor with linear warmup."""
-    super(CosineLearningRateWithLinearWarmup, self).__init__()
-    self._total_steps = total_steps
-    assert isinstance(params, (dict, params_dict.ParamsDict))
-    if isinstance(params, dict):
-      params = params_dict.ParamsDict(params)
-    self._params = params
-
-  def __call__(self, global_step):
-    global_step = tf.cast(global_step, dtype=tf.float32)
-    warmup_lr = self._params.warmup_learning_rate
-    warmup_steps = self._params.warmup_steps
-    init_lr = self._params.init_learning_rate
-    total_steps = self._total_steps
-    linear_warmup = (
-        warmup_lr + global_step / warmup_steps * (init_lr - warmup_lr))
-    cosine_learning_rate = (
-        init_lr * (tf.cos(np.pi * (global_step - warmup_steps) /
-                          (total_steps - warmup_steps)) + 1.0) / 2.0)
-    learning_rate = tf.where(global_step < warmup_steps, linear_warmup,
-                             cosine_learning_rate)
-    return learning_rate
-
-  def get_config(self):
-    return {'_params': self._params.as_dict()}
-
-
-def learning_rate_generator(total_steps, params):
-  """The learning rate function generator."""
-  if params.type == 'step':
-    return StepLearningRateWithLinearWarmup(total_steps, params)
-  elif params.type == 'cosine':
-    return CosineLearningRateWithLinearWarmup(total_steps, params)
-  else:
-    raise ValueError('Unsupported learning rate type: {}.'.format(params.type))
--- a/official/vision/detection/modeling/losses.py
+++ b/official/vision/detection/modeling/losses.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Losses used for detection models."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import logging
-import tensorflow as tf
-
-
-def focal_loss(logits, targets, alpha, gamma, normalizer):
-  """Compute the focal loss between `logits` and the golden `target` values.
-
-  Focal loss = -(1-pt)^gamma * log(pt)
-  where pt is the probability of being classified to the true class.
-
-  Args:
-    logits: A float32 tensor of size
-      [batch, height_in, width_in, num_predictions].
-    targets: A float32 tensor of size
-      [batch, height_in, width_in, num_predictions].
-    alpha: A float32 scalar multiplying alpha to the loss from positive examples
-      and (1-alpha) to the loss from negative examples.
-    gamma: A float32 scalar modulating loss from hard and easy examples.
-    normalizer: A float32 scalar normalizes the total loss from all examples.
-
-  Returns:
-    loss: A float32 Tensor of size [batch, height_in, width_in, num_predictions]
-      representing normalized loss on the prediction map.
-  """
-  with tf.name_scope('focal_loss'):
-    positive_label_mask = tf.math.equal(targets, 1.0)
-    cross_entropy = (
-        tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=logits))
-    # Below are comments/derivations for computing modulator.
-    # For brevity, let x = logits,  z = targets, r = gamma, and p_t = sigmod(x)
-    # for positive samples and 1 - sigmoid(x) for negative examples.
-    #
-    # The modulator, defined as (1 - P_t)^r, is a critical part in focal loss
-    # computation. For r > 0, it puts more weights on hard examples, and less
-    # weights on easier ones. However if it is directly computed as (1 - P_t)^r,
-    # its back-propagation is not stable when r < 1. The implementation here
-    # resolves the issue.
-    #
-    # For positive samples (labels being 1),
-    #    (1 - p_t)^r
-    #  = (1 - sigmoid(x))^r
-    #  = (1 - (1 / (1 + exp(-x))))^r
-    #  = (exp(-x) / (1 + exp(-x)))^r
-    #  = exp(log((exp(-x) / (1 + exp(-x)))^r))
-    #  = exp(r * log(exp(-x)) - r * log(1 + exp(-x)))
-    #  = exp(- r * x - r * log(1 + exp(-x)))
-    #
-    # For negative samples (labels being 0),
-    #    (1 - p_t)^r
-    #  = (sigmoid(x))^r
-    #  = (1 / (1 + exp(-x)))^r
-    #  = exp(log((1 / (1 + exp(-x)))^r))
-    #  = exp(-r * log(1 + exp(-x)))
-    #
-    # Therefore one unified form for positive (z = 1) and negative (z = 0)
-    # samples is:
-    #      (1 - p_t)^r = exp(-r * z * x - r * log(1 + exp(-x))).
-    neg_logits = -1.0 * logits
-    modulator = tf.math.exp(gamma * targets * neg_logits -
-                            gamma * tf.math.log1p(tf.math.exp(neg_logits)))
-    loss = modulator * cross_entropy
-    weighted_loss = tf.where(positive_label_mask, alpha * loss,
-                             (1.0 - alpha) * loss)
-    weighted_loss /= normalizer
-  return weighted_loss
-
-
-class RpnScoreLoss(object):
-  """Region Proposal Network score loss function."""
-
-  def __init__(self, params):
-    self._rpn_batch_size_per_im = params.rpn_batch_size_per_im
-    self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
-        reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
-
-  def __call__(self, score_outputs, labels):
-    """Computes total RPN detection loss.
-
-    Computes total RPN detection loss including box and score from all levels.
-
-    Args:
-      score_outputs: an OrderDict with keys representing levels and values
-        representing scores in [batch_size, height, width, num_anchors].
-      labels: the dictionary that returned from dataloader that includes
-        groundturth targets.
-
-    Returns:
-      rpn_score_loss: a scalar tensor representing total score loss.
-    """
-    with tf.name_scope('rpn_loss'):
-      levels = sorted(score_outputs.keys())
-
-      score_losses = []
-      for level in levels:
-        score_losses.append(
-            self._rpn_score_loss(
-                score_outputs[level],
-                labels[level],
-                normalizer=tf.cast(
-                    tf.shape(score_outputs[level])[0] *
-                    self._rpn_batch_size_per_im, dtype=tf.float32)))
-
-      # Sums per level losses to total loss.
-      return tf.math.add_n(score_losses)
-
-  def _rpn_score_loss(self, score_outputs, score_targets, normalizer=1.0):
-    """Computes score loss."""
-    # score_targets has three values:
-    # (1) score_targets[i]=1, the anchor is a positive sample.
-    # (2) score_targets[i]=0, negative.
-    # (3) score_targets[i]=-1, the anchor is don't care (ignore).
-    with tf.name_scope('rpn_score_loss'):
-      mask = tf.math.logical_or(tf.math.equal(score_targets, 1),
-                                tf.math.equal(score_targets, 0))
-
-      score_targets = tf.math.maximum(score_targets,
-                                      tf.zeros_like(score_targets))
-
-      score_targets = tf.expand_dims(score_targets, axis=-1)
-      score_outputs = tf.expand_dims(score_outputs, axis=-1)
-      score_loss = self._binary_crossentropy(
-          score_targets, score_outputs, sample_weight=mask)
-
-      score_loss /= normalizer
-      return score_loss
-
-
-class RpnBoxLoss(object):
-  """Region Proposal Network box regression loss function."""
-
-  def __init__(self, params):
-    logging.info('RpnBoxLoss huber_loss_delta %s', params.huber_loss_delta)
-    # The delta is typically around the mean value of regression target.
-    # for instances, the regression targets of 512x512 input with 6 anchors on
-    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
-    self._huber_loss = tf.keras.losses.Huber(
-        delta=params.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
-
-  def __call__(self, box_outputs, labels):
-    """Computes total RPN detection loss.
-
-    Computes total RPN detection loss including box and score from all levels.
-
-    Args:
-      box_outputs: an OrderDict with keys representing levels and values
-        representing box regression targets in
-        [batch_size, height, width, num_anchors * 4].
-      labels: the dictionary that returned from dataloader that includes
-        groundturth targets.
-
-    Returns:
-      rpn_box_loss: a scalar tensor representing total box regression loss.
-    """
-    with tf.name_scope('rpn_loss'):
-      levels = sorted(box_outputs.keys())
-
-      box_losses = []
-      for level in levels:
-        box_losses.append(self._rpn_box_loss(box_outputs[level], labels[level]))
-
-      # Sum per level losses to total loss.
-      return tf.add_n(box_losses)
-
-  def _rpn_box_loss(self, box_outputs, box_targets, normalizer=1.0):
-    """Computes box regression loss."""
-    with tf.name_scope('rpn_box_loss'):
-      mask = tf.cast(tf.not_equal(box_targets, 0.0), dtype=tf.float32)
-      box_targets = tf.expand_dims(box_targets, axis=-1)
-      box_outputs = tf.expand_dims(box_outputs, axis=-1)
-      box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
-      # The loss is normalized by the sum of non-zero weights and additional
-      # normalizer provided by the function caller. Using + 0.01 here to avoid
-      # division by zero.
-      box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01)
-      return box_loss
-
-
-class OlnRpnCenterLoss(object):
-  """Object Localization Network RPN centerness regression loss function."""
-
-  def __init__(self):
-    self._l1_loss = tf.keras.losses.MeanAbsoluteError(
-        reduction=tf.keras.losses.Reduction.SUM)
-
-  def __call__(self, center_outputs, labels):
-    """Computes total RPN centerness regression loss.
-
-    Computes total RPN centerness score regression loss from all levels.
-
-    Args:
-      center_outputs: an OrderDict with keys representing levels and values
-        representing anchor centerness regression targets in
-        [batch_size, height, width, num_anchors * 4].
-      labels: the dictionary that returned from dataloader that includes
-        groundturth targets.
-
-    Returns:
-      rpn_center_loss: a scalar tensor representing total centerness regression
-        loss.
-    """
-    with tf.name_scope('rpn_loss'):
-      # Normalizer.
-      levels = sorted(center_outputs.keys())
-      num_valid = 0
-      # 0<pos<1, neg=0, ign=-1
-      for level in levels:
-        num_valid += tf.reduce_sum(tf.cast(
-            tf.greater(labels[level], -1.0), tf.float32))  # in and out of box
-      num_valid += 1e-12
-
-      # Centerness loss over multi levels.
-      center_losses = []
-      for level in levels:
-        center_losses.append(
-            self._rpn_center_l1_loss(
-                center_outputs[level], labels[level],
-                normalizer=num_valid))
-
-      # Sum per level losses to total loss.
-      return tf.add_n(center_losses)
-
-  def _rpn_center_l1_loss(self, center_outputs, center_targets,
-                          normalizer=1.0):
-    """Computes centerness regression loss."""
-    # for instances, the regression targets of 512x512 input with 6 anchors on
-    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
-    with tf.name_scope('rpn_center_loss'):
-
-      # mask = tf.greater(center_targets, 0.0)  # inside box only.
-      mask = tf.greater(center_targets, -1.0)  # in and out of box.
-      center_targets = tf.maximum(center_targets, tf.zeros_like(center_targets))
-      center_outputs = tf.sigmoid(center_outputs)
-      center_targets = tf.expand_dims(center_targets, -1)
-      center_outputs = tf.expand_dims(center_outputs, -1)
-      mask = tf.cast(mask, dtype=tf.float32)
-      center_loss = self._l1_loss(center_targets, center_outputs,
-                                  sample_weight=mask)
-      center_loss /= normalizer
-      return center_loss
-
-
-class OlnRpnIoULoss(object):
-  """Object Localization Network RPN box-lrtb regression iou loss function."""
-
-  def __call__(self, box_outputs, labels, center_targets):
-    """Computes total RPN detection loss.
-
-    Computes total RPN box regression loss from all levels.
-
-    Args:
-      box_outputs: an OrderDict with keys representing levels and values
-        representing box regression targets in
-        [batch_size, height, width, num_anchors * 4].
-        last channel: (left, right, top, bottom).
-      labels: the dictionary that returned from dataloader that includes
-        groundturth targets (left, right, top, bottom).
-      center_targets: valid_target mask.
-
-    Returns:
-      rpn_iou_loss: a scalar tensor representing total box regression loss.
-    """
-    with tf.name_scope('rpn_loss'):
-      # Normalizer.
-      levels = sorted(box_outputs.keys())
-      normalizer = 0.
-      for level in levels:
-        # center_targets pos>0, neg=0, ign=-1.
-        mask_ = tf.cast(tf.logical_and(
-            tf.greater(center_targets[level][..., 0], 0.0),
-            tf.greater(tf.reduce_min(labels[level], -1), 0.0)), tf.float32)
-        normalizer += tf.reduce_sum(mask_)
-      normalizer += 1e-8
-      # iou_loss over multi levels.
-      iou_losses = []
-      for level in levels:
-        iou_losses.append(
-            self._rpn_iou_loss(
-                box_outputs[level], labels[level],
-                center_weight=center_targets[level][..., 0],
-                normalizer=normalizer))
-      # Sum per level losses to total loss.
-      return tf.add_n(iou_losses)
-
-  def _rpn_iou_loss(self, box_outputs, box_targets,
-                    center_weight=None, normalizer=1.0):
-    """Computes box regression loss."""
-    # for instances, the regression targets of 512x512 input with 6 anchors on
-    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
-    with tf.name_scope('rpn_iou_loss'):
-      mask = tf.logical_and(
-          tf.greater(center_weight, 0.0),
-          tf.greater(tf.reduce_min(box_targets, -1), 0.0))
-
-      pred_left = box_outputs[..., 0]
-      pred_right = box_outputs[..., 1]
-      pred_top = box_outputs[..., 2]
-      pred_bottom = box_outputs[..., 3]
-
-      gt_left = box_targets[..., 0]
-      gt_right = box_targets[..., 1]
-      gt_top = box_targets[..., 2]
-      gt_bottom = box_targets[..., 3]
-
-      inter_width = (tf.minimum(pred_left, gt_left) +
-                     tf.minimum(pred_right, gt_right))
-      inter_height = (tf.minimum(pred_top, gt_top) +
-                      tf.minimum(pred_bottom, gt_bottom))
-      inter_area = inter_width * inter_height
-      union_area = ((pred_left + pred_right) * (pred_top + pred_bottom) +
-                    (gt_left + gt_right) * (gt_top + gt_bottom) -
-                    inter_area)
-      iou = inter_area / (union_area + 1e-8)
-      mask_ = tf.cast(mask, tf.float32)
-      iou = tf.clip_by_value(iou, clip_value_min=1e-8, clip_value_max=1.0)
-      neg_log_iou = -tf.math.log(iou)
-      iou_loss = tf.reduce_sum(neg_log_iou * mask_)
-      iou_loss /= normalizer
-      return iou_loss
-
-
-class FastrcnnClassLoss(object):
-  """Fast R-CNN classification loss function."""
-
-  def __init__(self):
-    self._categorical_crossentropy = tf.keras.losses.CategoricalCrossentropy(
-        reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
-
-  def __call__(self, class_outputs, class_targets):
-    """Computes the class loss (Fast-RCNN branch) of Mask-RCNN.
-
-    This function implements the classification loss of the Fast-RCNN.
-
-    The classification loss is softmax on all RoIs.
-    Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py  # pylint: disable=line-too-long
-
-    Args:
-      class_outputs: a float tensor representing the class prediction for each box
-        with a shape of [batch_size, num_boxes, num_classes].
-      class_targets: a float tensor representing the class label for each box
-        with a shape of [batch_size, num_boxes].
-
-    Returns:
-      a scalar tensor representing total class loss.
-    """
-    with tf.name_scope('fast_rcnn_loss'):
-      batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
-      class_targets = tf.cast(class_targets, dtype=tf.int32)
-      class_targets_one_hot = tf.one_hot(class_targets, num_classes)
-      return self._fast_rcnn_class_loss(class_outputs, class_targets_one_hot,
-                                        normalizer=batch_size * num_boxes / 2.0)
-
-  def _fast_rcnn_class_loss(self, class_outputs, class_targets_one_hot,
-                            normalizer):
-    """Computes classification loss."""
-    with tf.name_scope('fast_rcnn_class_loss'):
-      class_loss = self._categorical_crossentropy(class_targets_one_hot,
-                                                  class_outputs)
-
-      class_loss /= normalizer
-      return class_loss
-
-
-class FastrcnnBoxLoss(object):
-  """Fast R-CNN box regression loss function."""
-
-  def __init__(self, params):
-    logging.info('FastrcnnBoxLoss huber_loss_delta %s', params.huber_loss_delta)
-    # The delta is typically around the mean value of regression target.
-    # for instances, the regression targets of 512x512 input with 6 anchors on
-    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
-    self._huber_loss = tf.keras.losses.Huber(
-        delta=params.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
-
-  def __call__(self, box_outputs, class_targets, box_targets):
-    """Computes the box loss (Fast-RCNN branch) of Mask-RCNN.
-
-    This function implements the box regression loss of the Fast-RCNN. As the
-    `box_outputs` produces `num_classes` boxes for each RoI, the reference model
-    expands `box_targets` to match the shape of `box_outputs` and selects only
-    the target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/fast_rcnn.py)  # pylint: disable=line-too-long
-    Instead, this function selects the `box_outputs` by the `class_targets` so
-    that it doesn't expand `box_targets`.
-
-    The box loss is smooth L1-loss on only positive samples of RoIs.
-    Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py  # pylint: disable=line-too-long
-
-    Args:
-      box_outputs: a float tensor representing the box prediction for each box
-        with a shape of [batch_size, num_boxes, num_classes * 4].
-      class_targets: a float tensor representing the class label for each box
-        with a shape of [batch_size, num_boxes].
-      box_targets: a float tensor representing the box label for each box
-        with a shape of [batch_size, num_boxes, 4].
-
-    Returns:
-      box_loss: a scalar tensor representing total box regression loss.
-    """
-    with tf.name_scope('fast_rcnn_loss'):
-      class_targets = tf.cast(class_targets, dtype=tf.int32)
-
-      # Selects the box from `box_outputs` based on `class_targets`, with which
-      # the box has the maximum overlap.
-      (batch_size, num_rois,
-       num_class_specific_boxes) = box_outputs.get_shape().as_list()
-      num_classes = num_class_specific_boxes // 4
-      box_outputs = tf.reshape(box_outputs,
-                               [batch_size, num_rois, num_classes, 4])
-
-      box_indices = tf.reshape(
-          class_targets + tf.tile(
-              tf.expand_dims(
-                  tf.range(batch_size) * num_rois * num_classes, 1),
-              [1, num_rois]) + tf.tile(
-                  tf.expand_dims(tf.range(num_rois) * num_classes, 0),
-                  [batch_size, 1]), [-1])
-
-      box_outputs = tf.matmul(
-          tf.one_hot(
-              box_indices,
-              batch_size * num_rois * num_classes,
-              dtype=box_outputs.dtype), tf.reshape(box_outputs, [-1, 4]))
-      box_outputs = tf.reshape(box_outputs, [batch_size, -1, 4])
-
-      return self._fast_rcnn_box_loss(box_outputs, box_targets, class_targets)
-
-  def _fast_rcnn_box_loss(self, box_outputs, box_targets, class_targets,
-                          normalizer=1.0):
-    """Computes box regression loss."""
-    with tf.name_scope('fast_rcnn_box_loss'):
-      mask = tf.tile(tf.expand_dims(tf.greater(class_targets, 0), axis=2),
-                     [1, 1, 4])
-      mask = tf.cast(mask, dtype=tf.float32)
-      box_targets = tf.expand_dims(box_targets, axis=-1)
-      box_outputs = tf.expand_dims(box_outputs, axis=-1)
-      box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
-      # The loss is normalized by the number of ones in mask,
-      # additianal normalizer provided by the user and using 0.01 here to avoid
-      # division by 0.
-      box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01)
-      return box_loss
-
-
-class OlnBoxScoreLoss(object):
-  """Object Localization Network Box-Iou scoring function."""
-
-  def __init__(self, params):
-    self._ignore_threshold = params.ignore_threshold
-    self._l1_loss = tf.keras.losses.MeanAbsoluteError(
-        reduction=tf.keras.losses.Reduction.SUM)
-
-  def __call__(self, score_outputs, score_targets):
-    """Computes the class loss (Fast-RCNN branch) of Mask-RCNN.
-
-    This function implements the classification loss of the Fast-RCNN.
-
-    The classification loss is softmax on all RoIs.
-    Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py  # pylint: disable=line-too-long
-
-    Args:
-      score_outputs: a float tensor representing the class prediction for each box
-        with a shape of [batch_size, num_boxes, num_classes].
-      score_targets: a float tensor representing the class label for each box
-        with a shape of [batch_size, num_boxes].
-
-    Returns:
-      a scalar tensor representing total score loss.
-    """
-    with tf.name_scope('fast_rcnn_loss'):
-      score_outputs = tf.squeeze(score_outputs, -1)
-
-      mask = tf.greater(score_targets, self._ignore_threshold)
-      num_valid = tf.reduce_sum(tf.cast(mask, tf.float32))
-      score_targets = tf.maximum(score_targets, tf.zeros_like(score_targets))
-      score_outputs = tf.sigmoid(score_outputs)
-      score_targets = tf.expand_dims(score_targets, -1)
-      score_outputs = tf.expand_dims(score_outputs, -1)
-      mask = tf.cast(mask, dtype=tf.float32)
-      score_loss = self._l1_loss(score_targets, score_outputs,
-                                 sample_weight=mask)
-      score_loss /= (num_valid + 1e-10)
-      return score_loss
-
-
-class MaskrcnnLoss(object):
-  """Mask R-CNN instance segmentation mask loss function."""
-
-  def __init__(self):
-    self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
-        reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
-
-  def __call__(self, mask_outputs, mask_targets, select_class_targets):
-    """Computes the mask loss of Mask-RCNN.
-
-    This function implements the mask loss of Mask-RCNN. As the `mask_outputs`
-    produces `num_classes` masks for each RoI, the reference model expands
-    `mask_targets` to match the shape of `mask_outputs` and selects only the
-    target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/mask_rcnn.py)  # pylint: disable=line-too-long
-    Instead, this implementation selects the `mask_outputs` by the `class_targets`
-    so that it doesn't expand `mask_targets`. Note that the selection logic is
-    done in the post-processing of mask_rcnn_fn in mask_rcnn_architecture.py.
-
-    Args:
-      mask_outputs: a float tensor representing the prediction for each mask,
-        with a shape of
-        [batch_size, num_masks, mask_height, mask_width].
-      mask_targets: a float tensor representing the binary mask of ground truth
-        labels for each mask with a shape of
-        [batch_size, num_masks, mask_height, mask_width].
-      select_class_targets: a tensor with a shape of [batch_size, num_masks],
-        representing the foreground mask targets.
-
-    Returns:
-      mask_loss: a float tensor representing total mask loss.
-    """
-    with tf.name_scope('mask_rcnn_loss'):
-      (batch_size, num_masks, mask_height,
-       mask_width) = mask_outputs.get_shape().as_list()
-
-      weights = tf.tile(
-          tf.reshape(tf.greater(select_class_targets, 0),
-                     [batch_size, num_masks, 1, 1]),
-          [1, 1, mask_height, mask_width])
-      weights = tf.cast(weights, dtype=tf.float32)
-
-      mask_targets = tf.expand_dims(mask_targets, axis=-1)
-      mask_outputs = tf.expand_dims(mask_outputs, axis=-1)
-      mask_loss = self._binary_crossentropy(mask_targets, mask_outputs,
-                                            sample_weight=weights)
-
-      # The loss is normalized by the number of 1's in weights and
-      # + 0.01 is used to avoid division by zero.
-      return mask_loss / (tf.reduce_sum(weights) + 0.01)
-
-
-class RetinanetClassLoss(object):
-  """RetinaNet class loss."""
-
-  def __init__(self, params, num_classes):
-    self._num_classes = num_classes
-    self._focal_loss_alpha = params.focal_loss_alpha
-    self._focal_loss_gamma = params.focal_loss_gamma
-
-  def __call__(self, cls_outputs, labels, num_positives):
-    """Computes total detection loss.
-
-    Computes total detection loss including box and class loss from all levels.
-
-    Args:
-      cls_outputs: an OrderDict with keys representing levels and values
-        representing logits in [batch_size, height, width,
-        num_anchors * num_classes].
-      labels: the dictionary that returned from dataloader that includes
-        class groundturth targets.
-      num_positives: number of positive examples in the minibatch.
-
-    Returns:
-      an integar tensor representing total class loss.
-    """
-    # Sums all positives in a batch for normalization and avoids zero
-    # num_positives_sum, which would lead to inf loss during training
-    num_positives_sum = tf.reduce_sum(input_tensor=num_positives) + 1.0
-
-    cls_losses = []
-    for level in cls_outputs.keys():
-      cls_losses.append(self.class_loss(
-          cls_outputs[level], labels[level], num_positives_sum))
-    # Sums per level losses to total loss.
-    return tf.add_n(cls_losses)
-
-  def class_loss(self, cls_outputs, cls_targets, num_positives,
-                 ignore_label=-2):
-    """Computes RetinaNet classification loss."""
-    # Onehot encoding for classification labels.
-    cls_targets_one_hot = tf.one_hot(cls_targets, self._num_classes)
-    bs, height, width, _, _ = cls_targets_one_hot.get_shape().as_list()
-    cls_targets_one_hot = tf.reshape(cls_targets_one_hot,
-                                     [bs, height, width, -1])
-    loss = focal_loss(tf.cast(cls_outputs, dtype=tf.float32),
-                      tf.cast(cls_targets_one_hot, dtype=tf.float32),
-                      self._focal_loss_alpha,
-                      self._focal_loss_gamma,
-                      num_positives)
-
-    ignore_loss = tf.where(
-        tf.equal(cls_targets, ignore_label),
-        tf.zeros_like(cls_targets, dtype=tf.float32),
-        tf.ones_like(cls_targets, dtype=tf.float32),
-    )
-    ignore_loss = tf.expand_dims(ignore_loss, -1)
-    ignore_loss = tf.tile(ignore_loss, [1, 1, 1, 1, self._num_classes])
-    ignore_loss = tf.reshape(ignore_loss, tf.shape(input=loss))
-    return tf.reduce_sum(input_tensor=ignore_loss * loss)
-
-
-class RetinanetBoxLoss(object):
-  """RetinaNet box loss."""
-
-  def __init__(self, params):
-    self._huber_loss = tf.keras.losses.Huber(
-        delta=params.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
-
-  def __call__(self, box_outputs, labels, num_positives):
-    """Computes box detection loss.
-
-    Computes total detection loss including box and class loss from all levels.
-
-    Args:
-      box_outputs: an OrderDict with keys representing levels and values
-        representing box regression targets in [batch_size, height, width,
-        num_anchors * 4].
-      labels: the dictionary that returned from dataloader that includes
-        box groundturth targets.
-      num_positives: number of positive examples in the minibatch.
-
-    Returns:
-      an integer tensor representing total box regression loss.
-    """
-    # Sums all positives in a batch for normalization and avoids zero
-    # num_positives_sum, which would lead to inf loss during training
-    num_positives_sum = tf.reduce_sum(input_tensor=num_positives) + 1.0
-
-    box_losses = []
-    for level in box_outputs.keys():
-      box_targets_l = labels[level]
-      box_losses.append(
-          self.box_loss(box_outputs[level], box_targets_l, num_positives_sum))
-    # Sums per level losses to total loss.
-    return tf.add_n(box_losses)
-
-  def box_loss(self, box_outputs, box_targets, num_positives):
-    """Computes RetinaNet box regression loss."""
-    # The delta is typically around the mean value of regression target.
-    # for instances, the regression targets of 512x512 input with 6 anchors on
-    # P3-P7 pyramid is about [0.1, 0.1, 0.2, 0.2].
-    normalizer = num_positives * 4.0
-    mask = tf.cast(tf.not_equal(box_targets, 0.0), dtype=tf.float32)
-    box_targets = tf.expand_dims(box_targets, axis=-1)
-    box_outputs = tf.expand_dims(box_outputs, axis=-1)
-    box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
-    box_loss /= normalizer
-    return box_loss
-
-
-class ShapemaskMseLoss(object):
-  """ShapeMask mask Mean Squared Error loss function wrapper."""
-
-  def __call__(self, probs, labels, valid_mask):
-    """Compute instance segmentation loss.
-
-    Args:
-      probs: A Tensor of shape [batch_size * num_points, height, width,
-        num_classes]. The logits are not necessarily between 0 and 1.
-      labels: A float32/float16 Tensor of shape [batch_size, num_instances,
-          mask_size, mask_size], where mask_size =
-          mask_crop_size * gt_upsample_scale for fine mask, or mask_crop_size
-          for coarse masks and shape priors.
-      valid_mask: a binary mask indicating valid training masks.
-
-    Returns:
-      loss: an float tensor representing total mask classification loss.
-    """
-    with tf.name_scope('shapemask_prior_loss'):
-      batch_size, num_instances = valid_mask.get_shape().as_list()[:2]
-      diff = (tf.cast(labels, dtype=tf.float32) -
-              tf.cast(probs, dtype=tf.float32))
-      diff *= tf.cast(
-          tf.reshape(valid_mask, [batch_size, num_instances, 1, 1]),
-          tf.float32)
-      # Adding 0.001 in the denominator to avoid division by zero.
-      loss = tf.nn.l2_loss(diff) / (tf.reduce_sum(labels) + 0.001)
-    return loss
-
-
-class ShapemaskLoss(object):
-  """ShapeMask mask loss function wrapper."""
-
-  def __init__(self):
-    self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
-        reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
-
-  def __call__(self, logits, labels, valid_mask):
-    """ShapeMask mask cross entropy loss function wrapper.
-
-    Args:
-      logits: A Tensor of shape [batch_size * num_instances, height, width,
-        num_classes]. The logits are not necessarily between 0 and 1.
-      labels: A float16/float32 Tensor of shape [batch_size, num_instances,
-        mask_size, mask_size], where mask_size =
-        mask_crop_size * gt_upsample_scale for fine mask, or mask_crop_size
-        for coarse masks and shape priors.
-      valid_mask: a binary mask of shape [batch_size, num_instances]
-        indicating valid training masks.
-    Returns:
-      loss: an float tensor representing total mask classification loss.
-    """
-    with tf.name_scope('shapemask_loss'):
-      batch_size, num_instances = valid_mask.get_shape().as_list()[:2]
-      labels = tf.cast(labels, tf.float32)
-      logits = tf.cast(logits, tf.float32)
-      loss = self._binary_crossentropy(labels, logits)
-      loss *= tf.cast(tf.reshape(
-          valid_mask, [batch_size, num_instances, 1, 1]), loss.dtype)
-      # Adding 0.001 in the denominator to avoid division by zero.
-      loss = tf.reduce_sum(loss) / (tf.reduce_sum(labels) + 0.001)
-    return loss
--- a/official/vision/detection/modeling/maskrcnn_model.py
+++ b/official/vision/detection/modeling/maskrcnn_model.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Model defination for the Mask R-CNN Model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from official.vision.detection.dataloader import anchor
-from official.vision.detection.dataloader import mode_keys
-from official.vision.detection.evaluation import factory as eval_factory
-from official.vision.detection.modeling import base_model
-from official.vision.detection.modeling import losses
-from official.vision.detection.modeling.architecture import factory
-from official.vision.detection.ops import postprocess_ops
-from official.vision.detection.ops import roi_ops
-from official.vision.detection.ops import spatial_transform_ops
-from official.vision.detection.ops import target_ops
-from official.vision.detection.utils import box_utils
-
-
-class MaskrcnnModel(base_model.Model):
-  """Mask R-CNN model function."""
-
-  def __init__(self, params):
-    super(MaskrcnnModel, self).__init__(params)
-
-    # For eval metrics.
-    self._params = params
-    self._keras_model = None
-
-    self._include_mask = params.architecture.include_mask
-
-    # Architecture generators.
-    self._backbone_fn = factory.backbone_generator(params)
-    self._fpn_fn = factory.multilevel_features_generator(params)
-    self._rpn_head_fn = factory.rpn_head_generator(params)
-    self._generate_rois_fn = roi_ops.ROIGenerator(params.roi_proposal)
-    self._sample_rois_fn = target_ops.ROISampler(params.roi_sampling)
-    self._sample_masks_fn = target_ops.MaskSampler(
-        params.architecture.mask_target_size,
-        params.mask_sampling.num_mask_samples_per_image)
-
-    self._frcnn_head_fn = factory.fast_rcnn_head_generator(params)
-    if self._include_mask:
-      self._mrcnn_head_fn = factory.mask_rcnn_head_generator(params)
-
-    # Loss function.
-    self._rpn_score_loss_fn = losses.RpnScoreLoss(params.rpn_score_loss)
-    self._rpn_box_loss_fn = losses.RpnBoxLoss(params.rpn_box_loss)
-    self._frcnn_class_loss_fn = losses.FastrcnnClassLoss()
-    self._frcnn_box_loss_fn = losses.FastrcnnBoxLoss(params.frcnn_box_loss)
-    if self._include_mask:
-      self._mask_loss_fn = losses.MaskrcnnLoss()
-
-    self._generate_detections_fn = postprocess_ops.GenericDetectionGenerator(
-        params.postprocess)
-
-    self._transpose_input = params.train.transpose_input
-    assert not self._transpose_input, 'Transpose input is not supportted.'
-
-  def build_outputs(self, inputs, mode):
-    is_training = mode == mode_keys.TRAIN
-    model_outputs = {}
-
-    image = inputs['image']
-    _, image_height, image_width, _ = image.get_shape().as_list()
-    backbone_features = self._backbone_fn(image, is_training)
-    fpn_features = self._fpn_fn(backbone_features, is_training)
-
-    rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn(
-        fpn_features, is_training)
-    model_outputs.update({
-        'rpn_score_outputs':
-            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
-                                  rpn_score_outputs),
-        'rpn_box_outputs':
-            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
-                                  rpn_box_outputs),
-    })
-    input_anchor = anchor.Anchor(self._params.architecture.min_level,
-                                 self._params.architecture.max_level,
-                                 self._params.anchor.num_scales,
-                                 self._params.anchor.aspect_ratios,
-                                 self._params.anchor.anchor_size,
-                                 (image_height, image_width))
-    rpn_rois, _ = self._generate_rois_fn(rpn_box_outputs, rpn_score_outputs,
-                                         input_anchor.multilevel_boxes,
-                                         inputs['image_info'][:, 1, :],
-                                         is_training)
-    if is_training:
-      rpn_rois = tf.stop_gradient(rpn_rois)
-
-      # Sample proposals.
-      rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
-          self._sample_rois_fn(rpn_rois, inputs['gt_boxes'],
-                               inputs['gt_classes']))
-
-      # Create bounding box training targets.
-      box_targets = box_utils.encode_boxes(
-          matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0])
-      # If the target is background, the box target is set to all 0s.
-      box_targets = tf.where(
-          tf.tile(
-              tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
-              [1, 1, 4]), tf.zeros_like(box_targets), box_targets)
-      model_outputs.update({
-          'class_targets': matched_gt_classes,
-          'box_targets': box_targets,
-      })
-
-    roi_features = spatial_transform_ops.multilevel_crop_and_resize(
-        fpn_features, rpn_rois, output_size=7)
-
-    class_outputs, box_outputs = self._frcnn_head_fn(roi_features, is_training)
-
-    model_outputs.update({
-        'class_outputs':
-            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
-                                  class_outputs),
-        'box_outputs':
-            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
-                                  box_outputs),
-    })
-
-    # Add this output to train to make the checkpoint loadable in predict mode.
-    # If we skip it in train mode, the heads will be out-of-order and checkpoint
-    # loading will fail.
-    boxes, scores, classes, valid_detections = self._generate_detections_fn(
-        box_outputs, class_outputs, rpn_rois, inputs['image_info'][:, 1:2, :])
-    model_outputs.update({
-        'num_detections': valid_detections,
-        'detection_boxes': boxes,
-        'detection_classes': classes,
-        'detection_scores': scores,
-    })
-
-    if not self._include_mask:
-      return model_outputs
-
-    if is_training:
-      rpn_rois, classes, mask_targets = self._sample_masks_fn(
-          rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices,
-          inputs['gt_masks'])
-      mask_targets = tf.stop_gradient(mask_targets)
-
-      classes = tf.cast(classes, dtype=tf.int32)
-
-      model_outputs.update({
-          'mask_targets': mask_targets,
-          'sampled_class_targets': classes,
-      })
-    else:
-      rpn_rois = boxes
-      classes = tf.cast(classes, dtype=tf.int32)
-
-    mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
-        fpn_features, rpn_rois, output_size=14)
-
-    mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes, is_training)
-
-    if is_training:
-      model_outputs.update({
-          'mask_outputs':
-              tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
-                                    mask_outputs),
-      })
-    else:
-      model_outputs.update({'detection_masks': tf.nn.sigmoid(mask_outputs)})
-
-    return model_outputs
-
-  def build_loss_fn(self):
-    if self._keras_model is None:
-      raise ValueError('build_loss_fn() must be called after build_model().')
-
-    filter_fn = self.make_filter_trainable_variables_fn()
-    trainable_variables = filter_fn(self._keras_model.trainable_variables)
-
-    def _total_loss_fn(labels, outputs):
-      rpn_score_loss = self._rpn_score_loss_fn(outputs['rpn_score_outputs'],
-                                               labels['rpn_score_targets'])
-      rpn_box_loss = self._rpn_box_loss_fn(outputs['rpn_box_outputs'],
-                                           labels['rpn_box_targets'])
-
-      frcnn_class_loss = self._frcnn_class_loss_fn(outputs['class_outputs'],
-                                                   outputs['class_targets'])
-      frcnn_box_loss = self._frcnn_box_loss_fn(outputs['box_outputs'],
-                                               outputs['class_targets'],
-                                               outputs['box_targets'])
-
-      if self._include_mask:
-        mask_loss = self._mask_loss_fn(outputs['mask_outputs'],
-                                       outputs['mask_targets'],
-                                       outputs['sampled_class_targets'])
-      else:
-        mask_loss = 0.0
-
-      model_loss = (
-          rpn_score_loss + rpn_box_loss + frcnn_class_loss + frcnn_box_loss +
-          mask_loss)
-
-      l2_regularization_loss = self.weight_decay_loss(trainable_variables)
-      total_loss = model_loss + l2_regularization_loss
-      return {
-          'total_loss': total_loss,
-          'loss': total_loss,
-          'fast_rcnn_class_loss': frcnn_class_loss,
-          'fast_rcnn_box_loss': frcnn_box_loss,
-          'mask_loss': mask_loss,
-          'model_loss': model_loss,
-          'l2_regularization_loss': l2_regularization_loss,
-          'rpn_score_loss': rpn_score_loss,
-          'rpn_box_loss': rpn_box_loss,
-      }
-
-    return _total_loss_fn
-
-  def build_input_layers(self, params, mode):
-    is_training = mode == mode_keys.TRAIN
-    input_shape = (
-        params.maskrcnn_parser.output_size +
-        [params.maskrcnn_parser.num_channels])
-    if is_training:
-      batch_size = params.train.batch_size
-      input_layer = {
-          'image':
-              tf.keras.layers.Input(
-                  shape=input_shape,
-                  batch_size=batch_size,
-                  name='image',
-                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
-          'image_info':
-              tf.keras.layers.Input(
-                  shape=[4, 2],
-                  batch_size=batch_size,
-                  name='image_info',
-              ),
-          'gt_boxes':
-              tf.keras.layers.Input(
-                  shape=[params.maskrcnn_parser.max_num_instances, 4],
-                  batch_size=batch_size,
-                  name='gt_boxes'),
-          'gt_classes':
-              tf.keras.layers.Input(
-                  shape=[params.maskrcnn_parser.max_num_instances],
-                  batch_size=batch_size,
-                  name='gt_classes',
-                  dtype=tf.int64),
-      }
-      if self._include_mask:
-        input_layer['gt_masks'] = tf.keras.layers.Input(
-            shape=[
-                params.maskrcnn_parser.max_num_instances,
-                params.maskrcnn_parser.mask_crop_size,
-                params.maskrcnn_parser.mask_crop_size
-            ],
-            batch_size=batch_size,
-            name='gt_masks')
-    else:
-      batch_size = params.eval.batch_size
-      input_layer = {
-          'image':
-              tf.keras.layers.Input(
-                  shape=input_shape,
-                  batch_size=batch_size,
-                  name='image',
-                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
-          'image_info':
-              tf.keras.layers.Input(
-                  shape=[4, 2],
-                  batch_size=batch_size,
-                  name='image_info',
-              ),
-      }
-    return input_layer
-
-  def build_model(self, params, mode):
-    if self._keras_model is None:
-      input_layers = self.build_input_layers(self._params, mode)
-      outputs = self.model_outputs(input_layers, mode)
-
-      model = tf.keras.models.Model(
-          inputs=input_layers, outputs=outputs, name='maskrcnn')
-      assert model is not None, 'Fail to build tf.keras.Model.'
-      model.optimizer = self.build_optimizer()
-      self._keras_model = model
-
-    return self._keras_model
-
-  def post_processing(self, labels, outputs):
-    required_output_fields = ['class_outputs', 'box_outputs']
-    for field in required_output_fields:
-      if field not in outputs:
-        raise ValueError('"%s" is missing in outputs, requried %s found %s' %
-                         (field, required_output_fields, outputs.keys()))
-    predictions = {
-        'image_info': labels['image_info'],
-        'num_detections': outputs['num_detections'],
-        'detection_boxes': outputs['detection_boxes'],
-        'detection_classes': outputs['detection_classes'],
-        'detection_scores': outputs['detection_scores'],
-    }
-    if self._include_mask:
-      predictions.update({
-          'detection_masks': outputs['detection_masks'],
-      })
-
-    if 'groundtruths' in labels:
-      predictions['source_id'] = labels['groundtruths']['source_id']
-      predictions['gt_source_id'] = labels['groundtruths']['source_id']
-      predictions['gt_height'] = labels['groundtruths']['height']
-      predictions['gt_width'] = labels['groundtruths']['width']
-      predictions['gt_image_info'] = labels['image_info']
-      predictions['gt_num_detections'] = (
-          labels['groundtruths']['num_detections'])
-      predictions['gt_boxes'] = labels['groundtruths']['boxes']
-      predictions['gt_classes'] = labels['groundtruths']['classes']
-      predictions['gt_areas'] = labels['groundtruths']['areas']
-      predictions['gt_is_crowds'] = labels['groundtruths']['is_crowds']
-    return labels, predictions
-
-  def eval_metrics(self):
-    return eval_factory.evaluator_generator(self._params.eval)
--- a/official/vision/detection/modeling/olnmask_model.py
+++ b/official/vision/detection/modeling/olnmask_model.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Model defination for the Object Localization Network (OLN) Model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from official.vision.detection.dataloader import anchor
-from official.vision.detection.dataloader import mode_keys
-from official.vision.detection.modeling import losses
-from official.vision.detection.modeling.architecture import factory
-from official.vision.detection.modeling.maskrcnn_model import MaskrcnnModel
-from official.vision.detection.ops import postprocess_ops
-from official.vision.detection.ops import roi_ops
-from official.vision.detection.ops import spatial_transform_ops
-from official.vision.detection.ops import target_ops
-from official.vision.detection.utils import box_utils
-
-
-class OlnMaskModel(MaskrcnnModel):
-  """OLN-Mask model function."""
-
-  def __init__(self, params):
-    super(OlnMaskModel, self).__init__(params)
-
-    self._params = params
-
-    # Different heads and layers.
-    self._include_rpn_class = params.architecture.include_rpn_class
-    self._include_mask = params.architecture.include_mask
-    self._include_frcnn_class = params.architecture.include_frcnn_class
-    self._include_frcnn_box = params.architecture.include_frcnn_box
-    self._include_centerness = params.rpn_head.has_centerness
-    self._include_box_score = (params.frcnn_head.has_scoring and
-                               params.architecture.include_frcnn_box)
-    self._include_mask_score = (params.mrcnn_head.has_scoring and
-                                params.architecture.include_mask)
-
-    # Architecture generators.
-    self._backbone_fn = factory.backbone_generator(params)
-    self._fpn_fn = factory.multilevel_features_generator(params)
-    self._rpn_head_fn = factory.rpn_head_generator(params)
-    if self._include_centerness:
-      self._rpn_head_fn = factory.oln_rpn_head_generator(params)
-    else:
-      self._rpn_head_fn = factory.rpn_head_generator(params)
-    self._generate_rois_fn = roi_ops.OlnROIGenerator(params.roi_proposal)
-    self._sample_rois_fn = target_ops.ROIScoreSampler(params.roi_sampling)
-    self._sample_masks_fn = target_ops.MaskSampler(
-        params.architecture.mask_target_size,
-        params.mask_sampling.num_mask_samples_per_image)
-
-    if self._include_box_score:
-      self._frcnn_head_fn = factory.oln_box_score_head_generator(params)
-    else:
-      self._frcnn_head_fn = factory.fast_rcnn_head_generator(params)
-
-    if self._include_mask:
-      if self._include_mask_score:
-        self._mrcnn_head_fn = factory.oln_mask_score_head_generator(params)
-      else:
-        self._mrcnn_head_fn = factory.mask_rcnn_head_generator(params)
-
-    # Loss function.
-    self._rpn_score_loss_fn = losses.RpnScoreLoss(params.rpn_score_loss)
-    self._rpn_box_loss_fn = losses.RpnBoxLoss(params.rpn_box_loss)
-    if self._include_centerness:
-      self._rpn_iou_loss_fn = losses.OlnRpnIoULoss()
-      self._rpn_center_loss_fn = losses.OlnRpnCenterLoss()
-    self._frcnn_class_loss_fn = losses.FastrcnnClassLoss()
-    self._frcnn_box_loss_fn = losses.FastrcnnBoxLoss(params.frcnn_box_loss)
-    if self._include_box_score:
-      self._frcnn_box_score_loss_fn = losses.OlnBoxScoreLoss(
-          params.frcnn_box_score_loss)
-    if self._include_mask:
-      self._mask_loss_fn = losses.MaskrcnnLoss()
-
-    self._generate_detections_fn = postprocess_ops.OlnDetectionGenerator(
-        params.postprocess)
-
-    self._transpose_input = params.train.transpose_input
-    assert not self._transpose_input, 'Transpose input is not supportted.'
-
-  def build_outputs(self, inputs, mode):
-    is_training = mode == mode_keys.TRAIN
-    model_outputs = {}
-
-    image = inputs['image']
-    _, image_height, image_width, _ = image.get_shape().as_list()
-    backbone_features = self._backbone_fn(image, is_training)
-    fpn_features = self._fpn_fn(backbone_features, is_training)
-
-    # rpn_centerness.
-    if self._include_centerness:
-      rpn_score_outputs, rpn_box_outputs, rpn_center_outputs = (
-          self._rpn_head_fn(fpn_features, is_training))
-      model_outputs.update({
-          'rpn_center_outputs':
-              tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
-                                    rpn_center_outputs),
-      })
-      object_scores = rpn_center_outputs
-    else:
-      rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn(
-          fpn_features, is_training)
-      object_scores = None
-    model_outputs.update({
-        'rpn_score_outputs':
-            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
-                                  rpn_score_outputs),
-        'rpn_box_outputs':
-            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
-                                  rpn_box_outputs),
-    })
-    input_anchor = anchor.Anchor(self._params.architecture.min_level,
-                                 self._params.architecture.max_level,
-                                 self._params.anchor.num_scales,
-                                 self._params.anchor.aspect_ratios,
-                                 self._params.anchor.anchor_size,
-                                 (image_height, image_width))
-    rpn_rois, rpn_roi_scores = self._generate_rois_fn(
-        rpn_box_outputs,
-        rpn_score_outputs,
-        input_anchor.multilevel_boxes,
-        inputs['image_info'][:, 1, :],
-        is_training,
-        is_box_lrtb=self._include_centerness,
-        object_scores=object_scores,
-        )
-    if (not self._include_frcnn_class and
-        not self._include_frcnn_box and
-        not self._include_mask):
-      # if not is_training:
-      # For direct RPN detection,
-      # use dummy box_outputs = (dy,dx,dh,dw = 0,0,0,0)
-      box_outputs = tf.zeros_like(rpn_rois)
-      box_outputs = tf.concat([box_outputs, box_outputs], -1)
-      boxes, scores, classes, valid_detections = self._generate_detections_fn(
-          box_outputs, rpn_roi_scores, rpn_rois,
-          inputs['image_info'][:, 1:2, :],
-          is_single_fg_score=True,  # if no_background, no softmax is applied.
-          keep_nms=True)
-      model_outputs.update({
-          'num_detections': valid_detections,
-          'detection_boxes': boxes,
-          'detection_classes': classes,
-          'detection_scores': scores,
-      })
-      return model_outputs
-
-    # ---- OLN-Proposal finishes here. ----
-
-    if is_training:
-      rpn_rois = tf.stop_gradient(rpn_rois)
-      rpn_roi_scores = tf.stop_gradient(rpn_roi_scores)
-
-      # Sample proposals.
-      (rpn_rois, rpn_roi_scores, matched_gt_boxes, matched_gt_classes,
-       matched_gt_indices) = (
-           self._sample_rois_fn(rpn_rois, rpn_roi_scores, inputs['gt_boxes'],
-                                inputs['gt_classes']))
-      # Create bounding box training targets.
-      box_targets = box_utils.encode_boxes(
-          matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0])
-      # If the target is background, the box target is set to all 0s.
-      box_targets = tf.where(
-          tf.tile(
-              tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
-              [1, 1, 4]), tf.zeros_like(box_targets), box_targets)
-      model_outputs.update({
-          'class_targets': matched_gt_classes,
-          'box_targets': box_targets,
-      })
-      # Create Box-IoU targets. {
-      box_ious = box_utils.bbox_overlap(
-          rpn_rois, inputs['gt_boxes'])
-      matched_box_ious = tf.reduce_max(box_ious, 2)
-      model_outputs.update({
-          'box_iou_targets': matched_box_ious,})  # }
-
-    roi_features = spatial_transform_ops.multilevel_crop_and_resize(
-        fpn_features, rpn_rois, output_size=7)
-
-    if not self._include_box_score:
-      class_outputs, box_outputs = self._frcnn_head_fn(
-          roi_features, is_training)
-    else:
-      class_outputs, box_outputs, score_outputs = self._frcnn_head_fn(
-          roi_features, is_training)
-      model_outputs.update({
-          'box_score_outputs':
-              tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
-                                    score_outputs),})
-    model_outputs.update({
-        'class_outputs':
-            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
-                                  class_outputs),
-        'box_outputs':
-            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
-                                  box_outputs),
-    })
-
-    # Add this output to train to make the checkpoint loadable in predict mode.
-    # If we skip it in train mode, the heads will be out-of-order and checkpoint
-    # loading will fail.
-    if not self._include_frcnn_box:
-      box_outputs = tf.zeros_like(box_outputs)  # dummy zeros.
-
-    if self._include_box_score:
-      score_outputs = tf.cast(tf.squeeze(score_outputs, -1),
-                              rpn_roi_scores.dtype)
-
-      # box-score = (rpn-centerness * box-iou)^(1/2)
-      # TR: rpn_roi_scores: b,1000, score_outputs: b,512
-      # TS: rpn_roi_scores: b,1000, score_outputs: b,1000
-      box_scores = tf.pow(
-          rpn_roi_scores * tf.sigmoid(score_outputs), 1/2.)
-
-    if not self._include_frcnn_class:
-      boxes, scores, classes, valid_detections = self._generate_detections_fn(
-          box_outputs,
-          box_scores,
-          rpn_rois,
-          inputs['image_info'][:, 1:2, :],
-          is_single_fg_score=True,
-          keep_nms=True,)
-    else:
-      boxes, scores, classes, valid_detections = self._generate_detections_fn(
-          box_outputs, class_outputs, rpn_rois,
-          inputs['image_info'][:, 1:2, :],
-          keep_nms=True,)
-    model_outputs.update({
-        'num_detections': valid_detections,
-        'detection_boxes': boxes,
-        'detection_classes': classes,
-        'detection_scores': scores,
-    })
-
-    # ---- OLN-Box finishes here. ----
-
-    if not self._include_mask:
-      return model_outputs
-
-    if is_training:
-      rpn_rois, classes, mask_targets = self._sample_masks_fn(
-          rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices,
-          inputs['gt_masks'])
-      mask_targets = tf.stop_gradient(mask_targets)
-
-      classes = tf.cast(classes, dtype=tf.int32)
-
-      model_outputs.update({
-          'mask_targets': mask_targets,
-          'sampled_class_targets': classes,
-      })
-    else:
-      rpn_rois = boxes
-      classes = tf.cast(classes, dtype=tf.int32)
-
-    mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
-        fpn_features, rpn_rois, output_size=14)
-
-    mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes, is_training)
-
-    if is_training:
-      model_outputs.update({
-          'mask_outputs':
-              tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
-                                    mask_outputs),
-      })
-    else:
-      model_outputs.update({'detection_masks': tf.nn.sigmoid(mask_outputs)})
-
-    return model_outputs
-
-  def build_loss_fn(self):
-    if self._keras_model is None:
-      raise ValueError('build_loss_fn() must be called after build_model().')
-
-    filter_fn = self.make_filter_trainable_variables_fn()
-    trainable_variables = filter_fn(self._keras_model.trainable_variables)
-
-    def _total_loss_fn(labels, outputs):
-      if self._include_rpn_class:
-        rpn_score_loss = self._rpn_score_loss_fn(outputs['rpn_score_outputs'],
-                                                 labels['rpn_score_targets'])
-      else:
-        rpn_score_loss = 0.0
-      if self._include_centerness:
-        rpn_center_loss = self._rpn_center_loss_fn(
-            outputs['rpn_center_outputs'], labels['rpn_center_targets'])
-        rpn_box_loss = self._rpn_iou_loss_fn(
-            outputs['rpn_box_outputs'], labels['rpn_box_targets'],
-            labels['rpn_center_targets'])
-      else:
-        rpn_center_loss = 0.0
-        rpn_box_loss = self._rpn_box_loss_fn(
-            outputs['rpn_box_outputs'], labels['rpn_box_targets'])
-
-      if self._include_frcnn_class:
-        frcnn_class_loss = self._frcnn_class_loss_fn(
-            outputs['class_outputs'], outputs['class_targets'])
-      else:
-        frcnn_class_loss = 0.0
-      if self._include_frcnn_box:
-        frcnn_box_loss = self._frcnn_box_loss_fn(
-            outputs['box_outputs'], outputs['class_targets'],
-            outputs['box_targets'])
-      else:
-        frcnn_box_loss = 0.0
-      if self._include_box_score:
-        box_score_loss = self._frcnn_box_score_loss_fn(
-            outputs['box_score_outputs'], outputs['box_iou_targets'])
-      else:
-        box_score_loss = 0.0
-
-      if self._include_mask:
-        mask_loss = self._mask_loss_fn(outputs['mask_outputs'],
-                                       outputs['mask_targets'],
-                                       outputs['sampled_class_targets'])
-      else:
-        mask_loss = 0.0
-
-      model_loss = (
-          rpn_score_loss + rpn_box_loss + rpn_center_loss +
-          frcnn_class_loss + frcnn_box_loss + box_score_loss +
-          mask_loss)
-
-      l2_regularization_loss = self.weight_decay_loss(trainable_variables)
-      total_loss = model_loss + l2_regularization_loss
-      return {
-          'total_loss': total_loss,
-          'loss': total_loss,
-          'fast_rcnn_class_loss': frcnn_class_loss,
-          'fast_rcnn_box_loss': frcnn_box_loss,
-          'fast_rcnn_box_score_loss': box_score_loss,
-          'mask_loss': mask_loss,
-          'model_loss': model_loss,
-          'l2_regularization_loss': l2_regularization_loss,
-          'rpn_score_loss': rpn_score_loss,
-          'rpn_box_loss': rpn_box_loss,
-          'rpn_center_loss': rpn_center_loss,
-      }
-
-    return _total_loss_fn
-
-  def build_input_layers(self, params, mode):
-    is_training = mode == mode_keys.TRAIN
-    input_shape = (
-        params.olnmask_parser.output_size +
-        [params.olnmask_parser.num_channels])
-    if is_training:
-      batch_size = params.train.batch_size
-      input_layer = {
-          'image':
-              tf.keras.layers.Input(
-                  shape=input_shape,
-                  batch_size=batch_size,
-                  name='image',
-                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
-          'image_info':
-              tf.keras.layers.Input(
-                  shape=[4, 2],
-                  batch_size=batch_size,
-                  name='image_info',
-              ),
-          'gt_boxes':
-              tf.keras.layers.Input(
-                  shape=[params.olnmask_parser.max_num_instances, 4],
-                  batch_size=batch_size,
-                  name='gt_boxes'),
-          'gt_classes':
-              tf.keras.layers.Input(
-                  shape=[params.olnmask_parser.max_num_instances],
-                  batch_size=batch_size,
-                  name='gt_classes',
-                  dtype=tf.int64),
-      }
-      if self._include_mask:
-        input_layer['gt_masks'] = tf.keras.layers.Input(
-            shape=[
-                params.olnmask_parser.max_num_instances,
-                params.olnmask_parser.mask_crop_size,
-                params.olnmask_parser.mask_crop_size
-            ],
-            batch_size=batch_size,
-            name='gt_masks')
-    else:
-      batch_size = params.eval.batch_size
-      input_layer = {
-          'image':
-              tf.keras.layers.Input(
-                  shape=input_shape,
-                  batch_size=batch_size,
-                  name='image',
-                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
-          'image_info':
-              tf.keras.layers.Input(
-                  shape=[4, 2],
-                  batch_size=batch_size,
-                  name='image_info',
-              ),
-      }
-    return input_layer
-
-  def build_model(self, params, mode):
-    if self._keras_model is None:
-      input_layers = self.build_input_layers(self._params, mode)
-      outputs = self.model_outputs(input_layers, mode)
-
-      model = tf.keras.models.Model(
-          inputs=input_layers, outputs=outputs, name='olnmask')
-      assert model is not None, 'Fail to build tf.keras.Model.'
-      model.optimizer = self.build_optimizer()
-      self._keras_model = model
-
-    return self._keras_model
--- a/official/vision/detection/modeling/optimizers.py
+++ b/official/vision/detection/modeling/optimizers.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Optimizers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-import numpy as np
-import tensorflow as tf
-
-
-class OptimizerFactory(object):
-  """Class to generate optimizer function."""
-
-  def __init__(self, params):
-    """Creates optimized based on the specified flags."""
-    if params.type == 'momentum':
-      self._optimizer = functools.partial(
-          tf.keras.optimizers.SGD,
-          momentum=params.momentum,
-          nesterov=params.nesterov)
-    elif params.type == 'adam':
-      self._optimizer = tf.keras.optimizers.Adam
-    elif params.type == 'adadelta':
-      self._optimizer = tf.keras.optimizers.Adadelta
-    elif params.type == 'adagrad':
-      self._optimizer = tf.keras.optimizers.Adagrad
-    elif params.type == 'rmsprop':
-      self._optimizer = functools.partial(
-          tf.keras.optimizers.RMSprop, momentum=params.momentum)
-    else:
-      raise ValueError('Unsupported optimizer type `{}`.'.format(params.type))
-
-  def __call__(self, learning_rate):
-    return self._optimizer(learning_rate=learning_rate)
--- a/official/vision/detection/modeling/retinanet_model.py
+++ b/official/vision/detection/modeling/retinanet_model.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Model defination for the RetinaNet Model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from official.vision.detection.dataloader import mode_keys
-from official.vision.detection.evaluation import factory as eval_factory
-from official.vision.detection.modeling import base_model
-from official.vision.detection.modeling import losses
-from official.vision.detection.modeling.architecture import factory
-from official.vision.detection.ops import postprocess_ops
-
-
-class RetinanetModel(base_model.Model):
-  """RetinaNet model function."""
-
-  def __init__(self, params):
-    super(RetinanetModel, self).__init__(params)
-
-    # For eval metrics.
-    self._params = params
-
-    # Architecture generators.
-    self._backbone_fn = factory.backbone_generator(params)
-    self._fpn_fn = factory.multilevel_features_generator(params)
-    self._head_fn = factory.retinanet_head_generator(params)
-
-    # Loss function.
-    self._cls_loss_fn = losses.RetinanetClassLoss(
-        params.retinanet_loss, params.architecture.num_classes)
-    self._box_loss_fn = losses.RetinanetBoxLoss(params.retinanet_loss)
-    self._box_loss_weight = params.retinanet_loss.box_loss_weight
-    self._keras_model = None
-
-    # Predict function.
-    self._generate_detections_fn = postprocess_ops.MultilevelDetectionGenerator(
-        params.architecture.min_level, params.architecture.max_level,
-        params.postprocess)
-
-    self._transpose_input = params.train.transpose_input
-    assert not self._transpose_input, 'Transpose input is not supported.'
-    # Input layer.
-    self._input_layer = tf.keras.layers.Input(
-        shape=(None, None, params.retinanet_parser.num_channels),
-        name='',
-        dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32)
-
-  def build_outputs(self, inputs, mode):
-    # If the input image is transposed (from NHWC to HWCN), we need to revert it
-    # back to the original shape before it's used in the computation.
-    if self._transpose_input:
-      inputs = tf.transpose(inputs, [3, 0, 1, 2])
-
-    backbone_features = self._backbone_fn(
-        inputs, is_training=(mode == mode_keys.TRAIN))
-    fpn_features = self._fpn_fn(
-        backbone_features, is_training=(mode == mode_keys.TRAIN))
-    cls_outputs, box_outputs = self._head_fn(
-        fpn_features, is_training=(mode == mode_keys.TRAIN))
-
-    if self._use_bfloat16:
-      levels = cls_outputs.keys()
-      for level in levels:
-        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
-        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)
-
-    model_outputs = {
-        'cls_outputs': cls_outputs,
-        'box_outputs': box_outputs,
-    }
-    return model_outputs
-
-  def build_loss_fn(self):
-    if self._keras_model is None:
-      raise ValueError('build_loss_fn() must be called after build_model().')
-
-    filter_fn = self.make_filter_trainable_variables_fn()
-    trainable_variables = filter_fn(self._keras_model.trainable_variables)
-
-    def _total_loss_fn(labels, outputs):
-      cls_loss = self._cls_loss_fn(outputs['cls_outputs'],
-                                   labels['cls_targets'],
-                                   labels['num_positives'])
-      box_loss = self._box_loss_fn(outputs['box_outputs'],
-                                   labels['box_targets'],
-                                   labels['num_positives'])
-      model_loss = cls_loss + self._box_loss_weight * box_loss
-      l2_regularization_loss = self.weight_decay_loss(trainable_variables)
-      total_loss = model_loss + l2_regularization_loss
-      return {
-          'total_loss': total_loss,
-          'cls_loss': cls_loss,
-          'box_loss': box_loss,
-          'model_loss': model_loss,
-          'l2_regularization_loss': l2_regularization_loss,
-      }
-
-    return _total_loss_fn
-
-  def build_model(self, params, mode=None):
-    if self._keras_model is None:
-      outputs = self.model_outputs(self._input_layer, mode)
-
-      model = tf.keras.models.Model(
-          inputs=self._input_layer, outputs=outputs, name='retinanet')
-      assert model is not None, 'Fail to build tf.keras.Model.'
-      model.optimizer = self.build_optimizer()
-      self._keras_model = model
-
-    return self._keras_model
-
-  def post_processing(self, labels, outputs):
-    # TODO(yeqing): Moves the output related part into build_outputs.
-    required_output_fields = ['cls_outputs', 'box_outputs']
-    for field in required_output_fields:
-      if field not in outputs:
-        raise ValueError('"%s" is missing in outputs, requried %s found %s',
-                         field, required_output_fields, outputs.keys())
-    required_label_fields = ['image_info', 'groundtruths']
-    for field in required_label_fields:
-      if field not in labels:
-        raise ValueError('"%s" is missing in outputs, requried %s found %s',
-                         field, required_label_fields, labels.keys())
-    boxes, scores, classes, valid_detections = self._generate_detections_fn(
-        outputs['box_outputs'], outputs['cls_outputs'], labels['anchor_boxes'],
-        labels['image_info'][:, 1:2, :])
-    # Discards the old output tensors to save memory. The `cls_outputs` and
-    # `box_outputs` are pretty big and could potentiall lead to memory issue.
-    outputs = {
-        'source_id': labels['groundtruths']['source_id'],
-        'image_info': labels['image_info'],
-        'num_detections': valid_detections,
-        'detection_boxes': boxes,
-        'detection_classes': classes,
-        'detection_scores': scores,
-    }
-
-    if 'groundtruths' in labels:
-      labels['source_id'] = labels['groundtruths']['source_id']
-      labels['boxes'] = labels['groundtruths']['boxes']
-      labels['classes'] = labels['groundtruths']['classes']
-      labels['areas'] = labels['groundtruths']['areas']
-      labels['is_crowds'] = labels['groundtruths']['is_crowds']
-
-    return labels, outputs
-
-  def eval_metrics(self):
-    return eval_factory.evaluator_generator(self._params.eval)
--- a/official/vision/detection/modeling/shapemask_model.py
+++ b/official/vision/detection/modeling/shapemask_model.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Model definition for the ShapeMask Model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from official.vision.detection.dataloader import anchor
-from official.vision.detection.dataloader import mode_keys
-from official.vision.detection.evaluation import factory as eval_factory
-from official.vision.detection.modeling import base_model
-from official.vision.detection.modeling import losses
-from official.vision.detection.modeling.architecture import factory
-from official.vision.detection.ops import postprocess_ops
-from official.vision.detection.utils import box_utils
-
-
-class ShapeMaskModel(base_model.Model):
-  """ShapeMask model function."""
-
-  def __init__(self, params):
-    super(ShapeMaskModel, self).__init__(params)
-
-    self._params = params
-    self._keras_model = None
-
-    # Architecture generators.
-    self._backbone_fn = factory.backbone_generator(params)
-    self._fpn_fn = factory.multilevel_features_generator(params)
-    self._retinanet_head_fn = factory.retinanet_head_generator(params)
-    self._shape_prior_head_fn = factory.shapeprior_head_generator(params)
-    self._coarse_mask_fn = factory.coarsemask_head_generator(params)
-    self._fine_mask_fn = factory.finemask_head_generator(params)
-
-    # Loss functions.
-    self._cls_loss_fn = losses.RetinanetClassLoss(
-        params.retinanet_loss, params.architecture.num_classes)
-    self._box_loss_fn = losses.RetinanetBoxLoss(params.retinanet_loss)
-    self._box_loss_weight = params.retinanet_loss.box_loss_weight
-
-    # Mask loss function.
-    self._shapemask_prior_loss_fn = losses.ShapemaskMseLoss()
-    self._shapemask_loss_fn = losses.ShapemaskLoss()
-    self._shape_prior_loss_weight = (
-        params.shapemask_loss.shape_prior_loss_weight)
-    self._coarse_mask_loss_weight = (
-        params.shapemask_loss.coarse_mask_loss_weight)
-    self._fine_mask_loss_weight = (params.shapemask_loss.fine_mask_loss_weight)
-
-    # Predict function.
-    self._generate_detections_fn = postprocess_ops.MultilevelDetectionGenerator(
-        params.architecture.min_level, params.architecture.max_level,
-        params.postprocess)
-
-  def build_outputs(self, inputs, mode):
-    is_training = mode == mode_keys.TRAIN
-    images = inputs['image']
-
-    if 'anchor_boxes' in inputs:
-      anchor_boxes = inputs['anchor_boxes']
-    else:
-      anchor_boxes = anchor.Anchor(
-          self._params.architecture.min_level,
-          self._params.architecture.max_level, self._params.anchor.num_scales,
-          self._params.anchor.aspect_ratios, self._params.anchor.anchor_size,
-          images.get_shape().as_list()[1:3]).multilevel_boxes
-
-      batch_size = tf.shape(images)[0]
-      for level in anchor_boxes:
-        anchor_boxes[level] = tf.tile(
-            tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1, 1])
-
-    backbone_features = self._backbone_fn(images, is_training=is_training)
-    fpn_features = self._fpn_fn(backbone_features, is_training=is_training)
-    cls_outputs, box_outputs = self._retinanet_head_fn(
-        fpn_features, is_training=is_training)
-
-    valid_boxes, valid_scores, valid_classes, valid_detections = (
-        self._generate_detections_fn(box_outputs, cls_outputs, anchor_boxes,
-                                     inputs['image_info'][:, 1:2, :]))
-
-    image_size = images.get_shape().as_list()[1:3]
-    valid_outer_boxes = box_utils.compute_outer_boxes(
-        tf.reshape(valid_boxes, [-1, 4]),
-        image_size,
-        scale=self._params.shapemask_parser.outer_box_scale)
-    valid_outer_boxes = tf.reshape(valid_outer_boxes, tf.shape(valid_boxes))
-
-    # Wrapping if else code paths into a layer to make the checkpoint loadable
-    # in prediction mode.
-    class SampledBoxesLayer(tf.keras.layers.Layer):
-      """ShapeMask model function."""
-
-      def call(self, inputs, val_boxes, val_classes, val_outer_boxes, training):
-        if training:
-          boxes = inputs['mask_boxes']
-          outer_boxes = inputs['mask_outer_boxes']
-          classes = inputs['mask_classes']
-        else:
-          boxes = val_boxes
-          classes = val_classes
-          outer_boxes = val_outer_boxes
-        return boxes, classes, outer_boxes
-
-    boxes, classes, outer_boxes = SampledBoxesLayer()(
-        inputs,
-        valid_boxes,
-        valid_classes,
-        valid_outer_boxes,
-        training=is_training)
-
-    instance_features, prior_masks = self._shape_prior_head_fn(
-        fpn_features, boxes, outer_boxes, classes, is_training)
-    coarse_mask_logits = self._coarse_mask_fn(instance_features, prior_masks,
-                                              classes, is_training)
-    fine_mask_logits = self._fine_mask_fn(instance_features, coarse_mask_logits,
-                                          classes, is_training)
-
-    model_outputs = {
-        'cls_outputs': cls_outputs,
-        'box_outputs': box_outputs,
-        'fine_mask_logits': fine_mask_logits,
-        'coarse_mask_logits': coarse_mask_logits,
-        'prior_masks': prior_masks,
-    }
-
-    if not is_training:
-      model_outputs.update({
-          'num_detections': valid_detections,
-          'detection_boxes': valid_boxes,
-          'detection_outer_boxes': valid_outer_boxes,
-          'detection_masks': fine_mask_logits,
-          'detection_classes': valid_classes,
-          'detection_scores': valid_scores,
-      })
-
-    return model_outputs
-
-  def build_loss_fn(self):
-    if self._keras_model is None:
-      raise ValueError('build_loss_fn() must be called after build_model().')
-
-    filter_fn = self.make_filter_trainable_variables_fn()
-    trainable_variables = filter_fn(self._keras_model.trainable_variables)
-
-    def _total_loss_fn(labels, outputs):
-      cls_loss = self._cls_loss_fn(outputs['cls_outputs'],
-                                   labels['cls_targets'],
-                                   labels['num_positives'])
-      box_loss = self._box_loss_fn(outputs['box_outputs'],
-                                   labels['box_targets'],
-                                   labels['num_positives'])
-
-      # Adds Shapemask model losses.
-      shape_prior_loss = self._shapemask_prior_loss_fn(outputs['prior_masks'],
-                                                       labels['mask_targets'],
-                                                       labels['mask_is_valid'])
-      coarse_mask_loss = self._shapemask_loss_fn(outputs['coarse_mask_logits'],
-                                                 labels['mask_targets'],
-                                                 labels['mask_is_valid'])
-      fine_mask_loss = self._shapemask_loss_fn(outputs['fine_mask_logits'],
-                                               labels['fine_mask_targets'],
-                                               labels['mask_is_valid'])
-
-      model_loss = (
-          cls_loss + self._box_loss_weight * box_loss +
-          shape_prior_loss * self._shape_prior_loss_weight +
-          coarse_mask_loss * self._coarse_mask_loss_weight +
-          fine_mask_loss * self._fine_mask_loss_weight)
-
-      l2_regularization_loss = self.weight_decay_loss(trainable_variables)
-      total_loss = model_loss + l2_regularization_loss
-
-      shapemask_losses = {
-          'total_loss': total_loss,
-          'loss': total_loss,
-          'retinanet_cls_loss': cls_loss,
-          'l2_regularization_loss': l2_regularization_loss,
-          'retinanet_box_loss': box_loss,
-          'shapemask_prior_loss': shape_prior_loss,
-          'shapemask_coarse_mask_loss': coarse_mask_loss,
-          'shapemask_fine_mask_loss': fine_mask_loss,
-          'model_loss': model_loss,
-      }
-      return shapemask_losses
-
-    return _total_loss_fn
-
-  def build_input_layers(self, params, mode):
-    is_training = mode == mode_keys.TRAIN
-    input_shape = (
-        params.shapemask_parser.output_size +
-        [params.shapemask_parser.num_channels])
-    if is_training:
-      batch_size = params.train.batch_size
-      input_layer = {
-          'image':
-              tf.keras.layers.Input(
-                  shape=input_shape,
-                  batch_size=batch_size,
-                  name='image',
-                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
-          'image_info':
-              tf.keras.layers.Input(
-                  shape=[4, 2], batch_size=batch_size, name='image_info'),
-          'mask_classes':
-              tf.keras.layers.Input(
-                  shape=[params.shapemask_parser.num_sampled_masks],
-                  batch_size=batch_size,
-                  name='mask_classes',
-                  dtype=tf.int64),
-          'mask_outer_boxes':
-              tf.keras.layers.Input(
-                  shape=[params.shapemask_parser.num_sampled_masks, 4],
-                  batch_size=batch_size,
-                  name='mask_outer_boxes',
-                  dtype=tf.float32),
-          'mask_boxes':
-              tf.keras.layers.Input(
-                  shape=[params.shapemask_parser.num_sampled_masks, 4],
-                  batch_size=batch_size,
-                  name='mask_boxes',
-                  dtype=tf.float32),
-      }
-    else:
-      batch_size = params.eval.batch_size
-      input_layer = {
-          'image':
-              tf.keras.layers.Input(
-                  shape=input_shape,
-                  batch_size=batch_size,
-                  name='image',
-                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
-          'image_info':
-              tf.keras.layers.Input(
-                  shape=[4, 2], batch_size=batch_size, name='image_info'),
-      }
-    return input_layer
-
-  def build_model(self, params, mode):
-    if self._keras_model is None:
-      input_layers = self.build_input_layers(self._params, mode)
-      outputs = self.model_outputs(input_layers, mode)
-
-      model = tf.keras.models.Model(
-          inputs=input_layers, outputs=outputs, name='shapemask')
-      assert model is not None, 'Fail to build tf.keras.Model.'
-      model.optimizer = self.build_optimizer()
-      self._keras_model = model
-
-    return self._keras_model
-
-  def post_processing(self, labels, outputs):
-    required_output_fields = [
-        'num_detections', 'detection_boxes', 'detection_classes',
-        'detection_masks', 'detection_scores'
-    ]
-
-    for field in required_output_fields:
-      if field not in outputs:
-        raise ValueError(
-            '"{}" is missing in outputs, requried {} found {}'.format(
-                field, required_output_fields, outputs.keys()))
-
-    required_label_fields = ['image_info']
-    for field in required_label_fields:
-      if field not in labels:
-        raise ValueError(
-            '"{}" is missing in labels, requried {} found {}'.format(
-                field, required_label_fields, labels.keys()))
-
-    predictions = {
-        'image_info': labels['image_info'],
-        'num_detections': outputs['num_detections'],
-        'detection_boxes': outputs['detection_boxes'],
-        'detection_outer_boxes': outputs['detection_outer_boxes'],
-        'detection_classes': outputs['detection_classes'],
-        'detection_scores': outputs['detection_scores'],
-        'detection_masks': outputs['detection_masks'],
-    }
-
-    if 'groundtruths' in labels:
-      predictions['source_id'] = labels['groundtruths']['source_id']
-      labels = labels['groundtruths']
-
-    return labels, predictions
-
-  def eval_metrics(self):
-    return eval_factory.evaluator_generator(self._params.eval)
--- a/official/vision/detection/ops/__init__.py
+++ b/official/vision/detection/ops/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
--- a/official/vision/detection/ops/nms.py
+++ b/official/vision/detection/ops/nms.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tensorflow implementation of non max suppression."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from official.vision.detection.utils import box_utils
-
-NMS_TILE_SIZE = 512
-
-
-def _self_suppression(iou, _, iou_sum):
-  batch_size = tf.shape(iou)[0]
-  can_suppress_others = tf.cast(
-      tf.reshape(tf.reduce_max(iou, 1) <= 0.5, [batch_size, -1, 1]), iou.dtype)
-  iou_suppressed = tf.reshape(
-      tf.cast(tf.reduce_max(can_suppress_others * iou, 1) <= 0.5, iou.dtype),
-      [batch_size, -1, 1]) * iou
-  iou_sum_new = tf.reduce_sum(iou_suppressed, [1, 2])
-  return [
-      iou_suppressed,
-      tf.reduce_any(iou_sum - iou_sum_new > 0.5), iou_sum_new
-  ]
-
-
-def _cross_suppression(boxes, box_slice, iou_threshold, inner_idx):
-  batch_size = tf.shape(boxes)[0]
-  new_slice = tf.slice(boxes, [0, inner_idx * NMS_TILE_SIZE, 0],
-                       [batch_size, NMS_TILE_SIZE, 4])
-  iou = box_utils.bbox_overlap(new_slice, box_slice)
-  ret_slice = tf.expand_dims(
-      tf.cast(tf.reduce_all(iou < iou_threshold, [1]), box_slice.dtype),
-      2) * box_slice
-  return boxes, ret_slice, iou_threshold, inner_idx + 1
-
-
-def _suppression_loop_body(boxes, iou_threshold, output_size, idx):
-  """Process boxes in the range [idx*NMS_TILE_SIZE, (idx+1)*NMS_TILE_SIZE).
-
-  Args:
-    boxes: a tensor with a shape of [batch_size, anchors, 4].
-    iou_threshold: a float representing the threshold for deciding whether boxes
-      overlap too much with respect to IOU.
-    output_size: an int32 tensor of size [batch_size]. Representing the number
-      of selected boxes for each batch.
-    idx: an integer scalar representing induction variable.
-
-  Returns:
-    boxes: updated boxes.
-    iou_threshold: pass down iou_threshold to the next iteration.
-    output_size: the updated output_size.
-    idx: the updated induction variable.
-  """
-  num_tiles = tf.shape(boxes)[1] // NMS_TILE_SIZE
-  batch_size = tf.shape(boxes)[0]
-
-  # Iterates over tiles that can possibly suppress the current tile.
-  box_slice = tf.slice(boxes, [0, idx * NMS_TILE_SIZE, 0],
-                       [batch_size, NMS_TILE_SIZE, 4])
-  _, box_slice, _, _ = tf.while_loop(
-      lambda _boxes, _box_slice, _threshold, inner_idx: inner_idx < idx,
-      _cross_suppression, [boxes, box_slice, iou_threshold,
-                           tf.constant(0)])
-
-  # Iterates over the current tile to compute self-suppression.
-  iou = box_utils.bbox_overlap(box_slice, box_slice)
-  mask = tf.expand_dims(
-      tf.reshape(tf.range(NMS_TILE_SIZE), [1, -1]) > tf.reshape(
-          tf.range(NMS_TILE_SIZE), [-1, 1]), 0)
-  iou *= tf.cast(tf.logical_and(mask, iou >= iou_threshold), iou.dtype)
-  suppressed_iou, _, _ = tf.while_loop(
-      lambda _iou, loop_condition, _iou_sum: loop_condition, _self_suppression,
-      [iou, tf.constant(True),
-       tf.reduce_sum(iou, [1, 2])])
-  suppressed_box = tf.reduce_sum(suppressed_iou, 1) > 0
-  box_slice *= tf.expand_dims(1.0 - tf.cast(suppressed_box, box_slice.dtype), 2)
-
-  # Uses box_slice to update the input boxes.
-  mask = tf.reshape(
-      tf.cast(tf.equal(tf.range(num_tiles), idx), boxes.dtype), [1, -1, 1, 1])
-  boxes = tf.tile(tf.expand_dims(
-      box_slice, [1]), [1, num_tiles, 1, 1]) * mask + tf.reshape(
-          boxes, [batch_size, num_tiles, NMS_TILE_SIZE, 4]) * (1 - mask)
-  boxes = tf.reshape(boxes, [batch_size, -1, 4])
-
-  # Updates output_size.
-  output_size += tf.reduce_sum(
-      tf.cast(tf.reduce_any(box_slice > 0, [2]), tf.int32), [1])
-  return boxes, iou_threshold, output_size, idx + 1
-
-
-def sorted_non_max_suppression_padded(scores, boxes, max_output_size,
-                                      iou_threshold):
-  """A wrapper that handles non-maximum suppression.
-
-  Assumption:
-    * The boxes are sorted by scores unless the box is a dot (all coordinates
-      are zero).
-    * Boxes with higher scores can be used to suppress boxes with lower scores.
-
-  The overal design of the algorithm is to handle boxes tile-by-tile:
-
-  boxes = boxes.pad_to_multiply_of(tile_size)
-  num_tiles = len(boxes) // tile_size
-  output_boxes = []
-  for i in range(num_tiles):
-    box_tile = boxes[i*tile_size : (i+1)*tile_size]
-    for j in range(i - 1):
-      suppressing_tile = boxes[j*tile_size : (j+1)*tile_size]
-      iou = bbox_overlap(box_tile, suppressing_tile)
-      # if the box is suppressed in iou, clear it to a dot
-      box_tile *= _update_boxes(iou)
-    # Iteratively handle the diagnal tile.
-    iou = _box_overlap(box_tile, box_tile)
-    iou_changed = True
-    while iou_changed:
-      # boxes that are not suppressed by anything else
-      suppressing_boxes = _get_suppressing_boxes(iou)
-      # boxes that are suppressed by suppressing_boxes
-      suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes)
-      # clear iou to 0 for boxes that are suppressed, as they cannot be used
-      # to suppress other boxes any more
-      new_iou = _clear_iou(iou, suppressed_boxes)
-      iou_changed = (new_iou != iou)
-      iou = new_iou
-    # remaining boxes that can still suppress others, are selected boxes.
-    output_boxes.append(_get_suppressing_boxes(iou))
-    if len(output_boxes) >= max_output_size:
-      break
-
-  Args:
-    scores: a tensor with a shape of [batch_size, anchors].
-    boxes: a tensor with a shape of [batch_size, anchors, 4].
-    max_output_size: a scalar integer `Tensor` representing the maximum number
-      of boxes to be selected by non max suppression.
-    iou_threshold: a float representing the threshold for deciding whether boxes
-      overlap too much with respect to IOU.
-
-  Returns:
-    nms_scores: a tensor with a shape of [batch_size, anchors]. It has same
-      dtype as input scores.
-    nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has
-      same dtype as input boxes.
-  """
-  batch_size = tf.shape(boxes)[0]
-  num_boxes = tf.shape(boxes)[1]
-  pad = tf.cast(
-      tf.math.ceil(tf.cast(num_boxes, tf.float32) / NMS_TILE_SIZE),
-      tf.int32) * NMS_TILE_SIZE - num_boxes
-  boxes = tf.pad(tf.cast(boxes, tf.float32), [[0, 0], [0, pad], [0, 0]])
-  scores = tf.pad(
-      tf.cast(scores, tf.float32), [[0, 0], [0, pad]], constant_values=-1)
-  num_boxes += pad
-
-  def _loop_cond(unused_boxes, unused_threshold, output_size, idx):
-    return tf.logical_and(
-        tf.reduce_min(output_size) < max_output_size,
-        idx < num_boxes // NMS_TILE_SIZE)
-
-  selected_boxes, _, output_size, _ = tf.while_loop(
-      _loop_cond, _suppression_loop_body,
-      [boxes, iou_threshold,
-       tf.zeros([batch_size], tf.int32),
-       tf.constant(0)])
-  idx = num_boxes - tf.cast(
-      tf.nn.top_k(
-          tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) *
-          tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0],
-      tf.int32)
-  idx = tf.minimum(idx, num_boxes - 1)
-  idx = tf.reshape(idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]),
-                   [-1])
-  boxes = tf.reshape(
-      tf.gather(tf.reshape(boxes, [-1, 4]), idx),
-      [batch_size, max_output_size, 4])
-  boxes = boxes * tf.cast(
-      tf.reshape(tf.range(max_output_size), [1, -1, 1]) < tf.reshape(
-          output_size, [-1, 1, 1]), boxes.dtype)
-  scores = tf.reshape(
-      tf.gather(tf.reshape(scores, [-1, 1]), idx),
-      [batch_size, max_output_size])
-  scores = scores * tf.cast(
-      tf.reshape(tf.range(max_output_size), [1, -1]) < tf.reshape(
-          output_size, [-1, 1]), scores.dtype)
-  return scores, boxes
--- a/official/vision/detection/ops/postprocess_ops.py
+++ b/official/vision/detection/ops/postprocess_ops.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Post-processing model outputs to generate detection."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-import tensorflow as tf
-
-from official.vision.detection.ops import nms
-from official.vision.detection.utils import box_utils
-
-
-def generate_detections_factory(params):
-  """Factory to select function to generate detection."""
-  if params.use_batched_nms:
-    func = functools.partial(
-        _generate_detections_batched,
-        max_total_size=params.max_total_size,
-        nms_iou_threshold=params.nms_iou_threshold,
-        score_threshold=params.score_threshold)
-  else:
-    func = functools.partial(
-        _generate_detections,
-        max_total_size=params.max_total_size,
-        nms_iou_threshold=params.nms_iou_threshold,
-        score_threshold=params.score_threshold,
-        pre_nms_num_boxes=params.pre_nms_num_boxes)
-  return func
-
-
-def _select_top_k_scores(scores_in, pre_nms_num_detections):
-  """Select top_k scores and indices for each class.
-
-  Args:
-    scores_in: a Tensor with shape [batch_size, N, num_classes], which stacks
-      class logit outputs on all feature levels. The N is the number of total
-      anchors on all levels. The num_classes is the number of classes predicted
-      by the model.
-    pre_nms_num_detections: Number of candidates before NMS.
-
-  Returns:
-    scores and indices: Tensors with shape [batch_size, pre_nms_num_detections,
-      num_classes].
-  """
-  batch_size, num_anchors, num_class = scores_in.get_shape().as_list()
-  scores_trans = tf.transpose(scores_in, perm=[0, 2, 1])
-  scores_trans = tf.reshape(scores_trans, [-1, num_anchors])
-
-  top_k_scores, top_k_indices = tf.nn.top_k(
-      scores_trans, k=pre_nms_num_detections, sorted=True)
-
-  top_k_scores = tf.reshape(top_k_scores,
-                            [batch_size, num_class, pre_nms_num_detections])
-  top_k_indices = tf.reshape(top_k_indices,
-                             [batch_size, num_class, pre_nms_num_detections])
-
-  return tf.transpose(top_k_scores,
-                      [0, 2, 1]), tf.transpose(top_k_indices, [0, 2, 1])
-
-
-def _generate_detections(boxes,
-                         scores,
-                         max_total_size=100,
-                         nms_iou_threshold=0.3,
-                         score_threshold=0.05,
-                         pre_nms_num_boxes=5000):
-  """Generate the final detections given the model outputs.
-
-  This uses classes unrolling with while loop based NMS, could be parralled
-  at batch dimension.
-
-  Args:
-    boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size,
-      N, 1, 4], which box predictions on all feature levels. The N is the number
-      of total anchors on all levels.
-    scores: a tensor with shape [batch_size, N, num_classes], which stacks class
-      probability on all feature levels. The N is the number of total anchors on
-      all levels. The num_classes is the number of classes predicted by the
-      model. Note that the class_outputs here is the raw score.
-    max_total_size: a scalar representing maximum number of boxes retained over
-      all classes.
-    nms_iou_threshold: a float representing the threshold for deciding whether
-      boxes overlap too much with respect to IOU.
-    score_threshold: a float representing the threshold for deciding when to
-      remove boxes based on score.
-    pre_nms_num_boxes: an int number of top candidate detections per class
-      before NMS.
-
-  Returns:
-    nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
-      representing top detected boxes in [y1, x1, y2, x2].
-    nms_scores: `float` Tensor of shape [batch_size, max_total_size]
-      representing sorted confidence scores for detected boxes. The values are
-      between [0, 1].
-    nms_classes: `int` Tensor of shape [batch_size, max_total_size] representing
-      classes for detected boxes.
-    valid_detections: `int` Tensor of shape [batch_size] only the top
-      `valid_detections` boxes are valid detections.
-  """
-  with tf.name_scope('generate_detections'):
-    nmsed_boxes = []
-    nmsed_classes = []
-    nmsed_scores = []
-    valid_detections = []
-    batch_size, _, num_classes_for_box, _ = boxes.get_shape().as_list()
-    _, total_anchors, num_classes = scores.get_shape().as_list()
-    # Selects top pre_nms_num scores and indices before NMS.
-    scores, indices = _select_top_k_scores(
-        scores, min(total_anchors, pre_nms_num_boxes))
-    for i in range(num_classes):
-      boxes_i = boxes[:, :, min(num_classes_for_box - 1, i), :]
-      scores_i = scores[:, :, i]
-      # Obtains pre_nms_num_boxes before running NMS.
-      boxes_i = tf.gather(boxes_i, indices[:, :, i], batch_dims=1, axis=1)
-
-      # Filter out scores.
-      boxes_i, scores_i = box_utils.filter_boxes_by_scores(
-          boxes_i, scores_i, min_score_threshold=score_threshold)
-
-      (nmsed_scores_i, nmsed_boxes_i) = nms.sorted_non_max_suppression_padded(
-          tf.cast(scores_i, tf.float32),
-          tf.cast(boxes_i, tf.float32),
-          max_total_size,
-          iou_threshold=nms_iou_threshold)
-      nmsed_classes_i = tf.fill([batch_size, max_total_size], i)
-      nmsed_boxes.append(nmsed_boxes_i)
-      nmsed_scores.append(nmsed_scores_i)
-      nmsed_classes.append(nmsed_classes_i)
-  nmsed_boxes = tf.concat(nmsed_boxes, axis=1)
-  nmsed_scores = tf.concat(nmsed_scores, axis=1)
-  nmsed_classes = tf.concat(nmsed_classes, axis=1)
-  nmsed_scores, indices = tf.nn.top_k(
-      nmsed_scores, k=max_total_size, sorted=True)
-  nmsed_boxes = tf.gather(nmsed_boxes, indices, batch_dims=1, axis=1)
-  nmsed_classes = tf.gather(nmsed_classes, indices, batch_dims=1)
-  valid_detections = tf.reduce_sum(
-      input_tensor=tf.cast(tf.greater(nmsed_scores, -1), tf.int32), axis=1)
-  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
-
-
-def _generate_detections_per_image(boxes,
-                                   scores,
-                                   max_total_size=100,
-                                   nms_iou_threshold=0.3,
-                                   score_threshold=0.05,
-                                   pre_nms_num_boxes=5000):
-  """Generate the final detections per image given the model outputs.
-
-  Args:
-    boxes: a tensor with shape [N, num_classes, 4] or [N, 1, 4], which box
-      predictions on all feature levels. The N is the number of total anchors on
-      all levels.
-    scores: a tensor with shape [N, num_classes], which stacks class probability
-      on all feature levels. The N is the number of total anchors on all levels.
-      The num_classes is the number of classes predicted by the model. Note that
-      the class_outputs here is the raw score.
-    max_total_size: a scalar representing maximum number of boxes retained over
-      all classes.
-    nms_iou_threshold: a float representing the threshold for deciding whether
-      boxes overlap too much with respect to IOU.
-    score_threshold: a float representing the threshold for deciding when to
-      remove boxes based on score.
-    pre_nms_num_boxes: an int number of top candidate detections per class
-      before NMS.
-
-  Returns:
-    nms_boxes: `float` Tensor of shape [max_total_size, 4] representing top
-      detected boxes in [y1, x1, y2, x2].
-    nms_scores: `float` Tensor of shape [max_total_size] representing sorted
-      confidence scores for detected boxes. The values are between [0, 1].
-    nms_classes: `int` Tensor of shape [max_total_size] representing classes for
-      detected boxes.
-    valid_detections: `int` Tensor of shape [1] only the top `valid_detections`
-      boxes are valid detections.
-  """
-  nmsed_boxes = []
-  nmsed_scores = []
-  nmsed_classes = []
-  num_classes_for_box = boxes.get_shape().as_list()[1]
-  num_classes = scores.get_shape().as_list()[1]
-  for i in range(num_classes):
-    boxes_i = boxes[:, min(num_classes_for_box - 1, i)]
-    scores_i = scores[:, i]
-
-    # Obtains pre_nms_num_boxes before running NMS.
-    scores_i, indices = tf.nn.top_k(
-        scores_i, k=tf.minimum(tf.shape(input=scores_i)[-1], pre_nms_num_boxes))
-    boxes_i = tf.gather(boxes_i, indices)
-
-    (nmsed_indices_i, nmsed_num_valid_i) = tf.image.non_max_suppression_padded(
-        tf.cast(boxes_i, tf.float32),
-        tf.cast(scores_i, tf.float32),
-        max_total_size,
-        iou_threshold=nms_iou_threshold,
-        score_threshold=score_threshold,
-        pad_to_max_output_size=True,
-        name='nms_detections_' + str(i))
-    nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
-    nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i)
-    # Sets scores of invalid boxes to -1.
-    nmsed_scores_i = tf.where(
-        tf.less(tf.range(max_total_size), [nmsed_num_valid_i]), nmsed_scores_i,
-        -tf.ones_like(nmsed_scores_i))
-    nmsed_classes_i = tf.fill([max_total_size], i)
-    nmsed_boxes.append(nmsed_boxes_i)
-    nmsed_scores.append(nmsed_scores_i)
-    nmsed_classes.append(nmsed_classes_i)
-
-  # Concats results from all classes and sort them.
-  nmsed_boxes = tf.concat(nmsed_boxes, axis=0)
-  nmsed_scores = tf.concat(nmsed_scores, axis=0)
-  nmsed_classes = tf.concat(nmsed_classes, axis=0)
-  nmsed_scores, indices = tf.nn.top_k(
-      nmsed_scores, k=max_total_size, sorted=True)
-  nmsed_boxes = tf.gather(nmsed_boxes, indices)
-  nmsed_classes = tf.gather(nmsed_classes, indices)
-  valid_detections = tf.reduce_sum(
-      input_tensor=tf.cast(tf.greater(nmsed_scores, -1), tf.int32))
-  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
-
-
-def _generate_detections_batched(boxes, scores, max_total_size,
-                                 nms_iou_threshold, score_threshold):
-  """Generates detected boxes with scores and classes for one-stage detector.
-
-  The function takes output of multi-level ConvNets and anchor boxes and
-  generates detected boxes. Note that this used batched nms, which is not
-  supported on TPU currently.
-
-  Args:
-    boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size,
-      N, 1, 4], which box predictions on all feature levels. The N is the number
-      of total anchors on all levels.
-    scores: a tensor with shape [batch_size, N, num_classes], which stacks class
-      probability on all feature levels. The N is the number of total anchors on
-      all levels. The num_classes is the number of classes predicted by the
-      model. Note that the class_outputs here is the raw score.
-    max_total_size: a scalar representing maximum number of boxes retained over
-      all classes.
-    nms_iou_threshold: a float representing the threshold for deciding whether
-      boxes overlap too much with respect to IOU.
-    score_threshold: a float representing the threshold for deciding when to
-      remove boxes based on score.
-
-  Returns:
-    nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
-      representing top detected boxes in [y1, x1, y2, x2].
-    nms_scores: `float` Tensor of shape [batch_size, max_total_size]
-      representing sorted confidence scores for detected boxes. The values are
-      between [0, 1].
-    nms_classes: `int` Tensor of shape [batch_size, max_total_size] representing
-      classes for detected boxes.
-    valid_detections: `int` Tensor of shape [batch_size] only the top
-      `valid_detections` boxes are valid detections.
-  """
-  with tf.name_scope('generate_detections'):
-    # TODO(tsungyi): Removes normalization/denomalization once the
-    # tf.image.combined_non_max_suppression is coordinate system agnostic.
-    # Normalizes maximum box cooridinates to 1.
-    normalizer = tf.reduce_max(boxes)
-    boxes /= normalizer
-    (nmsed_boxes, nmsed_scores, nmsed_classes,
-     valid_detections) = tf.image.combined_non_max_suppression(
-         boxes,
-         scores,
-         max_output_size_per_class=max_total_size,
-         max_total_size=max_total_size,
-         iou_threshold=nms_iou_threshold,
-         score_threshold=score_threshold,
-         pad_per_class=False,
-     )
-    # De-normalizes box cooridinates.
-    nmsed_boxes *= normalizer
-  nmsed_classes = tf.cast(nmsed_classes, tf.int32)
-  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
-
-
-class MultilevelDetectionGenerator(tf.keras.layers.Layer):
-  """Generates detected boxes with scores and classes for one-stage detector."""
-
-  def __init__(self, min_level, max_level, params):
-    self._min_level = min_level
-    self._max_level = max_level
-    self._generate_detections = generate_detections_factory(params)
-    super(MultilevelDetectionGenerator, self).__init__(autocast=False)
-
-  def call(self, box_outputs, class_outputs, anchor_boxes, image_shape):
-    # Collects outputs from all levels into a list.
-    boxes = []
-    scores = []
-    for i in range(self._min_level, self._max_level + 1):
-      box_outputs_i_shape = tf.shape(box_outputs[i])
-      batch_size = box_outputs_i_shape[0]
-      num_anchors_per_locations = box_outputs_i_shape[-1] // 4
-      num_classes = tf.shape(class_outputs[i])[-1] // num_anchors_per_locations
-
-      # Applies score transformation and remove the implicit background class.
-      scores_i = tf.sigmoid(
-          tf.reshape(class_outputs[i], [batch_size, -1, num_classes]))
-      scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1])
-
-      # Box decoding.
-      # The anchor boxes are shared for all data in a batch.
-      # One stage detector only supports class agnostic box regression.
-      anchor_boxes_i = tf.reshape(anchor_boxes[i], [batch_size, -1, 4])
-      box_outputs_i = tf.reshape(box_outputs[i], [batch_size, -1, 4])
-      boxes_i = box_utils.decode_boxes(box_outputs_i, anchor_boxes_i)
-
-      # Box clipping.
-      boxes_i = box_utils.clip_boxes(boxes_i, image_shape)
-
-      boxes.append(boxes_i)
-      scores.append(scores_i)
-    boxes = tf.concat(boxes, axis=1)
-    scores = tf.concat(scores, axis=1)
-
-    nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
-        self._generate_detections(tf.expand_dims(boxes, axis=2), scores))
-
-    # Adds 1 to offset the background class which has index 0.
-    nmsed_classes += 1
-    return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
-
-
-class GenericDetectionGenerator(tf.keras.layers.Layer):
-  """Generates the final detected boxes with scores and classes."""
-
-  def __init__(self, params):
-    super(GenericDetectionGenerator, self).__init__(autocast=False)
-    self._generate_detections = generate_detections_factory(params)
-
-  def call(self, box_outputs, class_outputs, anchor_boxes, image_shape):
-    """Generate final detections.
-
-    Args:
-      box_outputs: a tensor of shape of [batch_size, K, num_classes * 4]
-        representing the class-specific box coordinates relative to anchors.
-      class_outputs: a tensor of shape of [batch_size, K, num_classes]
-        representing the class logits before applying score activiation.
-      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
-        corresponding anchor boxes w.r.t `box_outputs`.
-      image_shape: a tensor of shape of [batch_size, 2] storing the image height
-        and width w.r.t. the scaled image, i.e. the same image space as
-        `box_outputs` and `anchor_boxes`.
-
-    Returns:
-      nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
-        representing top detected boxes in [y1, x1, y2, x2].
-      nms_scores: `float` Tensor of shape [batch_size, max_total_size]
-        representing sorted confidence scores for detected boxes. The values are
-        between [0, 1].
-      nms_classes: `int` Tensor of shape [batch_size, max_total_size]
-        representing classes for detected boxes.
-      valid_detections: `int` Tensor of shape [batch_size] only the top
-        `valid_detections` boxes are valid detections.
-    """
-    class_outputs = tf.nn.softmax(class_outputs, axis=-1)
-
-    # Removes the background class.
-    class_outputs_shape = tf.shape(class_outputs)
-    batch_size = class_outputs_shape[0]
-    num_locations = class_outputs_shape[1]
-    num_classes = class_outputs_shape[-1]
-    num_detections = num_locations * (num_classes - 1)
-
-    class_outputs = tf.slice(class_outputs, [0, 0, 1], [-1, -1, -1])
-    box_outputs = tf.reshape(
-        box_outputs,
-        tf.stack([batch_size, num_locations, num_classes, 4], axis=-1))
-    box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1])
-    anchor_boxes = tf.tile(
-        tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
-    box_outputs = tf.reshape(box_outputs,
-                             tf.stack([batch_size, num_detections, 4], axis=-1))
-    anchor_boxes = tf.reshape(
-        anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1))
-
-    # Box decoding.
-    decoded_boxes = box_utils.decode_boxes(
-        box_outputs, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0])
-
-    # Box clipping
-    decoded_boxes = box_utils.clip_boxes(decoded_boxes, image_shape)
-
-    decoded_boxes = tf.reshape(
-        decoded_boxes,
-        tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1))
-
-    nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
-        self._generate_detections(decoded_boxes, class_outputs))
-
-    # Adds 1 to offset the background class which has index 0.
-    nmsed_classes += 1
-
-    return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
-
-
-class OlnDetectionGenerator(GenericDetectionGenerator):
-  """Generates the final detected boxes with scores and classes."""
-
-  def __call__(self, box_outputs, class_outputs, anchor_boxes, image_shape,
-               is_single_fg_score=False, keep_nms=True):
-    """Generate final detections for Object Localization Network (OLN).
-
-    Args:
-      box_outputs: a tensor of shape of [batch_size, K, num_classes * 4]
-        representing the class-specific box coordinates relative to anchors.
-      class_outputs: a tensor of shape of [batch_size, K, num_classes]
-        representing the class logits before applying score activiation.
-      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
-        corresponding anchor boxes w.r.t `box_outputs`.
-      image_shape: a tensor of shape of [batch_size, 2] storing the image height
-        and width w.r.t. the scaled image, i.e. the same image space as
-        `box_outputs` and `anchor_boxes`.
-      is_single_fg_score: a Bool indicator of whether class_outputs includes the
-        background scores concatenated or not. By default, class_outputs is a
-        concatenation of both scores for the foreground and background. That is,
-        scores_without_bg=False.
-      keep_nms: a Bool indicator of whether to perform NMS or not.
-
-    Returns:
-      nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
-        representing top detected boxes in [y1, x1, y2, x2].
-      nms_scores: `float` Tensor of shape [batch_size, max_total_size]
-        representing sorted confidence scores for detected boxes. The values are
-        between [0, 1].
-      nms_classes: `int` Tensor of shape [batch_size, max_total_size]
-        representing classes for detected boxes.
-      valid_detections: `int` Tensor of shape [batch_size] only the top
-        `valid_detections` boxes are valid detections.
-    """
-    if is_single_fg_score:
-      # Concatenates dummy background scores.
-      dummy_bg_scores = tf.zeros_like(class_outputs)
-      class_outputs = tf.stack([dummy_bg_scores, class_outputs], -1)
-    else:
-      class_outputs = tf.nn.softmax(class_outputs, axis=-1)
-
-    # Removes the background class.
-    class_outputs_shape = tf.shape(class_outputs)
-    batch_size = class_outputs_shape[0]
-    num_locations = class_outputs_shape[1]
-    num_classes = class_outputs_shape[-1]
-    num_detections = num_locations * (num_classes - 1)
-
-    class_outputs = tf.slice(class_outputs, [0, 0, 1], [-1, -1, -1])
-    box_outputs = tf.reshape(
-        box_outputs,
-        tf.stack([batch_size, num_locations, num_classes, 4], axis=-1))
-    box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1])
-    anchor_boxes = tf.tile(
-        tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
-    box_outputs = tf.reshape(box_outputs,
-                             tf.stack([batch_size, num_detections, 4], axis=-1))
-    anchor_boxes = tf.reshape(
-        anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1))
-
-    # Box decoding. For RPN outputs, box_outputs are all zeros.
-    decoded_boxes = box_utils.decode_boxes(
-        box_outputs, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0])
-
-    # Box clipping
-    decoded_boxes = box_utils.clip_boxes(decoded_boxes, image_shape)
-
-    decoded_boxes = tf.reshape(
-        decoded_boxes,
-        tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1))
-
-    if keep_nms:
-      nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
-          self._generate_detections(decoded_boxes, class_outputs))
-      # Adds 1 to offset the background class which has index 0.
-      nmsed_classes += 1
-    else:
-      nmsed_boxes = decoded_boxes[:, :, 0, :]
-      nmsed_scores = class_outputs[:, :, 0]
-      nmsed_classes = tf.cast(tf.ones_like(nmsed_scores), tf.int32)
-      valid_detections = tf.cast(
-          tf.reduce_sum(tf.ones_like(nmsed_scores), axis=-1), tf.int32)
-
-    return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
--- a/official/vision/detection/ops/roi_ops.py
+++ b/official/vision/detection/ops/roi_ops.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""ROI-related ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from official.vision.detection.ops import nms
-from official.vision.detection.utils import box_utils
-
-
-def multilevel_propose_rois(rpn_boxes,
-                            rpn_scores,
-                            anchor_boxes,
-                            image_shape,
-                            rpn_pre_nms_top_k=2000,
-                            rpn_post_nms_top_k=1000,
-                            rpn_nms_threshold=0.7,
-                            rpn_score_threshold=0.0,
-                            rpn_min_size_threshold=0.0,
-                            decode_boxes=True,
-                            clip_boxes=True,
-                            use_batched_nms=False,
-                            apply_sigmoid_to_score=True):
-  """Proposes RoIs given a group of candidates from different FPN levels.
-
-  The following describes the steps:
-    1. For each individual level:
-      a. Apply sigmoid transform if specified.
-      b. Decode boxes if specified.
-      c. Clip boxes if specified.
-      d. Filter small boxes and those fall outside image if specified.
-      e. Apply pre-NMS filtering including pre-NMS top k and score thresholding.
-      f. Apply NMS.
-    2. Aggregate post-NMS boxes from each level.
-    3. Apply an overall top k to generate the final selected RoIs.
-
-  Args:
-    rpn_boxes: a dict with keys representing FPN levels and values representing
-      box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
-    rpn_scores: a dict with keys representing FPN levels and values representing
-      logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
-    anchor_boxes: a dict with keys representing FPN levels and values
-      representing anchor box tensors of shape [batch_size, feature_h,
-      feature_w, num_anchors * 4].
-    image_shape: a tensor of shape [batch_size, 2] where the last dimension are
-      [height, width] of the scaled image.
-    rpn_pre_nms_top_k: an integer of top scoring RPN proposals *per level* to
-      keep before applying NMS. Default: 2000.
-    rpn_post_nms_top_k: an integer of top scoring RPN proposals *in total* to
-      keep after applying NMS. Default: 1000.
-    rpn_nms_threshold: a float between 0 and 1 representing the IoU threshold
-      used for NMS. If 0.0, no NMS is applied. Default: 0.7.
-    rpn_score_threshold: a float between 0 and 1 representing the minimal box
-      score to keep before applying NMS. This is often used as a pre-filtering
-      step for better performance. If 0, no filtering is applied. Default: 0.
-    rpn_min_size_threshold: a float representing the minimal box size in each
-      side (w.r.t. the scaled image) to keep before applying NMS. This is often
-      used as a pre-filtering step for better performance. If 0, no filtering is
-      applied. Default: 0.
-    decode_boxes: a boolean indicating whether `rpn_boxes` needs to be decoded
-      using `anchor_boxes`. If False, use `rpn_boxes` directly and ignore
-      `anchor_boxes`. Default: True.
-    clip_boxes: a boolean indicating whether boxes are first clipped to the
-      scaled image size before appliying NMS. If False, no clipping is applied
-      and `image_shape` is ignored. Default: True.
-    use_batched_nms: a boolean indicating whether NMS is applied in batch using
-      `tf.image.combined_non_max_suppression`. Currently only available in
-      CPU/GPU. Default: False.
-    apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to
-      `rpn_scores` before applying NMS. Default: True.
-
-  Returns:
-    selected_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
-      representing the box coordinates of the selected proposals w.r.t. the
-      scaled image.
-    selected_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k, 1],
-      representing the scores of the selected proposals.
-  """
-  with tf.name_scope('multilevel_propose_rois'):
-    rois = []
-    roi_scores = []
-    image_shape = tf.expand_dims(image_shape, axis=1)
-    for level in sorted(rpn_scores.keys()):
-      with tf.name_scope('level_%d' % level):
-        _, feature_h, feature_w, num_anchors_per_location = (
-            rpn_scores[level].get_shape().as_list())
-
-        num_boxes = feature_h * feature_w * num_anchors_per_location
-        this_level_scores = tf.reshape(rpn_scores[level], [-1, num_boxes])
-        this_level_boxes = tf.reshape(rpn_boxes[level], [-1, num_boxes, 4])
-        this_level_anchors = tf.cast(
-            tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
-            dtype=this_level_scores.dtype)
-
-        if apply_sigmoid_to_score:
-          this_level_scores = tf.sigmoid(this_level_scores)
-
-        if decode_boxes:
-          this_level_boxes = box_utils.decode_boxes(this_level_boxes,
-                                                    this_level_anchors)
-        if clip_boxes:
-          this_level_boxes = box_utils.clip_boxes(this_level_boxes, image_shape)
-
-        if rpn_min_size_threshold > 0.0:
-          this_level_boxes, this_level_scores = box_utils.filter_boxes(
-              this_level_boxes, this_level_scores, image_shape,
-              rpn_min_size_threshold)
-
-        this_level_pre_nms_top_k = min(num_boxes, rpn_pre_nms_top_k)
-        this_level_post_nms_top_k = min(num_boxes, rpn_post_nms_top_k)
-        if rpn_nms_threshold > 0.0:
-          if use_batched_nms:
-            this_level_rois, this_level_roi_scores, _, _ = (
-                tf.image.combined_non_max_suppression(
-                    tf.expand_dims(this_level_boxes, axis=2),
-                    tf.expand_dims(this_level_scores, axis=-1),
-                    max_output_size_per_class=this_level_pre_nms_top_k,
-                    max_total_size=this_level_post_nms_top_k,
-                    iou_threshold=rpn_nms_threshold,
-                    score_threshold=rpn_score_threshold,
-                    pad_per_class=False,
-                    clip_boxes=False))
-          else:
-            if rpn_score_threshold > 0.0:
-              this_level_boxes, this_level_scores = (
-                  box_utils.filter_boxes_by_scores(this_level_boxes,
-                                                   this_level_scores,
-                                                   rpn_score_threshold))
-            this_level_boxes, this_level_scores = box_utils.top_k_boxes(
-                this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k)
-            this_level_roi_scores, this_level_rois = (
-                nms.sorted_non_max_suppression_padded(
-                    this_level_scores,
-                    this_level_boxes,
-                    max_output_size=this_level_post_nms_top_k,
-                    iou_threshold=rpn_nms_threshold))
-        else:
-          this_level_rois, this_level_roi_scores = box_utils.top_k_boxes(
-              this_level_rois, this_level_scores, k=this_level_post_nms_top_k)
-
-        rois.append(this_level_rois)
-        roi_scores.append(this_level_roi_scores)
-
-    all_rois = tf.concat(rois, axis=1)
-    all_roi_scores = tf.concat(roi_scores, axis=1)
-
-    with tf.name_scope('top_k_rois'):
-      _, num_valid_rois = all_roi_scores.get_shape().as_list()
-      overall_top_k = min(num_valid_rois, rpn_post_nms_top_k)
-
-      selected_rois, selected_roi_scores = box_utils.top_k_boxes(
-          all_rois, all_roi_scores, k=overall_top_k)
-
-    return selected_rois, selected_roi_scores
-
-
-class ROIGenerator(tf.keras.layers.Layer):
-  """Proposes RoIs for the second stage processing."""
-
-  def __init__(self, params):
-    self._rpn_pre_nms_top_k = params.rpn_pre_nms_top_k
-    self._rpn_post_nms_top_k = params.rpn_post_nms_top_k
-    self._rpn_nms_threshold = params.rpn_nms_threshold
-    self._rpn_score_threshold = params.rpn_score_threshold
-    self._rpn_min_size_threshold = params.rpn_min_size_threshold
-    self._test_rpn_pre_nms_top_k = params.test_rpn_pre_nms_top_k
-    self._test_rpn_post_nms_top_k = params.test_rpn_post_nms_top_k
-    self._test_rpn_nms_threshold = params.test_rpn_nms_threshold
-    self._test_rpn_score_threshold = params.test_rpn_score_threshold
-    self._test_rpn_min_size_threshold = params.test_rpn_min_size_threshold
-    self._use_batched_nms = params.use_batched_nms
-    super(ROIGenerator, self).__init__(autocast=False)
-
-  def call(self, boxes, scores, anchor_boxes, image_shape, is_training):
-    """Generates RoI proposals.
-
-    Args:
-      boxes: a dict with keys representing FPN levels and values representing
-        box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
-      scores: a dict with keys representing FPN levels and values representing
-        logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
-      anchor_boxes: a dict with keys representing FPN levels and values
-        representing anchor box tensors of shape [batch_size, feature_h,
-        feature_w, num_anchors * 4].
-      image_shape: a tensor of shape [batch_size, 2] where the last dimension
-        are [height, width] of the scaled image.
-      is_training: a bool indicating whether it is in training or inference
-        mode.
-
-    Returns:
-      proposed_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
-        representing the box coordinates of the proposed RoIs w.r.t. the
-        scaled image.
-      proposed_roi_scores: a tensor of shape
-        [batch_size, rpn_post_nms_top_k, 1], representing the scores of the
-        proposed RoIs.
-
-    """
-    proposed_rois, proposed_roi_scores = multilevel_propose_rois(
-        boxes,
-        scores,
-        anchor_boxes,
-        image_shape,
-        rpn_pre_nms_top_k=(self._rpn_pre_nms_top_k
-                           if is_training else self._test_rpn_pre_nms_top_k),
-        rpn_post_nms_top_k=(self._rpn_post_nms_top_k
-                            if is_training else self._test_rpn_post_nms_top_k),
-        rpn_nms_threshold=(self._rpn_nms_threshold
-                           if is_training else self._test_rpn_nms_threshold),
-        rpn_score_threshold=(self._rpn_score_threshold if is_training else
-                             self._test_rpn_score_threshold),
-        rpn_min_size_threshold=(self._rpn_min_size_threshold if is_training else
-                                self._test_rpn_min_size_threshold),
-        decode_boxes=True,
-        clip_boxes=True,
-        use_batched_nms=self._use_batched_nms,
-        apply_sigmoid_to_score=True)
-    return proposed_rois, proposed_roi_scores
-
-
-class OlnROIGenerator(ROIGenerator):
-  """Proposes RoIs for the second stage processing."""
-
-  def __call__(self, boxes, scores, anchor_boxes, image_shape, is_training,
-               is_box_lrtb=False, object_scores=None):
-    """Generates RoI proposals.
-
-    Args:
-      boxes: a dict with keys representing FPN levels and values representing
-        box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
-      scores: a dict with keys representing FPN levels and values representing
-        logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
-      anchor_boxes: a dict with keys representing FPN levels and values
-        representing anchor box tensors of shape [batch_size, feature_h,
-        feature_w, num_anchors * 4].
-      image_shape: a tensor of shape [batch_size, 2] where the last dimension
-        are [height, width] of the scaled image.
-      is_training: a bool indicating whether it is in training or inference
-        mode.
-      is_box_lrtb: a bool indicating whether boxes are in lrtb (=left,right,top,
-        bottom) format.
-      object_scores: another objectness score (e.g., centerness). In OLN, we use
-        object_scores=centerness as a replacement of the scores at each level.
-        A dict with keys representing FPN levels and values representing logit
-        tensors of shape [batch_size, feature_h, feature_w, num_anchors].
-
-    Returns:
-      proposed_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
-        representing the box coordinates of the proposed RoIs w.r.t. the
-        scaled image.
-      proposed_roi_scores: a tensor of shape
-        [batch_size, rpn_post_nms_top_k, 1], representing the scores of the
-        proposed RoIs.
-
-    """
-    proposed_rois, proposed_roi_scores = self.oln_multilevel_propose_rois(
-        boxes,
-        scores,
-        anchor_boxes,
-        image_shape,
-        rpn_pre_nms_top_k=(self._rpn_pre_nms_top_k
-                           if is_training else self._test_rpn_pre_nms_top_k),
-        rpn_post_nms_top_k=(self._rpn_post_nms_top_k
-                            if is_training else self._test_rpn_post_nms_top_k),
-        rpn_nms_threshold=(self._rpn_nms_threshold
-                           if is_training else self._test_rpn_nms_threshold),
-        rpn_score_threshold=(self._rpn_score_threshold if is_training else
-                             self._test_rpn_score_threshold),
-        rpn_min_size_threshold=(self._rpn_min_size_threshold if is_training else
-                                self._test_rpn_min_size_threshold),
-        decode_boxes=True,
-        clip_boxes=True,
-        use_batched_nms=self._use_batched_nms,
-        apply_sigmoid_to_score=True,
-        is_box_lrtb=is_box_lrtb,
-        rpn_object_scores=object_scores,)
-    return proposed_rois, proposed_roi_scores
-
-  def oln_multilevel_propose_rois(self,
-                                  rpn_boxes,
-                                  rpn_scores,
-                                  anchor_boxes,
-                                  image_shape,
-                                  rpn_pre_nms_top_k=2000,
-                                  rpn_post_nms_top_k=1000,
-                                  rpn_nms_threshold=0.7,
-                                  rpn_score_threshold=0.0,
-                                  rpn_min_size_threshold=0.0,
-                                  decode_boxes=True,
-                                  clip_boxes=True,
-                                  use_batched_nms=False,
-                                  apply_sigmoid_to_score=True,
-                                  is_box_lrtb=False,
-                                  rpn_object_scores=None,):
-    """Proposes RoIs given a group of candidates from different FPN levels.
-
-    The following describes the steps:
-      1. For each individual level:
-        a. Adjust scores for each level if specified by rpn_object_scores.
-        b. Apply sigmoid transform if specified.
-        c. Decode boxes (either of xyhw or left-right-top-bottom format) if
-          specified.
-        d. Clip boxes if specified.
-        e. Filter small boxes and those fall outside image if specified.
-        f. Apply pre-NMS filtering including pre-NMS top k and score
-           thresholding.
-        g. Apply NMS.
-      2. Aggregate post-NMS boxes from each level.
-      3. Apply an overall top k to generate the final selected RoIs.
-
-    Args:
-      rpn_boxes: a dict with keys representing FPN levels and values
-        representing box tenors of shape [batch_size, feature_h, feature_w,
-        num_anchors * 4].
-      rpn_scores: a dict with keys representing FPN levels and values
-        representing logit tensors of shape [batch_size, feature_h, feature_w,
-        num_anchors].
-      anchor_boxes: a dict with keys representing FPN levels and values
-        representing anchor box tensors of shape [batch_size, feature_h,
-        feature_w, num_anchors * 4].
-      image_shape: a tensor of shape [batch_size, 2] where the last dimension
-        are [height, width] of the scaled image.
-      rpn_pre_nms_top_k: an integer of top scoring RPN proposals *per level* to
-        keep before applying NMS. Default: 2000.
-      rpn_post_nms_top_k: an integer of top scoring RPN proposals *in total* to
-        keep after applying NMS. Default: 1000.
-      rpn_nms_threshold: a float between 0 and 1 representing the IoU threshold
-        used for NMS. If 0.0, no NMS is applied. Default: 0.7.
-      rpn_score_threshold: a float between 0 and 1 representing the minimal box
-        score to keep before applying NMS. This is often used as a pre-filtering
-        step for better performance. If 0, no filtering is applied. Default: 0.
-      rpn_min_size_threshold: a float representing the minimal box size in each
-        side (w.r.t. the scaled image) to keep before applying NMS. This is
-        often used as a pre-filtering step for better performance. If 0, no
-        filtering is applied. Default: 0.
-      decode_boxes: a boolean indicating whether `rpn_boxes` needs to be decoded
-        using `anchor_boxes`. If False, use `rpn_boxes` directly and ignore
-        `anchor_boxes`. Default: True.
-      clip_boxes: a boolean indicating whether boxes are first clipped to the
-        scaled image size before appliying NMS. If False, no clipping is applied
-        and `image_shape` is ignored. Default: True.
-      use_batched_nms: a boolean indicating whether NMS is applied in batch
-        using `tf.image.combined_non_max_suppression`. Currently only available
-        in CPU/GPU. Default: False.
-      apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to
-        `rpn_scores` before applying NMS. Default: True.
-      is_box_lrtb: a bool indicating whether boxes are in lrtb (=left,right,top,
-        bottom) format.
-      rpn_object_scores: a predicted objectness score (e.g., centerness). In
-        OLN, we use object_scores=centerness as a replacement of the scores at
-        each level. A dict with keys representing FPN levels and values
-        representing logit tensors of shape [batch_size, feature_h, feature_w,
-        num_anchors].
-
-    Returns:
-      selected_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
-        representing the box coordinates of the selected proposals w.r.t. the
-        scaled image.
-      selected_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k,
-      1],representing the scores of the selected proposals.
-    """
-    with tf.name_scope('multilevel_propose_rois'):
-      rois = []
-      roi_scores = []
-      image_shape = tf.expand_dims(image_shape, axis=1)
-      for level in sorted(rpn_scores.keys()):
-        with tf.name_scope('level_%d' % level):
-          _, feature_h, feature_w, num_anchors_per_location = (
-              rpn_scores[level].get_shape().as_list())
-
-          num_boxes = feature_h * feature_w * num_anchors_per_location
-          this_level_scores = tf.reshape(rpn_scores[level], [-1, num_boxes])
-          this_level_boxes = tf.reshape(rpn_boxes[level], [-1, num_boxes, 4])
-          this_level_anchors = tf.cast(
-              tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
-              dtype=this_level_scores.dtype)
-
-          if rpn_object_scores:
-            this_level_object_scores = rpn_object_scores[level]
-            this_level_object_scores = tf.reshape(this_level_object_scores,
-                                                  [-1, num_boxes])
-            this_level_object_scores = tf.cast(this_level_object_scores,
-                                               this_level_scores.dtype)
-            this_level_scores = this_level_object_scores
-
-          if apply_sigmoid_to_score:
-            this_level_scores = tf.sigmoid(this_level_scores)
-
-          if decode_boxes:
-            if is_box_lrtb:  # Box in left-right-top-bottom format.
-              this_level_boxes = box_utils.decode_boxes_lrtb(
-                  this_level_boxes, this_level_anchors)
-            else:  # Box in standard x-y-h-w format.
-              this_level_boxes = box_utils.decode_boxes(
-                  this_level_boxes, this_level_anchors)
-
-          if clip_boxes:
-            this_level_boxes = box_utils.clip_boxes(
-                this_level_boxes, image_shape)
-
-          if rpn_min_size_threshold > 0.0:
-            this_level_boxes, this_level_scores = box_utils.filter_boxes(
-                this_level_boxes, this_level_scores, image_shape,
-                rpn_min_size_threshold)
-
-          this_level_pre_nms_top_k = min(num_boxes, rpn_pre_nms_top_k)
-          this_level_post_nms_top_k = min(num_boxes, rpn_post_nms_top_k)
-          if rpn_nms_threshold > 0.0:
-            if use_batched_nms:
-              this_level_rois, this_level_roi_scores, _, _ = (
-                  tf.image.combined_non_max_suppression(
-                      tf.expand_dims(this_level_boxes, axis=2),
-                      tf.expand_dims(this_level_scores, axis=-1),
-                      max_output_size_per_class=this_level_pre_nms_top_k,
-                      max_total_size=this_level_post_nms_top_k,
-                      iou_threshold=rpn_nms_threshold,
-                      score_threshold=rpn_score_threshold,
-                      pad_per_class=False,
-                      clip_boxes=False))
-            else:
-              if rpn_score_threshold > 0.0:
-                this_level_boxes, this_level_scores = (
-                    box_utils.filter_boxes_by_scores(this_level_boxes,
-                                                     this_level_scores,
-                                                     rpn_score_threshold))
-              this_level_boxes, this_level_scores = box_utils.top_k_boxes(
-                  this_level_boxes, this_level_scores,
-                  k=this_level_pre_nms_top_k)
-              this_level_roi_scores, this_level_rois = (
-                  nms.sorted_non_max_suppression_padded(
-                      this_level_scores,
-                      this_level_boxes,
-                      max_output_size=this_level_post_nms_top_k,
-                      iou_threshold=rpn_nms_threshold))
-          else:
-            this_level_rois, this_level_roi_scores = box_utils.top_k_boxes(
-                this_level_rois, this_level_scores, k=this_level_post_nms_top_k)
-
-          rois.append(this_level_rois)
-          roi_scores.append(this_level_roi_scores)
-
-      all_rois = tf.concat(rois, axis=1)
-      all_roi_scores = tf.concat(roi_scores, axis=1)
-
-      with tf.name_scope('top_k_rois'):
-        _, num_valid_rois = all_roi_scores.get_shape().as_list()
-        overall_top_k = min(num_valid_rois, rpn_post_nms_top_k)
-
-        selected_rois, selected_roi_scores = box_utils.top_k_boxes(
-            all_rois, all_roi_scores, k=overall_top_k)
-
-      return selected_rois, selected_roi_scores
--- a/official/vision/detection/ops/spatial_transform_ops.py
+++ b/official/vision/detection/ops/spatial_transform_ops.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Functions to performa spatial transformation for Tensor."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-_EPSILON = 1e-8
-
-
-def nearest_upsampling(data, scale):
-  """Nearest neighbor upsampling implementation.
-
-  Args:
-    data: A tensor with a shape of [batch, height_in, width_in, channels].
-    scale: An integer multiple to scale resolution of input data.
-
-  Returns:
-    data_up: A tensor with a shape of
-      [batch, height_in*scale, width_in*scale, channels]. Same dtype as input
-      data.
-  """
-  with tf.name_scope('nearest_upsampling'):
-    bs, _, _, c = data.get_shape().as_list()
-    shape = tf.shape(input=data)
-    h = shape[1]
-    w = shape[2]
-    bs = -1 if bs is None else bs
-    # Uses reshape to quickly upsample the input.  The nearest pixel is selected
-    # implicitly via broadcasting.
-    data = tf.reshape(data, [bs, h, 1, w, 1, c]) * tf.ones(
-        [1, 1, scale, 1, scale, 1], dtype=data.dtype)
-    return tf.reshape(data, [bs, h * scale, w * scale, c])
-
-
-def feature_bilinear_interpolation(features, kernel_y, kernel_x):
-  """Feature bilinear interpolation.
-
-  The RoIAlign feature f can be computed by bilinear interpolation
-  of four neighboring feature points f0, f1, f2, and f3.
-
-  f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
-                        [f10, f11]]
-  f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
-  f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
-  kernel_y = [hy, ly]
-  kernel_x = [hx, lx]
-
-  Args:
-    features: The features are in shape of [batch_size, num_boxes, output_size *
-      2, output_size * 2, num_filters].
-    kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
-    kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
-
-  Returns:
-    A 5-D tensor representing feature crop of shape
-    [batch_size, num_boxes, output_size, output_size, num_filters].
-
-  """
-  (batch_size, num_boxes, output_size, _,
-   num_filters) = features.get_shape().as_list()
-  output_size = output_size // 2
-  kernel_y = tf.reshape(kernel_y, [batch_size, num_boxes, output_size * 2, 1])
-  kernel_x = tf.reshape(kernel_x, [batch_size, num_boxes, 1, output_size * 2])
-  # Use implicit broadcast to generate the interpolation kernel. The
-  # multiplier `4` is for avg pooling.
-  interpolation_kernel = kernel_y * kernel_x * 4
-
-  # Interpolate the gathered features with computed interpolation kernels.
-  features *= tf.cast(
-      tf.expand_dims(interpolation_kernel, axis=-1), dtype=features.dtype)
-  features = tf.reshape(
-      features,
-      [batch_size * num_boxes, output_size * 2, output_size * 2, num_filters])
-  features = tf.nn.avg_pool(features, [1, 2, 2, 1], [1, 2, 2, 1], 'VALID')
-  features = tf.reshape(
-      features, [batch_size, num_boxes, output_size, output_size, num_filters])
-  return features
-
-
-def compute_grid_positions(boxes, boundaries, output_size, sample_offset):
-  """Compute the grid position w.r.t.
-
-  the corresponding feature map.
-
-  Args:
-    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
-      information of each box w.r.t. the corresponding feature map.
-      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
-      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
-        in terms of the number of pixels of the corresponding feature map size.
-    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
-      the boundary (in (y, x)) of the corresponding feature map for each box.
-      Any resampled grid points that go beyond the bounary will be clipped.
-    output_size: a scalar indicating the output crop size.
-    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
-      from grid point.
-
-  Returns:
-    kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
-    kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
-    box_grid_y0y1: Tensor of size [batch_size, boxes, output_size, 2]
-    box_grid_x0x1: Tensor of size [batch_size, boxes, output_size, 2]
-  """
-  batch_size, num_boxes, _ = boxes.get_shape().as_list()
-  box_grid_x = []
-  box_grid_y = []
-  for i in range(output_size):
-    box_grid_x.append(boxes[:, :, 1] +
-                      (i + sample_offset) * boxes[:, :, 3] / output_size)
-    box_grid_y.append(boxes[:, :, 0] +
-                      (i + sample_offset) * boxes[:, :, 2] / output_size)
-  box_grid_x = tf.stack(box_grid_x, axis=2)
-  box_grid_y = tf.stack(box_grid_y, axis=2)
-
-  box_grid_y0 = tf.floor(box_grid_y)
-  box_grid_x0 = tf.floor(box_grid_x)
-  box_grid_x0 = tf.maximum(0., box_grid_x0)
-  box_grid_y0 = tf.maximum(0., box_grid_y0)
-
-  box_grid_x0 = tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1))
-  box_grid_x1 = tf.minimum(box_grid_x0 + 1,
-                           tf.expand_dims(boundaries[:, :, 1], -1))
-  box_grid_y0 = tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1))
-  box_grid_y1 = tf.minimum(box_grid_y0 + 1,
-                           tf.expand_dims(boundaries[:, :, 0], -1))
-
-  box_gridx0x1 = tf.stack([box_grid_x0, box_grid_x1], axis=-1)
-  box_gridy0y1 = tf.stack([box_grid_y0, box_grid_y1], axis=-1)
-
-  # The RoIAlign feature f can be computed by bilinear interpolation of four
-  # neighboring feature points f0, f1, f2, and f3.
-  # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
-  #                       [f10, f11]]
-  # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
-  # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
-  ly = box_grid_y - box_grid_y0
-  lx = box_grid_x - box_grid_x0
-  hy = 1.0 - ly
-  hx = 1.0 - lx
-  kernel_y = tf.reshape(
-      tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size, 2, 1])
-  kernel_x = tf.reshape(
-      tf.stack([hx, lx], axis=3), [batch_size, num_boxes, output_size, 2, 1])
-  return kernel_y, kernel_x, box_gridy0y1, box_gridx0x1
-
-
-def get_grid_one_hot(box_gridy0y1, box_gridx0x1, feature_height, feature_width):
-  """Get grid_one_hot from indices and feature_size."""
-  (batch_size, num_boxes, output_size, _) = box_gridx0x1.get_shape().as_list()
-  y_indices = tf.cast(
-      tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size, 2]),
-      dtype=tf.int32)
-  x_indices = tf.cast(
-      tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size, 2]),
-      dtype=tf.int32)
-
-  # shape is [batch_size, num_boxes, output_size, 2, height]
-  grid_y_one_hot = tf.one_hot(tf.cast(y_indices, tf.int32), feature_height)
-  # shape is [batch_size, num_boxes, output_size, 2, width]
-  grid_x_one_hot = tf.one_hot(tf.cast(x_indices, tf.int32), feature_width)
-
-  return grid_y_one_hot, grid_x_one_hot
-
-
-def selective_crop_and_resize(features,
-                              boxes,
-                              box_levels,
-                              boundaries,
-                              output_size=7,
-                              sample_offset=0.5,
-                              use_einsum_gather=False):
-  """Crop and resize boxes on a set of feature maps.
-
-  Given multiple features maps indexed by different levels, and a set of boxes
-  where each box is mapped to a certain level, it selectively crops and resizes
-  boxes from the corresponding feature maps to generate the box features.
-
-  We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf,
-  figure 3 for reference). Specifically, for each feature map, we select an
-  (output_size, output_size) set of pixels corresponding to the box location,
-  and then use bilinear interpolation to select the feature value for each
-  pixel.
-
-  For performance, we perform the gather and interpolation on all layers as a
-  single operation. In this op the multi-level features are first stacked and
-  gathered into [2*output_size, 2*output_size] feature points. Then bilinear
-  interpolation is performed on the gathered feature points to generate
-  [output_size, output_size] RoIAlign feature map.
-
-  Here is the step-by-step algorithm:
-    1. The multi-level features are gathered into a
-       [batch_size, num_boxes, output_size*2, output_size*2, num_filters]
-       Tensor. The Tensor contains four neighboring feature points for each
-       vertice in the output grid.
-    2. Compute the interpolation kernel of shape
-       [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis
-       can be seen as stacking 2x2 interpolation kernels for all vertices in the
-       output grid.
-    3. Element-wise multiply the gathered features and interpolation kernel.
-       Then apply 2x2 average pooling to reduce spatial dimension to
-       output_size.
-
-  Args:
-    features: a 5-D tensor of shape [batch_size, num_levels, max_height,
-      max_width, num_filters] where cropping and resizing are based.
-    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
-      information of each box w.r.t. the corresponding feature map.
-      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
-      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
-        in terms of the number of pixels of the corresponding feature map size.
-    box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing
-      the 0-based corresponding feature level index of each box.
-    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
-      the boundary (in (y, x)) of the corresponding feature map for each box.
-      Any resampled grid points that go beyond the bounary will be clipped.
-    output_size: a scalar indicating the output crop size.
-    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
-      from grid point.
-    use_einsum_gather: use einsum to replace gather or not. Replacing einsum
-      with gather can improve performance when feature size is not large, einsum
-      is friendly with model partition as well. Gather's performance is better
-      when feature size is very large and there are multiple box levels.
-
-  Returns:
-    features_per_box: a 5-D tensor of shape
-      [batch_size, num_boxes, output_size, output_size, num_filters]
-      representing the cropped features.
-  """
-  (batch_size, num_levels, max_feature_height, max_feature_width,
-   num_filters) = features.get_shape().as_list()
-  _, num_boxes, _ = boxes.get_shape().as_list()
-
-  kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
-      boxes, boundaries, output_size, sample_offset)
-  x_indices = tf.cast(
-      tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
-      dtype=tf.int32)
-  y_indices = tf.cast(
-      tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
-      dtype=tf.int32)
-
-  if use_einsum_gather:
-    # Blinear interpolation is done during the last two gathers:
-    #        f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
-    #                              [f10, f11]]
-    #        [[f00, f01],
-    #         [f10, f11]] = tf.einsum(tf.einsum(features, y_one_hot), x_one_hot)
-    #       where [hy, ly] and [hx, lx] are the bilinear interpolation kernel.
-
-    # shape is [batch_size, boxes, output_size, 2, 1]
-    grid_y_one_hot, grid_x_one_hot = get_grid_one_hot(box_gridy0y1,
-                                                      box_gridx0x1,
-                                                      max_feature_height,
-                                                      max_feature_width)
-
-    # shape is [batch_size, num_boxes, output_size, height]
-    grid_y_weight = tf.reduce_sum(
-        tf.multiply(grid_y_one_hot, kernel_y), axis=-2)
-    # shape is [batch_size, num_boxes, output_size, width]
-    grid_x_weight = tf.reduce_sum(
-        tf.multiply(grid_x_one_hot, kernel_x), axis=-2)
-
-    # Gather for y_axis.
-    # shape is [batch_size, num_boxes, output_size, width, features]
-    features_per_box = tf.einsum('bmhwf,bmoh->bmowf', features,
-                                 tf.cast(grid_y_weight, features.dtype))
-    # Gather for x_axis.
-    # shape is [batch_size, num_boxes, output_size, output_size, features]
-    features_per_box = tf.einsum('bmhwf,bmow->bmhof', features_per_box,
-                                 tf.cast(grid_x_weight, features.dtype))
-  else:
-    height_dim_offset = max_feature_width
-    level_dim_offset = max_feature_height * height_dim_offset
-    batch_dim_offset = num_levels * level_dim_offset
-
-    batch_size_offset = tf.tile(
-        tf.reshape(
-            tf.range(batch_size) * batch_dim_offset, [batch_size, 1, 1, 1]),
-        [1, num_boxes, output_size * 2, output_size * 2])
-    box_levels_offset = tf.tile(
-        tf.reshape(box_levels * level_dim_offset,
-                   [batch_size, num_boxes, 1, 1]),
-        [1, 1, output_size * 2, output_size * 2])
-    y_indices_offset = tf.tile(
-        tf.reshape(y_indices * height_dim_offset,
-                   [batch_size, num_boxes, output_size * 2, 1]),
-        [1, 1, 1, output_size * 2])
-    x_indices_offset = tf.tile(
-        tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
-        [1, 1, output_size * 2, 1])
-
-    indices = tf.reshape(
-        batch_size_offset + box_levels_offset + y_indices_offset +
-        x_indices_offset, [-1])
-
-    features = tf.reshape(features, [-1, num_filters])
-    # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
-    # performance.
-    features_per_box = tf.reshape(
-        tf.gather(features, indices),
-        [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
-    features_per_box = feature_bilinear_interpolation(features_per_box,
-                                                      kernel_y, kernel_x)
-
-  return features_per_box
-
-
-def multilevel_crop_and_resize(features, boxes, output_size=7):
-  """Crop and resize on multilevel feature pyramid.
-
-  Generate the (output_size, output_size) set of pixels for each input box
-  by first locating the box into the correct feature level, and then cropping
-  and resizing it using the correspoding feature map of that level.
-
-  Args:
-    features: A dictionary with key as pyramid level and value as features. The
-      features are in shape of [batch_size, height_l, width_l, num_filters].
-    boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents
-      a box with [y1, x1, y2, x2] in un-normalized coordinates.
-    output_size: A scalar to indicate the output crop size.
-
-  Returns:
-    A 5-D tensor representing feature crop of shape
-    [batch_size, num_boxes, output_size, output_size, num_filters].
-  """
-
-  with tf.name_scope('multilevel_crop_and_resize'):
-    levels = list(features.keys())
-    min_level = min(levels)
-    max_level = max(levels)
-    batch_size, max_feature_height, max_feature_width, num_filters = (
-        features[min_level].get_shape().as_list())
-    _, num_boxes, _ = boxes.get_shape().as_list()
-
-    # Stack feature pyramid into a features_all of shape
-    # [batch_size, levels, height, width, num_filters].
-    features_all = []
-    feature_heights = []
-    feature_widths = []
-    for level in range(min_level, max_level + 1):
-      shape = features[level].get_shape().as_list()
-      feature_heights.append(shape[1])
-      feature_widths.append(shape[2])
-      # Concat tensor of [batch_size, height_l * width_l, num_filters] for each
-      # levels.
-      features_all.append(
-          tf.reshape(features[level], [batch_size, -1, num_filters]))
-      features_r2 = tf.reshape(tf.concat(features_all, 1), [-1, num_filters])
-
-    # Calculate height_l * width_l for each level.
-    level_dim_sizes = [
-        feature_widths[i] * feature_heights[i]
-        for i in range(len(feature_widths))
-    ]
-    # level_dim_offsets is accumulated sum of level_dim_size.
-    level_dim_offsets = [0]
-    for i in range(len(feature_widths) - 1):
-      level_dim_offsets.append(level_dim_offsets[i] + level_dim_sizes[i])
-    batch_dim_size = level_dim_offsets[-1] + level_dim_sizes[-1]
-    level_dim_offsets = tf.constant(level_dim_offsets, tf.int32)
-    height_dim_sizes = tf.constant(feature_widths, tf.int32)
-
-    # Assigns boxes to the right level.
-    box_width = boxes[:, :, 3] - boxes[:, :, 1]
-    box_height = boxes[:, :, 2] - boxes[:, :, 0]
-    areas_sqrt = tf.sqrt(box_height * box_width)
-    levels = tf.cast(
-        tf.math.floordiv(
-            tf.math.log(tf.divide(areas_sqrt, 224.0)), tf.math.log(2.0)) + 4.0,
-        dtype=tf.int32)
-    # Maps levels between [min_level, max_level].
-    levels = tf.minimum(max_level, tf.maximum(levels, min_level))
-
-    # Projects box location and sizes to corresponding feature levels.
-    scale_to_level = tf.cast(
-        tf.pow(tf.constant(2.0), tf.cast(levels, tf.float32)),
-        dtype=boxes.dtype)
-    boxes /= tf.expand_dims(scale_to_level, axis=2)
-    box_width /= scale_to_level
-    box_height /= scale_to_level
-    boxes = tf.concat([
-        boxes[:, :, 0:2],
-        tf.expand_dims(box_height, -1),
-        tf.expand_dims(box_width, -1)
-    ],
-                      axis=-1)
-
-    # Maps levels to [0, max_level-min_level].
-    levels -= min_level
-    level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32))
-    boundary = tf.cast(
-        tf.concat([
-            tf.expand_dims(
-                [[tf.cast(max_feature_height, tf.float32)]] / level_strides - 1,
-                axis=-1),
-            tf.expand_dims(
-                [[tf.cast(max_feature_width, tf.float32)]] / level_strides - 1,
-                axis=-1),
-        ],
-                  axis=-1), boxes.dtype)
-
-    # Compute grid positions.
-    kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
-        boxes, boundary, output_size, sample_offset=0.5)
-
-    x_indices = tf.cast(
-        tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
-        dtype=tf.int32)
-    y_indices = tf.cast(
-        tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
-        dtype=tf.int32)
-
-    batch_size_offset = tf.tile(
-        tf.reshape(
-            tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
-        [1, num_boxes, output_size * 2, output_size * 2])
-    # Get level offset for each box. Each box belongs to one level.
-    levels_offset = tf.tile(
-        tf.reshape(
-            tf.gather(level_dim_offsets, levels),
-            [batch_size, num_boxes, 1, 1]),
-        [1, 1, output_size * 2, output_size * 2])
-    y_indices_offset = tf.tile(
-        tf.reshape(
-            y_indices * tf.expand_dims(tf.gather(height_dim_sizes, levels), -1),
-            [batch_size, num_boxes, output_size * 2, 1]),
-        [1, 1, 1, output_size * 2])
-    x_indices_offset = tf.tile(
-        tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
-        [1, 1, output_size * 2, 1])
-    indices = tf.reshape(
-        batch_size_offset + levels_offset + y_indices_offset + x_indices_offset,
-        [-1])
-
-    # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
-    # performance.
-    features_per_box = tf.reshape(
-        tf.gather(features_r2, indices),
-        [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
-
-    # Bilinear interpolation.
-    features_per_box = feature_bilinear_interpolation(features_per_box,
-                                                      kernel_y, kernel_x)
-    return features_per_box
-
-
-def single_level_feature_crop(features, level_boxes, detection_prior_levels,
-                              min_mask_level, mask_crop_size):
-  """Crop the FPN features at the appropriate levels for each detection.
-
-
-  Args:
-    features: a float tensor of shape [batch_size, num_levels, max_feature_size,
-      max_feature_size, num_downsample_channels].
-    level_boxes: a float Tensor of the level boxes to crop from. [batch_size,
-      num_instances, 4].
-    detection_prior_levels: an int Tensor of instance assigned level of shape
-      [batch_size, num_instances].
-    min_mask_level: minimum FPN level to crop mask feature from.
-    mask_crop_size: an int of mask crop size.
-
-  Returns:
-    crop_features: a float Tensor of shape [batch_size * num_instances,
-        mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
-        instance feature crop.
-  """
-  (batch_size, num_levels, max_feature_size, _,
-   num_downsample_channels) = features.get_shape().as_list()
-  _, num_of_instances, _ = level_boxes.get_shape().as_list()
-  level_boxes = tf.cast(level_boxes, tf.int32)
-  assert num_of_instances == detection_prior_levels.get_shape().as_list()[1]
-
-  x_start_indices = level_boxes[:, :, 1]
-  y_start_indices = level_boxes[:, :, 0]
-  # generate the full indices (not just the starting index)
-  x_idx_list = []
-  y_idx_list = []
-  for i in range(mask_crop_size):
-    x_idx_list.append(x_start_indices + i)
-    y_idx_list.append(y_start_indices + i)
-
-  x_indices = tf.stack(x_idx_list, axis=2)
-  y_indices = tf.stack(y_idx_list, axis=2)
-  levels = detection_prior_levels - min_mask_level
-  height_dim_size = max_feature_size
-  level_dim_size = max_feature_size * height_dim_size
-  batch_dim_size = num_levels * level_dim_size
-  # TODO(weicheng) change this to gather_nd for better readability.
-  indices = tf.reshape(
-      tf.tile(
-          tf.reshape(
-              tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
-          [1, num_of_instances, mask_crop_size, mask_crop_size]) + tf.tile(
-              tf.reshape(levels * level_dim_size,
-                         [batch_size, num_of_instances, 1, 1]),
-              [1, 1, mask_crop_size, mask_crop_size]) + tf.tile(
-                  tf.reshape(y_indices * height_dim_size,
-                             [batch_size, num_of_instances, mask_crop_size, 1]),
-                  [1, 1, 1, mask_crop_size]) +
-      tf.tile(
-          tf.reshape(x_indices,
-                     [batch_size, num_of_instances, 1, mask_crop_size]),
-          [1, 1, mask_crop_size, 1]), [-1])
-
-  features_r2 = tf.reshape(features, [-1, num_downsample_channels])
-  crop_features = tf.reshape(
-      tf.gather(features_r2, indices), [
-          batch_size * num_of_instances, mask_crop_size, mask_crop_size,
-          num_downsample_channels
-      ])
-
-  return crop_features
-
-
-def crop_mask_in_target_box(masks,
-                            boxes,
-                            target_boxes,
-                            output_size,
-                            sample_offset=0,
-                            use_einsum=True):
-  """Crop masks in target boxes.
-
-  Args:
-    masks: A tensor with a shape of [batch_size, num_masks, height, width].
-    boxes: a float tensor representing box cooridnates that tightly enclose
-      masks with a shape of [batch_size, num_masks, 4] in un-normalized
-      coordinates. A box is represented by [ymin, xmin, ymax, xmax].
-    target_boxes: a float tensor representing target box cooridnates for masks
-      with a shape of [batch_size, num_masks, 4] in un-normalized coordinates. A
-      box is represented by [ymin, xmin, ymax, xmax].
-    output_size: A scalar to indicate the output crop size. It currently only
-      supports to output a square shape outputs.
-    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
-      from grid point.
-    use_einsum: Use einsum to replace gather in selective_crop_and_resize.
-
-  Returns:
-    A 4-D tensor representing feature crop of shape
-    [batch_size, num_boxes, output_size, output_size].
-  """
-  with tf.name_scope('crop_mask_in_target_box'):
-    batch_size, num_masks, height, width = masks.get_shape().as_list()
-    masks = tf.reshape(masks, [batch_size * num_masks, height, width, 1])
-    # Pad zeros on the boundary of masks.
-    masks = tf.image.pad_to_bounding_box(masks, 2, 2, height + 4, width + 4)
-    masks = tf.reshape(masks, [batch_size, num_masks, height + 4, width + 4, 1])
-
-    # Projects target box locations and sizes to corresponding cropped
-    # mask coordinates.
-    gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
-        value=boxes, num_or_size_splits=4, axis=2)
-    bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
-        value=target_boxes, num_or_size_splits=4, axis=2)
-    y_transform = (bb_y_min - gt_y_min) * height / (gt_y_max - gt_y_min +
-                                                    _EPSILON) + 2
-    x_transform = (bb_x_min - gt_x_min) * height / (gt_x_max - gt_x_min +
-                                                    _EPSILON) + 2
-    h_transform = (bb_y_max - bb_y_min) * width / (
-        gt_y_max - gt_y_min + _EPSILON)
-    w_transform = (bb_x_max - bb_x_min) * width / (
-        gt_x_max - gt_x_min + _EPSILON)
-
-    boundaries = tf.concat([
-        tf.cast(
-            tf.ones_like(y_transform) * ((height + 4) - 1), dtype=tf.float32),
-        tf.cast(
-            tf.ones_like(x_transform) * ((width + 4) - 1), dtype=tf.float32)
-    ],
-                           axis=-1)
-
-    # Reshape tensors to have the right shape for selective_crop_and_resize.
-    trasnformed_boxes = tf.concat(
-        [y_transform, x_transform, h_transform, w_transform], -1)
-    levels = tf.tile(
-        tf.reshape(tf.range(num_masks), [1, num_masks]), [batch_size, 1])
-
-    cropped_masks = selective_crop_and_resize(
-        masks,
-        trasnformed_boxes,
-        levels,
-        boundaries,
-        output_size,
-        sample_offset=sample_offset,
-        use_einsum_gather=use_einsum)
-    cropped_masks = tf.squeeze(cropped_masks, axis=-1)
-
-  return cropped_masks
--- a/official/vision/detection/ops/target_ops.py
+++ b/official/vision/detection/ops/target_ops.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Target and sampling related ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from official.vision.detection.ops import spatial_transform_ops
-from official.vision.detection.utils import box_utils
-from official.vision.utils.object_detection import balanced_positive_negative_sampler
-
-
-def box_matching(boxes, gt_boxes, gt_classes):
-  """Match boxes to groundtruth boxes.
-
-  Given the proposal boxes and the groundtruth boxes and classes, perform the
-  groundtruth matching by taking the argmax of the IoU between boxes and
-  groundtruth boxes.
-
-  Args:
-    boxes: a tensor of shape of [batch_size, N, 4] representing the box
-      coordiantes to be matched to groundtruth boxes.
-    gt_boxes: a tensor of shape of [batch_size, MAX_INSTANCES, 4] representing
-      the groundtruth box coordinates. It is padded with -1s to indicate the
-      invalid boxes.
-    gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
-      classes. It is padded with -1s to indicate the invalid classes.
-
-  Returns:
-    matched_gt_boxes: a tensor of shape of [batch_size, N, 4], representing
-      the matched groundtruth box coordinates for each input box. If the box
-      does not overlap with any groundtruth boxes, the matched boxes of it
-      will be set to all 0s.
-    matched_gt_classes: a tensor of shape of [batch_size, N], representing
-      the matched groundtruth classes for each input box. If the box does not
-      overlap with any groundtruth boxes, the matched box classes of it will
-      be set to 0, which corresponds to the background class.
-    matched_gt_indices: a tensor of shape of [batch_size, N], representing
-      the indices of the matched groundtruth boxes in the original gt_boxes
-      tensor. If the box does not overlap with any groundtruth boxes, the
-      index of the matched groundtruth will be set to -1.
-    matched_iou: a tensor of shape of [batch_size, N], representing the IoU
-      between the box and its matched groundtruth box. The matched IoU is the
-      maximum IoU of the box and all the groundtruth boxes.
-    iou: a tensor of shape of [batch_size, N, K], representing the IoU matrix
-      between boxes and the groundtruth boxes. The IoU between a box and the
-      invalid groundtruth boxes whose coordinates are [-1, -1, -1, -1] is -1.
-  """
-  # Compute IoU between boxes and gt_boxes.
-  # iou <- [batch_size, N, K]
-  iou = box_utils.bbox_overlap(boxes, gt_boxes)
-
-  # max_iou <- [batch_size, N]
-  # 0.0 -> no match to gt, or -1.0 match to no gt
-  matched_iou = tf.reduce_max(iou, axis=-1)
-
-  # background_box_mask <- bool, [batch_size, N]
-  background_box_mask = tf.less_equal(matched_iou, 0.0)
-
-  argmax_iou_indices = tf.argmax(iou, axis=-1, output_type=tf.int32)
-
-  argmax_iou_indices_shape = tf.shape(argmax_iou_indices)
-  batch_indices = (
-      tf.expand_dims(tf.range(argmax_iou_indices_shape[0]), axis=-1) *
-      tf.ones([1, argmax_iou_indices_shape[-1]], dtype=tf.int32))
-  gather_nd_indices = tf.stack([batch_indices, argmax_iou_indices], axis=-1)
-
-  matched_gt_boxes = tf.gather_nd(gt_boxes, gather_nd_indices)
-  matched_gt_boxes = tf.where(
-      tf.tile(tf.expand_dims(background_box_mask, axis=-1), [1, 1, 4]),
-      tf.zeros_like(matched_gt_boxes, dtype=matched_gt_boxes.dtype),
-      matched_gt_boxes)
-
-  matched_gt_classes = tf.gather_nd(gt_classes, gather_nd_indices)
-  matched_gt_classes = tf.where(background_box_mask,
-                                tf.zeros_like(matched_gt_classes),
-                                matched_gt_classes)
-
-  matched_gt_indices = tf.where(background_box_mask,
-                                -tf.ones_like(argmax_iou_indices),
-                                argmax_iou_indices)
-
-  return (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
-          iou)
-
-
-def assign_and_sample_proposals(proposed_boxes,
-                                gt_boxes,
-                                gt_classes,
-                                num_samples_per_image=512,
-                                mix_gt_boxes=True,
-                                fg_fraction=0.25,
-                                fg_iou_thresh=0.5,
-                                bg_iou_thresh_hi=0.5,
-                                bg_iou_thresh_lo=0.0):
-  """Assigns the proposals with groundtruth classes and performs subsmpling.
-
-  Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
-  following algorithm to generate the final `num_samples_per_image` RoIs.
-    1. Calculates the IoU between each proposal box and each gt_boxes.
-    2. Assigns each proposed box with a groundtruth class and box by choosing
-       the largest IoU overlap.
-    3. Samples `num_samples_per_image` boxes from all proposed boxes, and
-       returns box_targets, class_targets, and RoIs.
-
-  Args:
-    proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number of
-      proposals before groundtruth assignment. The last dimension is the box
-      coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
-    gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
-      coordinates of gt_boxes are in the pixel coordinates of the scaled image.
-      This tensor might have padding of values -1 indicating the invalid box
-      coordinates.
-    gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
-      tensor might have paddings with values of -1 indicating the invalid
-      classes.
-    num_samples_per_image: a integer represents RoI minibatch size per image.
-    mix_gt_boxes: a bool indicating whether to mix the groundtruth boxes before
-      sampling proposals.
-    fg_fraction: a float represents the target fraction of RoI minibatch that is
-      labeled foreground (i.e., class > 0).
-    fg_iou_thresh: a float represents the IoU overlap threshold for an RoI to be
-      considered foreground (if >= fg_iou_thresh).
-    bg_iou_thresh_hi: a float represents the IoU overlap threshold for an RoI to
-      be considered background (class = 0 if overlap in [LO, HI)).
-    bg_iou_thresh_lo: a float represents the IoU overlap threshold for an RoI to
-      be considered background (class = 0 if overlap in [LO, HI)).
-
-  Returns:
-    sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
-      coordinates of the sampled RoIs, where K is the number of the sampled
-      RoIs, i.e. K = num_samples_per_image.
-    sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
-      box coordinates of the matched groundtruth boxes of the samples RoIs.
-    sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
-      classes of the matched groundtruth boxes of the sampled RoIs.
-    sampled_gt_indices: a tensor of shape of [batch_size, K], storing the
-      indices of the sampled groudntruth boxes in the original `gt_boxes`
-      tensor, i.e. gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
-  """
-
-  with tf.name_scope('sample_proposals'):
-    if mix_gt_boxes:
-      boxes = tf.concat([proposed_boxes, gt_boxes], axis=1)
-    else:
-      boxes = proposed_boxes
-
-    (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
-     _) = box_matching(boxes, gt_boxes, gt_classes)
-
-    positive_match = tf.greater(matched_iou, fg_iou_thresh)
-    negative_match = tf.logical_and(
-        tf.greater_equal(matched_iou, bg_iou_thresh_lo),
-        tf.less(matched_iou, bg_iou_thresh_hi))
-    ignored_match = tf.less(matched_iou, 0.0)
-
-    # re-assign negatively matched boxes to the background class.
-    matched_gt_classes = tf.where(negative_match,
-                                  tf.zeros_like(matched_gt_classes),
-                                  matched_gt_classes)
-    matched_gt_indices = tf.where(negative_match,
-                                  tf.zeros_like(matched_gt_indices),
-                                  matched_gt_indices)
-
-    sample_candidates = tf.logical_and(
-        tf.logical_or(positive_match, negative_match),
-        tf.logical_not(ignored_match))
-
-    sampler = (
-        balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
-            positive_fraction=fg_fraction, is_static=True))
-
-    batch_size, _ = sample_candidates.get_shape().as_list()
-    sampled_indicators = []
-    for i in range(batch_size):
-      sampled_indicator = sampler.subsample(sample_candidates[i],
-                                            num_samples_per_image,
-                                            positive_match[i])
-      sampled_indicators.append(sampled_indicator)
-    sampled_indicators = tf.stack(sampled_indicators)
-    _, sampled_indices = tf.nn.top_k(
-        tf.cast(sampled_indicators, dtype=tf.int32),
-        k=num_samples_per_image,
-        sorted=True)
-
-    sampled_indices_shape = tf.shape(sampled_indices)
-    batch_indices = (
-        tf.expand_dims(tf.range(sampled_indices_shape[0]), axis=-1) *
-        tf.ones([1, sampled_indices_shape[-1]], dtype=tf.int32))
-    gather_nd_indices = tf.stack([batch_indices, sampled_indices], axis=-1)
-
-    sampled_rois = tf.gather_nd(boxes, gather_nd_indices)
-    sampled_gt_boxes = tf.gather_nd(matched_gt_boxes, gather_nd_indices)
-    sampled_gt_classes = tf.gather_nd(matched_gt_classes, gather_nd_indices)
-    sampled_gt_indices = tf.gather_nd(matched_gt_indices, gather_nd_indices)
-
-    return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
-            sampled_gt_indices)
-
-
-def sample_and_crop_foreground_masks(candidate_rois,
-                                     candidate_gt_boxes,
-                                     candidate_gt_classes,
-                                     candidate_gt_indices,
-                                     gt_masks,
-                                     num_mask_samples_per_image=128,
-                                     mask_target_size=28):
-  """Samples and creates cropped foreground masks for training.
-
-  Args:
-    candidate_rois: a tensor of shape of [batch_size, N, 4], where N is the
-      number of candidate RoIs to be considered for mask sampling. It includes
-      both positive and negative RoIs. The `num_mask_samples_per_image` positive
-      RoIs will be sampled to create mask training targets.
-    candidate_gt_boxes: a tensor of shape of [batch_size, N, 4], storing the
-      corresponding groundtruth boxes to the `candidate_rois`.
-    candidate_gt_classes: a tensor of shape of [batch_size, N], storing the
-      corresponding groundtruth classes to the `candidate_rois`. 0 in the tensor
-      corresponds to the background class, i.e. negative RoIs.
-    candidate_gt_indices: a tensor of shape [batch_size, N], storing the
-      corresponding groundtruth instance indices to the `candidate_gt_boxes`,
-      i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i] and
-        gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is
-        the superset of candidate_gt_boxes.
-    gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
-      containing all the groundtruth masks which sample masks are drawn from.
-    num_mask_samples_per_image: an integer which specifies the number of masks
-      to sample.
-    mask_target_size: an integer which specifies the final cropped mask size
-      after sampling. The output masks are resized w.r.t the sampled RoIs.
-
-  Returns:
-    foreground_rois: a tensor of shape of [batch_size, K, 4] storing the RoI
-      that corresponds to the sampled foreground masks, where
-      K = num_mask_samples_per_image.
-    foreground_classes: a tensor of shape of [batch_size, K] storing the classes
-      corresponding to the sampled foreground masks.
-    cropoped_foreground_masks: a tensor of shape of
-      [batch_size, K, mask_target_size, mask_target_size] storing the cropped
-      foreground masks used for training.
-  """
-  with tf.name_scope('sample_and_crop_foreground_masks'):
-    _, fg_instance_indices = tf.nn.top_k(
-        tf.cast(tf.greater(candidate_gt_classes, 0), dtype=tf.int32),
-        k=num_mask_samples_per_image)
-
-    fg_instance_indices_shape = tf.shape(fg_instance_indices)
-    batch_indices = (
-        tf.expand_dims(tf.range(fg_instance_indices_shape[0]), axis=-1) *
-        tf.ones([1, fg_instance_indices_shape[-1]], dtype=tf.int32))
-
-    gather_nd_instance_indices = tf.stack([batch_indices, fg_instance_indices],
-                                          axis=-1)
-    foreground_rois = tf.gather_nd(candidate_rois, gather_nd_instance_indices)
-    foreground_boxes = tf.gather_nd(candidate_gt_boxes,
-                                    gather_nd_instance_indices)
-    foreground_classes = tf.gather_nd(candidate_gt_classes,
-                                      gather_nd_instance_indices)
-    foreground_gt_indices = tf.gather_nd(candidate_gt_indices,
-                                         gather_nd_instance_indices)
-
-    foreground_gt_indices_shape = tf.shape(foreground_gt_indices)
-    batch_indices = (
-        tf.expand_dims(tf.range(foreground_gt_indices_shape[0]), axis=-1) *
-        tf.ones([1, foreground_gt_indices_shape[-1]], dtype=tf.int32))
-    gather_nd_gt_indices = tf.stack([batch_indices, foreground_gt_indices],
-                                    axis=-1)
-    foreground_masks = tf.gather_nd(gt_masks, gather_nd_gt_indices)
-
-    cropped_foreground_masks = spatial_transform_ops.crop_mask_in_target_box(
-        foreground_masks,
-        foreground_boxes,
-        foreground_rois,
-        mask_target_size,
-        sample_offset=0.5)
-
-    return foreground_rois, foreground_classes, cropped_foreground_masks
-
-
-class ROISampler(tf.keras.layers.Layer):
-  """Samples RoIs and creates training targets."""
-
-  def __init__(self, params):
-    self._num_samples_per_image = params.num_samples_per_image
-    self._fg_fraction = params.fg_fraction
-    self._fg_iou_thresh = params.fg_iou_thresh
-    self._bg_iou_thresh_hi = params.bg_iou_thresh_hi
-    self._bg_iou_thresh_lo = params.bg_iou_thresh_lo
-    self._mix_gt_boxes = params.mix_gt_boxes
-    super(ROISampler, self).__init__(autocast=False)
-
-  def call(self, rois, gt_boxes, gt_classes):
-    """Sample and assign RoIs for training.
-
-    Args:
-      rois: a tensor of shape of [batch_size, N, 4]. N is the number of
-        proposals before groundtruth assignment. The last dimension is the box
-        coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
-      gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
-        coordinates of gt_boxes are in the pixel coordinates of the scaled
-        image. This tensor might have padding of values -1 indicating the
-        invalid box coordinates.
-      gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
-        tensor might have paddings with values of -1 indicating the invalid
-        classes.
-
-    Returns:
-      sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
-        coordinates of the sampled RoIs, where K is the number of the sampled
-        RoIs, i.e. K = num_samples_per_image.
-      sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
-        box coordinates of the matched groundtruth boxes of the samples RoIs.
-      sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
-        classes of the matched groundtruth boxes of the sampled RoIs.
-    """
-    sampled_rois, sampled_gt_boxes, sampled_gt_classes, sampled_gt_indices = (
-        assign_and_sample_proposals(
-            rois,
-            gt_boxes,
-            gt_classes,
-            num_samples_per_image=self._num_samples_per_image,
-            mix_gt_boxes=self._mix_gt_boxes,
-            fg_fraction=self._fg_fraction,
-            fg_iou_thresh=self._fg_iou_thresh,
-            bg_iou_thresh_hi=self._bg_iou_thresh_hi,
-            bg_iou_thresh_lo=self._bg_iou_thresh_lo))
-    return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
-            sampled_gt_indices)
-
-
-class ROIScoreSampler(ROISampler):
-  """Samples RoIs, RoI-scores and creates training targets."""
-
-  def __call__(self, rois, roi_scores, gt_boxes, gt_classes):
-    """Sample and assign RoIs for training.
-
-    Args:
-      rois: a tensor of shape of [batch_size, N, 4]. N is the number of
-        proposals before groundtruth assignment. The last dimension is the box
-        coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
-      roi_scores:
-      gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
-        coordinates of gt_boxes are in the pixel coordinates of the scaled
-        image. This tensor might have padding of values -1 indicating the
-        invalid box coordinates.
-      gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
-        tensor might have paddings with values of -1 indicating the invalid
-        classes.
-
-    Returns:
-      sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
-        coordinates of the sampled RoIs, where K is the number of the sampled
-        RoIs, i.e. K = num_samples_per_image.
-      sampled_roi_scores:
-      sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
-        box coordinates of the matched groundtruth boxes of the samples RoIs.
-      sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
-        classes of the matched groundtruth boxes of the sampled RoIs.
-    """
-    (sampled_rois, sampled_roi_scores, sampled_gt_boxes, sampled_gt_classes,
-     sampled_gt_indices) = (
-         self.assign_and_sample_proposals_and_scores(
-             rois,
-             roi_scores,
-             gt_boxes,
-             gt_classes,
-             num_samples_per_image=self._num_samples_per_image,
-             mix_gt_boxes=self._mix_gt_boxes,
-             fg_fraction=self._fg_fraction,
-             fg_iou_thresh=self._fg_iou_thresh,
-             bg_iou_thresh_hi=self._bg_iou_thresh_hi,
-             bg_iou_thresh_lo=self._bg_iou_thresh_lo))
-    return (sampled_rois, sampled_roi_scores, sampled_gt_boxes,
-            sampled_gt_classes, sampled_gt_indices)
-
-  def assign_and_sample_proposals_and_scores(self,
-                                             proposed_boxes,
-                                             proposed_scores,
-                                             gt_boxes,
-                                             gt_classes,
-                                             num_samples_per_image=512,
-                                             mix_gt_boxes=True,
-                                             fg_fraction=0.25,
-                                             fg_iou_thresh=0.5,
-                                             bg_iou_thresh_hi=0.5,
-                                             bg_iou_thresh_lo=0.0):
-    """Assigns the proposals with groundtruth classes and performs subsmpling.
-
-    Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
-    following algorithm to generate the final `num_samples_per_image` RoIs.
-      1. Calculates the IoU between each proposal box and each gt_boxes.
-      2. Assigns each proposed box with a groundtruth class and box by choosing
-         the largest IoU overlap.
-      3. Samples `num_samples_per_image` boxes from all proposed boxes, and
-         returns box_targets, class_targets, and RoIs.
-
-    Args:
-      proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number
-        of proposals before groundtruth assignment. The last dimension is the
-        box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
-        format.
-      proposed_scores: a tensor of shape of [batch_size, N]. N is the number of
-        proposals before groundtruth assignment. It is the rpn scores for all
-        proposed boxes which can be either their classification or centerness
-        scores.
-      gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
-        coordinates of gt_boxes are in the pixel coordinates of the scaled
-        image. This tensor might have padding of values -1 indicating the
-        invalid box coordinates.
-      gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
-        tensor might have paddings with values of -1 indicating the invalid
-        classes.
-      num_samples_per_image: a integer represents RoI minibatch size per image.
-      mix_gt_boxes: a bool indicating whether to mix the groundtruth boxes
-      before sampling proposals.
-      fg_fraction: a float represents the target fraction of RoI minibatch that
-        is labeled foreground (i.e., class > 0).
-      fg_iou_thresh: a float represents the IoU overlap threshold for an RoI to
-        be considered foreground (if >= fg_iou_thresh).
-      bg_iou_thresh_hi: a float represents the IoU overlap threshold for an RoI
-        to be considered background (class = 0 if overlap in [LO, HI)).
-      bg_iou_thresh_lo: a float represents the IoU overlap threshold for an RoI
-        to be considered background (class = 0 if overlap in [LO, HI)).
-
-    Returns:
-      sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
-        coordinates of the sampled RoIs, where K is the number of the sampled
-        RoIs, i.e. K = num_samples_per_image.
-      sampled_scores: a tensor of shape of [batch_size, K], representing the
-        confidence score of the sampled RoIs, where K is the number of the
-        sampled RoIs, i.e. K = num_samples_per_image.
-      sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
-        box coordinates of the matched groundtruth boxes of the samples RoIs.
-      sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
-        classes of the matched groundtruth boxes of the sampled RoIs.
-      sampled_gt_indices: a tensor of shape of [batch_size, K], storing the
-        indices of the sampled groudntruth boxes in the original `gt_boxes`
-        tensor, i.e. gt_boxes[sampled_gt_indices[:, i]] =
-        sampled_gt_boxes[:, i].
-    """
-
-    with tf.name_scope('sample_proposals_and_scores'):
-      if mix_gt_boxes:
-        boxes = tf.concat([proposed_boxes, gt_boxes], axis=1)
-        gt_scores = tf.ones_like(gt_boxes[:, :, 0])
-        scores = tf.concat([proposed_scores, gt_scores], axis=1)
-      else:
-        boxes = proposed_boxes
-        scores = proposed_scores
-
-      (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
-       _) = box_matching(boxes, gt_boxes, gt_classes)
-
-      positive_match = tf.greater(matched_iou, fg_iou_thresh)
-      negative_match = tf.logical_and(
-          tf.greater_equal(matched_iou, bg_iou_thresh_lo),
-          tf.less(matched_iou, bg_iou_thresh_hi))
-      ignored_match = tf.less(matched_iou, 0.0)
-
-      # re-assign negatively matched boxes to the background class.
-      matched_gt_classes = tf.where(negative_match,
-                                    tf.zeros_like(matched_gt_classes),
-                                    matched_gt_classes)
-      matched_gt_indices = tf.where(negative_match,
-                                    tf.zeros_like(matched_gt_indices),
-                                    matched_gt_indices)
-
-      sample_candidates = tf.logical_and(
-          tf.logical_or(positive_match, negative_match),
-          tf.logical_not(ignored_match))
-
-      sampler = (
-          balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
-              positive_fraction=fg_fraction, is_static=True))
-
-      batch_size, _ = sample_candidates.get_shape().as_list()
-      sampled_indicators = []
-      for i in range(batch_size):
-        sampled_indicator = sampler.subsample(sample_candidates[i],
-                                              num_samples_per_image,
-                                              positive_match[i])
-        sampled_indicators.append(sampled_indicator)
-      sampled_indicators = tf.stack(sampled_indicators)
-      _, sampled_indices = tf.nn.top_k(
-          tf.cast(sampled_indicators, dtype=tf.int32),
-          k=num_samples_per_image,
-          sorted=True)
-
-      sampled_indices_shape = tf.shape(sampled_indices)
-      batch_indices = (
-          tf.expand_dims(tf.range(sampled_indices_shape[0]), axis=-1) *
-          tf.ones([1, sampled_indices_shape[-1]], dtype=tf.int32))
-      gather_nd_indices = tf.stack([batch_indices, sampled_indices], axis=-1)
-
-      sampled_rois = tf.gather_nd(boxes, gather_nd_indices)
-      sampled_roi_scores = tf.gather_nd(scores, gather_nd_indices)
-      sampled_gt_boxes = tf.gather_nd(matched_gt_boxes, gather_nd_indices)
-      sampled_gt_classes = tf.gather_nd(matched_gt_classes, gather_nd_indices)
-      sampled_gt_indices = tf.gather_nd(matched_gt_indices, gather_nd_indices)
-
-      return (sampled_rois, sampled_roi_scores, sampled_gt_boxes,
-              sampled_gt_classes, sampled_gt_indices)
-
-
-class MaskSampler(tf.keras.layers.Layer):
-  """Samples and creates mask training targets."""
-
-  def __init__(self, mask_target_size, num_mask_samples_per_image):
-    self._mask_target_size = mask_target_size
-    self._num_mask_samples_per_image = num_mask_samples_per_image
-    super(MaskSampler, self).__init__(autocast=False)
-
-  def call(self,
-           candidate_rois,
-           candidate_gt_boxes,
-           candidate_gt_classes,
-           candidate_gt_indices,
-           gt_masks):
-    """Sample and create mask targets for training.
-
-    Args:
-      candidate_rois: a tensor of shape of [batch_size, N, 4], where N is the
-        number of candidate RoIs to be considered for mask sampling. It includes
-        both positive and negative RoIs. The `num_mask_samples_per_image`
-        positive RoIs will be sampled to create mask training targets.
-      candidate_gt_boxes: a tensor of shape of [batch_size, N, 4], storing the
-        corresponding groundtruth boxes to the `candidate_rois`.
-      candidate_gt_classes: a tensor of shape of [batch_size, N], storing the
-        corresponding groundtruth classes to the `candidate_rois`. 0 in the
-        tensor corresponds to the background class, i.e. negative RoIs.
-      candidate_gt_indices: a tensor of shape [batch_size, N], storing the
-        corresponding groundtruth instance indices to the `candidate_gt_boxes`,
-        i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i],
-          where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >=
-          N, is the superset of candidate_gt_boxes.
-      gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
-        containing all the groundtruth masks which sample masks are drawn from.
-        after sampling. The output masks are resized w.r.t the sampled RoIs.
-
-    Returns:
-      foreground_rois: a tensor of shape of [batch_size, K, 4] storing the RoI
-        that corresponds to the sampled foreground masks, where
-        K = num_mask_samples_per_image.
-      foreground_classes: a tensor of shape of [batch_size, K] storing the
-        classes corresponding to the sampled foreground masks.
-      cropoped_foreground_masks: a tensor of shape of
-        [batch_size, K, mask_target_size, mask_target_size] storing the
-        cropped foreground masks used for training.
-    """
-    foreground_rois, foreground_classes, cropped_foreground_masks = (
-        sample_and_crop_foreground_masks(candidate_rois, candidate_gt_boxes,
-                                         candidate_gt_classes,
-                                         candidate_gt_indices, gt_masks,
-                                         self._num_mask_samples_per_image,
-                                         self._mask_target_size))
-    return foreground_rois, foreground_classes, cropped_foreground_masks
--- a/official/vision/detection/utils/__init__.py
+++ b/official/vision/detection/utils/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
--- a/official/vision/detection/utils/box_utils.py
+++ b/official/vision/detection/utils/box_utils.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Utility functions for bounding box processing."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-EPSILON = 1e-8
-BBOX_XFORM_CLIP = np.log(1000. / 16.)
-
-
-def visualize_images_with_bounding_boxes(images, box_outputs, step,
-                                         summary_writer):
-  """Records subset of evaluation images with bounding boxes."""
-  image_shape = tf.shape(images[0])
-  image_height = tf.cast(image_shape[0], tf.float32)
-  image_width = tf.cast(image_shape[1], tf.float32)
-  normalized_boxes = normalize_boxes(box_outputs, [image_height, image_width])
-
-  bounding_box_color = tf.constant([[1.0, 1.0, 0.0, 1.0]])
-  image_summary = tf.image.draw_bounding_boxes(images, normalized_boxes,
-                                               bounding_box_color)
-  with summary_writer.as_default():
-    tf.summary.image('bounding_box_summary', image_summary, step=step)
-    summary_writer.flush()
-
-
-def yxyx_to_xywh(boxes):
-  """Converts boxes from ymin, xmin, ymax, xmax to xmin, ymin, width, height.
-
-  Args:
-    boxes: a numpy array whose last dimension is 4 representing the coordinates
-      of boxes in ymin, xmin, ymax, xmax order.
-
-  Returns:
-    boxes: a numpy array whose shape is the same as `boxes` in new format.
-
-  Raises:
-    ValueError: If the last dimension of boxes is not 4.
-  """
-  if boxes.shape[-1] != 4:
-    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
-        boxes.shape[-1]))
-
-  boxes_ymin = boxes[..., 0]
-  boxes_xmin = boxes[..., 1]
-  boxes_width = boxes[..., 3] - boxes[..., 1]
-  boxes_height = boxes[..., 2] - boxes[..., 0]
-  new_boxes = np.stack([boxes_xmin, boxes_ymin, boxes_width, boxes_height],
-                       axis=-1)
-
-  return new_boxes
-
-
-def jitter_boxes(boxes, noise_scale=0.025):
-  """Jitter the box coordinates by some noise distribution.
-
-  Args:
-    boxes: a tensor whose last dimension is 4 representing the coordinates of
-      boxes in ymin, xmin, ymax, xmax order.
-    noise_scale: a python float which specifies the magnitude of noise. The rule
-      of thumb is to set this between (0, 0.1]. The default value is found to
-      mimic the noisy detections best empirically.
-
-  Returns:
-    jittered_boxes: a tensor whose shape is the same as `boxes` representing
-      the jittered boxes.
-
-  Raises:
-    ValueError: If the last dimension of boxes is not 4.
-  """
-  if boxes.shape[-1] != 4:
-    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
-        boxes.shape[-1]))
-
-  with tf.name_scope('jitter_boxes'):
-    bbox_jitters = tf.random.normal(boxes.get_shape(), stddev=noise_scale)
-    ymin = boxes[..., 0:1]
-    xmin = boxes[..., 1:2]
-    ymax = boxes[..., 2:3]
-    xmax = boxes[..., 3:4]
-    width = xmax - xmin
-    height = ymax - ymin
-    new_center_x = (xmin + xmax) / 2.0 + bbox_jitters[..., 0:1] * width
-    new_center_y = (ymin + ymax) / 2.0 + bbox_jitters[..., 1:2] * height
-    new_width = width * tf.math.exp(bbox_jitters[..., 2:3])
-    new_height = height * tf.math.exp(bbox_jitters[..., 3:4])
-    jittered_boxes = tf.concat([
-        new_center_y - new_height * 0.5, new_center_x - new_width * 0.5,
-        new_center_y + new_height * 0.5, new_center_x + new_width * 0.5
-    ],
-                               axis=-1)
-
-    return jittered_boxes
-
-
-def normalize_boxes(boxes, image_shape):
-  """Converts boxes to the normalized coordinates.
-
-  Args:
-    boxes: a tensor whose last dimension is 4 representing the coordinates of
-      boxes in ymin, xmin, ymax, xmax order.
-    image_shape: a list of two integers, a two-element vector or a tensor such
-      that all but the last dimensions are `broadcastable` to `boxes`. The last
-      dimension is 2, which represents [height, width].
-
-  Returns:
-    normalized_boxes: a tensor whose shape is the same as `boxes` representing
-      the normalized boxes.
-
-  Raises:
-    ValueError: If the last dimension of boxes is not 4.
-  """
-  if boxes.shape[-1] != 4:
-    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
-        boxes.shape[-1]))
-
-  with tf.name_scope('normalize_boxes'):
-    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
-      height, width = image_shape
-    else:
-      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
-      height = image_shape[..., 0:1]
-      width = image_shape[..., 1:2]
-
-    ymin = boxes[..., 0:1] / height
-    xmin = boxes[..., 1:2] / width
-    ymax = boxes[..., 2:3] / height
-    xmax = boxes[..., 3:4] / width
-
-    normalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
-    return normalized_boxes
-
-
-def denormalize_boxes(boxes, image_shape):
-  """Converts boxes normalized by [height, width] to pixel coordinates.
-
-  Args:
-    boxes: a tensor whose last dimension is 4 representing the coordinates of
-      boxes in ymin, xmin, ymax, xmax order.
-    image_shape: a list of two integers, a two-element vector or a tensor such
-      that all but the last dimensions are `broadcastable` to `boxes`. The last
-      dimension is 2, which represents [height, width].
-
-  Returns:
-    denormalized_boxes: a tensor whose shape is the same as `boxes` representing
-      the denormalized boxes.
-
-  Raises:
-    ValueError: If the last dimension of boxes is not 4.
-  """
-  with tf.name_scope('denormalize_boxes'):
-    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
-      height, width = image_shape
-    else:
-      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
-      height, width = tf.split(image_shape, 2, axis=-1)
-
-    ymin, xmin, ymax, xmax = tf.split(boxes, 4, axis=-1)
-    ymin = ymin * height
-    xmin = xmin * width
-    ymax = ymax * height
-    xmax = xmax * width
-
-    denormalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
-    return denormalized_boxes
-
-
-def clip_boxes(boxes, image_shape):
-  """Clips boxes to image boundaries.
-
-  Args:
-    boxes: a tensor whose last dimension is 4 representing the coordinates of
-      boxes in ymin, xmin, ymax, xmax order.
-    image_shape: a list of two integers, a two-element vector or a tensor such
-      that all but the last dimensions are `broadcastable` to `boxes`. The last
-      dimension is 2, which represents [height, width].
-
-  Returns:
-    clipped_boxes: a tensor whose shape is the same as `boxes` representing the
-      clipped boxes.
-
-  Raises:
-    ValueError: If the last dimension of boxes is not 4.
-  """
-  if boxes.shape[-1] != 4:
-    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
-        boxes.shape[-1]))
-
-  with tf.name_scope('clip_boxes'):
-    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
-      height, width = image_shape
-      max_length = [height - 1.0, width - 1.0, height - 1.0, width - 1.0]
-    else:
-      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
-      height, width = tf.unstack(image_shape, axis=-1)
-      max_length = tf.stack(
-          [height - 1.0, width - 1.0, height - 1.0, width - 1.0], axis=-1)
-
-    clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0)
-    return clipped_boxes
-
-
-def compute_outer_boxes(boxes, image_shape, scale=1.0):
-  """Compute outer box encloses an object with a margin.
-
-  Args:
-    boxes: a tensor whose last dimension is 4 representing the coordinates of
-      boxes in ymin, xmin, ymax, xmax order.
-    image_shape: a list of two integers, a two-element vector or a tensor such
-      that all but the last dimensions are `broadcastable` to `boxes`. The last
-      dimension is 2, which represents [height, width].
-    scale: a float number specifying the scale of output outer boxes to input
-      `boxes`.
-
-  Returns:
-    outer_boxes: a tensor whose shape is the same as `boxes` representing the
-      outer boxes.
-  """
-  if scale < 1.0:
-    raise ValueError(
-        'scale is {}, but outer box scale must be greater than 1.0.'.format(
-            scale))
-  centers_y = (boxes[..., 0] + boxes[..., 2]) / 2.0
-  centers_x = (boxes[..., 1] + boxes[..., 3]) / 2.0
-  box_height = (boxes[..., 2] - boxes[..., 0]) * scale
-  box_width = (boxes[..., 3] - boxes[..., 1]) * scale
-  outer_boxes = tf.stack([
-      centers_y - box_height / 2.0, centers_x - box_width / 2.0,
-      centers_y + box_height / 2.0, centers_x + box_width / 2.0
-  ],
-                         axis=1)
-  outer_boxes = clip_boxes(outer_boxes, image_shape)
-  return outer_boxes
-
-
-def encode_boxes(boxes, anchors, weights=None):
-  """Encode boxes to targets.
-
-  Args:
-    boxes: a tensor whose last dimension is 4 representing the coordinates of
-      boxes in ymin, xmin, ymax, xmax order.
-    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
-      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
-    weights: None or a list of four float numbers used to scale coordinates.
-
-  Returns:
-    encoded_boxes: a tensor whose shape is the same as `boxes` representing the
-      encoded box targets.
-
-  Raises:
-    ValueError: If the last dimension of boxes is not 4.
-  """
-  if boxes.shape[-1] != 4:
-    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
-        boxes.shape[-1]))
-
-  with tf.name_scope('encode_boxes'):
-    boxes = tf.cast(boxes, dtype=anchors.dtype)
-    ymin = boxes[..., 0:1]
-    xmin = boxes[..., 1:2]
-    ymax = boxes[..., 2:3]
-    xmax = boxes[..., 3:4]
-    box_h = ymax - ymin + 1.0
-    box_w = xmax - xmin + 1.0
-    box_yc = ymin + 0.5 * box_h
-    box_xc = xmin + 0.5 * box_w
-
-    anchor_ymin = anchors[..., 0:1]
-    anchor_xmin = anchors[..., 1:2]
-    anchor_ymax = anchors[..., 2:3]
-    anchor_xmax = anchors[..., 3:4]
-    anchor_h = anchor_ymax - anchor_ymin + 1.0
-    anchor_w = anchor_xmax - anchor_xmin + 1.0
-    anchor_yc = anchor_ymin + 0.5 * anchor_h
-    anchor_xc = anchor_xmin + 0.5 * anchor_w
-
-    encoded_dy = (box_yc - anchor_yc) / anchor_h
-    encoded_dx = (box_xc - anchor_xc) / anchor_w
-    encoded_dh = tf.math.log(box_h / anchor_h)
-    encoded_dw = tf.math.log(box_w / anchor_w)
-    if weights:
-      encoded_dy *= weights[0]
-      encoded_dx *= weights[1]
-      encoded_dh *= weights[2]
-      encoded_dw *= weights[3]
-
-    encoded_boxes = tf.concat([encoded_dy, encoded_dx, encoded_dh, encoded_dw],
-                              axis=-1)
-    return encoded_boxes
-
-
-def decode_boxes(encoded_boxes, anchors, weights=None):
-  """Decode boxes.
-
-  Args:
-    encoded_boxes: a tensor whose last dimension is 4 representing the
-      coordinates of encoded boxes in ymin, xmin, ymax, xmax order.
-    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
-      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
-    weights: None or a list of four float numbers used to scale coordinates.
-
-  Returns:
-    encoded_boxes: a tensor whose shape is the same as `boxes` representing the
-      decoded box targets.
-  """
-  if encoded_boxes.shape[-1] != 4:
-    raise ValueError('encoded_boxes.shape[-1] is {:d}, but must be 4.'.format(
-        encoded_boxes.shape[-1]))
-
-  with tf.name_scope('decode_boxes'):
-    encoded_boxes = tf.cast(encoded_boxes, dtype=anchors.dtype)
-    dy = encoded_boxes[..., 0:1]
-    dx = encoded_boxes[..., 1:2]
-    dh = encoded_boxes[..., 2:3]
-    dw = encoded_boxes[..., 3:4]
-    if weights:
-      dy /= weights[0]
-      dx /= weights[1]
-      dh /= weights[2]
-      dw /= weights[3]
-    dh = tf.math.minimum(dh, BBOX_XFORM_CLIP)
-    dw = tf.math.minimum(dw, BBOX_XFORM_CLIP)
-
-    anchor_ymin = anchors[..., 0:1]
-    anchor_xmin = anchors[..., 1:2]
-    anchor_ymax = anchors[..., 2:3]
-    anchor_xmax = anchors[..., 3:4]
-    anchor_h = anchor_ymax - anchor_ymin + 1.0
-    anchor_w = anchor_xmax - anchor_xmin + 1.0
-    anchor_yc = anchor_ymin + 0.5 * anchor_h
-    anchor_xc = anchor_xmin + 0.5 * anchor_w
-
-    decoded_boxes_yc = dy * anchor_h + anchor_yc
-    decoded_boxes_xc = dx * anchor_w + anchor_xc
-    decoded_boxes_h = tf.math.exp(dh) * anchor_h
-    decoded_boxes_w = tf.math.exp(dw) * anchor_w
-
-    decoded_boxes_ymin = decoded_boxes_yc - 0.5 * decoded_boxes_h
-    decoded_boxes_xmin = decoded_boxes_xc - 0.5 * decoded_boxes_w
-    decoded_boxes_ymax = decoded_boxes_ymin + decoded_boxes_h - 1.0
-    decoded_boxes_xmax = decoded_boxes_xmin + decoded_boxes_w - 1.0
-
-    decoded_boxes = tf.concat([
-        decoded_boxes_ymin, decoded_boxes_xmin, decoded_boxes_ymax,
-        decoded_boxes_xmax
-    ],
-                              axis=-1)
-    return decoded_boxes
-
-
-def encode_boxes_lrtb(boxes, anchors, weights=None):
-  """Encode boxes to targets on lrtb (=left,right,top,bottom) format.
-
-  Args:
-    boxes: a tensor whose last dimension is 4 representing the coordinates
-      of boxes in ymin, xmin, ymax, xmax order.
-    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
-      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
-    weights: None or a list of four float numbers used to scale coordinates.
-
-  Returns:
-    encoded_boxes_lrtb: a tensor whose shape is the same as `boxes` representing
-      the encoded box targets. The box targets encode the left, right, top,
-      bottom distances from an anchor location to the four borders of the
-      matched groundtruth bounding box.
-    center_targets: centerness targets defined by the left, right, top, and
-      bottom distance targets. The centerness is defined as the deviation of the
-      anchor location from the groundtruth object center. Formally, centerness =
-      sqrt(min(left, right)/max(left, right)*min(top, bottom)/max(top, bottom)).
-
-  Raises:
-    ValueError: If the last dimension of boxes is not 4.
-  """
-  if boxes.shape[-1] != 4:
-    raise ValueError(
-        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
-
-  with tf.name_scope('encode_boxes_lrtb'):
-    boxes = tf.cast(boxes, dtype=anchors.dtype)
-    ymin = boxes[..., 0:1]
-    xmin = boxes[..., 1:2]
-    ymax = boxes[..., 2:3]
-    xmax = boxes[..., 3:4]
-    # box_h = ymax - ymin + 1.0
-    # box_w = xmax - xmin + 1.0
-    box_h = ymax - ymin
-    box_w = xmax - xmin
-
-    anchor_ymin = anchors[..., 0:1]
-    anchor_xmin = anchors[..., 1:2]
-    anchor_ymax = anchors[..., 2:3]
-    anchor_xmax = anchors[..., 3:4]
-    # anchor_h = anchor_ymax - anchor_ymin + 1.0
-    # anchor_w = anchor_xmax - anchor_xmin + 1.0
-    anchor_h = anchor_ymax - anchor_ymin
-    anchor_w = anchor_xmax - anchor_xmin
-    anchor_yc = anchor_ymin + 0.5 * anchor_h
-    anchor_xc = anchor_xmin + 0.5 * anchor_w
-
-    box_h += EPSILON
-    box_w += EPSILON
-    anchor_h += EPSILON
-    anchor_w += EPSILON
-
-    left = (anchor_xc - xmin) / anchor_w
-    right = (xmax - anchor_xc) / anchor_w
-    top = (anchor_yc - ymin) / anchor_h
-    bottom = (ymax - anchor_yc) / anchor_h
-
-    # Create centerness target. {
-    lrtb_targets = tf.concat([left, right, top, bottom], axis=-1)
-    valid_match = tf.greater(tf.reduce_min(lrtb_targets, -1), 0.0)
-
-    # Centerness score.
-    left_right = tf.concat([left, right], axis=-1)
-
-    left_right = tf.where(tf.stack([valid_match, valid_match], -1),
-                          left_right, tf.zeros_like(left_right))
-    top_bottom = tf.concat([top, bottom], axis=-1)
-    top_bottom = tf.where(tf.stack([valid_match, valid_match], -1),
-                          top_bottom, tf.zeros_like(top_bottom))
-    center_targets = tf.sqrt(
-        (tf.reduce_min(left_right, -1) /
-         (tf.reduce_max(left_right, -1) + EPSILON)) *
-        (tf.reduce_min(top_bottom, -1) /
-         (tf.reduce_max(top_bottom, -1) + EPSILON)))
-    center_targets = tf.where(valid_match,
-                              center_targets,
-                              tf.zeros_like(center_targets))
-    if weights:
-      left *= weights[0]
-      right *= weights[1]
-      top *= weights[2]
-      bottom *= weights[3]
-
-    encoded_boxes_lrtb = tf.concat(
-        [left, right, top, bottom],
-        axis=-1)
-
-    return encoded_boxes_lrtb, center_targets
-
-
-def decode_boxes_lrtb(encoded_boxes_lrtb, anchors, weights=None):
-  """Decode boxes.
-
-  Args:
-    encoded_boxes_lrtb: a tensor whose last dimension is 4 representing the
-      coordinates of encoded boxes in left, right, top, bottom order.
-    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
-      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
-    weights: None or a list of four float numbers used to scale coordinates.
-
-  Returns:
-    decoded_boxes_lrtb: a tensor whose shape is the same as `boxes` representing
-      the decoded box targets in lrtb (=left,right,top,bottom) format. The box
-      decoded box coordinates represent the left, right, top, and bottom
-      distances from an anchor location to the four borders of the matched
-      groundtruth bounding box.
-  """
-  if encoded_boxes_lrtb.shape[-1] != 4:
-    raise ValueError(
-        'encoded_boxes_lrtb.shape[-1] is {:d}, but must be 4.'
-        .format(encoded_boxes_lrtb.shape[-1]))
-
-  with tf.name_scope('decode_boxes_lrtb'):
-    encoded_boxes_lrtb = tf.cast(encoded_boxes_lrtb, dtype=anchors.dtype)
-    left = encoded_boxes_lrtb[..., 0:1]
-    right = encoded_boxes_lrtb[..., 1:2]
-    top = encoded_boxes_lrtb[..., 2:3]
-    bottom = encoded_boxes_lrtb[..., 3:4]
-    if weights:
-      left /= weights[0]
-      right /= weights[1]
-      top /= weights[2]
-      bottom /= weights[3]
-
-    anchor_ymin = anchors[..., 0:1]
-    anchor_xmin = anchors[..., 1:2]
-    anchor_ymax = anchors[..., 2:3]
-    anchor_xmax = anchors[..., 3:4]
-
-    anchor_h = anchor_ymax - anchor_ymin
-    anchor_w = anchor_xmax - anchor_xmin
-    anchor_yc = anchor_ymin + 0.5 * anchor_h
-    anchor_xc = anchor_xmin + 0.5 * anchor_w
-    anchor_h += EPSILON
-    anchor_w += EPSILON
-
-    decoded_boxes_ymin = anchor_yc - top * anchor_h
-    decoded_boxes_xmin = anchor_xc - left * anchor_w
-    decoded_boxes_ymax = anchor_yc + bottom * anchor_h
-    decoded_boxes_xmax = anchor_xc + right * anchor_w
-
-    decoded_boxes_lrtb = tf.concat(
-        [decoded_boxes_ymin, decoded_boxes_xmin,
-         decoded_boxes_ymax, decoded_boxes_xmax],
-        axis=-1)
-    return decoded_boxes_lrtb
-
-
-def filter_boxes(boxes, scores, image_shape, min_size_threshold):
-  """Filter and remove boxes that are too small or fall outside the image.
-
-  Args:
-    boxes: a tensor whose last dimension is 4 representing the coordinates of
-      boxes in ymin, xmin, ymax, xmax order.
-    scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
-      representing the original scores of the boxes.
-    image_shape: a tensor whose shape is the same as, or `broadcastable` to
-      `boxes` except the last dimension, which is 2, representing [height,
-      width] of the scaled image.
-    min_size_threshold: a float representing the minimal box size in each side
-      (w.r.t. the scaled image). Boxes whose sides are smaller than it will be
-      filtered out.
-
-  Returns:
-    filtered_boxes: a tensor whose shape is the same as `boxes` but with
-      the position of the filtered boxes are filled with 0.
-    filtered_scores: a tensor whose shape is the same as 'scores' but with
-      the positinon of the filtered boxes filled with 0.
-  """
-  if boxes.shape[-1] != 4:
-    raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
-        boxes.shape[-1]))
-
-  with tf.name_scope('filter_boxes'):
-    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
-      height, width = image_shape
-    else:
-      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
-      height = image_shape[..., 0]
-      width = image_shape[..., 1]
-
-    ymin = boxes[..., 0]
-    xmin = boxes[..., 1]
-    ymax = boxes[..., 2]
-    xmax = boxes[..., 3]
-
-    h = ymax - ymin + 1.0
-    w = xmax - xmin + 1.0
-    yc = ymin + 0.5 * h
-    xc = xmin + 0.5 * w
-
-    min_size = tf.cast(
-        tf.math.maximum(min_size_threshold, 1.0), dtype=boxes.dtype)
-
-    filtered_size_mask = tf.math.logical_and(
-        tf.math.greater(h, min_size), tf.math.greater(w, min_size))
-    filtered_center_mask = tf.logical_and(
-        tf.math.logical_and(tf.math.greater(yc, 0.0), tf.math.less(yc, height)),
-        tf.math.logical_and(tf.math.greater(xc, 0.0), tf.math.less(xc, width)))
-    filtered_mask = tf.math.logical_and(filtered_size_mask,
-                                        filtered_center_mask)
-
-    filtered_scores = tf.where(filtered_mask, scores, tf.zeros_like(scores))
-    filtered_boxes = tf.cast(
-        tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
-
-    return filtered_boxes, filtered_scores
-
-
-def filter_boxes_by_scores(boxes, scores, min_score_threshold):
-  """Filter and remove boxes whose scores are smaller than the threshold.
-
-  Args:
-    boxes: a tensor whose last dimension is 4 representing the coordinates of
-      boxes in ymin, xmin, ymax, xmax order.
-    scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
-      representing the original scores of the boxes.
-    min_score_threshold: a float representing the minimal box score threshold.
-      Boxes whose score are smaller than it will be filtered out.
-
-  Returns:
-    filtered_boxes: a tensor whose shape is the same as `boxes` but with
-      the position of the filtered boxes are filled with -1.
-    filtered_scores: a tensor whose shape is the same as 'scores' but with
-      the
-  """
-  if boxes.shape[-1] != 4:
-    raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
-        boxes.shape[-1]))
-
-  with tf.name_scope('filter_boxes_by_scores'):
-    filtered_mask = tf.math.greater(scores, min_score_threshold)
-    filtered_scores = tf.where(filtered_mask, scores, -tf.ones_like(scores))
-    filtered_boxes = tf.cast(
-        tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
-
-    return filtered_boxes, filtered_scores
-
-
-def top_k_boxes(boxes, scores, k):
-  """Sort and select top k boxes according to the scores.
-
-  Args:
-    boxes: a tensor of shape [batch_size, N, 4] representing the coordiante of
-      the boxes. N is the number of boxes per image.
-    scores: a tensor of shsape [batch_size, N] representing the socre of the
-      boxes.
-    k: an integer or a tensor indicating the top k number.
-
-  Returns:
-    selected_boxes: a tensor of shape [batch_size, k, 4] representing the
-      selected top k box coordinates.
-    selected_scores: a tensor of shape [batch_size, k] representing the selected
-      top k box scores.
-  """
-  with tf.name_scope('top_k_boxes'):
-    selected_scores, top_k_indices = tf.nn.top_k(scores, k=k, sorted=True)
-
-    batch_size, _ = scores.get_shape().as_list()
-    if batch_size == 1:
-      selected_boxes = tf.squeeze(
-          tf.gather(boxes, top_k_indices, axis=1), axis=1)
-    else:
-      top_k_indices_shape = tf.shape(top_k_indices)
-      batch_indices = (
-          tf.expand_dims(tf.range(top_k_indices_shape[0]), axis=-1) *
-          tf.ones([1, top_k_indices_shape[-1]], dtype=tf.int32))
-      gather_nd_indices = tf.stack([batch_indices, top_k_indices], axis=-1)
-      selected_boxes = tf.gather_nd(boxes, gather_nd_indices)
-
-    return selected_boxes, selected_scores
-
-
-def bbox_overlap(boxes, gt_boxes):
-  """Calculates the overlap between proposal and ground truth boxes.
-
-  Some `gt_boxes` may have been padded.  The returned `iou` tensor for these
-  boxes will be -1.
-
-  Args:
-    boxes: a tensor with a shape of [batch_size, N, 4]. N is the number of
-      proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
-      last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
-    gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. This
-      tensor might have paddings with a negative value.
-
-  Returns:
-    iou: a tensor with as a shape of [batch_size, N, MAX_NUM_INSTANCES].
-  """
-  with tf.name_scope('bbox_overlap'):
-    bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
-        value=boxes, num_or_size_splits=4, axis=2)
-    gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
-        value=gt_boxes, num_or_size_splits=4, axis=2)
-
-    # Calculates the intersection area.
-    i_xmin = tf.math.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
-    i_xmax = tf.math.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
-    i_ymin = tf.math.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
-    i_ymax = tf.math.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
-    i_area = tf.math.maximum((i_xmax - i_xmin), 0) * tf.math.maximum(
-        (i_ymax - i_ymin), 0)
-
-    # Calculates the union area.
-    bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min)
-    gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min)
-    # Adds a small epsilon to avoid divide-by-zero.
-    u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8
-
-    # Calculates IoU.
-    iou = i_area / u_area
-
-    # Fills -1 for IoU entries between the padded ground truth boxes.
-    gt_invalid_mask = tf.less(
-        tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
-    padding_mask = tf.logical_or(
-        tf.zeros_like(bb_x_min, dtype=tf.bool),
-        tf.transpose(gt_invalid_mask, [0, 2, 1]))
-    iou = tf.where(padding_mask, -tf.ones_like(iou), iou)
-
-    return iou
-
-
-def get_non_empty_box_indices(boxes):
-  """Get indices for non-empty boxes."""
-  # Selects indices if box height or width is 0.
-  height = boxes[:, 2] - boxes[:, 0]
-  width = boxes[:, 3] - boxes[:, 1]
-  indices = tf.where(
-      tf.logical_and(tf.greater(height, 0), tf.greater(width, 0)))
-  return indices[:, 0]
--- a/official/vision/detection/utils/class_utils.py
+++ b/official/vision/detection/utils/class_utils.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Utility functions for handling dataset object categories."""
-
-
-def coco_split_class_ids(split_name):
-  """Return the COCO class split ids based on split name and training mode.
-
-  Args:
-    split_name: The name of dataset split.
-
-  Returns:
-    class_ids: a python list of integer.
-  """
-  if split_name == 'all':
-    return []
-
-  elif split_name == 'voc':
-    return [
-        1, 2, 3, 4, 5, 6, 7, 9, 16, 17, 18, 19, 20, 21, 44, 62, 63, 64, 67, 72
-    ]
-
-  elif split_name == 'nonvoc':
-    return [
-        8, 10, 11, 13, 14, 15, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36,
-        37, 38, 39, 40, 41, 42, 43, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
-        57, 58, 59, 60, 61, 65, 70, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84,
-        85, 86, 87, 88, 89, 90
-    ]
-
-  else:
-    raise ValueError('Invalid split name {}!!!'.format(split_name))
--- a/official/vision/detection/utils/dataloader_utils.py
+++ b/official/vision/detection/utils/dataloader_utils.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Utility functions for dataloader."""
-
-import tensorflow as tf
-
-from official.vision.detection.utils import input_utils
-
-
-def process_source_id(source_id):
-  """Processes source_id to the right format."""
-  if source_id.dtype == tf.string:
-    source_id = tf.cast(tf.strings.to_number(source_id), tf.int64)
-  with tf.control_dependencies([source_id]):
-    source_id = tf.cond(
-        pred=tf.equal(tf.size(input=source_id), 0),
-        true_fn=lambda: tf.cast(tf.constant(-1), tf.int64),
-        false_fn=lambda: tf.identity(source_id))
-  return source_id
-
-
-def pad_groundtruths_to_fixed_size(gt, n):
-  """Pads the first dimension of groundtruths labels to the fixed size."""
-  gt['boxes'] = input_utils.pad_to_fixed_size(gt['boxes'], n, -1)
-  gt['is_crowds'] = input_utils.pad_to_fixed_size(gt['is_crowds'], n, 0)
-  gt['areas'] = input_utils.pad_to_fixed_size(gt['areas'], n, -1)
-  gt['classes'] = input_utils.pad_to_fixed_size(gt['classes'], n, -1)
-  return gt
--- a/official/vision/detection/utils/input_utils.py
+++ b/official/vision/detection/utils/input_utils.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Utility functions for input processing."""
-
-import math
-
-import tensorflow as tf
-
-from official.vision.detection.utils import box_utils
-from official.vision.utils.object_detection import preprocessor
-
-
-def pad_to_fixed_size(input_tensor, size, constant_values=0):
-  """Pads data to a fixed length at the first dimension.
-
-  Args:
-    input_tensor: `Tensor` with any dimension.
-    size: `int` number for the first dimension of output Tensor.
-    constant_values: `int` value assigned to the paddings.
-
-  Returns:
-    `Tensor` with the first dimension padded to `size`.
-  """
-  input_shape = input_tensor.get_shape().as_list()
-  padding_shape = []
-
-  # Computes the padding length on the first dimension.
-  padding_length = tf.maximum(0, size - tf.shape(input_tensor)[0])
-  assert_length = tf.Assert(
-      tf.greater_equal(padding_length, 0), [padding_length])
-  with tf.control_dependencies([assert_length]):
-    padding_shape.append(padding_length)
-
-  # Copies shapes of the rest of input shape dimensions.
-  for i in range(1, len(input_shape)):
-    padding_shape.append(tf.shape(input=input_tensor)[i])
-
-  # Pads input tensor to the fixed first dimension.
-  paddings = tf.cast(constant_values * tf.ones(padding_shape),
-                     input_tensor.dtype)
-  padded_tensor = tf.concat([input_tensor, paddings], axis=0)
-  output_shape = input_shape
-  output_shape[0] = size
-  padded_tensor.set_shape(output_shape)
-  return padded_tensor
-
-
-def normalize_image(image,
-                    offset=(0.485, 0.456, 0.406),
-                    scale=(0.229, 0.224, 0.225)):
-  """Normalizes the image to zero mean and unit variance."""
-  image = tf.image.convert_image_dtype(image, dtype=tf.float32)
-  offset = tf.constant(offset)
-  offset = tf.expand_dims(offset, axis=0)
-  offset = tf.expand_dims(offset, axis=0)
-  image -= offset
-
-  scale = tf.constant(scale)
-  scale = tf.expand_dims(scale, axis=0)
-  scale = tf.expand_dims(scale, axis=0)
-  image /= scale
-  return image
-
-
-def compute_padded_size(desired_size, stride):
-  """Compute the padded size given the desired size and the stride.
-
-  The padded size will be the smallest rectangle, such that each dimension is
-  the smallest multiple of the stride which is larger than the desired
-  dimension. For example, if desired_size = (100, 200) and stride = 32,
-  the output padded_size = (128, 224).
-
-  Args:
-    desired_size: a `Tensor` or `int` list/tuple of two elements representing
-      [height, width] of the target output image size.
-    stride: an integer, the stride of the backbone network.
-
-  Returns:
-    padded_size: a `Tensor` or `int` list/tuple of two elements representing
-      [height, width] of the padded output image size.
-  """
-  if isinstance(desired_size, list) or isinstance(desired_size, tuple):
-    padded_size = [
-        int(math.ceil(d * 1.0 / stride) * stride) for d in desired_size
-    ]
-  else:
-    padded_size = tf.cast(
-        tf.math.ceil(tf.cast(desired_size, dtype=tf.float32) / stride) * stride,
-        tf.int32)
-  return padded_size
-
-
-def resize_and_crop_image(image,
-                          desired_size,
-                          padded_size,
-                          aug_scale_min=1.0,
-                          aug_scale_max=1.0,
-                          seed=1,
-                          method=tf.image.ResizeMethod.BILINEAR):
-  """Resizes the input image to output size.
-
-  Resize and pad images given the desired output size of the image and
-  stride size.
-
-  Here are the preprocessing steps.
-  1. For a given image, keep its aspect ratio and rescale the image to make it
-     the largest rectangle to be bounded by the rectangle specified by the
-     `desired_size`.
-  2. Pad the rescaled image to the padded_size.
-
-  Args:
-    image: a `Tensor` of shape [height, width, 3] representing an image.
-    desired_size: a `Tensor` or `int` list/tuple of two elements representing
-      [height, width] of the desired actual output image size.
-    padded_size: a `Tensor` or `int` list/tuple of two elements representing
-      [height, width] of the padded output image size. Padding will be applied
-      after scaling the image to the desired_size.
-    aug_scale_min: a `float` with range between [0, 1.0] representing minimum
-      random scale applied to desired_size for training scale jittering.
-    aug_scale_max: a `float` with range between [1.0, inf] representing maximum
-      random scale applied to desired_size for training scale jittering.
-    seed: seed for random scale jittering.
-    method: function to resize input image to scaled image.
-
-  Returns:
-    output_image: `Tensor` of shape [height, width, 3] where [height, width]
-      equals to `output_size`.
-    image_info: a 2D `Tensor` that encodes the information of the image and the
-      applied preprocessing. It is in the format of
-      [[original_height, original_width], [desired_height, desired_width],
-       [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
-      desireed_width] is the actual scaled image size, and [y_scale, x_scale] is
-      the scaling factory, which is the ratio of
-      scaled dimension / original dimension.
-  """
-  with tf.name_scope('resize_and_crop_image'):
-    image_size = tf.cast(tf.shape(input=image)[0:2], tf.float32)
-
-    random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
-
-    if random_jittering:
-      random_scale = tf.random.uniform([],
-                                       aug_scale_min,
-                                       aug_scale_max,
-                                       seed=seed)
-      scaled_size = tf.round(random_scale * desired_size)
-    else:
-      scaled_size = desired_size
-
-    scale = tf.minimum(scaled_size[0] / image_size[0],
-                       scaled_size[1] / image_size[1])
-    scaled_size = tf.round(image_size * scale)
-
-    # Computes 2D image_scale.
-    image_scale = scaled_size / image_size
-
-    # Selects non-zero random offset (x, y) if scaled image is larger than
-    # desired_size.
-    if random_jittering:
-      max_offset = scaled_size - desired_size
-      max_offset = tf.where(
-          tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
-      offset = max_offset * tf.random.uniform([
-          2,
-      ], 0, 1, seed=seed)
-      offset = tf.cast(offset, tf.int32)
-    else:
-      offset = tf.zeros((2,), tf.int32)
-
-    scaled_image = tf.image.resize(
-        image, tf.cast(scaled_size, tf.int32), method=method)
-
-    if random_jittering:
-      scaled_image = scaled_image[offset[0]:offset[0] + desired_size[0],
-                                  offset[1]:offset[1] + desired_size[1], :]
-
-    output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0,
-                                                padded_size[0], padded_size[1])
-
-    image_info = tf.stack([
-        image_size,
-        tf.cast(desired_size, dtype=tf.float32), image_scale,
-        tf.cast(offset, tf.float32)
-    ])
-    return output_image, image_info
-
-
-def resize_and_crop_image_v2(image,
-                             short_side,
-                             long_side,
-                             padded_size,
-                             aug_scale_min=1.0,
-                             aug_scale_max=1.0,
-                             seed=1,
-                             method=tf.image.ResizeMethod.BILINEAR):
-  """Resizes the input image to output size (Faster R-CNN style).
-
-  Resize and pad images given the specified short / long side length and the
-  stride size.
-
-  Here are the preprocessing steps.
-  1. For a given image, keep its aspect ratio and first try to rescale the short
-     side of the original image to `short_side`.
-  2. If the scaled image after 1 has a long side that exceeds `long_side`, keep
-     the aspect ratio and rescal the long side of the image to `long_side`.
-  2. Pad the rescaled image to the padded_size.
-
-  Args:
-    image: a `Tensor` of shape [height, width, 3] representing an image.
-    short_side: a scalar `Tensor` or `int` representing the desired short side
-      to be rescaled to.
-    long_side: a scalar `Tensor` or `int` representing the desired long side to
-      be rescaled to.
-    padded_size: a `Tensor` or `int` list/tuple of two elements representing
-      [height, width] of the padded output image size. Padding will be applied
-      after scaling the image to the desired_size.
-    aug_scale_min: a `float` with range between [0, 1.0] representing minimum
-      random scale applied to desired_size for training scale jittering.
-    aug_scale_max: a `float` with range between [1.0, inf] representing maximum
-      random scale applied to desired_size for training scale jittering.
-    seed: seed for random scale jittering.
-    method: function to resize input image to scaled image.
-
-  Returns:
-    output_image: `Tensor` of shape [height, width, 3] where [height, width]
-      equals to `output_size`.
-    image_info: a 2D `Tensor` that encodes the information of the image and the
-      applied preprocessing. It is in the format of
-      [[original_height, original_width], [desired_height, desired_width],
-       [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
-      desired_width] is the actual scaled image size, and [y_scale, x_scale] is
-      the scaling factor, which is the ratio of
-      scaled dimension / original dimension.
-  """
-  with tf.name_scope('resize_and_crop_image_v2'):
-    image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
-
-    scale_using_short_side = (
-        short_side / tf.math.minimum(image_size[0], image_size[1]))
-    scale_using_long_side = (
-        long_side / tf.math.maximum(image_size[0], image_size[1]))
-
-    scaled_size = tf.math.round(image_size * scale_using_short_side)
-    scaled_size = tf.where(
-        tf.math.greater(
-            tf.math.maximum(scaled_size[0], scaled_size[1]), long_side),
-        tf.math.round(image_size * scale_using_long_side), scaled_size)
-    desired_size = scaled_size
-
-    random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
-
-    if random_jittering:
-      random_scale = tf.random.uniform([],
-                                       aug_scale_min,
-                                       aug_scale_max,
-                                       seed=seed)
-      scaled_size = tf.math.round(random_scale * scaled_size)
-
-    # Computes 2D image_scale.
-    image_scale = scaled_size / image_size
-
-    # Selects non-zero random offset (x, y) if scaled image is larger than
-    # desired_size.
-    if random_jittering:
-      max_offset = scaled_size - desired_size
-      max_offset = tf.where(
-          tf.math.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
-      offset = max_offset * tf.random.uniform([
-          2,
-      ], 0, 1, seed=seed)
-      offset = tf.cast(offset, tf.int32)
-    else:
-      offset = tf.zeros((2,), tf.int32)
-
-    scaled_image = tf.image.resize(
-        image, tf.cast(scaled_size, tf.int32), method=method)
-
-    if random_jittering:
-      scaled_image = scaled_image[offset[0]:offset[0] + desired_size[0],
-                                  offset[1]:offset[1] + desired_size[1], :]
-
-    output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0,
-                                                padded_size[0], padded_size[1])
-
-    image_info = tf.stack([
-        image_size,
-        tf.cast(desired_size, dtype=tf.float32), image_scale,
-        tf.cast(offset, tf.float32)
-    ])
-    return output_image, image_info
-
-
-def resize_and_crop_boxes(boxes, image_scale, output_size, offset):
-  """Resizes boxes to output size with scale and offset.
-
-  Args:
-    boxes: `Tensor` of shape [N, 4] representing ground truth boxes.
-    image_scale: 2D float `Tensor` representing scale factors that apply to
-      [height, width] of input image.
-    output_size: 2D `Tensor` or `int` representing [height, width] of target
-      output image size.
-    offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
-      boxes.
-
-  Returns:
-    boxes: `Tensor` of shape [N, 4] representing the scaled boxes.
-  """
-  # Adjusts box coordinates based on image_scale and offset.
-  boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
-  boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
-  # Clips the boxes.
-  boxes = box_utils.clip_boxes(boxes, output_size)
-  return boxes
-
-
-def resize_and_crop_masks(masks, image_scale, output_size, offset):
-  """Resizes boxes to output size with scale and offset.
-
-  Args:
-    masks: `Tensor` of shape [N, H, W, 1] representing ground truth masks.
-    image_scale: 2D float `Tensor` representing scale factors that apply to
-      [height, width] of input image.
-    output_size: 2D `Tensor` or `int` representing [height, width] of target
-      output image size.
-    offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
-      boxes.
-
-  Returns:
-    masks: `Tensor` of shape [N, H, W, 1] representing the scaled masks.
-  """
-  mask_size = tf.shape(input=masks)[1:3]
-  scaled_size = tf.cast(image_scale * tf.cast(mask_size, image_scale.dtype),
-                        tf.int32)
-  scaled_masks = tf.image.resize(
-      masks, scaled_size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
-  offset = tf.cast(offset, tf.int32)
-  scaled_masks = scaled_masks[:, offset[0]:offset[0] + output_size[0],
-                              offset[1]:offset[1] + output_size[1], :]
-
-  output_masks = tf.image.pad_to_bounding_box(scaled_masks, 0, 0,
-                                              output_size[0], output_size[1])
-  return output_masks
-
-
-def random_horizontal_flip(image, boxes=None, masks=None):
-  """Randomly flips input image and bounding boxes."""
-  return preprocessor.random_horizontal_flip(image, boxes, masks)
--- a/official/vision/detection/utils/mask_utils.py
+++ b/official/vision/detection/utils/mask_utils.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Utility functions for segmentations."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-import numpy as np
-import cv2
-
-
-def paste_instance_masks(masks, detected_boxes, image_height, image_width):
-  """Paste instance masks to generate the image segmentation results.
-
-  Args:
-    masks: a numpy array of shape [N, mask_height, mask_width] representing the
-      instance masks w.r.t. the `detected_boxes`.
-    detected_boxes: a numpy array of shape [N, 4] representing the reference
-      bounding boxes.
-    image_height: an integer representing the height of the image.
-    image_width: an integer representing the width of the image.
-
-  Returns:
-    segms: a numpy array of shape [N, image_height, image_width] representing
-      the instance masks *pasted* on the image canvas.
-  """
-
-  def expand_boxes(boxes, scale):
-    """Expands an array of boxes by a given scale."""
-    # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227  # pylint: disable=line-too-long
-    # The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
-    # whereas `boxes` here is in [x1, y1, w, h] form
-    w_half = boxes[:, 2] * .5
-    h_half = boxes[:, 3] * .5
-    x_c = boxes[:, 0] + w_half
-    y_c = boxes[:, 1] + h_half
-
-    w_half *= scale
-    h_half *= scale
-
-    boxes_exp = np.zeros(boxes.shape)
-    boxes_exp[:, 0] = x_c - w_half
-    boxes_exp[:, 2] = x_c + w_half
-    boxes_exp[:, 1] = y_c - h_half
-    boxes_exp[:, 3] = y_c + h_half
-
-    return boxes_exp
-
-  # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812  # pylint: disable=line-too-long
-  # To work around an issue with cv2.resize (it seems to automatically pad
-  # with repeated border values), we manually zero-pad the masks by 1 pixel
-  # prior to resizing back to the original image resolution. This prevents
-  # "top hat" artifacts. We therefore need to expand the reference boxes by an
-  # appropriate factor.
-  _, mask_height, mask_width = masks.shape
-  scale = max((mask_width + 2.0) / mask_width,
-              (mask_height + 2.0) / mask_height)
-
-  ref_boxes = expand_boxes(detected_boxes, scale)
-  ref_boxes = ref_boxes.astype(np.int32)
-  padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
-  segms = []
-  for mask_ind, mask in enumerate(masks):
-    im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
-    # Process mask inside bounding boxes.
-    padded_mask[1:-1, 1:-1] = mask[:, :]
-
-    ref_box = ref_boxes[mask_ind, :]
-    w = ref_box[2] - ref_box[0] + 1
-    h = ref_box[3] - ref_box[1] + 1
-    w = np.maximum(w, 1)
-    h = np.maximum(h, 1)
-
-    mask = cv2.resize(padded_mask, (w, h))
-    mask = np.array(mask > 0.5, dtype=np.uint8)
-
-    x_0 = min(max(ref_box[0], 0), image_width)
-    x_1 = min(max(ref_box[2] + 1, 0), image_width)
-    y_0 = min(max(ref_box[1], 0), image_height)
-    y_1 = min(max(ref_box[3] + 1, 0), image_height)
-
-    im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[1]),
-                                     (x_0 - ref_box[0]):(x_1 - ref_box[0])]
-    segms.append(im_mask)
-
-  segms = np.array(segms)
-  assert masks.shape[0] == segms.shape[0]
-  return segms
-
-
-def paste_instance_masks_v2(masks, detected_boxes, image_height, image_width):
-  """Paste instance masks to generate the image segmentation (v2).
-
-  Args:
-    masks: a numpy array of shape [N, mask_height, mask_width] representing the
-      instance masks w.r.t. the `detected_boxes`.
-    detected_boxes: a numpy array of shape [N, 4] representing the reference
-      bounding boxes.
-    image_height: an integer representing the height of the image.
-    image_width: an integer representing the width of the image.
-
-  Returns:
-    segms: a numpy array of shape [N, image_height, image_width] representing
-      the instance masks *pasted* on the image canvas.
-  """
-  _, mask_height, mask_width = masks.shape
-
-  segms = []
-  for i, mask in enumerate(masks):
-    box = detected_boxes[i, :]
-    xmin = box[0]
-    ymin = box[1]
-    xmax = xmin + box[2]
-    ymax = ymin + box[3]
-
-    # Sample points of the cropped mask w.r.t. the image grid.
-    # Note that these coordinates may fall beyond the image.
-    # Pixel clipping will happen after warping.
-    xmin_int = int(math.floor(xmin))
-    xmax_int = int(math.ceil(xmax))
-    ymin_int = int(math.floor(ymin))
-    ymax_int = int(math.ceil(ymax))
-
-    alpha = box[2] / (1.0 * mask_width)
-    beta = box[3] / (1.0 * mask_height)
-    # pylint: disable=invalid-name
-    # Transformation from mask pixel indices to image coordinate.
-    M_mask_to_image = np.array([[alpha, 0, xmin], [0, beta, ymin], [0, 0, 1]],
-                               dtype=np.float32)
-    # Transformation from image to cropped mask coordinate.
-    M_image_to_crop = np.array(
-        [[1, 0, -xmin_int], [0, 1, -ymin_int], [0, 0, 1]], dtype=np.float32)
-    M = np.dot(M_image_to_crop, M_mask_to_image)
-    # Compensate the half pixel offset that OpenCV has in the
-    # warpPerspective implementation: the top-left pixel is sampled
-    # at (0,0), but we want it to be at (0.5, 0.5).
-    M = np.dot(
-        np.dot(
-            np.array([[1, 0, -0.5], [0, 1, -0.5], [0, 0, 1]], np.float32), M),
-        np.array([[1, 0, 0.5], [0, 1, 0.5], [0, 0, 1]], np.float32))
-    # pylint: enable=invalid-name
-    cropped_mask = cv2.warpPerspective(
-        mask.astype(np.float32), M, (xmax_int - xmin_int, ymax_int - ymin_int))
-    cropped_mask = np.array(cropped_mask > 0.5, dtype=np.uint8)
-
-    img_mask = np.zeros((image_height, image_width))
-    x0 = max(min(xmin_int, image_width), 0)
-    x1 = max(min(xmax_int, image_width), 0)
-    y0 = max(min(ymin_int, image_height), 0)
-    y1 = max(min(ymax_int, image_height), 0)
-    img_mask[y0:y1, x0:x1] = cropped_mask[(y0 - ymin_int):(y1 - ymin_int),
-                                          (x0 - xmin_int):(x1 - xmin_int)]
-
-    segms.append(img_mask)
-
-  segms = np.array(segms)
-  return segms