Internal change

PiperOrigin-RevId: 326286926

Internal change
PiperOrigin-RevId: 326286926
999fae62 · Hongkun Yu · A. Unique TensorFlower · 94561082 · 999fae62 · 999fae62
Commit 999fae62 authored Aug 12, 2020 by Hongkun Yu Committed by A. Unique TensorFlower Aug 12, 2020
20 changed files
--- a/official/vision/detection/modeling/retinanet_model.py
+++ b/official/vision/detection/modeling/retinanet_model.py
@@ -52,15 +52,15 @@ class RetinanetModel(base_model.Model):
    # Predict function.
    self._generate_detections_fn = postprocess_ops.MultilevelDetectionGenerator(
-        params.architecture.min_level,
+        params.architecture.min_level, params.architecture.max_level,
-        params.architecture.max_level,
        params.postprocess)
    self._transpose_input = params.train.transpose_input
    assert not self._transpose_input, 'Transpose input is not supported.'
    # Input layer.
    self._input_layer = tf.keras.layers.Input(
-        shape=(None, None, params.retinanet_parser.num_channels), name='',
+        shape=(None, None, params.retinanet_parser.num_channels),
+        name='',
        dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32)
  def build_outputs(self, inputs, mode):
@@ -141,8 +141,8 @@ class RetinanetModel(base_model.Model):
        raise ValueError('"%s" is missing in outputs, requried %s found %s',
                         field, required_label_fields, labels.keys())
    boxes, scores, classes, valid_detections = self._generate_detections_fn(
-        outputs['box_outputs'], outputs['cls_outputs'],
+        outputs['box_outputs'], outputs['cls_outputs'], labels['anchor_boxes'],
-        labels['anchor_boxes'], labels['image_info'][:, 1:2, :])
+        labels['image_info'][:, 1:2, :])
    # Discards the old output tensors to save memory. The `cls_outputs` and
    # `box_outputs` are pretty big and could potentiall lead to memory issue.
    outputs = {

--- a/official/vision/detection/modeling/shapemask_model.py
+++ b/official/vision/detection/modeling/shapemask_model.py
@@ -61,13 +61,11 @@ class ShapeMaskModel(base_model.Model):
        params.shapemask_loss.shape_prior_loss_weight)
    self._coarse_mask_loss_weight = (
        params.shapemask_loss.coarse_mask_loss_weight)
-    self._fine_mask_loss_weight = (
+    self._fine_mask_loss_weight = (params.shapemask_loss.fine_mask_loss_weight)
-        params.shapemask_loss.fine_mask_loss_weight)
    # Predict function.
    self._generate_detections_fn = postprocess_ops.MultilevelDetectionGenerator(
-        params.architecture.min_level,
+        params.architecture.min_level, params.architecture.max_level,
-        params.architecture.max_level,
        params.postprocess)
  def build_outputs(self, inputs, mode):
@@ -79,10 +77,8 @@ class ShapeMaskModel(base_model.Model):
    else:
      anchor_boxes = anchor.Anchor(
          self._params.architecture.min_level,
-          self._params.architecture.max_level,
+          self._params.architecture.max_level, self._params.anchor.num_scales,
-          self._params.anchor.num_scales,
+          self._params.anchor.aspect_ratios, self._params.anchor.anchor_size,
-          self._params.anchor.aspect_ratios,
-          self._params.anchor.anchor_size,
          images.get_shape().as_list()[1:3]).multilevel_boxes
      batch_size = tf.shape(images)[0]
@@ -96,8 +92,7 @@ class ShapeMaskModel(base_model.Model):
        fpn_features, is_training=is_training)
    valid_boxes, valid_scores, valid_classes, valid_detections = (
-        self._generate_detections_fn(box_outputs, cls_outputs,
+        self._generate_detections_fn(box_outputs, cls_outputs, anchor_boxes,
-                                     anchor_boxes,
                                     inputs['image_info'][:, 1:2, :]))
    image_size = images.get_shape().as_list()[1:3]
@@ -124,22 +119,18 @@ class ShapeMaskModel(base_model.Model):
        return boxes, classes, outer_boxes
    boxes, classes, outer_boxes = SampledBoxesLayer()(
-        inputs, valid_boxes, valid_classes,
+        inputs,
-        valid_outer_boxes, training=is_training)
+        valid_boxes,
+        valid_classes,
-    instance_features, prior_masks = self._shape_prior_head_fn(fpn_features,
+        valid_outer_boxes,
-                                                               boxes,
+        training=is_training)
-                                                               outer_boxes,
-                                                               classes,
+    instance_features, prior_masks = self._shape_prior_head_fn(
-                                                               is_training)
+        fpn_features, boxes, outer_boxes, classes, is_training)
-    coarse_mask_logits = self._coarse_mask_fn(instance_features,
+    coarse_mask_logits = self._coarse_mask_fn(instance_features, prior_masks,
-                                              prior_masks,
+                                              classes, is_training)
-                                              classes,
+    fine_mask_logits = self._fine_mask_fn(instance_features, coarse_mask_logits,
-                                              is_training)
+                                          classes, is_training)
-    fine_mask_logits = self._fine_mask_fn(instance_features,
-                                          coarse_mask_logits,
-                                          classes,
-                                          is_training)
    model_outputs = {
        'cls_outputs': cls_outputs,
@@ -177,18 +168,15 @@ class ShapeMaskModel(base_model.Model):
                                   labels['num_positives'])
      # Adds Shapemask model losses.
-      shape_prior_loss = self._shapemask_prior_loss_fn(
+      shape_prior_loss = self._shapemask_prior_loss_fn(outputs['prior_masks'],
-          outputs['prior_masks'],
+                                                       labels['mask_targets'],
-          labels['mask_targets'],
+                                                       labels['mask_is_valid'])
-          labels['mask_is_valid'])
+      coarse_mask_loss = self._shapemask_loss_fn(outputs['coarse_mask_logits'],
-      coarse_mask_loss = self._shapemask_loss_fn(
+                                                 labels['mask_targets'],
-          outputs['coarse_mask_logits'],
+                                                 labels['mask_is_valid'])
-          labels['mask_targets'],
+      fine_mask_loss = self._shapemask_loss_fn(outputs['fine_mask_logits'],
-          labels['mask_is_valid'])
+                                               labels['fine_mask_targets'],
-      fine_mask_loss = self._shapemask_loss_fn(
+                                               labels['mask_is_valid'])
-          outputs['fine_mask_logits'],
-          labels['fine_mask_targets'],
-          labels['mask_is_valid'])
      model_loss = (
          cls_loss + self._box_loss_weight * box_loss +
@@ -222,43 +210,46 @@ class ShapeMaskModel(base_model.Model):
    if is_training:
      batch_size = params.train.batch_size
      input_layer = {
-          'image': tf.keras.layers.Input(
+          'image':
-              shape=input_shape,
+              tf.keras.layers.Input(
-              batch_size=batch_size,
+                  shape=input_shape,
-              name='image',
+                  batch_size=batch_size,
-              dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
+                  name='image',
-          'image_info': tf.keras.layers.Input(
+                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
-              shape=[4, 2],
+          'image_info':
-              batch_size=batch_size,
+              tf.keras.layers.Input(
-              name='image_info'),
+                  shape=[4, 2], batch_size=batch_size, name='image_info'),
-          'mask_classes': tf.keras.layers.Input(
+          'mask_classes':
-              shape=[params.shapemask_parser.num_sampled_masks],
+              tf.keras.layers.Input(
-              batch_size=batch_size,
+                  shape=[params.shapemask_parser.num_sampled_masks],
-              name='mask_classes',
+                  batch_size=batch_size,
-              dtype=tf.int64),
+                  name='mask_classes',
-          'mask_outer_boxes': tf.keras.layers.Input(
+                  dtype=tf.int64),
-              shape=[params.shapemask_parser.num_sampled_masks, 4],
+          'mask_outer_boxes':
-              batch_size=batch_size,
+              tf.keras.layers.Input(
-              name='mask_outer_boxes',
+                  shape=[params.shapemask_parser.num_sampled_masks, 4],
-              dtype=tf.float32),
+                  batch_size=batch_size,
-          'mask_boxes': tf.keras.layers.Input(
+                  name='mask_outer_boxes',
-              shape=[params.shapemask_parser.num_sampled_masks, 4],
+                  dtype=tf.float32),
-              batch_size=batch_size,
+          'mask_boxes':
-              name='mask_boxes',
+              tf.keras.layers.Input(
-              dtype=tf.float32),
+                  shape=[params.shapemask_parser.num_sampled_masks, 4],
+                  batch_size=batch_size,
+                  name='mask_boxes',
+                  dtype=tf.float32),
      }
    else:
      batch_size = params.eval.batch_size
      input_layer = {
-          'image': tf.keras.layers.Input(
+          'image':
-              shape=input_shape,
+              tf.keras.layers.Input(
-              batch_size=batch_size,
+                  shape=input_shape,
-              name='image',
+                  batch_size=batch_size,
-              dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
+                  name='image',
-          'image_info': tf.keras.layers.Input(
+                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
-              shape=[4, 2],
+          'image_info':
-              batch_size=batch_size,
+              tf.keras.layers.Input(
-              name='image_info'),
+                  shape=[4, 2], batch_size=batch_size, name='image_info'),
      }
    return input_layer
@@ -277,9 +268,10 @@ class ShapeMaskModel(base_model.Model):
    return self._keras_model
  def post_processing(self, labels, outputs):
-    required_output_fields = ['num_detections', 'detection_boxes',
+    required_output_fields = [
-                              'detection_classes', 'detection_masks',
+        'num_detections', 'detection_boxes', 'detection_classes',
-                              'detection_scores']
+        'detection_masks', 'detection_scores'
+    ]
    for field in required_output_fields:
      if field not in outputs:

--- a/official/vision/detection/ops/nms.py
+++ b/official/vision/detection/ops/nms.py
@@ -22,7 +22,6 @@ import tensorflow as tf
 from official.vision.detection.utils import box_utils
 NMS_TILE_SIZE = 512
@@ -106,9 +105,7 @@ def _suppression_loop_body(boxes, iou_threshold, output_size, idx):
  return boxes, iou_threshold, output_size, idx + 1
-def sorted_non_max_suppression_padded(scores,
+def sorted_non_max_suppression_padded(scores, boxes, max_output_size,
-                                      boxes,
-                                      max_output_size,
                                      iou_threshold):
  """A wrapper that handles non-maximum suppression.
@@ -177,19 +174,18 @@ def sorted_non_max_suppression_padded(scores,
        idx < num_boxes // NMS_TILE_SIZE)
  selected_boxes, _, output_size, _ = tf.while_loop(
-      _loop_cond, _suppression_loop_body, [
+      _loop_cond, _suppression_loop_body,
-          boxes, iou_threshold,
+      [boxes, iou_threshold,
-          tf.zeros([batch_size], tf.int32),
+       tf.zeros([batch_size], tf.int32),
-          tf.constant(0)
+       tf.constant(0)])
-      ])
  idx = num_boxes - tf.cast(
      tf.nn.top_k(
          tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) *
          tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0],
      tf.int32)
  idx = tf.minimum(idx, num_boxes - 1)
-  idx = tf.reshape(
+  idx = tf.reshape(idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]),
-      idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]), [-1])
+                   [-1])
  boxes = tf.reshape(
      tf.gather(tf.reshape(boxes, [-1, 4]), idx),
      [batch_size, max_output_size, 4])

--- a/official/vision/detection/ops/postprocess_ops.py
+++ b/official/vision/detection/ops/postprocess_ops.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 import functools
 import tensorflow as tf
 from official.vision.detection.ops import nms
@@ -202,15 +203,14 @@ def _generate_detections_per_image(boxes,
        scores_i, k=tf.minimum(tf.shape(input=scores_i)[-1], pre_nms_num_boxes))
    boxes_i = tf.gather(boxes_i, indices)
-    (nmsed_indices_i,
+    (nmsed_indices_i, nmsed_num_valid_i) = tf.image.non_max_suppression_padded(
-     nmsed_num_valid_i) = tf.image.non_max_suppression_padded(
+        tf.cast(boxes_i, tf.float32),
-         tf.cast(boxes_i, tf.float32),
+        tf.cast(scores_i, tf.float32),
-         tf.cast(scores_i, tf.float32),
+        max_total_size,
-         max_total_size,
+        iou_threshold=nms_iou_threshold,
-         iou_threshold=nms_iou_threshold,
+        score_threshold=score_threshold,
-         score_threshold=score_threshold,
+        pad_to_max_output_size=True,
-         pad_to_max_output_size=True,
+        name='nms_detections_' + str(i))
-         name='nms_detections_' + str(i))
    nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
    nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i)
    # Sets scores of invalid boxes to -1.
@@ -235,11 +235,8 @@ def _generate_detections_per_image(boxes,
  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
-def _generate_detections_batched(boxes,
+def _generate_detections_batched(boxes, scores, max_total_size,
-                                 scores,
+                                 nms_iou_threshold, score_threshold):
-                                 max_total_size,
-                                 nms_iou_threshold,
-                                 score_threshold):
  """Generates detected boxes with scores and classes for one-stage detector.
  The function takes output of multi-level ConvNets and anchor boxes and
@@ -247,19 +244,20 @@ def _generate_detections_batched(boxes,
  supported on TPU currently.
  Args:
-    boxes: a tensor with shape [batch_size, N, num_classes, 4] or
+    boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size,
-      [batch_size, N, 1, 4], which box predictions on all feature levels. The N
+      N, 1, 4], which box predictions on all feature levels. The N is the number
-      is the number of total anchors on all levels.
+      of total anchors on all levels.
-    scores: a tensor with shape [batch_size, N, num_classes], which
+    scores: a tensor with shape [batch_size, N, num_classes], which stacks class
-      stacks class probability on all feature levels. The N is the number of
+      probability on all feature levels. The N is the number of total anchors on
-      total anchors on all levels. The num_classes is the number of classes
+      all levels. The num_classes is the number of classes predicted by the
-      predicted by the model. Note that the class_outputs here is the raw score.
+      model. Note that the class_outputs here is the raw score.
    max_total_size: a scalar representing maximum number of boxes retained over
      all classes.
    nms_iou_threshold: a float representing the threshold for deciding whether
      boxes overlap too much with respect to IOU.
    score_threshold: a float representing the threshold for deciding when to
      remove boxes based on score.
  Returns:
    nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
      representing top detected boxes in [y1, x1, y2, x2].
@@ -285,7 +283,8 @@ def _generate_detections_batched(boxes,
         max_total_size=max_total_size,
         iou_threshold=nms_iou_threshold,
         score_threshold=score_threshold,
-         pad_per_class=False,)
+         pad_per_class=False,
+     )
    # De-normalizes box cooridinates.
    nmsed_boxes *= normalizer
  nmsed_classes = tf.cast(nmsed_classes, tf.int32)
@@ -382,16 +381,13 @@ class GenericDetectionGenerator(object):
    box_outputs = tf.reshape(
        box_outputs,
        tf.stack([batch_size, num_locations, num_classes, 4], axis=-1))
-    box_outputs = tf.slice(
+    box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1])
-        box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1])
    anchor_boxes = tf.tile(
        tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
-    box_outputs = tf.reshape(
+    box_outputs = tf.reshape(box_outputs,
-        box_outputs,
+                             tf.stack([batch_size, num_detections, 4], axis=-1))
-        tf.stack([batch_size, num_detections, 4], axis=-1))
    anchor_boxes = tf.reshape(
-        anchor_boxes,
+        anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1))
-        tf.stack([batch_size, num_detections, 4], axis=-1))
    # Box decoding.
    decoded_boxes = box_utils.decode_boxes(

--- a/official/vision/detection/ops/roi_ops.py
+++ b/official/vision/detection/ops/roi_ops.py
@@ -56,8 +56,8 @@ def multilevel_propose_rois(rpn_boxes,
    rpn_scores: a dict with keys representing FPN levels and values representing
      logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
    anchor_boxes: a dict with keys representing FPN levels and values
-      representing anchor box tensors of shape
+      representing anchor box tensors of shape [batch_size, feature_h,
-      [batch_size, feature_h, feature_w, num_anchors * 4].
+      feature_w, num_anchors * 4].
    image_shape: a tensor of shape [batch_size, 2] where the last dimension are
      [height, width] of the scaled image.
    rpn_pre_nms_top_k: an integer of top scoring RPN proposals *per level* to
@@ -112,17 +112,14 @@ def multilevel_propose_rois(rpn_boxes,
          this_level_scores = tf.sigmoid(this_level_scores)
        if decode_boxes:
-          this_level_boxes = box_utils.decode_boxes(
+          this_level_boxes = box_utils.decode_boxes(this_level_boxes,
-              this_level_boxes, this_level_anchors)
+                                                    this_level_anchors)
        if clip_boxes:
-          this_level_boxes = box_utils.clip_boxes(
+          this_level_boxes = box_utils.clip_boxes(this_level_boxes, image_shape)
-              this_level_boxes, image_shape)
        if rpn_min_size_threshold > 0.0:
          this_level_boxes, this_level_scores = box_utils.filter_boxes(
-              this_level_boxes,
+              this_level_boxes, this_level_scores, image_shape,
-              this_level_scores,
-              image_shape,
              rpn_min_size_threshold)
        this_level_pre_nms_top_k = min(num_boxes, rpn_pre_nms_top_k)
@@ -142,8 +139,9 @@ def multilevel_propose_rois(rpn_boxes,
          else:
            if rpn_score_threshold > 0.0:
              this_level_boxes, this_level_scores = (
-                  box_utils.filter_boxes_by_scores(
+                  box_utils.filter_boxes_by_scores(this_level_boxes,
-                      this_level_boxes, this_level_scores, rpn_score_threshold))
+                                                   this_level_scores,
+                                                   rpn_score_threshold))
            this_level_boxes, this_level_scores = box_utils.top_k_boxes(
                this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k)
            this_level_roi_scores, this_level_rois = (
@@ -154,9 +152,7 @@ def multilevel_propose_rois(rpn_boxes,
                    iou_threshold=rpn_nms_threshold))
        else:
          this_level_rois, this_level_roi_scores = box_utils.top_k_boxes(
-              this_level_rois,
+              this_level_rois, this_level_scores, k=this_level_post_nms_top_k)
-              this_level_scores,
-              k=this_level_post_nms_top_k)
        rois.append(this_level_rois)
        roi_scores.append(this_level_roi_scores)
@@ -199,8 +195,8 @@ class ROIGenerator(object):
      scores: a dict with keys representing FPN levels and values representing
        logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
      anchor_boxes: a dict with keys representing FPN levels and values
-        representing anchor box tensors of shape
+        representing anchor box tensors of shape [batch_size, feature_h,
-        [batch_size, feature_h, feature_w, num_anchors * 4].
+        feature_w, num_anchors * 4].
      image_shape: a tensor of shape [batch_size, 2] where the last dimension
        are [height, width] of the scaled image.
      is_training: a bool indicating whether it is in training or inference
@@ -220,16 +216,16 @@ class ROIGenerator(object):
        scores,
        anchor_boxes,
        image_shape,
-        rpn_pre_nms_top_k=(self._rpn_pre_nms_top_k if is_training
+        rpn_pre_nms_top_k=(self._rpn_pre_nms_top_k
-                           else self._test_rpn_pre_nms_top_k),
+                           if is_training else self._test_rpn_pre_nms_top_k),
-        rpn_post_nms_top_k=(self._rpn_post_nms_top_k if is_training
+        rpn_post_nms_top_k=(self._rpn_post_nms_top_k
-                            else self._test_rpn_post_nms_top_k),
+                            if is_training else self._test_rpn_post_nms_top_k),
-        rpn_nms_threshold=(self._rpn_nms_threshold if is_training
+        rpn_nms_threshold=(self._rpn_nms_threshold
-                           else self._test_rpn_nms_threshold),
+                           if is_training else self._test_rpn_nms_threshold),
-        rpn_score_threshold=(self._rpn_score_threshold if is_training
+        rpn_score_threshold=(self._rpn_score_threshold if is_training else
-                             else self._test_rpn_score_threshold),
+                             self._test_rpn_score_threshold),
-        rpn_min_size_threshold=(self._rpn_min_size_threshold if is_training
+        rpn_min_size_threshold=(self._rpn_min_size_threshold if is_training else
-                                else self._test_rpn_min_size_threshold),
+                                self._test_rpn_min_size_threshold),
        decode_boxes=True,
        clip_boxes=True,
        use_batched_nms=self._use_batched_nms,

--- a/official/vision/detection/ops/spatial_transform_ops.py
+++ b/official/vision/detection/ops/spatial_transform_ops.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 import tensorflow as tf
 _EPSILON = 1e-8
@@ -30,6 +29,7 @@ def nearest_upsampling(data, scale):
  Args:
    data: A tensor with a shape of [batch, height_in, width_in, channels].
    scale: An integer multiple to scale resolution of input data.
  Returns:
    data_up: A tensor with a shape of
      [batch, height_in*scale, width_in*scale, channels]. Same dtype as input
@@ -382,8 +382,7 @@ def multilevel_crop_and_resize(features, boxes, output_size=7):
    areas_sqrt = tf.sqrt(box_height * box_width)
    levels = tf.cast(
        tf.math.floordiv(
-            tf.math.log(tf.divide(areas_sqrt, 224.0)), tf.math.log(2.0)) +
+            tf.math.log(tf.divide(areas_sqrt, 224.0)), tf.math.log(2.0)) + 4.0,
-        4.0,
        dtype=tf.int32)
    # Maps levels between [min_level, max_level].
    levels = tf.minimum(max_level, tf.maximum(levels, min_level))
@@ -395,9 +394,12 @@ def multilevel_crop_and_resize(features, boxes, output_size=7):
    boxes /= tf.expand_dims(scale_to_level, axis=2)
    box_width /= scale_to_level
    box_height /= scale_to_level
-    boxes = tf.concat([boxes[:, :, 0:2],
+    boxes = tf.concat([
-                       tf.expand_dims(box_height, -1),
+        boxes[:, :, 0:2],
-                       tf.expand_dims(box_width, -1)], axis=-1)
+        tf.expand_dims(box_height, -1),
+        tf.expand_dims(box_width, -1)
+    ],
+                      axis=-1)
    # Maps levels to [0, max_level-min_level].
    levels -= min_level
@@ -464,12 +466,12 @@ def single_level_feature_crop(features, level_boxes, detection_prior_levels,
  Args:
-    features: a float tensor of shape [batch_size, num_levels,
+    features: a float tensor of shape [batch_size, num_levels, max_feature_size,
-      max_feature_size, max_feature_size, num_downsample_channels].
+      max_feature_size, num_downsample_channels].
-    level_boxes: a float Tensor of the level boxes to crop from.
+    level_boxes: a float Tensor of the level boxes to crop from. [batch_size,
-        [batch_size, num_instances, 4].
+      num_instances, 4].
    detection_prior_levels: an int Tensor of instance assigned level of shape
-        [batch_size, num_instances].
+      [batch_size, num_instances].
    min_mask_level: minimum FPN level to crop mask feature from.
    mask_crop_size: an int of mask crop size.
@@ -478,8 +480,8 @@ def single_level_feature_crop(features, level_boxes, detection_prior_levels,
        mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
        instance feature crop.
  """
-  (batch_size, num_levels, max_feature_size,
+  (batch_size, num_levels, max_feature_size, _,
-   _, num_downsample_channels) = features.get_shape().as_list()
+   num_downsample_channels) = features.get_shape().as_list()
  _, num_of_instances, _ = level_boxes.get_shape().as_list()
  level_boxes = tf.cast(level_boxes, tf.int32)
  assert num_of_instances == detection_prior_levels.get_shape().as_list()[1]
@@ -503,32 +505,25 @@ def single_level_feature_crop(features, level_boxes, detection_prior_levels,
  indices = tf.reshape(
      tf.tile(
          tf.reshape(
-              tf.range(batch_size) * batch_dim_size,
+              tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
-              [batch_size, 1, 1, 1]),
+          [1, num_of_instances, mask_crop_size, mask_crop_size]) + tf.tile(
-          [1, num_of_instances,
+              tf.reshape(levels * level_dim_size,
-           mask_crop_size, mask_crop_size]) +
+                         [batch_size, num_of_instances, 1, 1]),
-      tf.tile(
+              [1, 1, mask_crop_size, mask_crop_size]) + tf.tile(
-          tf.reshape(levels * level_dim_size,
+                  tf.reshape(y_indices * height_dim_size,
-                     [batch_size, num_of_instances, 1, 1]),
+                             [batch_size, num_of_instances, mask_crop_size, 1]),
-          [1, 1, mask_crop_size, mask_crop_size]) +
+                  [1, 1, 1, mask_crop_size]) +
-      tf.tile(
-          tf.reshape(y_indices * height_dim_size,
-                     [batch_size, num_of_instances,
-                      mask_crop_size, 1]),
-          [1, 1, 1, mask_crop_size]) +
      tf.tile(
          tf.reshape(x_indices,
-                     [batch_size, num_of_instances,
+                     [batch_size, num_of_instances, 1, mask_crop_size]),
-                      1, mask_crop_size]),
          [1, 1, mask_crop_size, 1]), [-1])
-  features_r2 = tf.reshape(features,
+  features_r2 = tf.reshape(features, [-1, num_downsample_channels])
-                           [-1, num_downsample_channels])
  crop_features = tf.reshape(
-      tf.gather(features_r2, indices),
+      tf.gather(features_r2, indices), [
-      [batch_size * num_of_instances,
+          batch_size * num_of_instances, mask_crop_size, mask_crop_size,
-       mask_crop_size, mask_crop_size,
+          num_downsample_channels
-       num_downsample_channels])
+      ])
  return crop_features
@@ -546,9 +541,9 @@ def crop_mask_in_target_box(masks,
    boxes: a float tensor representing box cooridnates that tightly enclose
      masks with a shape of [batch_size, num_masks, 4] in un-normalized
      coordinates. A box is represented by [ymin, xmin, ymax, xmax].
-    target_boxes: a float tensor representing target box cooridnates for
+    target_boxes: a float tensor representing target box cooridnates for masks
-      masks with a shape of [batch_size, num_masks, 4] in un-normalized
+      with a shape of [batch_size, num_masks, 4] in un-normalized coordinates. A
-      coordinates. A box is represented by [ymin, xmin, ymax, xmax].
+      box is represented by [ymin, xmin, ymax, xmax].
    output_size: A scalar to indicate the output crop size. It currently only
      supports to output a square shape outputs.
    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
@@ -561,10 +556,10 @@ def crop_mask_in_target_box(masks,
  """
  with tf.name_scope('crop_mask_in_target_box'):
    batch_size, num_masks, height, width = masks.get_shape().as_list()
-    masks = tf.reshape(masks, [batch_size*num_masks, height, width, 1])
+    masks = tf.reshape(masks, [batch_size * num_masks, height, width, 1])
    # Pad zeros on the boundary of masks.
    masks = tf.image.pad_to_bounding_box(masks, 2, 2, height + 4, width + 4)
-    masks = tf.reshape(masks, [batch_size, num_masks, height+4, width+4, 1])
+    masks = tf.reshape(masks, [batch_size, num_masks, height + 4, width + 4, 1])
    # Projects target box locations and sizes to corresponding cropped
    # mask coordinates.
@@ -572,10 +567,10 @@ def crop_mask_in_target_box(masks,
        value=boxes, num_or_size_splits=4, axis=2)
    bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
        value=target_boxes, num_or_size_splits=4, axis=2)
-    y_transform = (bb_y_min - gt_y_min) * height / (
+    y_transform = (bb_y_min - gt_y_min) * height / (gt_y_max - gt_y_min +
-        gt_y_max - gt_y_min + _EPSILON) + 2
+                                                    _EPSILON) + 2
-    x_transform = (bb_x_min - gt_x_min) * height / (
+    x_transform = (bb_x_min - gt_x_min) * height / (gt_x_max - gt_x_min +
-        gt_x_max - gt_x_min + _EPSILON) + 2
+                                                    _EPSILON) + 2
    h_transform = (bb_y_max - bb_y_min) * width / (
        gt_y_max - gt_y_min + _EPSILON)
    w_transform = (bb_x_max - bb_x_min) * width / (
@@ -592,8 +587,8 @@ def crop_mask_in_target_box(masks,
    # Reshape tensors to have the right shape for selective_crop_and_resize.
    trasnformed_boxes = tf.concat(
        [y_transform, x_transform, h_transform, w_transform], -1)
-    levels = tf.tile(tf.reshape(tf.range(num_masks), [1, num_masks]),
+    levels = tf.tile(
-                     [batch_size, 1])
+        tf.reshape(tf.range(num_masks), [1, num_masks]), [batch_size, 1])
    cropped_masks = selective_crop_and_resize(
        masks,

--- a/official/vision/detection/ops/target_ops.py
+++ b/official/vision/detection/ops/target_ops.py
@@ -87,18 +87,16 @@ def box_matching(boxes, gt_boxes, gt_classes):
      matched_gt_boxes)
  matched_gt_classes = tf.gather_nd(gt_classes, gather_nd_indices)
-  matched_gt_classes = tf.where(
+  matched_gt_classes = tf.where(background_box_mask,
-      background_box_mask,
+                                tf.zeros_like(matched_gt_classes),
-      tf.zeros_like(matched_gt_classes),
+                                matched_gt_classes)
-      matched_gt_classes)
-  matched_gt_indices = tf.where(
+  matched_gt_indices = tf.where(background_box_mask,
-      background_box_mask,
+                                -tf.ones_like(argmax_iou_indices),
-      -tf.ones_like(argmax_iou_indices),
+                                argmax_iou_indices)
-      argmax_iou_indices)
-  return (matched_gt_boxes, matched_gt_classes, matched_gt_indices,
+  return (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
-          matched_iou, iou)
+          iou)
 def assign_and_sample_proposals(proposed_boxes,
@@ -121,22 +119,21 @@ def assign_and_sample_proposals(proposed_boxes,
       returns box_targets, class_targets, and RoIs.
  Args:
-    proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number
+    proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number of
-      of proposals before groundtruth assignment. The last dimension is the
+      proposals before groundtruth assignment. The last dimension is the box
-      box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
+      coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
-      format.
+    gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
-    gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4].
+      coordinates of gt_boxes are in the pixel coordinates of the scaled image.
-      The coordinates of gt_boxes are in the pixel coordinates of the scaled
+      This tensor might have padding of values -1 indicating the invalid box
-      image. This tensor might have padding of values -1 indicating the invalid
+      coordinates.
-      box coordinates.
    gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
      tensor might have paddings with values of -1 indicating the invalid
      classes.
    num_samples_per_image: a integer represents RoI minibatch size per image.
    mix_gt_boxes: a bool indicating whether to mix the groundtruth boxes before
      sampling proposals.
-    fg_fraction: a float represents the target fraction of RoI minibatch that
+    fg_fraction: a float represents the target fraction of RoI minibatch that is
-      is labeled foreground (i.e., class > 0).
+      labeled foreground (i.e., class > 0).
    fg_iou_thresh: a float represents the IoU overlap threshold for an RoI to be
      considered foreground (if >= fg_iou_thresh).
    bg_iou_thresh_hi: a float represents the IoU overlap threshold for an RoI to
@@ -163,8 +160,8 @@ def assign_and_sample_proposals(proposed_boxes,
    else:
      boxes = proposed_boxes
-    (matched_gt_boxes, matched_gt_classes, matched_gt_indices,
+    (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
-     matched_iou, _) = box_matching(boxes, gt_boxes, gt_classes)
+     _) = box_matching(boxes, gt_boxes, gt_classes)
    positive_match = tf.greater(matched_iou, fg_iou_thresh)
    negative_match = tf.logical_and(
@@ -173,10 +170,12 @@ def assign_and_sample_proposals(proposed_boxes,
    ignored_match = tf.less(matched_iou, 0.0)
    # re-assign negatively matched boxes to the background class.
-    matched_gt_classes = tf.where(
+    matched_gt_classes = tf.where(negative_match,
-        negative_match, tf.zeros_like(matched_gt_classes), matched_gt_classes)
+                                  tf.zeros_like(matched_gt_classes),
-    matched_gt_indices = tf.where(
+                                  matched_gt_classes)
-        negative_match, tf.zeros_like(matched_gt_indices), matched_gt_indices)
+    matched_gt_indices = tf.where(negative_match,
+                                  tf.zeros_like(matched_gt_indices),
+                                  matched_gt_indices)
    sample_candidates = tf.logical_and(
        tf.logical_or(positive_match, negative_match),
@@ -189,8 +188,9 @@ def assign_and_sample_proposals(proposed_boxes,
    batch_size, _ = sample_candidates.get_shape().as_list()
    sampled_indicators = []
    for i in range(batch_size):
-      sampled_indicator = sampler.subsample(
+      sampled_indicator = sampler.subsample(sample_candidates[i],
-          sample_candidates[i], num_samples_per_image, positive_match[i])
+                                            num_samples_per_image,
+                                            positive_match[i])
      sampled_indicators.append(sampled_indicator)
    sampled_indicators = tf.stack(sampled_indicators)
    _, sampled_indices = tf.nn.top_k(
@@ -206,10 +206,8 @@ def assign_and_sample_proposals(proposed_boxes,
    sampled_rois = tf.gather_nd(boxes, gather_nd_indices)
    sampled_gt_boxes = tf.gather_nd(matched_gt_boxes, gather_nd_indices)
-    sampled_gt_classes = tf.gather_nd(
+    sampled_gt_classes = tf.gather_nd(matched_gt_classes, gather_nd_indices)
-        matched_gt_classes, gather_nd_indices)
+    sampled_gt_indices = tf.gather_nd(matched_gt_indices, gather_nd_indices)
-    sampled_gt_indices = tf.gather_nd(
-        matched_gt_indices, gather_nd_indices)
    return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
            sampled_gt_indices)
@@ -237,8 +235,8 @@ def sample_and_crop_foreground_masks(candidate_rois,
    candidate_gt_indices: a tensor of shape [batch_size, N], storing the
      corresponding groundtruth instance indices to the `candidate_gt_boxes`,
      i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i] and
-      gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is the
+        gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is
-      superset of candidate_gt_boxes.
+        the superset of candidate_gt_boxes.
    gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
      containing all the groundtruth masks which sample masks are drawn from.
    num_mask_samples_per_image: an integer which specifies the number of masks
@@ -266,27 +264,29 @@ def sample_and_crop_foreground_masks(candidate_rois,
        tf.expand_dims(tf.range(fg_instance_indices_shape[0]), axis=-1) *
        tf.ones([1, fg_instance_indices_shape[-1]], dtype=tf.int32))
-    gather_nd_instance_indices = tf.stack(
+    gather_nd_instance_indices = tf.stack([batch_indices, fg_instance_indices],
-        [batch_indices, fg_instance_indices], axis=-1)
+                                          axis=-1)
-    foreground_rois = tf.gather_nd(
+    foreground_rois = tf.gather_nd(candidate_rois, gather_nd_instance_indices)
-        candidate_rois, gather_nd_instance_indices)
+    foreground_boxes = tf.gather_nd(candidate_gt_boxes,
-    foreground_boxes = tf.gather_nd(
+                                    gather_nd_instance_indices)
-        candidate_gt_boxes, gather_nd_instance_indices)
+    foreground_classes = tf.gather_nd(candidate_gt_classes,
-    foreground_classes = tf.gather_nd(
+                                      gather_nd_instance_indices)
-        candidate_gt_classes, gather_nd_instance_indices)
+    foreground_gt_indices = tf.gather_nd(candidate_gt_indices,
-    foreground_gt_indices = tf.gather_nd(
+                                         gather_nd_instance_indices)
-        candidate_gt_indices, gather_nd_instance_indices)
    foreground_gt_indices_shape = tf.shape(foreground_gt_indices)
    batch_indices = (
        tf.expand_dims(tf.range(foreground_gt_indices_shape[0]), axis=-1) *
        tf.ones([1, foreground_gt_indices_shape[-1]], dtype=tf.int32))
-    gather_nd_gt_indices = tf.stack(
+    gather_nd_gt_indices = tf.stack([batch_indices, foreground_gt_indices],
-        [batch_indices, foreground_gt_indices], axis=-1)
+                                    axis=-1)
    foreground_masks = tf.gather_nd(gt_masks, gather_nd_gt_indices)
    cropped_foreground_masks = spatial_transform_ops.crop_mask_in_target_box(
-        foreground_masks, foreground_boxes, foreground_rois, mask_target_size,
+        foreground_masks,
+        foreground_boxes,
+        foreground_rois,
+        mask_target_size,
        sample_offset=0.5)
    return foreground_rois, foreground_classes, cropped_foreground_masks
@@ -307,12 +307,11 @@ class ROISampler(object):
    """Sample and assign RoIs for training.
    Args:
-      rois: a tensor of shape of [batch_size, N, 4]. N is the number
+      rois: a tensor of shape of [batch_size, N, 4]. N is the number of
-        of proposals before groundtruth assignment. The last dimension is the
+        proposals before groundtruth assignment. The last dimension is the box
-        box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
+        coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
-        format.
+      gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
-      gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4].
+        coordinates of gt_boxes are in the pixel coordinates of the scaled
-        The coordinates of gt_boxes are in the pixel coordinates of the scaled
        image. This tensor might have padding of values -1 indicating the
        invalid box coordinates.
      gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
@@ -350,12 +349,8 @@ class MaskSampler(object):
    self._mask_target_size = mask_target_size
    self._num_mask_samples_per_image = num_mask_samples_per_image
-  def __call__(self,
+  def __call__(self, candidate_rois, candidate_gt_boxes, candidate_gt_classes,
-               candidate_rois,
+               candidate_gt_indices, gt_masks):
-               candidate_gt_boxes,
-               candidate_gt_classes,
-               candidate_gt_indices,
-               gt_masks):
    """Sample and create mask targets for training.
    Args:
@@ -371,8 +366,8 @@ class MaskSampler(object):
      candidate_gt_indices: a tensor of shape [batch_size, N], storing the
        corresponding groundtruth instance indices to the `candidate_gt_boxes`,
        i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i],
-        where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N,
+          where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >=
-        is the superset of candidate_gt_boxes.
+          N, is the superset of candidate_gt_boxes.
      gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
        containing all the groundtruth masks which sample masks are drawn from.
        after sampling. The output masks are resized w.r.t the sampled RoIs.
@@ -388,12 +383,9 @@ class MaskSampler(object):
        cropped foreground masks used for training.
    """
    foreground_rois, foreground_classes, cropped_foreground_masks = (
-        sample_and_crop_foreground_masks(
+        sample_and_crop_foreground_masks(candidate_rois, candidate_gt_boxes,
-            candidate_rois,
+                                         candidate_gt_classes,
-            candidate_gt_boxes,
+                                         candidate_gt_indices, gt_masks,
-            candidate_gt_classes,
+                                         self._num_mask_samples_per_image,
-            candidate_gt_indices,
+                                         self._mask_target_size))
-            gt_masks,
-            self._num_mask_samples_per_image,
-            self._mask_target_size))
    return foreground_rois, foreground_classes, cropped_foreground_masks
--- a/official/vision/detection/utils/box_utils.py
+++ b/official/vision/detection/utils/box_utils.py
@@ -115,8 +115,8 @@ def normalize_boxes(boxes, image_shape):
  """Converts boxes to the normalized coordinates.
  Args:
-    boxes: a tensor whose last dimension is 4 representing the coordinates
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
-      of boxes in ymin, xmin, ymax, xmax order.
+      boxes in ymin, xmin, ymax, xmax order.
    image_shape: a list of two integers, a two-element vector or a tensor such
      that all but the last dimensions are `broadcastable` to `boxes`. The last
      dimension is 2, which represents [height, width].
@@ -153,8 +153,8 @@ def denormalize_boxes(boxes, image_shape):
  """Converts boxes normalized by [height, width] to pixel coordinates.
  Args:
-    boxes: a tensor whose last dimension is 4 representing the coordinates
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
-      of boxes in ymin, xmin, ymax, xmax order.
+      boxes in ymin, xmin, ymax, xmax order.
    image_shape: a list of two integers, a two-element vector or a tensor such
      that all but the last dimensions are `broadcastable` to `boxes`. The last
      dimension is 2, which represents [height, width].
@@ -187,8 +187,8 @@ def clip_boxes(boxes, image_shape):
  """Clips boxes to image boundaries.
  Args:
-    boxes: a tensor whose last dimension is 4 representing the coordinates
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
-      of boxes in ymin, xmin, ymax, xmax order.
+      boxes in ymin, xmin, ymax, xmax order.
    image_shape: a list of two integers, a two-element vector or a tensor such
      that all but the last dimensions are `broadcastable` to `boxes`. The last
      dimension is 2, which represents [height, width].
@@ -255,8 +255,8 @@ def encode_boxes(boxes, anchors, weights=None):
  """Encode boxes to targets.
  Args:
-    boxes: a tensor whose last dimension is 4 representing the coordinates
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
-      of boxes in ymin, xmin, ymax, xmax order.
+      boxes in ymin, xmin, ymax, xmax order.
    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
    weights: None or a list of four float numbers used to scale coordinates.
@@ -302,9 +302,8 @@ def encode_boxes(boxes, anchors, weights=None):
      encoded_dh *= weights[2]
      encoded_dw *= weights[3]
-    encoded_boxes = tf.concat(
+    encoded_boxes = tf.concat([encoded_dy, encoded_dx, encoded_dh, encoded_dw],
-        [encoded_dy, encoded_dx, encoded_dh, encoded_dw],
+                              axis=-1)
-        axis=-1)
    return encoded_boxes
@@ -359,10 +358,11 @@ def decode_boxes(encoded_boxes, anchors, weights=None):
    decoded_boxes_ymax = decoded_boxes_ymin + decoded_boxes_h - 1.0
    decoded_boxes_xmax = decoded_boxes_xmin + decoded_boxes_w - 1.0
-    decoded_boxes = tf.concat(
+    decoded_boxes = tf.concat([
-        [decoded_boxes_ymin, decoded_boxes_xmin,
+        decoded_boxes_ymin, decoded_boxes_xmin, decoded_boxes_ymax,
-         decoded_boxes_ymax, decoded_boxes_xmax],
+        decoded_boxes_xmax
-        axis=-1)
+    ],
+                              axis=-1)
    return decoded_boxes
@@ -546,6 +546,6 @@ def get_non_empty_box_indices(boxes):
  # Selects indices if box height or width is 0.
  height = boxes[:, 2] - boxes[:, 0]
  width = boxes[:, 3] - boxes[:, 1]
-  indices = tf.where(tf.logical_and(tf.greater(height, 0),
+  indices = tf.where(
-                                    tf.greater(width, 0)))
+      tf.logical_and(tf.greater(height, 0), tf.greater(width, 0)))
  return indices[:, 0]
--- a/official/vision/detection/utils/input_utils.py
+++ b/official/vision/detection/utils/input_utils.py
@@ -15,6 +15,7 @@
 """Utility functions for input processing."""
 import math
 import tensorflow as tf
 from official.vision.detection.utils import box_utils
@@ -91,12 +92,12 @@ def compute_padded_size(desired_size, stride):
      [height, width] of the padded output image size.
  """
  if isinstance(desired_size, list) or isinstance(desired_size, tuple):
-    padded_size = [int(math.ceil(d * 1.0 / stride) * stride)
+    padded_size = [
-                   for d in desired_size]
+        int(math.ceil(d * 1.0 / stride) * stride) for d in desired_size
+    ]
  else:
    padded_size = tf.cast(
-        tf.math.ceil(
+        tf.math.ceil(tf.cast(desired_size, dtype=tf.float32) / stride) * stride,
-            tf.cast(desired_size, dtype=tf.float32) / stride) * stride,
        tf.int32)
  return padded_size
@@ -158,8 +159,8 @@ def resize_and_crop_image(image,
    else:
      scaled_size = desired_size
-    scale = tf.minimum(
+    scale = tf.minimum(scaled_size[0] / image_size[0],
-        scaled_size[0] / image_size[0], scaled_size[1] / image_size[1])
+                       scaled_size[1] / image_size[1])
    scaled_size = tf.round(image_size * scale)
    # Computes 2D image_scale.
@@ -169,9 +170,8 @@ def resize_and_crop_image(image,
    # desired_size.
    if random_jittering:
      max_offset = scaled_size - desired_size
-      max_offset = tf.where(tf.less(max_offset, 0),
+      max_offset = tf.where(
-                            tf.zeros_like(max_offset),
+          tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
-                            max_offset)
      offset = max_offset * tf.random.uniform([
          2,
      ], 0, 1, seed=seed)
@@ -191,9 +191,9 @@ def resize_and_crop_image(image,
    image_info = tf.stack([
        image_size,
-        tf.cast(desired_size, dtype=tf.float32),
+        tf.cast(desired_size, dtype=tf.float32), image_scale,
-        image_scale,
+        tf.cast(offset, tf.float32)
-        tf.cast(offset, tf.float32)])
+    ])
    return output_image, image_info
@@ -288,25 +288,21 @@ def resize_and_crop_image_v2(image,
        image, tf.cast(scaled_size, tf.int32), method=method)
    if random_jittering:
-      scaled_image = scaled_image[
+      scaled_image = scaled_image[offset[0]:offset[0] + desired_size[0],
-          offset[0]:offset[0] + desired_size[0],
+                                  offset[1]:offset[1] + desired_size[1], :]
-          offset[1]:offset[1] + desired_size[1], :]
-    output_image = tf.image.pad_to_bounding_box(
+    output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0,
-        scaled_image, 0, 0, padded_size[0], padded_size[1])
+                                                padded_size[0], padded_size[1])
    image_info = tf.stack([
        image_size,
-        tf.cast(desired_size, dtype=tf.float32),
+        tf.cast(desired_size, dtype=tf.float32), image_scale,
-        image_scale,
+        tf.cast(offset, tf.float32)
-        tf.cast(offset, tf.float32)])
+    ])
    return output_image, image_info
-def resize_and_crop_boxes(boxes,
+def resize_and_crop_boxes(boxes, image_scale, output_size, offset):
-                          image_scale,
-                          output_size,
-                          offset):
  """Resizes boxes to output size with scale and offset.
  Args:
@@ -329,10 +325,7 @@ def resize_and_crop_boxes(boxes,
  return boxes
-def resize_and_crop_masks(masks,
+def resize_and_crop_masks(masks, image_scale, output_size, offset):
-                          image_scale,
-                          output_size,
-                          offset):
  """Resizes boxes to output size with scale and offset.
  Args:

--- a/official/vision/detection/utils/mask_utils.py
+++ b/official/vision/detection/utils/mask_utils.py
@@ -18,14 +18,12 @@ from __future__ import division
 from __future__ import print_function
 import math
 import numpy as np
 import cv2
-def paste_instance_masks(masks,
+def paste_instance_masks(masks, detected_boxes, image_height, image_width):
-                         detected_boxes,
-                         image_height,
-                         image_width):
  """Paste instance masks to generate the image segmentation results.
  Args:
@@ -95,10 +93,8 @@ def paste_instance_masks(masks,
    y_0 = min(max(ref_box[1], 0), image_height)
    y_1 = min(max(ref_box[3] + 1, 0), image_height)
-    im_mask[y_0:y_1, x_0:x_1] = mask[
+    im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[1]),
-        (y_0 - ref_box[1]):(y_1 - ref_box[1]),
+                                     (x_0 - ref_box[0]):(x_1 - ref_box[0])]
-        (x_0 - ref_box[0]):(x_1 - ref_box[0])
-    ]
    segms.append(im_mask)
  segms = np.array(segms)
@@ -106,10 +102,7 @@ def paste_instance_masks(masks,
  return segms
-def paste_instance_masks_v2(masks,
+def paste_instance_masks_v2(masks, detected_boxes, image_height, image_width):
-                            detected_boxes,
-                            image_height,
-                            image_width):
  """Paste instance masks to generate the image segmentation (v2).
  Args:
@@ -146,34 +139,22 @@ def paste_instance_masks_v2(masks,
    beta = box[3] / (1.0 * mask_height)
    # pylint: disable=invalid-name
    # Transformation from mask pixel indices to image coordinate.
-    M_mask_to_image = np.array(
+    M_mask_to_image = np.array([[alpha, 0, xmin], [0, beta, ymin], [0, 0, 1]],
-        [[alpha, 0, xmin],
+                               dtype=np.float32)
-         [0, beta, ymin],
-         [0, 0, 1]],
-        dtype=np.float32)
    # Transformation from image to cropped mask coordinate.
    M_image_to_crop = np.array(
-        [[1, 0, -xmin_int],
+        [[1, 0, -xmin_int], [0, 1, -ymin_int], [0, 0, 1]], dtype=np.float32)
-         [0, 1, -ymin_int],
-         [0, 0, 1]],
-        dtype=np.float32)
    M = np.dot(M_image_to_crop, M_mask_to_image)
    # Compensate the half pixel offset that OpenCV has in the
    # warpPerspective implementation: the top-left pixel is sampled
    # at (0,0), but we want it to be at (0.5, 0.5).
    M = np.dot(
        np.dot(
-            np.array([[1, 0, -0.5],
+            np.array([[1, 0, -0.5], [0, 1, -0.5], [0, 0, 1]], np.float32), M),
-                      [0, 1, -0.5],
+        np.array([[1, 0, 0.5], [0, 1, 0.5], [0, 0, 1]], np.float32))
-                      [0, 0, 1]], np.float32),
-            M),
-        np.array([[1, 0, 0.5],
-                  [0, 1, 0.5],
-                  [0, 0, 1]], np.float32))
    # pylint: enable=invalid-name
    cropped_mask = cv2.warpPerspective(
-        mask.astype(np.float32), M,
+        mask.astype(np.float32), M, (xmax_int - xmin_int, ymax_int - ymin_int))
-        (xmax_int - xmin_int, ymax_int - ymin_int))
    cropped_mask = np.array(cropped_mask > 0.5, dtype=np.uint8)
    img_mask = np.zeros((image_height, image_width))
@@ -181,12 +162,10 @@ def paste_instance_masks_v2(masks,
    x1 = max(min(xmax_int, image_width), 0)
    y0 = max(min(ymin_int, image_height), 0)
    y1 = max(min(ymax_int, image_height), 0)
-    img_mask[y0:y1, x0:x1] = cropped_mask[
+    img_mask[y0:y1, x0:x1] = cropped_mask[(y0 - ymin_int):(y1 - ymin_int),
-        (y0 - ymin_int):(y1 - ymin_int),
+                                          (x0 - xmin_int):(x1 - xmin_int)]
-        (x0 - xmin_int):(x1 - xmin_int)]
    segms.append(img_mask)
  segms = np.array(segms)
  return segms
--- a/official/vision/detection/utils/object_detection/balanced_positive_negative_sampler.py
+++ b/official/vision/detection/utils/object_detection/balanced_positive_negative_sampler.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Class to subsample minibatches by balancing positives and negatives.
 Subsamples minibatches based on a pre-specified positive fraction in range
@@ -92,10 +91,10 @@ class BalancedPositiveNegativeSampler(minibatch_sampler.MinibatchSampler):
    Args:
      input_tensor: An int32 tensor of shape [N] to be sliced.
-      num_start_samples: Number of examples to be sliced from the beginning
+      num_start_samples: Number of examples to be sliced from the beginning of
-        of the input tensor.
+        the input tensor.
-      num_end_samples: Number of examples to be sliced from the end of the
+      num_end_samples: Number of examples to be sliced from the end of the input
-        input tensor.
+        tensor.
      total_num_samples: Sum of is num_start_samples and num_end_samples. This
        should be a scalar.
@@ -110,13 +109,16 @@ class BalancedPositiveNegativeSampler(minibatch_sampler.MinibatchSampler):
        tf.range(input_length), input_length - num_end_samples)
    selected_positions = tf.logical_or(start_positions, end_positions)
    selected_positions = tf.cast(selected_positions, tf.float32)
-    indexed_positions = tf.multiply(tf.cumsum(selected_positions),
+    indexed_positions = tf.multiply(
-                                    selected_positions)
+        tf.cumsum(selected_positions), selected_positions)
-    one_hot_selector = tf.one_hot(tf.cast(indexed_positions, tf.int32) - 1,
+    one_hot_selector = tf.one_hot(
-                                  total_num_samples,
+        tf.cast(indexed_positions, tf.int32) - 1,
-                                  dtype=tf.float32)
+        total_num_samples,
-    return tf.cast(tf.tensordot(tf.cast(input_tensor, tf.float32),
+        dtype=tf.float32)
-                                one_hot_selector, axes=[0, 0]), tf.int32)
+    return tf.cast(
+        tf.tensordot(
+            tf.cast(input_tensor, tf.float32), one_hot_selector, axes=[0, 0]),
+        tf.int32)
  def _static_subsample(self, indicator, batch_size, labels):
    """Returns subsampled minibatch.
@@ -182,13 +184,12 @@ class BalancedPositiveNegativeSampler(minibatch_sampler.MinibatchSampler):
    sorted_signed_indicator_idx = tf.nn.top_k(
        signed_indicator_idx, input_length, sorted=True).values
-    [num_positive_samples,
+    [num_positive_samples, num_negative_samples
-     num_negative_samples] = self._get_num_pos_neg_samples(
+    ] = self._get_num_pos_neg_samples(sorted_signed_indicator_idx, batch_size)
-         sorted_signed_indicator_idx, batch_size)
    sampled_idx = self._get_values_from_start_and_end(
-        sorted_signed_indicator_idx, num_positive_samples,
+        sorted_signed_indicator_idx, num_positive_samples, num_negative_samples,
-        num_negative_samples, batch_size)
+        batch_size)
    # Shift the indices to start from 0 and remove any samples that are set as
    # False.
@@ -203,11 +204,13 @@ class BalancedPositiveNegativeSampler(minibatch_sampler.MinibatchSampler):
        tf.bool)
    # project back the order based on stored permutations
-    reprojections = tf.one_hot(permutation, depth=input_length,
+    reprojections = tf.one_hot(
-                               dtype=tf.float32)
+        permutation, depth=input_length, dtype=tf.float32)
-    return tf.cast(tf.tensordot(
+    return tf.cast(
-        tf.cast(sampled_idx_indicator, tf.float32),
+        tf.tensordot(
-        reprojections, axes=[0, 0]), tf.bool)
+            tf.cast(sampled_idx_indicator, tf.float32),
+            reprojections,
+            axes=[0, 0]), tf.bool)
  def subsample(self, indicator, batch_size, labels, scope=None):
    """Returns subsampled minibatch.
@@ -218,7 +221,7 @@ class BalancedPositiveNegativeSampler(minibatch_sampler.MinibatchSampler):
        randomly selects negative samples so that the positive sample fraction
        matches self._positive_fraction. It cannot be None is is_static is True.
      labels: boolean tensor of shape [N] denoting positive(=True) and negative
-          (=False) examples.
+        (=False) examples.
      scope: name scope.
    Returns:

--- a/official/vision/detection/utils/object_detection/box_coder.py
+++ b/official/vision/detection/utils/object_detection/box_coder.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Base box coder.
 Box coders convert between coordinate frames, namely image-centric
@@ -32,7 +31,6 @@ from abc import abstractproperty
 import tensorflow as tf
 # Box coder types.
 FASTER_RCNN = 'faster_rcnn'
 KEYPOINT = 'keypoint'
@@ -138,11 +136,11 @@ def batch_decode(encoded_boxes, box_coder, anchors):
  """
  encoded_boxes.get_shape().assert_has_rank(3)
  if encoded_boxes.get_shape()[1].value != anchors.num_boxes_static():
-    raise ValueError('The number of anchors inferred from encoded_boxes'
+    raise ValueError(
-                     ' and anchors are inconsistent: shape[1] of encoded_boxes'
+        'The number of anchors inferred from encoded_boxes'
-                     ' %s should be equal to the number of anchors: %s.' %
+        ' and anchors are inconsistent: shape[1] of encoded_boxes'
-                     (encoded_boxes.get_shape()[1].value,
+        ' %s should be equal to the number of anchors: %s.' %
-                      anchors.num_boxes_static()))
+        (encoded_boxes.get_shape()[1].value, anchors.num_boxes_static()))
  decoded_boxes = tf.stack([
      box_coder.decode(boxes, anchors).get()

--- a/official/vision/detection/utils/object_detection/box_list.py
+++ b/official/vision/detection/utils/object_detection/box_list.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Bounding Box List definition.
 BoxList represents a list of bounding boxes as tensorflow
@@ -126,8 +125,8 @@ class BoxList(object):
    it returns the box coordinates.
    Args:
-      field: this optional string parameter can be used to specify
+      field: this optional string parameter can be used to specify a related
-        a related field to be accessed.
+        field to be accessed.
    Returns:
      a tensor representing the box collection or an associated field.
@@ -192,8 +191,8 @@ class BoxList(object):
    """Retrieves specified fields as a dictionary of tensors.
    Args:
-      fields: (optional) list of fields to return in the dictionary.
+      fields: (optional) list of fields to return in the dictionary. If None
-        If None (default), all fields are returned.
+        (default), all fields are returned.
    Returns:
      tensor_dict: A dictionary of tensors specified by fields.

--- a/official/vision/detection/utils/object_detection/box_list_ops.py
+++ b/official/vision/detection/utils/object_detection/box_list_ops.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Bounding Box List operations.
 Example box operations that are supported:
@@ -152,8 +151,8 @@ def prune_outside_window(boxlist, window, scope=None):
  Args:
    boxlist: a BoxList holding M_in boxes.
-    window: a float tensor of shape [4] representing [ymin, xmin, ymax, xmax]
+    window: a float tensor of shape [4] representing [ymin, xmin, ymax, xmax] of
-      of the window
+      the window
    scope: name scope.
  Returns:
@@ -166,8 +165,10 @@ def prune_outside_window(boxlist, window, scope=None):
        value=boxlist.get(), num_or_size_splits=4, axis=1)
    win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window)
    coordinate_violations = tf.concat([
-        tf.less(y_min, win_y_min), tf.less(x_min, win_x_min),
+        tf.less(y_min, win_y_min),
-        tf.greater(y_max, win_y_max), tf.greater(x_max, win_x_max)
+        tf.less(x_min, win_x_min),
+        tf.greater(y_max, win_y_max),
+        tf.greater(x_max, win_x_max)
    ], 1)
    valid_indices = tf.reshape(
        tf.where(tf.logical_not(tf.reduce_any(coordinate_violations, 1))), [-1])
@@ -183,8 +184,8 @@ def prune_completely_outside_window(boxlist, window, scope=None):
  Args:
    boxlist: a BoxList holding M_in boxes.
-    window: a float tensor of shape [4] representing [ymin, xmin, ymax, xmax]
+    window: a float tensor of shape [4] representing [ymin, xmin, ymax, xmax] of
-      of the window
+      the window
    scope: name scope.
  Returns:
@@ -198,8 +199,10 @@ def prune_completely_outside_window(boxlist, window, scope=None):
        value=boxlist.get(), num_or_size_splits=4, axis=1)
    win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window)
    coordinate_violations = tf.concat([
-        tf.greater_equal(y_min, win_y_max), tf.greater_equal(x_min, win_x_max),
+        tf.greater_equal(y_min, win_y_max),
-        tf.less_equal(y_max, win_y_min), tf.less_equal(x_max, win_x_min)
+        tf.greater_equal(x_min, win_x_max),
+        tf.less_equal(y_max, win_y_min),
+        tf.less_equal(x_max, win_x_min)
    ], 1)
    valid_indices = tf.reshape(
        tf.where(tf.logical_not(tf.reduce_any(coordinate_violations, 1))), [-1])
@@ -274,8 +277,8 @@ def iou(boxlist1, boxlist2, scope=None):
    unions = (
        tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections)
    return tf.where(
-        tf.equal(intersections, 0.0),
+        tf.equal(intersections, 0.0), tf.zeros_like(intersections),
-        tf.zeros_like(intersections), tf.truediv(intersections, unions))
+        tf.truediv(intersections, unions))
 def matched_iou(boxlist1, boxlist2, scope=None):
@@ -295,8 +298,8 @@ def matched_iou(boxlist1, boxlist2, scope=None):
    areas2 = area(boxlist2)
    unions = areas1 + areas2 - intersections
    return tf.where(
-        tf.equal(intersections, 0.0),
+        tf.equal(intersections, 0.0), tf.zeros_like(intersections),
-        tf.zeros_like(intersections), tf.truediv(intersections, unions))
+        tf.truediv(intersections, unions))
 def ioa(boxlist1, boxlist2, scope=None):
@@ -320,8 +323,10 @@ def ioa(boxlist1, boxlist2, scope=None):
    return tf.truediv(intersections, areas)
-def prune_non_overlapping_boxes(
+def prune_non_overlapping_boxes(boxlist1,
-    boxlist1, boxlist2, min_overlap=0.0, scope=None):
+                                boxlist2,
+                                min_overlap=0.0,
+                                scope=None):
  """Prunes the boxes in boxlist1 that overlap less than thresh with boxlist2.
  For each box in boxlist1, we want its IOA to be more than minoverlap with
@@ -331,7 +336,7 @@ def prune_non_overlapping_boxes(
    boxlist1: BoxList holding N boxes.
    boxlist2: BoxList holding M boxes.
    min_overlap: Minimum required overlap between boxes, to count them as
-                overlapping.
+      overlapping.
    scope: name scope.
  Returns:
@@ -361,8 +366,8 @@ def prune_small_boxes(boxlist, min_side, scope=None):
  """
  with tf.name_scope(scope, 'PruneSmallBoxes'):
    height, width = height_width(boxlist)
-    is_valid = tf.logical_and(tf.greater_equal(width, min_side),
+    is_valid = tf.logical_and(
-                              tf.greater_equal(height, min_side))
+        tf.greater_equal(width, min_side), tf.greater_equal(height, min_side))
    return gather(boxlist, tf.reshape(tf.where(is_valid), [-1]))
@@ -389,9 +394,10 @@ def change_coordinate_frame(boxlist, window, scope=None):
  with tf.name_scope(scope, 'ChangeCoordinateFrame'):
    win_height = window[2] - window[0]
    win_width = window[3] - window[1]
-    boxlist_new = scale(box_list.BoxList(
+    boxlist_new = scale(
-        boxlist.get() - [window[0], window[1], window[0], window[1]]),
+        box_list.BoxList(boxlist.get() -
-                        1.0 / win_height, 1.0 / win_width)
+                         [window[0], window[1], window[0], window[1]]),
+        1.0 / win_height, 1.0 / win_width)
    boxlist_new = _copy_extra_fields(boxlist_new, boxlist)
    return boxlist_new
@@ -420,13 +426,17 @@ def sq_dist(boxlist1, boxlist2, scope=None):
  with tf.name_scope(scope, 'SqDist'):
    sqnorm1 = tf.reduce_sum(tf.square(boxlist1.get()), 1, keep_dims=True)
    sqnorm2 = tf.reduce_sum(tf.square(boxlist2.get()), 1, keep_dims=True)
-    innerprod = tf.matmul(boxlist1.get(), boxlist2.get(),
+    innerprod = tf.matmul(
-                          transpose_a=False, transpose_b=True)
+        boxlist1.get(), boxlist2.get(), transpose_a=False, transpose_b=True)
    return sqnorm1 + tf.transpose(sqnorm2) - 2.0 * innerprod
-def boolean_mask(boxlist, indicator, fields=None, scope=None,
+def boolean_mask(boxlist,
-                 use_static_shapes=False, indicator_sum=None):
+                 indicator,
+                 fields=None,
+                 scope=None,
+                 use_static_shapes=False,
+                 indicator_sum=None):
  """Select boxes from BoxList according to indicator and return new BoxList.
  `boolean_mask` returns the subset of boxes that are marked as "True" by the
@@ -463,8 +473,7 @@ def boolean_mask(boxlist, indicator, fields=None, scope=None,
        raise ValueError('`indicator_sum` must be a of type int')
      selected_positions = tf.cast(indicator, dtype=tf.float32)
      indexed_positions = tf.cast(
-          tf.multiply(
+          tf.multiply(tf.cumsum(selected_positions), selected_positions),
-              tf.cumsum(selected_positions), selected_positions),
          dtype=tf.int32)
      one_hot_selector = tf.one_hot(
          indexed_positions - 1, indicator_sum, dtype=tf.float32)
@@ -541,9 +550,8 @@ def concatenate(boxlists, fields=None, scope=None):
  Args:
    boxlists: list of BoxList objects
-    fields: optional list of fields to also concatenate.  By default, all
+    fields: optional list of fields to also concatenate.  By default, all fields
-      fields from the first BoxList in the list are included in the
+      from the first BoxList in the list are included in the concatenation.
-      concatenation.
    scope: name scope.
  Returns:
@@ -637,8 +645,8 @@ def visualize_boxes_in_image(image, boxlist, normalized=False, scope=None):
  Args:
    image: an image tensor with shape [height, width, 3]
    boxlist: a BoxList
-    normalized: (boolean) specify whether corners are to be interpreted
+    normalized: (boolean) specify whether corners are to be interpreted as
-      as absolute coordinates in image space or normalized with respect to the
+      absolute coordinates in image space or normalized with respect to the
      image size.
    scope: name scope.
@@ -648,8 +656,7 @@ def visualize_boxes_in_image(image, boxlist, normalized=False, scope=None):
  with tf.name_scope(scope, 'VisualizeBoxesInImage'):
    if not normalized:
      height, width, _ = tf.unstack(tf.shape(image))
-      boxlist = scale(boxlist,
+      boxlist = scale(boxlist, 1.0 / tf.cast(height, tf.float32),
-                      1.0 / tf.cast(height, tf.float32),
                      1.0 / tf.cast(width, tf.float32))
    corners = tf.expand_dims(boxlist.get(), 0)
    image = tf.expand_dims(image, 0)
@@ -714,9 +721,8 @@ def filter_greater_than(boxlist, thresh, scope=None):
    if len(scores.shape.as_list()) == 2 and scores.shape.as_list()[1] != 1:
      raise ValueError('Scores should have rank 1 or have shape '
                       'consistent with [None, 1]')
-    high_score_indices = tf.cast(tf.reshape(
+    high_score_indices = tf.cast(
-        tf.where(tf.greater(scores, thresh)),
+        tf.reshape(tf.where(tf.greater(scores, thresh)), [-1]), tf.int32)
-        [-1]), tf.int32)
    return gather(boxlist, high_score_indices)
@@ -748,8 +754,10 @@ def non_max_suppression(boxlist, thresh, max_output_size, scope=None):
    if not boxlist.has_field('scores'):
      raise ValueError('input boxlist must have \'scores\' field')
    selected_indices = tf.image.non_max_suppression(
-        boxlist.get(), boxlist.get_field('scores'),
+        boxlist.get(),
-        max_output_size, iou_threshold=thresh)
+        boxlist.get_field('scores'),
+        max_output_size,
+        iou_threshold=thresh)
    return gather(boxlist, selected_indices)
@@ -768,8 +776,11 @@ def _copy_extra_fields(boxlist_to_copy_to, boxlist_to_copy_from):
  return boxlist_to_copy_to
-def to_normalized_coordinates(boxlist, height, width,
+def to_normalized_coordinates(boxlist,
-                              check_range=True, scope=None):
+                              height,
+                              width,
+                              check_range=True,
+                              scope=None):
  """Converts absolute box coordinates to normalized coordinates in [0, 1].
  Usually one uses the dynamic shape of the image or conv-layer tensor:
@@ -797,8 +808,9 @@ def to_normalized_coordinates(boxlist, height, width,
    if check_range:
      max_val = tf.reduce_max(boxlist.get())
-      max_assert = tf.Assert(tf.greater(max_val, 1.01),
+      max_assert = tf.Assert(
-                             ['max value is lower than 1.01: ', max_val])
+          tf.greater(max_val, 1.01),
+          ['max value is lower than 1.01: ', max_val])
      with tf.control_dependencies([max_assert]):
        width = tf.identity(width)
@@ -822,8 +834,8 @@ def to_absolute_coordinates(boxlist,
    height: Maximum value for height of absolute box coordinates.
    width: Maximum value for width of absolute box coordinates.
    check_range: If True, checks if the coordinates are normalized or not.
-    maximum_normalized_coordinate: Maximum coordinate value to be considered
+    maximum_normalized_coordinate: Maximum coordinate value to be considered as
-      as normalized, default to 1.1.
+      normalized, default to 1.1.
    scope: name scope.
  Returns:
@@ -838,9 +850,10 @@ def to_absolute_coordinates(boxlist,
    if check_range:
      box_maximum = tf.reduce_max(boxlist.get())
      max_assert = tf.Assert(
-          tf.greater_equal(maximum_normalized_coordinate, box_maximum),
+          tf.greater_equal(maximum_normalized_coordinate, box_maximum), [
-          ['maximum box coordinate value is larger '
+              'maximum box coordinate value is larger '
-           'than %f: ' % maximum_normalized_coordinate, box_maximum])
+              'than %f: ' % maximum_normalized_coordinate, box_maximum
+          ])
      with tf.control_dependencies([max_assert]):
        width = tf.identity(width)
@@ -924,13 +937,15 @@ def refine_boxes(pool_boxes,
  if not pool_boxes.has_field('scores'):
    raise ValueError('pool_boxes must have a \'scores\' field')
-  nms_boxes = non_max_suppression(
+  nms_boxes = non_max_suppression(pool_boxes, nms_iou_thresh,
-      pool_boxes, nms_iou_thresh, nms_max_detections)
+                                  nms_max_detections)
  return box_voting(nms_boxes, pool_boxes, voting_iou_thresh)
 def box_voting(selected_boxes, pool_boxes, iou_thresh=0.5):
-  """Performs box voting as described in S. Gidaris and N. Komodakis, ICCV 2015.
+  """Performs box voting as described in S. Gidaris and N.
+  Komodakis, ICCV 2015.
  Performs box voting as described in 'Object detection via a multi-region &
  semantic segmentation-aware CNN model', Gidaris and Komodakis, ICCV 2015. For
@@ -972,9 +987,10 @@ def box_voting(selected_boxes, pool_boxes, iou_thresh=0.5):
  # match to any boxes in pool_boxes. For such boxes without any matches, we
  # should return the original boxes without voting.
  match_assert = tf.Assert(
-      tf.reduce_all(tf.greater(num_matches, 0)),
+      tf.reduce_all(tf.greater(num_matches, 0)), [
-      ['Each box in selected_boxes must match with at least one box '
+          'Each box in selected_boxes must match with at least one box '
-       'in pool_boxes.'])
+          'in pool_boxes.'
+      ])
  scores = tf.expand_dims(pool_boxes.get_field('scores'), 1)
  scores_assert = tf.Assert(
@@ -993,9 +1009,7 @@ def box_voting(selected_boxes, pool_boxes, iou_thresh=0.5):
  return averaged_boxes
-def get_minimal_coverage_box(boxlist,
+def get_minimal_coverage_box(boxlist, default_box=None, scope=None):
-                             default_box=None,
-                             scope=None):
  """Creates a single bounding box which covers all boxes in the boxlist.
  Args:
@@ -1045,9 +1059,9 @@ def sample_boxes_by_jittering(boxlist,
    boxlist: A boxlist containing N boxes in normalized coordinates.
    num_boxes_to_sample: A positive integer containing the number of boxes to
      sample.
-    stddev: Standard deviation. This is used to draw random offsets for the
+    stddev: Standard deviation. This is used to draw random offsets for the box
-      box corners from a normal distribution. The offset is multiplied by the
+      corners from a normal distribution. The offset is multiplied by the box
-      box size so will be larger in terms of pixels for larger boxes.
+      size so will be larger in terms of pixels for larger boxes.
    scope: Name scope.
  Returns:
@@ -1056,11 +1070,10 @@ def sample_boxes_by_jittering(boxlist,
  """
  with tf.name_scope(scope, 'SampleBoxesByJittering'):
    num_boxes = boxlist.num_boxes()
-    box_indices = tf.random_uniform(
+    box_indices = tf.random_uniform([num_boxes_to_sample],
-        [num_boxes_to_sample],
+                                    minval=0,
-        minval=0,
+                                    maxval=num_boxes,
-        maxval=num_boxes,
+                                    dtype=tf.int32)
-        dtype=tf.int32)
    sampled_boxes = tf.gather(boxlist.get(), box_indices)
    sampled_boxes_height = sampled_boxes[:, 2] - sampled_boxes[:, 0]
    sampled_boxes_width = sampled_boxes[:, 3] - sampled_boxes[:, 1]

--- a/official/vision/detection/utils/object_detection/faster_rcnn_box_coder.py
+++ b/official/vision/detection/utils/object_detection/faster_rcnn_box_coder.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Faster RCNN box coder.
 Faster RCNN box coder follows the coding schema described below:
@@ -43,9 +42,9 @@ class FasterRcnnBoxCoder(box_coder.BoxCoder):
    """Constructor for FasterRcnnBoxCoder.
    Args:
-      scale_factors: List of 4 positive scalars to scale ty, tx, th and tw.
+      scale_factors: List of 4 positive scalars to scale ty, tx, th and tw. If
-        If set to None, does not perform scaling. For Faster RCNN,
+        set to None, does not perform scaling. For Faster RCNN, the open-source
-        the open-source implementation recommends using [10.0, 10.0, 5.0, 5.0].
+        implementation recommends using [10.0, 10.0, 5.0, 5.0].
    """
    if scale_factors:
      assert len(scale_factors) == 4

--- a/official/vision/detection/utils/object_detection/matcher.py
+++ b/official/vision/detection/utils/object_detection/matcher.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Matcher interface and Match class.
 This module defines the Matcher interface and the Match object. The job of the
@@ -49,9 +48,9 @@ class Match(object):
    Args:
      match_results: Integer tensor of shape [N] with (1) match_results[i]>=0,
-        meaning that column i is matched with row match_results[i].
+        meaning that column i is matched with row match_results[i]. (2)
-        (2) match_results[i]=-1, meaning that column i is not matched.
+        match_results[i]=-1, meaning that column i is not matched. (3)
-        (3) match_results[i]=-2, meaning that column i is ignored.
+        match_results[i]=-2, meaning that column i is ignored.
    Raises:
      ValueError: if match_results does not have rank 1 or is not an
@@ -168,8 +167,7 @@ class Match(object):
  def _reshape_and_cast(self, t):
    return tf.cast(tf.reshape(t, [-1]), tf.int32)
-  def gather_based_on_match(self, input_tensor, unmatched_value,
+  def gather_based_on_match(self, input_tensor, unmatched_value, ignored_value):
-                            ignored_value):
    """Gathers elements from `input_tensor` based on match results.
    For columns that are matched to a row, gathered_tensor[col] is set to
@@ -190,16 +188,15 @@ class Match(object):
        The shape of the gathered tensor is [match_results.shape[0]] +
        input_tensor.shape[1:].
    """
-    input_tensor = tf.concat([tf.stack([ignored_value, unmatched_value]),
+    input_tensor = tf.concat(
-                              input_tensor], axis=0)
+        [tf.stack([ignored_value, unmatched_value]), input_tensor], axis=0)
    gather_indices = tf.maximum(self.match_results + 2, 0)
    gathered_tensor = tf.gather(input_tensor, gather_indices)
    return gathered_tensor
 class Matcher(object):
-  """Abstract base class for matcher.
+  """Abstract base class for matcher."""
-  """
  __metaclass__ = ABCMeta
  def match(self, similarity_matrix, scope=None, **params):
@@ -212,8 +209,8 @@ class Matcher(object):
      similarity_matrix: Float tensor of shape [N, M] with pairwise similarity
        where higher value means more similar.
      scope: Op scope name. Defaults to 'Match' if None.
-      **params: Additional keyword arguments for specific implementations of
+      **params: Additional keyword arguments for specific implementations of the
-        the Matcher.
+        Matcher.
    Returns:
      A Match object with the results of matching.
@@ -230,8 +227,8 @@ class Matcher(object):
    Args:
      similarity_matrix: Float tensor of shape [N, M] with pairwise similarity
        where higher value means more similar.
-      **params: Additional keyword arguments for specific implementations of
+      **params: Additional keyword arguments for specific implementations of the
-        the Matcher.
+        Matcher.
    Returns:
      match_results: Integer tensor of shape [M]: match_results[i]>=0 means

--- a/official/vision/detection/utils/object_detection/minibatch_sampler.py
+++ b/official/vision/detection/utils/object_detection/minibatch_sampler.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Base minibatch sampler module.
 The job of the minibatch_sampler is to subsample a minibatch based on some
@@ -53,8 +52,8 @@ class MinibatchSampler(object):
    Args:
      indicator: boolean tensor of shape [N] whose True entries can be sampled.
      batch_size: desired batch size.
-      **params: additional keyword arguments for specific implementations of
+      **params: additional keyword arguments for specific implementations of the
-          the MinibatchSampler.
+        MinibatchSampler.
    Returns:
      sample_indicator: boolean tensor of shape [N] whose True entries have been
@@ -72,8 +71,8 @@ class MinibatchSampler(object):
    is returned.
    Args:
-      indicator: a 1-dimensional boolean tensor indicating which elements
+      indicator: a 1-dimensional boolean tensor indicating which elements are
-        are allowed to be sampled and which are not.
+        allowed to be sampled and which are not.
      num_samples: int32 scalar tensor
    Returns:

--- a/official/vision/detection/utils/object_detection/ops.py
+++ b/official/vision/detection/utils/object_detection/ops.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """A module for helper tensorflow ops.
 This is originally implemented in TensorFlow Object Detection API.
@@ -37,7 +36,7 @@ def indices_to_dense_vector(indices,
  Args:
    indices: 1d Tensor with integer indices which are to be set to
-        indices_values.
+      indices_values.
    size: scalar with size (integer) of output Tensor.
    indices_value: values of elements specified by indices in the output vector
    default_value: values of other elements in the output vector.
@@ -61,10 +60,10 @@ def matmul_gather_on_zeroth_axis(params, indices, scope=None):
  TODO(rathodv, jonathanhuang): enable sparse matmul option.
  Args:
-    params: A float32 Tensor. The tensor from which to gather values.
+    params: A float32 Tensor. The tensor from which to gather values. Must be at
-      Must be at least rank 1.
+      least rank 1.
-    indices: A Tensor. Must be one of the following types: int32, int64.
+    indices: A Tensor. Must be one of the following types: int32, int64. Must be
-      Must be in range [0, params.shape[0])
+      in range [0, params.shape[0])
    scope: A name for the operation (optional).
  Returns:

--- a/official/vision/detection/utils/object_detection/preprocessor.py
+++ b/official/vision/detection/utils/object_detection/preprocessor.py
@@ -50,10 +50,9 @@ def _flip_boxes_left_right(boxes):
  """Left-right flip the boxes.
  Args:
-    boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4].
+    boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. Boxes
-           Boxes are in normalized form meaning their coordinates vary
+      are in normalized form meaning their coordinates vary between [0, 1]. Each
-           between [0, 1].
+      row is in the form of [ymin, xmin, ymax, xmax].
-           Each row is in the form of [ymin, xmin, ymax, xmax].
  Returns:
    Flipped boxes.
@@ -69,8 +68,8 @@ def _flip_masks_left_right(masks):
  """Left-right flip masks.
  Args:
-    masks: rank 3 float32 tensor with shape
+    masks: rank 3 float32 tensor with shape [num_instances, height, width]
-      [num_instances, height, width] representing instance masks.
+      representing instance masks.
  Returns:
    flipped masks: rank 3 float32 tensor with shape
@@ -79,7 +78,9 @@ def _flip_masks_left_right(masks):
  return masks[:, :, ::-1]
-def keypoint_flip_horizontal(keypoints, flip_point, flip_permutation,
+def keypoint_flip_horizontal(keypoints,
+                             flip_point,
+                             flip_permutation,
                             scope=None):
  """Flips the keypoints horizontally around the flip_point.
@@ -91,9 +92,9 @@ def keypoint_flip_horizontal(keypoints, flip_point, flip_permutation,
    flip_point:  (float) scalar tensor representing the x coordinate to flip the
      keypoints around.
    flip_permutation: rank 1 int32 tensor containing the keypoint flip
-      permutation. This specifies the mapping from original keypoint indices
+      permutation. This specifies the mapping from original keypoint indices to
-      to the flipped keypoint indices. This is used primarily for keypoints
+      the flipped keypoint indices. This is used primarily for keypoints that
-      that are not reflection invariant. E.g. Suppose there are 3 keypoints
+      are not reflection invariant. E.g. Suppose there are 3 keypoints
      representing ['head', 'right_eye', 'left_eye'], then a logical choice for
      flip_permutation might be [0, 2, 1] since we want to swap the 'left_eye'
      and 'right_eye' after a horizontal flip.
@@ -190,19 +191,16 @@ def random_horizontal_flip(image,
  Args:
    image: rank 3 float32 tensor with shape [height, width, channels].
-    boxes: (optional) rank 2 float32 tensor with shape [N, 4]
+    boxes: (optional) rank 2 float32 tensor with shape [N, 4] containing the
-           containing the bounding boxes.
+      bounding boxes. Boxes are in normalized form meaning their coordinates
-           Boxes are in normalized form meaning their coordinates vary
+      vary between [0, 1]. Each row is in the form of [ymin, xmin, ymax, xmax].
-           between [0, 1].
+    masks: (optional) rank 3 float32 tensor with shape [num_instances, height,
-           Each row is in the form of [ymin, xmin, ymax, xmax].
+      width] containing instance masks. The masks are of the same height, width
-    masks: (optional) rank 3 float32 tensor with shape
+      as the input `image`.
-           [num_instances, height, width] containing instance masks. The masks
+    keypoints: (optional) rank 3 float32 tensor with shape [num_instances,
-           are of the same height, width as the input `image`.
+      num_keypoints, 2]. The keypoints are in y-x normalized coordinates.
-    keypoints: (optional) rank 3 float32 tensor with shape
-               [num_instances, num_keypoints, 2]. The keypoints are in y-x
-               normalized coordinates.
    keypoint_flip_permutation: rank 1 int32 tensor containing the keypoint flip
-                               permutation.
+      permutation.
    seed: random seed
  Returns:
@@ -369,20 +367,19 @@ def resize_to_range(image,
  Args:
    image: A 3D tensor of shape [height, width, channels]
-    masks: (optional) rank 3 float32 tensor with shape
+    masks: (optional) rank 3 float32 tensor with shape [num_instances, height,
-           [num_instances, height, width] containing instance masks.
+      width] containing instance masks.
    min_dimension: (optional) (scalar) desired size of the smaller image
-                   dimension.
+      dimension.
-    max_dimension: (optional) (scalar) maximum allowed size
+    max_dimension: (optional) (scalar) maximum allowed size of the larger image
-                   of the larger image dimension.
+      dimension.
    method: (optional) interpolation method used in resizing. Defaults to
-            BILINEAR.
+      BILINEAR.
-    align_corners: bool. If true, exactly align all 4 corners of the input
+    align_corners: bool. If true, exactly align all 4 corners of the input and
-                   and output. Defaults to False.
+      output. Defaults to False.
-    pad_to_max_dimension: Whether to resize the image and pad it with zeros
+    pad_to_max_dimension: Whether to resize the image and pad it with zeros so
-      so the resulting image is of the spatial size
+      the resulting image is of the spatial size [max_dimension, max_dimension].
-      [max_dimension, max_dimension]. If masks are included they are padded
+      If masks are included they are padded similarly.
-      similarly.
  Returns:
    Note that the position of the resized_image_shape changes based on whether
@@ -410,8 +407,8 @@ def resize_to_range(image,
    new_image = tf.image.resize(image, new_size[:-1], method=method)
    if pad_to_max_dimension:
-      new_image = tf.image.pad_to_bounding_box(
+      new_image = tf.image.pad_to_bounding_box(new_image, 0, 0, max_dimension,
-          new_image, 0, 0, max_dimension, max_dimension)
+                                               max_dimension)
    result = [new_image]
    if masks is not None:
@@ -422,8 +419,8 @@ def resize_to_range(image,
          method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
      new_masks = tf.squeeze(new_masks, 3)
      if pad_to_max_dimension:
-        new_masks = tf.image.pad_to_bounding_box(
+        new_masks = tf.image.pad_to_bounding_box(new_masks, 0, 0, max_dimension,
-            new_masks, 0, 0, max_dimension, max_dimension)
+                                                 max_dimension)
      result.append(new_masks)
    result.append(new_size)
@@ -500,11 +497,10 @@ def scale_boxes_to_pixel_coordinates(image, boxes, keypoints=None):
  Args:
    image: A 3D float32 tensor of shape [height, width, channels].
    boxes: A 2D float32 tensor of shape [num_boxes, 4] containing the bounding
-      boxes in normalized coordinates. Each row is of the form
+      boxes in normalized coordinates. Each row is of the form [ymin, xmin,
-      [ymin, xmin, ymax, xmax].
+      ymax, xmax].
-    keypoints: (optional) rank 3 float32 tensor with shape
+    keypoints: (optional) rank 3 float32 tensor with shape [num_instances,
-      [num_instances, num_keypoints, 2]. The keypoints are in y-x normalized
+      num_keypoints, 2]. The keypoints are in y-x normalized coordinates.
-      coordinates.
  Returns:
    image: unchanged input image.

--- a/official/vision/detection/utils/object_detection/region_similarity_calculator.py
+++ b/official/vision/detection/utils/object_detection/region_similarity_calculator.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Region Similarity Calculators for BoxLists.
 Region Similarity Calculators compare a pairwise measure of similarity