Merge pull request #8746 from syiming:add_multilevel_crop_and_resize

PiperOrigin-RevId: 322214979

Merge pull request #8746 from syiming:add_multilevel_crop_and_resize
PiperOrigin-RevId: 322214979
d3d2ad3d · TF Object Detection Team · 52515dc3 · f7d74d68 · d3d2ad3d · d3d2ad3d
Commit d3d2ad3d authored Jul 20, 2020 by TF Object Detection Team
7 changed files
--- a/research/object_detection/builders/model_builder.py
+++ b/research/object_detection/builders/model_builder.py
@@ -39,6 +39,7 @@ from object_detection.protos import losses_pb2
 from object_detection.protos import model_pb2
 from object_detection.utils import label_map_util
 from object_detection.utils import ops
+from object_detection.utils import spatial_transform_ops as spatial_ops
 from object_detection.utils import tf_version
 ## Feature Extractors for TF
@@ -656,8 +657,9 @@ def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries):
        second_stage_localization_loss_weight)
  crop_and_resize_fn = (
-      ops.matmul_crop_and_resize if frcnn_config.use_matmul_crop_and_resize
+      spatial_ops.multilevel_matmul_crop_and_resize
-      else ops.native_crop_and_resize)
+      if frcnn_config.use_matmul_crop_and_resize
+      else spatial_ops.multilevel_native_crop_and_resize)
  clip_anchors_to_image = (
      frcnn_config.clip_anchors_to_image)

--- a/research/object_detection/meta_architectures/context_rcnn_meta_arch.py
+++ b/research/object_detection/meta_architectures/context_rcnn_meta_arch.py
@@ -324,7 +324,7 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
      A float32 Tensor with shape [K, new_height, new_width, depth].
    """
    box_features = self._crop_and_resize_fn(
-        features_to_crop, proposal_boxes_normalized,
+        [features_to_crop], proposal_boxes_normalized, None,
        [self._initial_crop_size, self._initial_crop_size])
    attention_features = self._context_feature_extract_fn(

--- a/research/object_detection/meta_architectures/context_rcnn_meta_arch_tf1_test.py
+++ b/research/object_detection/meta_architectures/context_rcnn_meta_arch_tf1_test.py
@@ -41,7 +41,7 @@ from object_detection.meta_architectures import faster_rcnn_meta_arch
 from object_detection.protos import box_predictor_pb2
 from object_detection.protos import hyperparams_pb2
 from object_detection.protos import post_processing_pb2
-from object_detection.utils import ops
+from object_detection.utils import spatial_transform_ops as spatial_ops
 from object_detection.utils import test_case
 from object_detection.utils import test_utils
 from object_detection.utils import tf_version
@@ -363,8 +363,9 @@ class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):
          max_negatives_per_positive=None)
    crop_and_resize_fn = (
-        ops.matmul_crop_and_resize
+        spatial_ops.multilevel_matmul_crop_and_resize
-        if use_matmul_crop_and_resize else ops.native_crop_and_resize)
+        if use_matmul_crop_and_resize
+        else spatial_ops.multilevel_native_crop_and_resize)
    common_kwargs = {
        'is_training':
            is_training,

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
@@ -1948,9 +1948,16 @@ class FasterRCNNMetaArch(model.DetectionModel):
    Returns:
      A float32 tensor with shape [K, new_height, new_width, depth].
    """
+    features_to_crop = [features_to_crop]
+    num_levels = len(features_to_crop)
+    box_levels = None
+    if num_levels != 1:
+      # If there are multiple levels to select, get the box levels
+      box_levels = ops.fpn_feature_levels(num_levels, num_levels - 1,
+                                          1.0/224, proposal_boxes_normalized)
    cropped_regions = self._flatten_first_two_dimensions(
        self._crop_and_resize_fn(
-            features_to_crop, proposal_boxes_normalized,
+            features_to_crop, proposal_boxes_normalized, box_levels,
            [self._initial_crop_size, self._initial_crop_size]))
    return self._maxpool_layer(cropped_regions)
@@ -2517,8 +2524,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
            image_shape[1], image_shape[2], check_range=False).get()
        flat_cropped_gt_mask = self._crop_and_resize_fn(
-            tf.expand_dims(flat_gt_masks, -1),
+            [tf.expand_dims(flat_gt_masks, -1)],
-            tf.expand_dims(flat_normalized_proposals, axis=1),
+            tf.expand_dims(flat_normalized_proposals, axis=1), None,
            [mask_height, mask_width])
        # Without stopping gradients into cropped groundtruth masks the
        # performance with 100-padded groundtruth masks when batch size > 1 is

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
@@ -34,7 +34,7 @@ from object_detection.meta_architectures import faster_rcnn_meta_arch
 from object_detection.protos import box_predictor_pb2
 from object_detection.protos import hyperparams_pb2
 from object_detection.protos import post_processing_pb2
-from object_detection.utils import ops
+from object_detection.utils import spatial_transform_ops as spatial_ops
 from object_detection.utils import test_case
 from object_detection.utils import test_utils
 from object_detection.utils import tf_version
@@ -377,8 +377,9 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
          max_negatives_per_positive=None)
    crop_and_resize_fn = (
-        ops.matmul_crop_and_resize
+        spatial_ops.multilevel_matmul_crop_and_resize
-        if use_matmul_crop_and_resize else ops.native_crop_and_resize)
+        if use_matmul_crop_and_resize
+        else spatial_ops.multilevel_native_crop_and_resize)
    common_kwargs = {
        'is_training':
            is_training,

--- a/research/object_detection/utils/spatial_transform_ops.py
+++ b/research/object_detection/utils/spatial_transform_ops.py
@@ -411,6 +411,56 @@ def multilevel_roi_align(features, boxes, box_levels, output_size,
    return features_per_box
+def multilevel_native_crop_and_resize(images, boxes, box_levels,
+                                      crop_size, scope=None):
+  """Multilevel native crop and resize.
+  Same as `multilevel_matmul_crop_and_resize` but uses tf.image.crop_and_resize.
+  Args:
+    images: A list of 4-D tensor of shape
+      [batch, image_height, image_width, depth] representing features of
+      different size.
+    boxes: A `Tensor` of type `float32`.
+      A 3-D tensor of shape `[batch, num_boxes, 4]`. The boxes are specified in
+      normalized coordinates and are of the form `[y1, x1, y2, x2]`. A
+      normalized coordinate value of `y` is mapped to the image coordinate at
+      `y * (image_height - 1)`, so as the `[0, 1]` interval of normalized image
+      height is mapped to `[0, image_height - 1] in image height coordinates.
+      We do allow y1 > y2, in which case the sampled crop is an up-down flipped
+      version of the original image. The width dimension is treated similarly.
+      Normalized coordinates outside the `[0, 1]` range are allowed, in which
+      case we use `extrapolation_value` to extrapolate the input image values.
+    box_levels: A 2-D tensor of shape [batch, num_boxes] representing the level
+      of the box.
+    crop_size: A list of two integers `[crop_height, crop_width]`. All
+      cropped image patches are resized to this size. The aspect ratio of the
+      image content is not preserved. Both `crop_height` and `crop_width` need
+      to be positive.
+    scope: A name for the operation (optional).
+  Returns:
+    A 5-D float tensor of shape `[batch, num_boxes, crop_height, crop_width,
+    depth]`
+  """
+  if box_levels is None:
+    return native_crop_and_resize(images[0], boxes, crop_size, scope)
+  with tf.name_scope('MultiLevelNativeCropAndResize'):
+    cropped_feature_list = []
+    for level, image in enumerate(images):
+      # For each level, crop the feature according to all boxes
+      # set the cropped feature not at this level to 0 tensor.
+      # Consider more efficient way of computing cropped features.
+      cropped = native_crop_and_resize(image, boxes, crop_size, scope)
+      cond = tf.tile(
+          tf.equal(box_levels, level)[:, :, tf.newaxis],
+          [1, 1] + [tf.math.reduce_prod(cropped.shape.as_list()[2:])])
+      cond = tf.reshape(cond, cropped.shape)
+      cropped_final = tf.where(cond, cropped, tf.zeros_like(cropped))
+      cropped_feature_list.append(cropped_final)
+    return tf.math.reduce_sum(cropped_feature_list, axis=0)
 def native_crop_and_resize(image, boxes, crop_size, scope=None):
  """Same as `matmul_crop_and_resize` but uses tf.image.crop_and_resize."""
  def get_box_inds(proposals):
@@ -431,6 +481,50 @@ def native_crop_and_resize(image, boxes, crop_size, scope=None):
    return tf.reshape(cropped_regions, final_shape)
+def multilevel_matmul_crop_and_resize(images, boxes, box_levels, crop_size,
+                                      extrapolation_value=0.0, scope=None):
+  """Multilevel matmul crop and resize.
+  Same as `matmul_crop_and_resize` but crop images according to box levels.
+  Args:
+    images: A list of 4-D tensor of shape
+      [batch, image_height, image_width, depth] representing features of
+      different size.
+    boxes: A `Tensor` of type `float32` or 'bfloat16'.
+      A 3-D tensor of shape `[batch, num_boxes, 4]`. The boxes are specified in
+      normalized coordinates and are of the form `[y1, x1, y2, x2]`. A
+      normalized coordinate value of `y` is mapped to the image coordinate at
+      `y * (image_height - 1)`, so as the `[0, 1]` interval of normalized image
+      height is mapped to `[0, image_height - 1] in image height coordinates.
+      We do allow y1 > y2, in which case the sampled crop is an up-down flipped
+      version of the original image. The width dimension is treated similarly.
+      Normalized coordinates outside the `[0, 1]` range are allowed, in which
+      case we use `extrapolation_value` to extrapolate the input image values.
+    box_levels: A 2-D tensor of shape [batch, num_boxes] representing the level
+      of the box.
+    crop_size: A list of two integers `[crop_height, crop_width]`. All
+      cropped image patches are resized to this size. The aspect ratio of the
+      image content is not preserved. Both `crop_height` and `crop_width` need
+      to be positive.
+    extrapolation_value: A float value to use for extrapolation.
+    scope: A name for the operation (optional).
+  Returns:
+    A 5-D float tensor of shape `[batch, num_boxes, crop_height, crop_width,
+    depth]`
+  """
+  with tf.name_scope(scope, 'MultiLevelMatMulCropAndResize'):
+    if box_levels is None:
+      box_levels = tf.zeros(tf.shape(boxes)[:2], dtype=tf.int32)
+    return multilevel_roi_align(images,
+                                boxes,
+                                box_levels,
+                                crop_size,
+                                align_corners=True,
+                                extrapolation_value=extrapolation_value)
 def matmul_crop_and_resize(image, boxes, crop_size, extrapolation_value=0.0,
                           scope=None):
  """Matrix multiplication based implementation of the crop and resize op.

--- a/research/object_detection/utils/spatial_transform_ops_test.py
+++ b/research/object_detection/utils/spatial_transform_ops_test.py
@@ -512,6 +512,38 @@ class MatMulCropAndResizeTest(test_case.TestCase):
    crop_output = self.execute(graph_fn, [image, boxes])
    self.assertAllClose(crop_output, expected_output)
+  def testMultilevelMatMulCropAndResize(self):
+    def graph_fn(image1, image2, boxes, box_levels):
+      return spatial_ops.multilevel_matmul_crop_and_resize([image1, image2],
+                                                           boxes,
+                                                           box_levels,
+                                                           crop_size=[2, 2])
+    image = [np.array([[[[1, 0], [2, 0], [3, 0]],
+                        [[4, 0], [5, 0], [6, 0]],
+                        [[7, 0], [8, 0], [9, 0]]],
+                       [[[1, 0], [2, 0], [3, 0]],
+                        [[4, 0], [5, 0], [6, 0]],
+                        [[7, 0], [8, 0], [9, 0]]]], dtype=np.float32),
+             np.array([[[[1, 0], [2, 1], [3, 2]],
+                        [[4, 3], [5, 4], [6, 5]],
+                        [[7, 6], [8, 7], [9, 8]]],
+                       [[[1, 0], [2, 1], [3, 2]],
+                        [[4, 3], [5, 4], [6, 5]],
+                        [[7, 6], [8, 7], [9, 8]]]], dtype=np.float32)]
+    boxes = np.array([[[1, 1, 0, 0],
+                       [.5, .5, 0, 0]],
+                      [[0, 0, 1, 1],
+                       [0, 0, .5, .5]]], dtype=np.float32)
+    box_levels = np.array([[0, 1], [1, 1]], dtype=np.int32)
+    expected_output = [[[[[9, 0], [7, 0]], [[3, 0], [1, 0]]],
+                        [[[5, 4], [4, 3]], [[2, 1], [1, 0]]]],
+                       [[[[1, 0], [3, 2]], [[7, 6], [9, 8]]],
+                        [[[1, 0], [2, 1]], [[4, 3], [5, 4]]]]]
+    crop_output = self.execute(graph_fn, image + [boxes, box_levels])
+    self.assertAllClose(crop_output, expected_output)
 class NativeCropAndResizeTest(test_case.TestCase):
@@ -537,6 +569,35 @@ class NativeCropAndResizeTest(test_case.TestCase):
    crop_output = self.execute_cpu(graph_fn, [image, boxes])
    self.assertAllClose(crop_output, expected_output)
+  def testMultilevelBatchCropAndResize3x3To2x2_2Channels(self):
+    def graph_fn(image1, image2, boxes, box_levels):
+      return spatial_ops.multilevel_native_crop_and_resize([image1, image2],
+                                                           boxes,
+                                                           box_levels,
+                                                           crop_size=[2, 2])
+    image = [np.array([[[[1, 0], [2, 1], [3, 2]],
+                        [[4, 3], [5, 4], [6, 5]],
+                        [[7, 6], [8, 7], [9, 8]]],
+                       [[[1, 0], [2, 1], [3, 2]],
+                        [[4, 3], [5, 4], [6, 5]],
+                        [[7, 6], [8, 7], [9, 8]]]], dtype=np.float32),
+             np.array([[[[1, 0], [2, 1]],
+                        [[4, 3], [5, 4]]],
+                       [[[1, 0], [2, 1]],
+                        [[4, 3], [5, 4]]]], dtype=np.float32)]
+    boxes = np.array([[[0, 0, 1, 1],
+                       [0, 0, .5, .5]],
+                      [[1, 1, 0, 0],
+                       [.5, .5, 0, 0]]], dtype=np.float32)
+    box_levels = np.array([[0, 1], [0, 0]], dtype=np.float32)
+    expected_output = [[[[[1, 0], [3, 2]], [[7, 6], [9, 8]]],
+                        [[[1, 0], [1.5, 0.5]], [[2.5, 1.5], [3, 2]]]],
+                       [[[[9, 8], [7, 6]], [[3, 2], [1, 0]]],
+                        [[[5, 4], [4, 3]], [[2, 1], [1, 0]]]]]
+    crop_output = self.execute_cpu(graph_fn, image + [boxes, box_levels])
+    self.assertAllClose(crop_output, expected_output)
 if __name__ == '__main__':
  tf.test.main()