Commit d3d2ad3d authored by TF Object Detection Team's avatar TF Object Detection Team
Browse files

Merge pull request #8746 from syiming:add_multilevel_crop_and_resize

PiperOrigin-RevId: 322214979
parents 52515dc3 f7d74d68
...@@ -39,6 +39,7 @@ from object_detection.protos import losses_pb2 ...@@ -39,6 +39,7 @@ from object_detection.protos import losses_pb2
from object_detection.protos import model_pb2 from object_detection.protos import model_pb2
from object_detection.utils import label_map_util from object_detection.utils import label_map_util
from object_detection.utils import ops from object_detection.utils import ops
from object_detection.utils import spatial_transform_ops as spatial_ops
from object_detection.utils import tf_version from object_detection.utils import tf_version
## Feature Extractors for TF ## Feature Extractors for TF
...@@ -656,8 +657,9 @@ def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries): ...@@ -656,8 +657,9 @@ def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries):
second_stage_localization_loss_weight) second_stage_localization_loss_weight)
crop_and_resize_fn = ( crop_and_resize_fn = (
ops.matmul_crop_and_resize if frcnn_config.use_matmul_crop_and_resize spatial_ops.multilevel_matmul_crop_and_resize
else ops.native_crop_and_resize) if frcnn_config.use_matmul_crop_and_resize
else spatial_ops.multilevel_native_crop_and_resize)
clip_anchors_to_image = ( clip_anchors_to_image = (
frcnn_config.clip_anchors_to_image) frcnn_config.clip_anchors_to_image)
......
...@@ -324,7 +324,7 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch): ...@@ -324,7 +324,7 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
A float32 Tensor with shape [K, new_height, new_width, depth]. A float32 Tensor with shape [K, new_height, new_width, depth].
""" """
box_features = self._crop_and_resize_fn( box_features = self._crop_and_resize_fn(
features_to_crop, proposal_boxes_normalized, [features_to_crop], proposal_boxes_normalized, None,
[self._initial_crop_size, self._initial_crop_size]) [self._initial_crop_size, self._initial_crop_size])
attention_features = self._context_feature_extract_fn( attention_features = self._context_feature_extract_fn(
......
...@@ -41,7 +41,7 @@ from object_detection.meta_architectures import faster_rcnn_meta_arch ...@@ -41,7 +41,7 @@ from object_detection.meta_architectures import faster_rcnn_meta_arch
from object_detection.protos import box_predictor_pb2 from object_detection.protos import box_predictor_pb2
from object_detection.protos import hyperparams_pb2 from object_detection.protos import hyperparams_pb2
from object_detection.protos import post_processing_pb2 from object_detection.protos import post_processing_pb2
from object_detection.utils import ops from object_detection.utils import spatial_transform_ops as spatial_ops
from object_detection.utils import test_case from object_detection.utils import test_case
from object_detection.utils import test_utils from object_detection.utils import test_utils
from object_detection.utils import tf_version from object_detection.utils import tf_version
...@@ -363,8 +363,9 @@ class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase): ...@@ -363,8 +363,9 @@ class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):
max_negatives_per_positive=None) max_negatives_per_positive=None)
crop_and_resize_fn = ( crop_and_resize_fn = (
ops.matmul_crop_and_resize spatial_ops.multilevel_matmul_crop_and_resize
if use_matmul_crop_and_resize else ops.native_crop_and_resize) if use_matmul_crop_and_resize
else spatial_ops.multilevel_native_crop_and_resize)
common_kwargs = { common_kwargs = {
'is_training': 'is_training':
is_training, is_training,
......
...@@ -1948,9 +1948,16 @@ class FasterRCNNMetaArch(model.DetectionModel): ...@@ -1948,9 +1948,16 @@ class FasterRCNNMetaArch(model.DetectionModel):
Returns: Returns:
A float32 tensor with shape [K, new_height, new_width, depth]. A float32 tensor with shape [K, new_height, new_width, depth].
""" """
features_to_crop = [features_to_crop]
num_levels = len(features_to_crop)
box_levels = None
if num_levels != 1:
# If there are multiple levels to select, get the box levels
box_levels = ops.fpn_feature_levels(num_levels, num_levels - 1,
1.0/224, proposal_boxes_normalized)
cropped_regions = self._flatten_first_two_dimensions( cropped_regions = self._flatten_first_two_dimensions(
self._crop_and_resize_fn( self._crop_and_resize_fn(
features_to_crop, proposal_boxes_normalized, features_to_crop, proposal_boxes_normalized, box_levels,
[self._initial_crop_size, self._initial_crop_size])) [self._initial_crop_size, self._initial_crop_size]))
return self._maxpool_layer(cropped_regions) return self._maxpool_layer(cropped_regions)
...@@ -2517,8 +2524,8 @@ class FasterRCNNMetaArch(model.DetectionModel): ...@@ -2517,8 +2524,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
image_shape[1], image_shape[2], check_range=False).get() image_shape[1], image_shape[2], check_range=False).get()
flat_cropped_gt_mask = self._crop_and_resize_fn( flat_cropped_gt_mask = self._crop_and_resize_fn(
tf.expand_dims(flat_gt_masks, -1), [tf.expand_dims(flat_gt_masks, -1)],
tf.expand_dims(flat_normalized_proposals, axis=1), tf.expand_dims(flat_normalized_proposals, axis=1), None,
[mask_height, mask_width]) [mask_height, mask_width])
# Without stopping gradients into cropped groundtruth masks the # Without stopping gradients into cropped groundtruth masks the
# performance with 100-padded groundtruth masks when batch size > 1 is # performance with 100-padded groundtruth masks when batch size > 1 is
......
...@@ -34,7 +34,7 @@ from object_detection.meta_architectures import faster_rcnn_meta_arch ...@@ -34,7 +34,7 @@ from object_detection.meta_architectures import faster_rcnn_meta_arch
from object_detection.protos import box_predictor_pb2 from object_detection.protos import box_predictor_pb2
from object_detection.protos import hyperparams_pb2 from object_detection.protos import hyperparams_pb2
from object_detection.protos import post_processing_pb2 from object_detection.protos import post_processing_pb2
from object_detection.utils import ops from object_detection.utils import spatial_transform_ops as spatial_ops
from object_detection.utils import test_case from object_detection.utils import test_case
from object_detection.utils import test_utils from object_detection.utils import test_utils
from object_detection.utils import tf_version from object_detection.utils import tf_version
...@@ -377,8 +377,9 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase): ...@@ -377,8 +377,9 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
max_negatives_per_positive=None) max_negatives_per_positive=None)
crop_and_resize_fn = ( crop_and_resize_fn = (
ops.matmul_crop_and_resize spatial_ops.multilevel_matmul_crop_and_resize
if use_matmul_crop_and_resize else ops.native_crop_and_resize) if use_matmul_crop_and_resize
else spatial_ops.multilevel_native_crop_and_resize)
common_kwargs = { common_kwargs = {
'is_training': 'is_training':
is_training, is_training,
......
...@@ -411,6 +411,56 @@ def multilevel_roi_align(features, boxes, box_levels, output_size, ...@@ -411,6 +411,56 @@ def multilevel_roi_align(features, boxes, box_levels, output_size,
return features_per_box return features_per_box
def multilevel_native_crop_and_resize(images, boxes, box_levels,
crop_size, scope=None):
"""Multilevel native crop and resize.
Same as `multilevel_matmul_crop_and_resize` but uses tf.image.crop_and_resize.
Args:
images: A list of 4-D tensor of shape
[batch, image_height, image_width, depth] representing features of
different size.
boxes: A `Tensor` of type `float32`.
A 3-D tensor of shape `[batch, num_boxes, 4]`. The boxes are specified in
normalized coordinates and are of the form `[y1, x1, y2, x2]`. A
normalized coordinate value of `y` is mapped to the image coordinate at
`y * (image_height - 1)`, so as the `[0, 1]` interval of normalized image
height is mapped to `[0, image_height - 1] in image height coordinates.
We do allow y1 > y2, in which case the sampled crop is an up-down flipped
version of the original image. The width dimension is treated similarly.
Normalized coordinates outside the `[0, 1]` range are allowed, in which
case we use `extrapolation_value` to extrapolate the input image values.
box_levels: A 2-D tensor of shape [batch, num_boxes] representing the level
of the box.
crop_size: A list of two integers `[crop_height, crop_width]`. All
cropped image patches are resized to this size. The aspect ratio of the
image content is not preserved. Both `crop_height` and `crop_width` need
to be positive.
scope: A name for the operation (optional).
Returns:
A 5-D float tensor of shape `[batch, num_boxes, crop_height, crop_width,
depth]`
"""
if box_levels is None:
return native_crop_and_resize(images[0], boxes, crop_size, scope)
with tf.name_scope('MultiLevelNativeCropAndResize'):
cropped_feature_list = []
for level, image in enumerate(images):
# For each level, crop the feature according to all boxes
# set the cropped feature not at this level to 0 tensor.
# Consider more efficient way of computing cropped features.
cropped = native_crop_and_resize(image, boxes, crop_size, scope)
cond = tf.tile(
tf.equal(box_levels, level)[:, :, tf.newaxis],
[1, 1] + [tf.math.reduce_prod(cropped.shape.as_list()[2:])])
cond = tf.reshape(cond, cropped.shape)
cropped_final = tf.where(cond, cropped, tf.zeros_like(cropped))
cropped_feature_list.append(cropped_final)
return tf.math.reduce_sum(cropped_feature_list, axis=0)
def native_crop_and_resize(image, boxes, crop_size, scope=None): def native_crop_and_resize(image, boxes, crop_size, scope=None):
"""Same as `matmul_crop_and_resize` but uses tf.image.crop_and_resize.""" """Same as `matmul_crop_and_resize` but uses tf.image.crop_and_resize."""
def get_box_inds(proposals): def get_box_inds(proposals):
...@@ -431,6 +481,50 @@ def native_crop_and_resize(image, boxes, crop_size, scope=None): ...@@ -431,6 +481,50 @@ def native_crop_and_resize(image, boxes, crop_size, scope=None):
return tf.reshape(cropped_regions, final_shape) return tf.reshape(cropped_regions, final_shape)
def multilevel_matmul_crop_and_resize(images, boxes, box_levels, crop_size,
extrapolation_value=0.0, scope=None):
"""Multilevel matmul crop and resize.
Same as `matmul_crop_and_resize` but crop images according to box levels.
Args:
images: A list of 4-D tensor of shape
[batch, image_height, image_width, depth] representing features of
different size.
boxes: A `Tensor` of type `float32` or 'bfloat16'.
A 3-D tensor of shape `[batch, num_boxes, 4]`. The boxes are specified in
normalized coordinates and are of the form `[y1, x1, y2, x2]`. A
normalized coordinate value of `y` is mapped to the image coordinate at
`y * (image_height - 1)`, so as the `[0, 1]` interval of normalized image
height is mapped to `[0, image_height - 1] in image height coordinates.
We do allow y1 > y2, in which case the sampled crop is an up-down flipped
version of the original image. The width dimension is treated similarly.
Normalized coordinates outside the `[0, 1]` range are allowed, in which
case we use `extrapolation_value` to extrapolate the input image values.
box_levels: A 2-D tensor of shape [batch, num_boxes] representing the level
of the box.
crop_size: A list of two integers `[crop_height, crop_width]`. All
cropped image patches are resized to this size. The aspect ratio of the
image content is not preserved. Both `crop_height` and `crop_width` need
to be positive.
extrapolation_value: A float value to use for extrapolation.
scope: A name for the operation (optional).
Returns:
A 5-D float tensor of shape `[batch, num_boxes, crop_height, crop_width,
depth]`
"""
with tf.name_scope(scope, 'MultiLevelMatMulCropAndResize'):
if box_levels is None:
box_levels = tf.zeros(tf.shape(boxes)[:2], dtype=tf.int32)
return multilevel_roi_align(images,
boxes,
box_levels,
crop_size,
align_corners=True,
extrapolation_value=extrapolation_value)
def matmul_crop_and_resize(image, boxes, crop_size, extrapolation_value=0.0, def matmul_crop_and_resize(image, boxes, crop_size, extrapolation_value=0.0,
scope=None): scope=None):
"""Matrix multiplication based implementation of the crop and resize op. """Matrix multiplication based implementation of the crop and resize op.
......
...@@ -512,6 +512,38 @@ class MatMulCropAndResizeTest(test_case.TestCase): ...@@ -512,6 +512,38 @@ class MatMulCropAndResizeTest(test_case.TestCase):
crop_output = self.execute(graph_fn, [image, boxes]) crop_output = self.execute(graph_fn, [image, boxes])
self.assertAllClose(crop_output, expected_output) self.assertAllClose(crop_output, expected_output)
def testMultilevelMatMulCropAndResize(self):
def graph_fn(image1, image2, boxes, box_levels):
return spatial_ops.multilevel_matmul_crop_and_resize([image1, image2],
boxes,
box_levels,
crop_size=[2, 2])
image = [np.array([[[[1, 0], [2, 0], [3, 0]],
[[4, 0], [5, 0], [6, 0]],
[[7, 0], [8, 0], [9, 0]]],
[[[1, 0], [2, 0], [3, 0]],
[[4, 0], [5, 0], [6, 0]],
[[7, 0], [8, 0], [9, 0]]]], dtype=np.float32),
np.array([[[[1, 0], [2, 1], [3, 2]],
[[4, 3], [5, 4], [6, 5]],
[[7, 6], [8, 7], [9, 8]]],
[[[1, 0], [2, 1], [3, 2]],
[[4, 3], [5, 4], [6, 5]],
[[7, 6], [8, 7], [9, 8]]]], dtype=np.float32)]
boxes = np.array([[[1, 1, 0, 0],
[.5, .5, 0, 0]],
[[0, 0, 1, 1],
[0, 0, .5, .5]]], dtype=np.float32)
box_levels = np.array([[0, 1], [1, 1]], dtype=np.int32)
expected_output = [[[[[9, 0], [7, 0]], [[3, 0], [1, 0]]],
[[[5, 4], [4, 3]], [[2, 1], [1, 0]]]],
[[[[1, 0], [3, 2]], [[7, 6], [9, 8]]],
[[[1, 0], [2, 1]], [[4, 3], [5, 4]]]]]
crop_output = self.execute(graph_fn, image + [boxes, box_levels])
self.assertAllClose(crop_output, expected_output)
class NativeCropAndResizeTest(test_case.TestCase): class NativeCropAndResizeTest(test_case.TestCase):
...@@ -537,6 +569,35 @@ class NativeCropAndResizeTest(test_case.TestCase): ...@@ -537,6 +569,35 @@ class NativeCropAndResizeTest(test_case.TestCase):
crop_output = self.execute_cpu(graph_fn, [image, boxes]) crop_output = self.execute_cpu(graph_fn, [image, boxes])
self.assertAllClose(crop_output, expected_output) self.assertAllClose(crop_output, expected_output)
def testMultilevelBatchCropAndResize3x3To2x2_2Channels(self):
def graph_fn(image1, image2, boxes, box_levels):
return spatial_ops.multilevel_native_crop_and_resize([image1, image2],
boxes,
box_levels,
crop_size=[2, 2])
image = [np.array([[[[1, 0], [2, 1], [3, 2]],
[[4, 3], [5, 4], [6, 5]],
[[7, 6], [8, 7], [9, 8]]],
[[[1, 0], [2, 1], [3, 2]],
[[4, 3], [5, 4], [6, 5]],
[[7, 6], [8, 7], [9, 8]]]], dtype=np.float32),
np.array([[[[1, 0], [2, 1]],
[[4, 3], [5, 4]]],
[[[1, 0], [2, 1]],
[[4, 3], [5, 4]]]], dtype=np.float32)]
boxes = np.array([[[0, 0, 1, 1],
[0, 0, .5, .5]],
[[1, 1, 0, 0],
[.5, .5, 0, 0]]], dtype=np.float32)
box_levels = np.array([[0, 1], [0, 0]], dtype=np.float32)
expected_output = [[[[[1, 0], [3, 2]], [[7, 6], [9, 8]]],
[[[1, 0], [1.5, 0.5]], [[2.5, 1.5], [3, 2]]]],
[[[[9, 8], [7, 6]], [[3, 2], [1, 0]]],
[[[5, 4], [4, 3]], [[2, 1], [1, 0]]]]]
crop_output = self.execute_cpu(graph_fn, image + [boxes, box_levels])
self.assertAllClose(crop_output, expected_output)
if __name__ == '__main__': if __name__ == '__main__':
tf.test.main() tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment