Commit 30b1c958 authored by Sara Beery's avatar Sara Beery Committed by TF Object Detection Team
Browse files

Context R-CNN Updates: Added capabilities to use multi-headed or multi-layered...

Context R-CNN Updates: Added capabilities to use multi-headed or multi-layered attention, to place the attention heads pre- or post-second stage feature extraction, and to work with embedded features in the context feature back from pre- or post-second stage feature extraction. Added an option for RPN feature map crops to be piped through to model outputs.

PiperOrigin-RevId: 345777219
parent e7dfe641
......@@ -756,7 +756,9 @@ def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries):
'return_raw_detections_during_predict':
frcnn_config.return_raw_detections_during_predict,
'output_final_box_features':
frcnn_config.output_final_box_features
frcnn_config.output_final_box_features,
'output_final_box_rpn_features':
frcnn_config.output_final_box_rpn_features,
}
if ((not is_keras and isinstance(second_stage_box_predictor,
......@@ -773,7 +775,19 @@ def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries):
'attention_bottleneck_dimension':
context_config.attention_bottleneck_dimension,
'attention_temperature':
context_config.attention_temperature
context_config.attention_temperature,
'use_self_attention':
context_config.use_self_attention,
'use_long_term_attention':
context_config.use_long_term_attention,
'self_attention_in_sequence':
context_config.self_attention_in_sequence,
'num_attention_heads':
context_config.num_attention_heads,
'num_attention_layers':
context_config.num_attention_layers,
'attention_position':
context_config.attention_position
})
return context_rcnn_meta_arch.ContextRCNNMetaArch(
initial_crop_size=initial_crop_size,
......
......@@ -67,10 +67,13 @@ def filter_weight_value(weights, values, valid_mask):
# Force the invalid weights to be very negative so it won't contribute to
# the softmax.
weights += tf.transpose(
tf.cast(tf.math.logical_not(valid_mask), weights.dtype) *
_NEGATIVE_PADDING_VALUE,
perm=[0, 2, 1])
very_negative_mask = tf.ones(
weights.shape, dtype=weights.dtype) * _NEGATIVE_PADDING_VALUE
valid_weight_mask = tf.tile(tf.transpose(valid_mask, perm=[0, 2, 1]),
[1, weights.shape[1], 1])
weights = tf.where(valid_weight_mask,
x=weights, y=very_negative_mask)
# Force the invalid values to be 0.
values *= tf.cast(valid_mask, values.dtype)
......@@ -140,8 +143,9 @@ def project_features(features, projection_dimension, is_training, normalize):
def attention_block(input_features, context_features, bottleneck_dimension,
output_dimension, attention_temperature, valid_mask,
is_training):
output_dimension, attention_temperature,
keys_values_valid_mask, queries_valid_mask,
is_training, block_name="AttentionBlock"):
"""Generic attention block.
Args:
......@@ -156,14 +160,18 @@ def attention_block(input_features, context_features, bottleneck_dimension,
attention_temperature: A float Tensor. It controls the temperature of the
softmax for weights calculation. The formula for calculation as follows:
weights = exp(weights / temperature) / sum(exp(weights / temperature))
valid_mask: A boolean Tensor of shape [batch_size, context_size].
keys_values_valid_mask: A boolean Tensor of shape
[batch_size, context_size].
queries_valid_mask: A boolean Tensor of shape
[batch_size, max_num_proposals].
is_training: A boolean Tensor (affecting batch normalization).
block_name: A string to specify names for different attention blocks
Returns:
A float Tensor of shape [batch_size, input_size, output_dimension].
"""
with tf.variable_scope("AttentionBlock"):
with tf.variable_scope(block_name):
queries = project_features(
input_features, bottleneck_dimension, is_training, normalize=True)
keys = project_features(
......@@ -171,27 +179,42 @@ def attention_block(input_features, context_features, bottleneck_dimension,
values = project_features(
context_features, bottleneck_dimension, is_training, normalize=True)
weights = tf.matmul(queries, keys, transpose_b=True)
# masking out any keys which are padding
keys *= tf.cast(keys_values_valid_mask[..., tf.newaxis], keys.dtype)
queries *= tf.cast(queries_valid_mask[..., tf.newaxis], queries.dtype)
weights = tf.matmul(queries, keys, transpose_b=True)
weights, values = filter_weight_value(weights, values,
keys_values_valid_mask)
weights, values = filter_weight_value(weights, values, valid_mask)
weights = tf.identity(tf.nn.softmax(weights / attention_temperature),
name=block_name+"AttentionWeights")
weights = tf.nn.softmax(weights / attention_temperature)
features = tf.matmul(weights, values)
features = tf.matmul(weights, values)
output_features = project_features(
features, output_dimension, is_training, normalize=False)
return output_features
def compute_box_context_attention(box_features, context_features,
valid_context_size, bottleneck_dimension,
attention_temperature, is_training):
def _compute_box_context_attention(box_features, num_proposals,
context_features, valid_context_size,
bottleneck_dimension,
attention_temperature, is_training,
max_num_proposals,
use_self_attention=False,
use_long_term_attention=True,
self_attention_in_sequence=False,
num_attention_heads=1,
num_attention_layers=1):
"""Computes the attention feature from the context given a batch of box.
Args:
box_features: A float Tensor of shape [batch_size, max_num_proposals,
box_features: A float Tensor of shape [batch_size * max_num_proposals,
height, width, channels]. It is pooled features from first stage
proposals.
num_proposals: The number of valid box proposals.
context_features: A float Tensor of shape [batch_size, context_size,
num_context_features].
valid_context_size: A int32 Tensor of shape [batch_size].
......@@ -201,22 +224,78 @@ def compute_box_context_attention(box_features, context_features,
softmax for weights calculation. The formula for calculation as follows:
weights = exp(weights / temperature) / sum(exp(weights / temperature))
is_training: A boolean Tensor (affecting batch normalization).
max_num_proposals: The number of box proposals for each image.
use_self_attention: Whether to use an attention block across the
first stage predicted box features for the input image.
use_long_term_attention: Whether to use an attention block into the context
features.
self_attention_in_sequence: Whether self-attention and long term attention
should be in sequence or parallel.
num_attention_heads: Number of heads for multi-headed attention.
num_attention_layers: Number of heads for multi-layered attention.
Returns:
A float Tensor of shape [batch_size, max_num_proposals, 1, 1, channels].
"""
_, context_size, _ = context_features.shape
valid_mask = compute_valid_mask(valid_context_size, context_size)
context_valid_mask = compute_valid_mask(valid_context_size, context_size)
total_proposals, height, width, channels = box_features.shape
batch_size = total_proposals // max_num_proposals
box_features = tf.reshape(
box_features,
[batch_size,
max_num_proposals,
height,
width,
channels])
channels = box_features.shape[-1]
# Average pools over height and width dimension so that the shape of
# box_features becomes [batch_size, max_num_proposals, channels].
box_features = tf.reduce_mean(box_features, [2, 3])
output_features = attention_block(box_features, context_features,
bottleneck_dimension, channels.value,
attention_temperature, valid_mask,
is_training)
box_valid_mask = compute_valid_mask(
num_proposals,
box_features.shape[1])
if use_self_attention:
self_attention_box_features = attention_block(
box_features, box_features, bottleneck_dimension, channels.value,
attention_temperature, keys_values_valid_mask=box_valid_mask,
queries_valid_mask=box_valid_mask, is_training=is_training,
block_name="SelfAttentionBlock")
if use_long_term_attention:
if use_self_attention and self_attention_in_sequence:
input_features = tf.add(self_attention_box_features, box_features)
input_features = tf.divide(input_features, 2)
else:
input_features = box_features
original_input_features = input_features
for jdx in range(num_attention_layers):
layer_features = tf.zeros_like(input_features)
for idx in range(num_attention_heads):
block_name = "AttentionBlock" + str(idx) + "_AttentionLayer" +str(jdx)
attention_features = attention_block(
input_features,
context_features,
bottleneck_dimension,
channels.value,
attention_temperature,
keys_values_valid_mask=context_valid_mask,
queries_valid_mask=box_valid_mask,
is_training=is_training,
block_name=block_name)
layer_features = tf.add(layer_features, attention_features)
layer_features = tf.divide(layer_features, num_attention_heads)
input_features = tf.add(input_features, layer_features)
output_features = tf.add(input_features, original_input_features)
if not self_attention_in_sequence and use_self_attention:
output_features = tf.add(self_attention_box_features, output_features)
elif use_self_attention:
output_features = self_attention_box_features
else:
output_features = tf.zeros(self_attention_box_features.shape)
# Expands the dimension back to match with the original feature map.
output_features = output_features[:, :, tf.newaxis, tf.newaxis, :]
......
......@@ -50,9 +50,9 @@ class ContextRcnnLibTest(parameterized.TestCase, test_case.TestCase,
filtered_weights, filtered_values = context_rcnn_lib.filter_weight_value(
weights, values, valid_mask)
expected_weights = tf.constant([[[4, 4], [4, 4], [4, 4]],
[[4, _NEGATIVE_PADDING_VALUE + 4],
[4, _NEGATIVE_PADDING_VALUE + 4],
[4, _NEGATIVE_PADDING_VALUE + 4]]])
[[4, _NEGATIVE_PADDING_VALUE],
[4, _NEGATIVE_PADDING_VALUE],
[4, _NEGATIVE_PADDING_VALUE]]])
expected_values = tf.constant([[[1, 1, 1, 1], [1, 1, 1, 1]],
[[1, 1, 1, 1], [0, 0, 0, 0]]])
......@@ -66,9 +66,9 @@ class ContextRcnnLibTest(parameterized.TestCase, test_case.TestCase,
weights, values, valid_mask)
expected_weights = tf.constant(
[[[4, 4], [4, 4], [4, 4]],
[[_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4],
[_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4],
[_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4]]])
[[_NEGATIVE_PADDING_VALUE, _NEGATIVE_PADDING_VALUE],
[_NEGATIVE_PADDING_VALUE, _NEGATIVE_PADDING_VALUE],
[_NEGATIVE_PADDING_VALUE, _NEGATIVE_PADDING_VALUE]]])
expected_values = tf.constant([[[1, 1, 1, 1], [1, 1, 1, 1]],
[[0, 0, 0, 0], [0, 0, 0, 0]]])
......@@ -100,27 +100,67 @@ class ContextRcnnLibTest(parameterized.TestCase, test_case.TestCase,
input_features = tf.ones([2, 3, 4], tf.float32)
context_features = tf.ones([2, 2, 3], tf.float32)
valid_mask = tf.constant([[True, True], [False, False]], tf.bool)
box_valid_mask = tf.constant([[True, True, True], [False, False, False]],
tf.bool)
is_training = False
output_features = context_rcnn_lib.attention_block(
input_features, context_features, bottleneck_dimension,
output_dimension, attention_temperature, valid_mask, is_training)
output_dimension, attention_temperature,
keys_values_valid_mask=valid_mask,
queries_valid_mask=box_valid_mask,
is_training=is_training)
# Makes sure the shape is correct.
self.assertAllEqual(output_features.shape, [2, 3, output_dimension])
@parameterized.parameters(True, False)
def test_compute_box_context_attention(self, is_training):
box_features = tf.ones([2, 3, 4, 4, 4], tf.float32)
box_features = tf.ones([2 * 3, 4, 4, 4], tf.float32)
context_features = tf.ones([2, 5, 6], tf.float32)
valid_context_size = tf.constant((2, 3), tf.int32)
num_proposals = tf.constant((2, 3), tf.int32)
bottleneck_dimension = 10
attention_temperature = 1
attention_features = context_rcnn_lib.compute_box_context_attention(
box_features, context_features, valid_context_size,
bottleneck_dimension, attention_temperature, is_training)
attention_features = context_rcnn_lib._compute_box_context_attention(
box_features, num_proposals, context_features, valid_context_size,
bottleneck_dimension, attention_temperature, is_training,
max_num_proposals=3)
# Makes sure the shape is correct.
self.assertAllEqual(attention_features.shape, [2, 3, 1, 1, 4])
@parameterized.parameters(True, False)
def test_compute_box_context_attention_with_self_attention(self, is_training):
box_features = tf.ones([2 * 3, 4, 4, 4], tf.float32)
context_features = tf.ones([2, 5, 6], tf.float32)
valid_context_size = tf.constant((2, 3), tf.int32)
num_proposals = tf.constant((2, 3), tf.int32)
bottleneck_dimension = 10
attention_temperature = 1
attention_features = context_rcnn_lib._compute_box_context_attention(
box_features, num_proposals, context_features, valid_context_size,
bottleneck_dimension, attention_temperature, is_training,
max_num_proposals=3,
use_self_attention=True)
# Makes sure the shape is correct.
self.assertAllEqual(attention_features.shape, [2, 3, 1, 1, 4])
@parameterized.parameters(True, False)
def test_compute_box_context_attention_with_layers_and_heads(
self, is_training):
box_features = tf.ones([2 * 3, 4, 4, 4], tf.float32)
context_features = tf.ones([2, 5, 6], tf.float32)
valid_context_size = tf.constant((2, 3), tf.int32)
num_proposals = tf.constant((2, 3), tf.int32)
bottleneck_dimension = 10
attention_temperature = 1
attention_features = context_rcnn_lib._compute_box_context_attention(
box_features, num_proposals, context_features, valid_context_size,
bottleneck_dimension, attention_temperature, is_training,
max_num_proposals=3,
num_attention_layers=3,
num_attention_heads=3)
# Makes sure the shape is correct.
self.assertAllEqual(attention_features.shape, [2, 3, 1, 1, 4])
if __name__ == '__main__':
tf.test.main()
......@@ -51,7 +51,8 @@ class AttentionBlock(tf.keras.layers.Layer):
def __init__(self, bottleneck_dimension, attention_temperature,
output_dimension=None, is_training=False,
name='AttentionBlock', **kwargs):
name='AttentionBlock', max_num_proposals=100,
**kwargs):
"""Constructs an attention block.
Args:
......@@ -64,6 +65,7 @@ class AttentionBlock(tf.keras.layers.Layer):
output feature.
is_training: A boolean Tensor (affecting batch normalization).
name: A string describing what to name the variables in this block.
max_num_proposals: The number of box proposals for each image
**kwargs: Additional keyword arguments.
"""
......@@ -75,6 +77,7 @@ class AttentionBlock(tf.keras.layers.Layer):
self._bottleneck_dimension = bottleneck_dimension
self._is_training = is_training
self._output_dimension = output_dimension
self._max_num_proposals = max_num_proposals
if self._output_dimension:
self._feature_proj = ContextProjection(self._output_dimension)
super(AttentionBlock, self).__init__(name=name, **kwargs)
......@@ -89,15 +92,18 @@ class AttentionBlock(tf.keras.layers.Layer):
self._output_dimension = input_shapes[-1]
self._feature_proj = ContextProjection(self._output_dimension)
def call(self, box_features, context_features, valid_context_size):
def call(self, box_features, context_features, valid_context_size,
num_proposals):
"""Handles a call by performing attention.
Args:
box_features: A float Tensor of shape [batch_size, input_size, height,
box_features: A float Tensor of shape [batch_size * input_size, height,
width, num_input_features].
context_features: A float Tensor of shape [batch_size, context_size,
num_context_features].
valid_context_size: A int32 Tensor of shape [batch_size].
num_proposals: A [batch_size] int32 Tensor specifying the number of valid
proposals per image in the batch.
Returns:
A float Tensor with shape [batch_size, input_size, num_input_features]
......@@ -105,12 +111,26 @@ class AttentionBlock(tf.keras.layers.Layer):
"""
_, context_size, _ = context_features.shape
valid_mask = compute_valid_mask(valid_context_size, context_size)
keys_values_valid_mask = compute_valid_mask(
valid_context_size, context_size)
total_proposals, height, width, channels = box_features.shape
batch_size = total_proposals // self._max_num_proposals
box_features = tf.reshape(
box_features,
[batch_size,
self._max_num_proposals,
height,
width,
channels])
# Average pools over height and width dimension so that the shape of
# box_features becomes [batch_size, max_num_proposals, channels].
box_features = tf.reduce_mean(box_features, [2, 3])
queries_valid_mask = compute_valid_mask(num_proposals,
box_features.shape[1])
queries = project_features(
box_features, self._bottleneck_dimension, self._is_training,
self._query_proj, normalize=True)
......@@ -121,8 +141,13 @@ class AttentionBlock(tf.keras.layers.Layer):
context_features, self._bottleneck_dimension, self._is_training,
self._val_proj, normalize=True)
# masking out any keys which are padding
keys *= tf.cast(keys_values_valid_mask[..., tf.newaxis], keys.dtype)
queries *= tf.cast(queries_valid_mask[..., tf.newaxis], queries.dtype)
weights = tf.matmul(queries, keys, transpose_b=True)
weights, values = filter_weight_value(weights, values, valid_mask)
weights, values = filter_weight_value(weights, values,
keys_values_valid_mask)
weights = tf.nn.softmax(weights / self._attention_temperature)
features = tf.matmul(weights, values)
......
......@@ -97,19 +97,21 @@ class ContextRcnnLibTest(parameterized.TestCase, test_case.TestCase):
)
def test_attention_block(self, bottleneck_dimension, output_dimension,
attention_temperature):
input_features = tf.ones([2, 8, 3, 3, 3], tf.float32)
input_features = tf.ones([2 * 8, 3, 3, 3], tf.float32)
context_features = tf.ones([2, 20, 10], tf.float32)
num_proposals = tf.convert_to_tensor([6, 3])
attention_block = context_rcnn_lib.AttentionBlock(
bottleneck_dimension,
attention_temperature,
output_dimension=output_dimension,
is_training=False)
is_training=False,
max_num_proposals=8)
valid_context_size = tf.random_uniform((2,),
minval=0,
maxval=10,
dtype=tf.int32)
output_features = attention_block(input_features, context_features,
valid_context_size)
valid_context_size, num_proposals)
# Makes sure the shape is correct.
self.assertAllEqual(output_features.shape,
......
......@@ -25,12 +25,19 @@ from __future__ import print_function
import functools
import tensorflow.compat.v1 as tf
from object_detection.core import box_predictor
from object_detection.core import standard_fields as fields
from object_detection.meta_architectures import context_rcnn_lib
from object_detection.meta_architectures import context_rcnn_lib_tf2
from object_detection.meta_architectures import faster_rcnn_meta_arch
from object_detection.protos import faster_rcnn_pb2
from object_detection.utils import ops
from object_detection.utils import tf_version
_UNINITIALIZED_FEATURE_EXTRACTOR = '__uninitialized__'
class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
"""Context R-CNN Meta-architecture definition."""
......@@ -76,8 +83,17 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
freeze_batchnorm=False,
return_raw_detections_during_predict=False,
output_final_box_features=False,
output_final_box_rpn_features=False,
attention_bottleneck_dimension=None,
attention_temperature=None):
attention_temperature=None,
use_self_attention=False,
use_long_term_attention=True,
self_attention_in_sequence=False,
num_attention_heads=1,
num_attention_layers=1,
attention_position=(
faster_rcnn_pb2.AttentionPosition.POST_BOX_CLASSIFIER)
):
"""ContextRCNNMetaArch Constructor.
Args:
......@@ -210,11 +226,25 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
boxes in the predict() method. These are decoded boxes that have not
been through postprocessing (i.e. NMS). Default False.
output_final_box_features: Whether to output final box features. If true,
it crops the feauture map based on the final box prediction and returns
in the dict as detection_features.
it crops the feature map based on the final box prediction and returns
it in the output dict as detection_features.
output_final_box_rpn_features: Whether to output rpn box features. If
true, it crops the rpn feature map based on the final box prediction and
returns it in the output dict as detection_features.
attention_bottleneck_dimension: A single integer. The bottleneck feature
dimension of the attention block.
attention_temperature: A single float. The attention temperature.
use_self_attention: Whether to use self-attention within the box features
in the current frame.
use_long_term_attention: Whether to use attention into the context
features.
self_attention_in_sequence: Whether self attention and long term attention
are in sequence or parallel.
num_attention_heads: The number of attention heads to use.
num_attention_layers: The number of attention layers to use.
attention_position: Whether attention should occur post rpn or post
box classifier. Options are specified in the faster rcnn proto,
default is post box classifier.
Raises:
ValueError: If `second_stage_batch_size` > `first_stage_max_proposals` at
......@@ -264,19 +294,40 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
freeze_batchnorm=freeze_batchnorm,
return_raw_detections_during_predict=(
return_raw_detections_during_predict),
output_final_box_features=output_final_box_features)
output_final_box_features=output_final_box_features,
output_final_box_rpn_features=output_final_box_rpn_features)
self._attention_position = attention_position
if tf_version.is_tf1():
self._context_feature_extract_fn = functools.partial(
context_rcnn_lib.compute_box_context_attention,
context_rcnn_lib._compute_box_context_attention,
bottleneck_dimension=attention_bottleneck_dimension,
attention_temperature=attention_temperature,
is_training=is_training)
is_training=is_training,
max_num_proposals=self.max_num_proposals,
use_self_attention=use_self_attention,
use_long_term_attention=use_long_term_attention,
self_attention_in_sequence=self_attention_in_sequence,
num_attention_heads=num_attention_heads,
num_attention_layers=num_attention_layers)
else:
if use_self_attention:
raise NotImplementedError
if self_attention_in_sequence:
raise NotImplementedError
if not use_long_term_attention:
raise NotImplementedError
if num_attention_heads > 1:
raise NotImplementedError
if num_attention_layers > 1:
raise NotImplementedError
self._context_feature_extract_fn = context_rcnn_lib_tf2.AttentionBlock(
bottleneck_dimension=attention_bottleneck_dimension,
attention_temperature=attention_temperature,
is_training=is_training)
is_training=is_training,
max_num_proposals=self.max_num_proposals)
@staticmethod
def get_side_inputs(features):
......@@ -298,8 +349,8 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
if (fields.InputDataFields.context_features not in features or
fields.InputDataFields.valid_context_size not in features):
raise ValueError(
"Please make sure context_features and valid_context_size are in the "
"features")
'Please make sure context_features and valid_context_size are in the '
'features')
return {
fields.InputDataFields.context_features:
......@@ -308,9 +359,189 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
features[fields.InputDataFields.valid_context_size]
}
def _predict_second_stage(self, rpn_box_encodings,
rpn_objectness_predictions_with_background,
rpn_features_to_crop, anchors, image_shape,
true_image_shapes, **side_inputs):
"""Predicts the output tensors from second stage of Faster R-CNN.
Args:
rpn_box_encodings: 3-D float tensor of shape
[batch_size, num_valid_anchors, self._box_coder.code_size] containing
predicted boxes.
rpn_objectness_predictions_with_background: 2-D float tensor of shape
[batch_size, num_valid_anchors, 2] containing class
predictions (logits) for each of the anchors. Note that this
tensor *includes* background class predictions (at class index 0).
rpn_features_to_crop: A list of 4-D float32 or bfloat16 tensor with shape
[batch_size, height_i, width_i, depth] representing image features to
crop using the proposal boxes predicted by the RPN.
anchors: 2-D float tensor of shape
[num_anchors, self._box_coder.code_size].
image_shape: A 1D int32 tensors of size [4] containing the image shape.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
**side_inputs: additional tensors that are required by the network.
Returns:
prediction_dict: a dictionary holding "raw" prediction tensors:
1) refined_box_encodings: a 3-D float32 tensor with shape
[total_num_proposals, num_classes, self._box_coder.code_size]
representing predicted (final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals. If using a
shared box across classes the shape will instead be
[total_num_proposals, 1, self._box_coder.code_size].
2) class_predictions_with_background: a 3-D float32 tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors, where
total_num_proposals=batch_size*self._max_num_proposals.
Note that this tensor *includes* background class predictions
(at class index 0).
3) num_proposals: An int32 tensor of shape [batch_size] representing the
number of proposals generated by the RPN. `num_proposals` allows us
to keep track of which entries are to be treated as zero paddings and
which are not since we always pad the number of proposals to be
`self.max_num_proposals` for each image.
4) proposal_boxes: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing
decoded proposal bounding boxes in absolute coordinates.
5) proposal_boxes_normalized: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing decoded proposal
bounding boxes in normalized coordinates. Can be used to override the
boxes proposed by the RPN, thus enabling one to extract features and
get box classification and prediction for externally selected areas
of the image.
6) box_classifier_features: a 4-D float32/bfloat16 tensor
representing the features for each proposal.
If self._return_raw_detections_during_predict is True, the dictionary
will also contain:
7) raw_detection_boxes: a 4-D float32 tensor with shape
[batch_size, self.max_num_proposals, num_classes, 4] in normalized
coordinates.
8) raw_detection_feature_map_indices: a 3-D int32 tensor with shape
[batch_size, self.max_num_proposals, num_classes].
"""
proposal_boxes_normalized, num_proposals = self._proposal_postprocess(
rpn_box_encodings, rpn_objectness_predictions_with_background, anchors,
image_shape, true_image_shapes)
prediction_dict = self._box_prediction(rpn_features_to_crop,
proposal_boxes_normalized,
image_shape, true_image_shapes,
num_proposals,
**side_inputs)
prediction_dict['num_proposals'] = num_proposals
return prediction_dict
def _box_prediction(self, rpn_features_to_crop, proposal_boxes_normalized,
image_shape, true_image_shapes, num_proposals,
**side_inputs):
"""Predicts the output tensors from second stage of Faster R-CNN.
Args:
rpn_features_to_crop: A list 4-D float32 or bfloat16 tensor with shape
[batch_size, height_i, width_i, depth] representing image features to
crop using the proposal boxes predicted by the RPN.
proposal_boxes_normalized: A float tensor with shape [batch_size,
max_num_proposals, 4] representing the (potentially zero padded)
proposal boxes for all images in the batch. These boxes are represented
as normalized coordinates.
image_shape: A 1D int32 tensors of size [4] containing the image shape.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
num_proposals: The number of valid box proposals.
**side_inputs: additional tensors that are required by the network.
Returns:
prediction_dict: a dictionary holding "raw" prediction tensors:
1) refined_box_encodings: a 3-D float32 tensor with shape
[total_num_proposals, num_classes, self._box_coder.code_size]
representing predicted (final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals. If using a
shared box across classes the shape will instead be
[total_num_proposals, 1, self._box_coder.code_size].
2) class_predictions_with_background: a 3-D float32 tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors, where
total_num_proposals=batch_size*self._max_num_proposals.
Note that this tensor *includes* background class predictions
(at class index 0).
3) proposal_boxes: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing
decoded proposal bounding boxes in absolute coordinates.
4) proposal_boxes_normalized: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing decoded proposal
bounding boxes in normalized coordinates. Can be used to override the
boxes proposed by the RPN, thus enabling one to extract features and
get box classification and prediction for externally selected areas
of the image.
5) box_classifier_features: a 4-D float32/bfloat16 tensor
representing the features for each proposal.
If self._return_raw_detections_during_predict is True, the dictionary
will also contain:
6) raw_detection_boxes: a 4-D float32 tensor with shape
[batch_size, self.max_num_proposals, num_classes, 4] in normalized
coordinates.
7) raw_detection_feature_map_indices: a 3-D int32 tensor with shape
[batch_size, self.max_num_proposals, num_classes].
8) final_anchors: a 3-D float tensor of shape [batch_size,
self.max_num_proposals, 4] containing the reference anchors for raw
detection boxes in normalized coordinates.
"""
flattened_proposal_feature_maps = (
self._compute_second_stage_input_feature_maps(
rpn_features_to_crop, proposal_boxes_normalized,
image_shape, num_proposals, **side_inputs))
box_classifier_features = self._extract_box_classifier_features(
flattened_proposal_feature_maps, num_proposals, **side_inputs)
if self._mask_rcnn_box_predictor.is_keras_model:
box_predictions = self._mask_rcnn_box_predictor(
[box_classifier_features],
prediction_stage=2)
else:
box_predictions = self._mask_rcnn_box_predictor.predict(
[box_classifier_features],
num_predictions_per_location=[1],
scope=self.second_stage_box_predictor_scope,
prediction_stage=2)
refined_box_encodings = tf.squeeze(
box_predictions[box_predictor.BOX_ENCODINGS],
axis=1, name='all_refined_box_encodings')
class_predictions_with_background = tf.squeeze(
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
axis=1, name='all_class_predictions_with_background')
absolute_proposal_boxes = ops.normalized_to_image_coordinates(
proposal_boxes_normalized, image_shape, self._parallel_iterations)
prediction_dict = {
'refined_box_encodings': tf.cast(refined_box_encodings,
dtype=tf.float32),
'class_predictions_with_background':
tf.cast(class_predictions_with_background, dtype=tf.float32),
'proposal_boxes': absolute_proposal_boxes,
'box_classifier_features': box_classifier_features,
'proposal_boxes_normalized': proposal_boxes_normalized,
'final_anchors': proposal_boxes_normalized
}
if self._return_raw_detections_during_predict:
prediction_dict.update(self._raw_detections_and_feature_map_inds(
refined_box_encodings, absolute_proposal_boxes, true_image_shapes))
return prediction_dict
def _compute_second_stage_input_feature_maps(self, features_to_crop,
proposal_boxes_normalized,
image_shape,
num_proposals,
context_features,
valid_context_size):
"""Crops to a set of proposals from the feature map for a batch of images.
......@@ -326,6 +557,7 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
num_proposals, box_code_size] containing proposal boxes in normalized
coordinates.
image_shape: A 1D int32 tensors of size [4] containing the image shape.
num_proposals: The number of valid box proposals.
context_features: A float Tensor of shape [batch_size, context_size,
num_context_features].
valid_context_size: A int32 Tensor of shape [batch_size].
......@@ -338,14 +570,55 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
features_to_crop, proposal_boxes_normalized, None,
[self._initial_crop_size, self._initial_crop_size])
attention_features = self._context_feature_extract_fn(
box_features=box_features,
context_features=context_features,
valid_context_size=valid_context_size)
flattened_box_features = self._flatten_first_two_dimensions(box_features)
flattened_box_features = self._maxpool_layer(flattened_box_features)
if self._attention_position == (
faster_rcnn_pb2.AttentionPosition.POST_RPN):
attention_features = self._context_feature_extract_fn(
box_features=flattened_box_features,
num_proposals=num_proposals,
context_features=context_features,
valid_context_size=valid_context_size)
# Adds box features with attention features.
flattened_box_features += self._flatten_first_two_dimensions(
attention_features)
return flattened_box_features
def _extract_box_classifier_features(
self, flattened_box_features, num_proposals, context_features,
valid_context_size,
attention_position=(
faster_rcnn_pb2.AttentionPosition.POST_BOX_CLASSIFIER)):
if self._feature_extractor_for_box_classifier_features == (
_UNINITIALIZED_FEATURE_EXTRACTOR):
self._feature_extractor_for_box_classifier_features = (
self._feature_extractor.get_box_classifier_feature_extractor_model(
name=self.second_stage_feature_extractor_scope))
if self._feature_extractor_for_box_classifier_features:
box_classifier_features = (
self._feature_extractor_for_box_classifier_features(
flattened_box_features))
else:
box_classifier_features = (
self._feature_extractor.extract_box_classifier_features(
flattened_box_features,
scope=self.second_stage_feature_extractor_scope))
# Adds box features with attention features.
box_features += attention_features
if self._attention_position == (
faster_rcnn_pb2.AttentionPosition.POST_BOX_CLASSIFIER):
attention_features = self._context_feature_extract_fn(
box_features=box_classifier_features,
num_proposals=num_proposals,
context_features=context_features,
valid_context_size=valid_context_size)
flattened_feature_maps = self._flatten_first_two_dimensions(box_features)
# Adds box features with attention features.
box_classifier_features += self._flatten_first_two_dimensions(
attention_features)
return self._maxpool_layer(flattened_feature_maps)
return box_classifier_features
......@@ -293,7 +293,6 @@ class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):
first_stage_nms_score_threshold = -1.0
first_stage_nms_iou_threshold = 1.0
first_stage_max_proposals = first_stage_max_proposals
first_stage_non_max_suppression_fn = functools.partial(
post_processing.batch_multiclass_non_max_suppression,
score_thresh=first_stage_nms_score_threshold,
......@@ -444,7 +443,7 @@ class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):
def test_prediction_mock_tf1(self, mock_context_rcnn_lib_v1):
"""Mocks the context_rcnn_lib_v1 module to test the prediction.
Using mock object so that we can ensure compute_box_context_attention is
Using mock object so that we can ensure _compute_box_context_attention is
called in side the prediction function.
Args:
......@@ -457,7 +456,7 @@ class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):
num_classes=42)
mock_tensor = tf.ones([2, 8, 3, 3, 3], tf.float32)
mock_context_rcnn_lib_v1.compute_box_context_attention.return_value = mock_tensor
mock_context_rcnn_lib_v1._compute_box_context_attention.return_value = mock_tensor
inputs_shape = (2, 20, 20, 3)
inputs = tf.cast(
tf.random_uniform(inputs_shape, minval=0, maxval=255, dtype=tf.int32),
......@@ -479,7 +478,7 @@ class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):
side_inputs = model.get_side_inputs(features)
_ = model.predict(preprocessed_inputs, true_image_shapes, **side_inputs)
mock_context_rcnn_lib_v1.compute_box_context_attention.assert_called_once()
mock_context_rcnn_lib_v1._compute_box_context_attention.assert_called_once()
@parameterized.named_parameters(
{'testcase_name': 'static_shapes', 'static_shapes': True},
......
......@@ -304,7 +304,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
resize_masks=True,
freeze_batchnorm=False,
return_raw_detections_during_predict=False,
output_final_box_features=False):
output_final_box_features=False,
output_final_box_rpn_features=False):
"""FasterRCNNMetaArch Constructor.
Args:
......@@ -437,8 +438,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
boxes in the predict() method. These are decoded boxes that have not
been through postprocessing (i.e. NMS). Default False.
output_final_box_features: Whether to output final box features. If true,
it crops the feauture map based on the final box prediction and returns
in the dict as detection_features.
it crops the rpn feature map and passes it through box_classifier then
returns in the output dict as `detection_features`.
output_final_box_rpn_features: Whether to output rpn box features. If
true, it crops the rpn feature map and returns in the output dict as
`detection_features`.
Raises:
ValueError: If `second_stage_batch_size` > `first_stage_max_proposals` at
......@@ -604,6 +608,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
self._return_raw_detections_during_predict = (
return_raw_detections_during_predict)
self._output_final_box_features = output_final_box_features
self._output_final_box_rpn_features = output_final_box_rpn_features
@property
def first_stage_feature_extractor_scope(self):
......@@ -821,7 +826,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
prediction_dict['rpn_objectness_predictions_with_background'],
prediction_dict['rpn_features_to_crop'],
prediction_dict['anchors'], prediction_dict['image_shape'],
true_image_shapes, **side_inputs))
true_image_shapes,
**side_inputs))
if self._number_of_stages == 3:
prediction_dict = self._predict_third_stage(prediction_dict,
......@@ -1059,7 +1065,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
image_shape, **side_inputs))
box_classifier_features = self._extract_box_classifier_features(
flattened_proposal_feature_maps)
flattened_proposal_feature_maps, **side_inputs)
if self._mask_rcnn_box_predictor.is_keras_model:
box_predictions = self._mask_rcnn_box_predictor(
......@@ -1547,10 +1553,22 @@ class FasterRCNNMetaArch(model.DetectionModel):
'Please make sure rpn_features_to_crop is in the prediction_dict.'
)
detections_dict[
'detection_features'] = self._add_detection_features_output_node(
'detection_features'] = (
self._add_detection_box_boxclassifier_features_output_node(
detections_dict[
fields.DetectionResultFields.detection_boxes],
prediction_dict['rpn_features_to_crop'],
prediction_dict['image_shape']))
if self._output_final_box_rpn_features:
if 'rpn_features_to_crop' not in prediction_dict:
raise ValueError(
'Please make sure rpn_features_to_crop is in the prediction_dict.'
)
detections_dict['cropped_rpn_box_features'] = (
self._add_detection_box_rpn_features_output_node(
detections_dict[fields.DetectionResultFields.detection_boxes],
prediction_dict['rpn_features_to_crop'],
prediction_dict['image_shape'])
prediction_dict['image_shape']))
return detections_dict
......@@ -1566,8 +1584,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
prediction_dict.pop(k)
return prediction_dict
def _add_detection_features_output_node(self, detection_boxes,
rpn_features_to_crop, image_shape):
def _add_detection_box_boxclassifier_features_output_node(
self, detection_boxes, rpn_features_to_crop, image_shape):
"""Add detection features to outputs.
This function extracts box features for each box in rpn_features_to_crop.
......@@ -1606,6 +1624,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
reshaped_detection_features_pool = tf.identity(
reshaped_detection_features_pool, 'pooled_detection_features')
# TODO(sbeery) add node to extract rpn features here!!
reshaped_detection_features = tf.reshape(
detection_features_unpooled,
[batch_size, max_detections,
......@@ -1615,6 +1635,44 @@ class FasterRCNNMetaArch(model.DetectionModel):
return reshaped_detection_features
def _add_detection_box_rpn_features_output_node(self, detection_boxes,
rpn_features_to_crop,
image_shape):
"""Add detection features to outputs.
This function extracts box features for each box in rpn_features_to_crop.
It returns the extracted box features, reshaped to
[batch size, max_detections, height, width, depth]
Args:
detection_boxes: a 3-D float32 tensor of shape
[batch_size, max_detections, 4] which represents the bounding boxes.
rpn_features_to_crop: A list of 4-D float32 tensor with shape
[batch, height, width, depth] representing image features to crop using
the proposals boxes.
image_shape: a 1-D tensor of shape [4] representing the image shape.
Returns:
detection_features: a 4-D float32 tensor of shape
[batch size, max_detections, height, width, depth] representing
cropped image features
"""
with tf.name_scope('FirstStageDetectionFeaturesExtract'):
flattened_detected_feature_maps = (
self._compute_second_stage_input_feature_maps(
rpn_features_to_crop, detection_boxes, image_shape))
batch_size = tf.shape(detection_boxes)[0]
max_detections = tf.shape(detection_boxes)[1]
reshaped_detection_features = tf.reshape(
flattened_detected_feature_maps,
[batch_size, max_detections,
tf.shape(flattened_detected_feature_maps)[1],
tf.shape(flattened_detected_feature_maps)[2],
tf.shape(flattened_detected_feature_maps)[3]])
return reshaped_detection_features
def _postprocess_rpn(self,
rpn_box_encodings_batch,
rpn_objectness_predictions_with_background_batch,
......
......@@ -84,7 +84,8 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
resize_masks=False,
freeze_batchnorm=False,
return_raw_detections_during_predict=False,
output_final_box_features=False):
output_final_box_features=False,
output_final_box_rpn_features=False):
"""RFCNMetaArch Constructor.
Args:
......@@ -194,8 +195,11 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
boxes in the predict() method. These are decoded boxes that have not
been through postprocessing (i.e. NMS). Default False.
output_final_box_features: Whether to output final box features. If true,
it crops the feauture map based on the final box prediction and returns
in the dict as detection_features.
it crops the feature map based on the final box prediction and returns
it in the dict as detection_features.
output_final_box_rpn_features: Whether to output rpn box features. If
true, it crops the rpn feature map based on the final box prediction and
returns it in the dict as detection_features.
Raises:
ValueError: If `second_stage_batch_size` > `first_stage_max_proposals`
......@@ -245,7 +249,8 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
freeze_batchnorm=freeze_batchnorm,
return_raw_detections_during_predict=(
return_raw_detections_during_predict),
output_final_box_features=output_final_box_features)
output_final_box_features=output_final_box_features,
output_final_box_rpn_features=output_final_box_rpn_features)
self._rfcn_box_predictor = second_stage_rfcn_box_predictor
......
......@@ -18,6 +18,7 @@ import "object_detection/protos/fpn.proto";
// (or RPN) and a second stage box classifier. We thus use the prefixes
// `first_stage_` and `second_stage_` to indicate the stage to which each
// parameter pertains when relevant.
message FasterRcnn {
// Whether to construct only the Region Proposal Network (RPN).
optional int32 number_of_stages = 1 [default = 2];
......@@ -176,17 +177,30 @@ message FasterRcnn {
// Whether to use tf.image.combined_non_max_suppression.
optional bool use_combined_nms_in_first_stage = 40 [default = false];
// Whether to output final box feature. If true, it will crop the feature map
// in the postprocess() method based on the final predictions.
// Whether to output final box feature. If true, it will crop the rpn feature
// map based on the final prediction boxes, then pass the crops through the
// box_classifier to compute the final features in the postprocess() method.
optional bool output_final_box_features = 42 [default = false];
// Whether to output final box rpn features. If true, it will crop the rpn
// feature map in the postprocess() method based on the final prediction
// boxes.
optional bool output_final_box_rpn_features = 43 [default = false];
// Configs for context model.
optional Context context_config = 41;
}
// Input type format: whether inputs are TfExamples or TfSequenceExamples.
enum AttentionPosition {
ATTENTION_DEFAULT = 0; // Default, currently post box classifier
POST_BOX_CLASSIFIER = 1; // Post box classifier
POST_RPN = 2; // Post RPN, pre box classifier
}
message Context {
// Configuration proto for Context .
// Next id: 4
// Configuration proto for Context R-CNN.
// Next id: 12
// The maximum number of contextual features per-image, used for padding
optional int32 max_num_context_features = 1 [default = 2000];
......@@ -199,6 +213,30 @@ message Context {
// The context feature length.
optional int32 context_feature_length = 4 [default = 2057];
// Whether to use self-attention from box proposals to themselves, TF1 only.
optional bool use_self_attention = 6 [default = false];
// Whether to use attention into context features, setting to false is only
// implemented in TF1.
optional bool use_long_term_attention = 7 [default = true];
// Whether the self-attention block and the long term attention block should
// be in sequence or parallel, ie whether the outputs of the self-attention
// block should be the inputs into the long term attention block (sequence)
// or whether the self attention block and long term attention block should
// happen in parallel, with outputs summed.
optional bool self_attention_in_sequence = 8 [default = false];
// Number of attention heads
optional int32 num_attention_heads = 9 [default = 1];
// Number of attention layers
optional int32 num_attention_layers = 11 [default = 1];
// Where the attention goes, 0 is pre-second-stage, 1 is post-second-stage
optional AttentionPosition attention_position = 10 [
default = POST_BOX_CLASSIFIER];
}
message FasterRcnnFeatureExtractor {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment