Context R-CNN Updates: Added capabilities to use multi-headed or multi-layered...

Context R-CNN Updates: Added capabilities to use multi-headed or multi-layered attention, to place the attention heads pre- or post-second stage feature extraction, and to work with embedded features in the context feature back from pre- or post-second stage feature extraction. Added an option for RPN feature map crops to be piped through to model outputs. PiperOrigin-RevId: 345777219

Context R-CNN Updates: Added capabilities to use multi-headed or multi-layered...
Context R-CNN Updates: Added capabilities to use multi-headed or multi-layered attention, to place the attention heads pre- or post-second stage feature extraction, and to work with embedded features in the context feature back from pre- or post-second stage feature extraction. Added an option for RPN feature map crops to be piped through to model outputs. PiperOrigin-RevId: 345777219
30b1c958 · Sara Beery · TF Object Detection Team · e7dfe641 · 30b1c958 · 30b1c958
Commit 30b1c958 authored Dec 04, 2020 by Sara Beery Committed by TF Object Detection Team Dec 04, 2020
10 changed files
--- a/research/object_detection/builders/model_builder.py
+++ b/research/object_detection/builders/model_builder.py
@@ -756,7 +756,9 @@ def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries):
      'return_raw_detections_during_predict':
          frcnn_config.return_raw_detections_during_predict,
      'output_final_box_features':
-          frcnn_config.output_final_box_features
+          frcnn_config.output_final_box_features,
+      'output_final_box_rpn_features':
+          frcnn_config.output_final_box_rpn_features,
  }

  if ((not is_keras and isinstance(second_stage_box_predictor,
@@ -773,7 +775,19 @@ def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries):
        'attention_bottleneck_dimension':
            context_config.attention_bottleneck_dimension,
        'attention_temperature':
-            context_config.attention_temperature
+            context_config.attention_temperature,
+        'use_self_attention':
+            context_config.use_self_attention,
+        'use_long_term_attention':
+            context_config.use_long_term_attention,
+        'self_attention_in_sequence':
+            context_config.self_attention_in_sequence,
+        'num_attention_heads':
+            context_config.num_attention_heads,
+        'num_attention_layers':
+            context_config.num_attention_layers,
+        'attention_position':
+            context_config.attention_position
    })
    return context_rcnn_meta_arch.ContextRCNNMetaArch(
        initial_crop_size=initial_crop_size,

--- a/research/object_detection/meta_architectures/context_rcnn_lib.py
+++ b/research/object_detection/meta_architectures/context_rcnn_lib.py
@@ -67,10 +67,13 @@ def filter_weight_value(weights, values, valid_mask):

  # Force the invalid weights to be very negative so it won't contribute to
  # the softmax.
-  weights += tf.transpose(
-      tf.cast(tf.math.logical_not(valid_mask), weights.dtype) *
-      _NEGATIVE_PADDING_VALUE,
-      perm=[0, 2, 1])
+
+  very_negative_mask = tf.ones(
+      weights.shape, dtype=weights.dtype) * _NEGATIVE_PADDING_VALUE
+  valid_weight_mask = tf.tile(tf.transpose(valid_mask, perm=[0, 2, 1]),
+                              [1, weights.shape[1], 1])
+  weights = tf.where(valid_weight_mask,
+                     x=weights, y=very_negative_mask)

  # Force the invalid values to be 0.
  values *= tf.cast(valid_mask, values.dtype)
@@ -140,8 +143,9 @@ def project_features(features, projection_dimension, is_training, normalize):


 def attention_block(input_features, context_features, bottleneck_dimension,
-                    output_dimension, attention_temperature, valid_mask,
-                    is_training):
+                    output_dimension, attention_temperature,
+                    keys_values_valid_mask, queries_valid_mask,
+                    is_training, block_name="AttentionBlock"):
  """Generic attention block.

  Args:
@@ -156,14 +160,18 @@ def attention_block(input_features, context_features, bottleneck_dimension,
    attention_temperature: A float Tensor. It controls the temperature of the
      softmax for weights calculation. The formula for calculation as follows:
        weights = exp(weights / temperature) / sum(exp(weights / temperature))
-    valid_mask: A boolean Tensor of shape [batch_size, context_size].
+    keys_values_valid_mask: A boolean Tensor of shape
+      [batch_size, context_size].
+    queries_valid_mask: A boolean Tensor of shape
+      [batch_size, max_num_proposals].
    is_training: A boolean Tensor (affecting batch normalization).
+    block_name: A string to specify names for different attention blocks

  Returns:
    A float Tensor of shape [batch_size, input_size, output_dimension].
  """

-  with tf.variable_scope("AttentionBlock"):
+  with tf.variable_scope(block_name):
    queries = project_features(
        input_features, bottleneck_dimension, is_training, normalize=True)
    keys = project_features(
@@ -171,27 +179,42 @@ def attention_block(input_features, context_features, bottleneck_dimension,
    values = project_features(
        context_features, bottleneck_dimension, is_training, normalize=True)

-  weights = tf.matmul(queries, keys, transpose_b=True)
+    # masking out any keys which are padding
+    keys *= tf.cast(keys_values_valid_mask[..., tf.newaxis], keys.dtype)
+    queries *= tf.cast(queries_valid_mask[..., tf.newaxis], queries.dtype)
+
+    weights = tf.matmul(queries, keys, transpose_b=True)
+
+    weights, values = filter_weight_value(weights, values,
+                                          keys_values_valid_mask)

-  weights, values = filter_weight_value(weights, values, valid_mask)
+    weights = tf.identity(tf.nn.softmax(weights / attention_temperature),
+                          name=block_name+"AttentionWeights")

-  weights = tf.nn.softmax(weights / attention_temperature)
+    features = tf.matmul(weights, values)

-  features = tf.matmul(weights, values)
  output_features = project_features(
      features, output_dimension, is_training, normalize=False)
  return output_features


-def compute_box_context_attention(box_features, context_features,
-                                  valid_context_size, bottleneck_dimension,
-                                  attention_temperature, is_training):
+def _compute_box_context_attention(box_features, num_proposals,
+                                   context_features, valid_context_size,
+                                   bottleneck_dimension,
+                                   attention_temperature, is_training,
+                                   max_num_proposals,
+                                   use_self_attention=False,
+                                   use_long_term_attention=True,
+                                   self_attention_in_sequence=False,
+                                   num_attention_heads=1,
+                                   num_attention_layers=1):
  """Computes the attention feature from the context given a batch of box.

  Args:
-    box_features: A float Tensor of shape [batch_size, max_num_proposals,
+    box_features: A float Tensor of shape [batch_size * max_num_proposals,
      height, width, channels]. It is pooled features from first stage
      proposals.
+    num_proposals: The number of valid box proposals.
    context_features: A float Tensor of shape [batch_size, context_size,
      num_context_features].
    valid_context_size: A int32 Tensor of shape [batch_size].
@@ -201,22 +224,78 @@ def compute_box_context_attention(box_features, context_features,
      softmax for weights calculation. The formula for calculation as follows:
        weights = exp(weights / temperature) / sum(exp(weights / temperature))
    is_training: A boolean Tensor (affecting batch normalization).
+    max_num_proposals: The number of box proposals for each image.
+    use_self_attention: Whether to use an attention block across the
+      first stage predicted box features for the input image.
+    use_long_term_attention: Whether to use an attention block into the context
+      features.
+    self_attention_in_sequence: Whether self-attention and long term attention
+      should be in sequence or parallel.
+    num_attention_heads: Number of heads for multi-headed attention.
+    num_attention_layers: Number of heads for multi-layered attention.

  Returns:
    A float Tensor of shape [batch_size, max_num_proposals, 1, 1, channels].
  """
  _, context_size, _ = context_features.shape
-  valid_mask = compute_valid_mask(valid_context_size, context_size)
+  context_valid_mask = compute_valid_mask(valid_context_size, context_size)
+
+  total_proposals, height, width, channels = box_features.shape
+
+  batch_size = total_proposals // max_num_proposals
+  box_features = tf.reshape(
+      box_features,
+      [batch_size,
+       max_num_proposals,
+       height,
+       width,
+       channels])

-  channels = box_features.shape[-1]
  # Average pools over height and width dimension so that the shape of
  # box_features becomes [batch_size, max_num_proposals, channels].
  box_features = tf.reduce_mean(box_features, [2, 3])
-
-  output_features = attention_block(box_features, context_features,
-                                    bottleneck_dimension, channels.value,
-                                    attention_temperature, valid_mask,
-                                    is_training)
+  box_valid_mask = compute_valid_mask(
+      num_proposals,
+      box_features.shape[1])
+
+  if use_self_attention:
+    self_attention_box_features = attention_block(
+        box_features, box_features, bottleneck_dimension, channels.value,
+        attention_temperature, keys_values_valid_mask=box_valid_mask,
+        queries_valid_mask=box_valid_mask, is_training=is_training,
+        block_name="SelfAttentionBlock")
+
+  if use_long_term_attention:
+    if use_self_attention and self_attention_in_sequence:
+      input_features = tf.add(self_attention_box_features, box_features)
+      input_features = tf.divide(input_features, 2)
+    else:
+      input_features = box_features
+    original_input_features = input_features
+    for jdx in range(num_attention_layers):
+      layer_features = tf.zeros_like(input_features)
+      for idx in range(num_attention_heads):
+        block_name = "AttentionBlock" + str(idx) + "_AttentionLayer" +str(jdx)
+        attention_features = attention_block(
+            input_features,
+            context_features,
+            bottleneck_dimension,
+            channels.value,
+            attention_temperature,
+            keys_values_valid_mask=context_valid_mask,
+            queries_valid_mask=box_valid_mask,
+            is_training=is_training,
+            block_name=block_name)
+        layer_features = tf.add(layer_features, attention_features)
+      layer_features = tf.divide(layer_features, num_attention_heads)
+      input_features = tf.add(input_features, layer_features)
+    output_features = tf.add(input_features, original_input_features)
+    if not self_attention_in_sequence and use_self_attention:
+      output_features = tf.add(self_attention_box_features, output_features)
+  elif use_self_attention:
+    output_features = self_attention_box_features
+  else:
+    output_features = tf.zeros(self_attention_box_features.shape)

  # Expands the dimension back to match with the original feature map.
  output_features = output_features[:, :, tf.newaxis, tf.newaxis, :]

--- a/research/object_detection/meta_architectures/context_rcnn_lib_tf1_test.py
+++ b/research/object_detection/meta_architectures/context_rcnn_lib_tf1_test.py
@@ -50,9 +50,9 @@ class ContextRcnnLibTest(parameterized.TestCase, test_case.TestCase,
    filtered_weights, filtered_values = context_rcnn_lib.filter_weight_value(
        weights, values, valid_mask)
    expected_weights = tf.constant([[[4, 4], [4, 4], [4, 4]],
-                                    [[4, _NEGATIVE_PADDING_VALUE + 4],
-                                     [4, _NEGATIVE_PADDING_VALUE + 4],
-                                     [4, _NEGATIVE_PADDING_VALUE + 4]]])
+                                    [[4, _NEGATIVE_PADDING_VALUE],
+                                     [4, _NEGATIVE_PADDING_VALUE],
+                                     [4, _NEGATIVE_PADDING_VALUE]]])

    expected_values = tf.constant([[[1, 1, 1, 1], [1, 1, 1, 1]],
                                   [[1, 1, 1, 1], [0, 0, 0, 0]]])
@@ -66,9 +66,9 @@ class ContextRcnnLibTest(parameterized.TestCase, test_case.TestCase,
        weights, values, valid_mask)
    expected_weights = tf.constant(
        [[[4, 4], [4, 4], [4, 4]],
-         [[_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4],
-          [_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4],
-          [_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4]]])
+         [[_NEGATIVE_PADDING_VALUE, _NEGATIVE_PADDING_VALUE],
+          [_NEGATIVE_PADDING_VALUE, _NEGATIVE_PADDING_VALUE],
+          [_NEGATIVE_PADDING_VALUE, _NEGATIVE_PADDING_VALUE]]])

    expected_values = tf.constant([[[1, 1, 1, 1], [1, 1, 1, 1]],
                                   [[0, 0, 0, 0], [0, 0, 0, 0]]])
@@ -100,27 +100,67 @@ class ContextRcnnLibTest(parameterized.TestCase, test_case.TestCase,
    input_features = tf.ones([2, 3, 4], tf.float32)
    context_features = tf.ones([2, 2, 3], tf.float32)
    valid_mask = tf.constant([[True, True], [False, False]], tf.bool)
+    box_valid_mask = tf.constant([[True, True, True], [False, False, False]],
+                                 tf.bool)
    is_training = False
    output_features = context_rcnn_lib.attention_block(
        input_features, context_features, bottleneck_dimension,
-        output_dimension, attention_temperature, valid_mask, is_training)
+        output_dimension, attention_temperature,
+        keys_values_valid_mask=valid_mask,
+        queries_valid_mask=box_valid_mask,
+        is_training=is_training)

    # Makes sure the shape is correct.
    self.assertAllEqual(output_features.shape, [2, 3, output_dimension])

  @parameterized.parameters(True, False)
  def test_compute_box_context_attention(self, is_training):
-    box_features = tf.ones([2, 3, 4, 4, 4], tf.float32)
+    box_features = tf.ones([2 * 3, 4, 4, 4], tf.float32)
    context_features = tf.ones([2, 5, 6], tf.float32)
    valid_context_size = tf.constant((2, 3), tf.int32)
+    num_proposals = tf.constant((2, 3), tf.int32)
    bottleneck_dimension = 10
    attention_temperature = 1
-    attention_features = context_rcnn_lib.compute_box_context_attention(
-        box_features, context_features, valid_context_size,
-        bottleneck_dimension, attention_temperature, is_training)
+    attention_features = context_rcnn_lib._compute_box_context_attention(
+        box_features, num_proposals, context_features, valid_context_size,
+        bottleneck_dimension, attention_temperature, is_training,
+        max_num_proposals=3)
    # Makes sure the shape is correct.
    self.assertAllEqual(attention_features.shape, [2, 3, 1, 1, 4])

+  @parameterized.parameters(True, False)
+  def test_compute_box_context_attention_with_self_attention(self, is_training):
+    box_features = tf.ones([2 * 3, 4, 4, 4], tf.float32)
+    context_features = tf.ones([2, 5, 6], tf.float32)
+    valid_context_size = tf.constant((2, 3), tf.int32)
+    num_proposals = tf.constant((2, 3), tf.int32)
+    bottleneck_dimension = 10
+    attention_temperature = 1
+    attention_features = context_rcnn_lib._compute_box_context_attention(
+        box_features, num_proposals, context_features, valid_context_size,
+        bottleneck_dimension, attention_temperature, is_training,
+        max_num_proposals=3,
+        use_self_attention=True)
+    # Makes sure the shape is correct.
+    self.assertAllEqual(attention_features.shape, [2, 3, 1, 1, 4])
+
+  @parameterized.parameters(True, False)
+  def test_compute_box_context_attention_with_layers_and_heads(
+      self, is_training):
+    box_features = tf.ones([2 * 3, 4, 4, 4], tf.float32)
+    context_features = tf.ones([2, 5, 6], tf.float32)
+    valid_context_size = tf.constant((2, 3), tf.int32)
+    num_proposals = tf.constant((2, 3), tf.int32)
+    bottleneck_dimension = 10
+    attention_temperature = 1
+    attention_features = context_rcnn_lib._compute_box_context_attention(
+        box_features, num_proposals, context_features, valid_context_size,
+        bottleneck_dimension, attention_temperature, is_training,
+        max_num_proposals=3,
+        num_attention_layers=3,
+        num_attention_heads=3)
+    # Makes sure the shape is correct.
+    self.assertAllEqual(attention_features.shape, [2, 3, 1, 1, 4])

 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/meta_architectures/context_rcnn_lib_tf2.py
+++ b/research/object_detection/meta_architectures/context_rcnn_lib_tf2.py
@@ -51,7 +51,8 @@ class AttentionBlock(tf.keras.layers.Layer):

  def __init__(self, bottleneck_dimension, attention_temperature,
               output_dimension=None, is_training=False,
-               name='AttentionBlock', **kwargs):
+               name='AttentionBlock', max_num_proposals=100,
+               **kwargs):
    """Constructs an attention block.

    Args:
@@ -64,6 +65,7 @@ class AttentionBlock(tf.keras.layers.Layer):
        output feature.
      is_training: A boolean Tensor (affecting batch normalization).
      name: A string describing what to name the variables in this block.
+      max_num_proposals: The number of box proposals for each image
      **kwargs: Additional keyword arguments.
    """

@@ -75,6 +77,7 @@ class AttentionBlock(tf.keras.layers.Layer):
    self._bottleneck_dimension = bottleneck_dimension
    self._is_training = is_training
    self._output_dimension = output_dimension
+    self._max_num_proposals = max_num_proposals
    if self._output_dimension:
      self._feature_proj = ContextProjection(self._output_dimension)
    super(AttentionBlock, self).__init__(name=name, **kwargs)
@@ -89,15 +92,18 @@ class AttentionBlock(tf.keras.layers.Layer):
      self._output_dimension = input_shapes[-1]
      self._feature_proj = ContextProjection(self._output_dimension)

-  def call(self, box_features, context_features, valid_context_size):
+  def call(self, box_features, context_features, valid_context_size,
+           num_proposals):
    """Handles a call by performing attention.

    Args:
-      box_features: A float Tensor of shape [batch_size, input_size, height,
+      box_features: A float Tensor of shape [batch_size * input_size, height,
        width, num_input_features].
      context_features: A float Tensor of shape [batch_size, context_size,
        num_context_features].
      valid_context_size: A int32 Tensor of shape [batch_size].
+      num_proposals: A [batch_size] int32 Tensor specifying the number of valid
+        proposals per image in the batch.

    Returns:
      A float Tensor with shape [batch_size, input_size, num_input_features]
@@ -105,12 +111,26 @@ class AttentionBlock(tf.keras.layers.Layer):
    """

    _, context_size, _ = context_features.shape
-    valid_mask = compute_valid_mask(valid_context_size, context_size)
+    keys_values_valid_mask = compute_valid_mask(
+        valid_context_size, context_size)
+
+    total_proposals, height, width, channels = box_features.shape
+    batch_size = total_proposals // self._max_num_proposals
+    box_features = tf.reshape(
+        box_features,
+        [batch_size,
+         self._max_num_proposals,
+         height,
+         width,
+         channels])

    # Average pools over height and width dimension so that the shape of
    # box_features becomes [batch_size, max_num_proposals, channels].
    box_features = tf.reduce_mean(box_features, [2, 3])

+    queries_valid_mask = compute_valid_mask(num_proposals,
+                                            box_features.shape[1])
+
    queries = project_features(
        box_features, self._bottleneck_dimension, self._is_training,
        self._query_proj, normalize=True)
@@ -121,8 +141,13 @@ class AttentionBlock(tf.keras.layers.Layer):
        context_features, self._bottleneck_dimension, self._is_training,
        self._val_proj, normalize=True)

+    # masking out any keys which are padding
+    keys *= tf.cast(keys_values_valid_mask[..., tf.newaxis], keys.dtype)
+    queries *= tf.cast(queries_valid_mask[..., tf.newaxis], queries.dtype)
+
    weights = tf.matmul(queries, keys, transpose_b=True)
-    weights, values = filter_weight_value(weights, values, valid_mask)
+    weights, values = filter_weight_value(weights, values,
+                                          keys_values_valid_mask)
    weights = tf.nn.softmax(weights / self._attention_temperature)

    features = tf.matmul(weights, values)

--- a/research/object_detection/meta_architectures/context_rcnn_lib_tf2_test.py
+++ b/research/object_detection/meta_architectures/context_rcnn_lib_tf2_test.py
@@ -97,19 +97,21 @@ class ContextRcnnLibTest(parameterized.TestCase, test_case.TestCase):
  )
  def test_attention_block(self, bottleneck_dimension, output_dimension,
                           attention_temperature):
-    input_features = tf.ones([2, 8, 3, 3, 3], tf.float32)
+    input_features = tf.ones([2 * 8, 3, 3, 3], tf.float32)
    context_features = tf.ones([2, 20, 10], tf.float32)
+    num_proposals = tf.convert_to_tensor([6, 3])
    attention_block = context_rcnn_lib.AttentionBlock(
        bottleneck_dimension,
        attention_temperature,
        output_dimension=output_dimension,
-        is_training=False)
+        is_training=False,
+        max_num_proposals=8)
    valid_context_size = tf.random_uniform((2,),
                                           minval=0,
                                           maxval=10,
                                           dtype=tf.int32)
    output_features = attention_block(input_features, context_features,
-                                      valid_context_size)
+                                      valid_context_size, num_proposals)

    # Makes sure the shape is correct.
    self.assertAllEqual(output_features.shape,

--- a/research/object_detection/meta_architectures/context_rcnn_meta_arch.py
+++ b/research/object_detection/meta_architectures/context_rcnn_meta_arch.py
@@ -25,12 +25,19 @@ from __future__ import print_function

 import functools

+import tensorflow.compat.v1 as tf
+
+from object_detection.core import box_predictor
 from object_detection.core import standard_fields as fields
 from object_detection.meta_architectures import context_rcnn_lib
 from object_detection.meta_architectures import context_rcnn_lib_tf2
 from object_detection.meta_architectures import faster_rcnn_meta_arch
+from object_detection.protos import faster_rcnn_pb2
+from object_detection.utils import ops
 from object_detection.utils import tf_version

+_UNINITIALIZED_FEATURE_EXTRACTOR = '__uninitialized__'
+

 class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
  """Context R-CNN Meta-architecture definition."""
@@ -76,8 +83,17 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
               freeze_batchnorm=False,
               return_raw_detections_during_predict=False,
               output_final_box_features=False,
+               output_final_box_rpn_features=False,
               attention_bottleneck_dimension=None,
-               attention_temperature=None):
+               attention_temperature=None,
+               use_self_attention=False,
+               use_long_term_attention=True,
+               self_attention_in_sequence=False,
+               num_attention_heads=1,
+               num_attention_layers=1,
+               attention_position=(
+                   faster_rcnn_pb2.AttentionPosition.POST_BOX_CLASSIFIER)
+               ):
    """ContextRCNNMetaArch Constructor.

    Args:
@@ -210,11 +226,25 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        boxes in the predict() method. These are decoded boxes that have not
        been through postprocessing (i.e. NMS). Default False.
      output_final_box_features: Whether to output final box features. If true,
-        it crops the feauture map based on the final box prediction and returns
-        in the dict as detection_features.
+        it crops the feature map based on the final box prediction and returns
+        it in the output dict as detection_features.
+      output_final_box_rpn_features: Whether to output rpn box features. If
+        true, it crops the rpn feature map based on the final box prediction and
+        returns it in the output dict as detection_features.
      attention_bottleneck_dimension: A single integer. The bottleneck feature
        dimension of the attention block.
      attention_temperature: A single float. The attention temperature.
+      use_self_attention: Whether to use self-attention within the box features
+        in the current frame.
+      use_long_term_attention: Whether to use attention into the context
+        features.
+      self_attention_in_sequence: Whether self attention and long term attention
+        are in sequence or parallel.
+      num_attention_heads: The number of attention heads to use.
+      num_attention_layers: The number of attention layers to use.
+      attention_position: Whether attention should occur post rpn or post
+      box classifier. Options are specified in the faster rcnn proto,
+        default is post box classifier.

    Raises:
      ValueError: If `second_stage_batch_size` > `first_stage_max_proposals` at
@@ -264,19 +294,40 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        freeze_batchnorm=freeze_batchnorm,
        return_raw_detections_during_predict=(
            return_raw_detections_during_predict),
-        output_final_box_features=output_final_box_features)
+        output_final_box_features=output_final_box_features,
+        output_final_box_rpn_features=output_final_box_rpn_features)
+
+    self._attention_position = attention_position

    if tf_version.is_tf1():
      self._context_feature_extract_fn = functools.partial(
-          context_rcnn_lib.compute_box_context_attention,
+          context_rcnn_lib._compute_box_context_attention,
          bottleneck_dimension=attention_bottleneck_dimension,
          attention_temperature=attention_temperature,
-          is_training=is_training)
+          is_training=is_training,
+          max_num_proposals=self.max_num_proposals,
+          use_self_attention=use_self_attention,
+          use_long_term_attention=use_long_term_attention,
+          self_attention_in_sequence=self_attention_in_sequence,
+          num_attention_heads=num_attention_heads,
+          num_attention_layers=num_attention_layers)
    else:
+      if use_self_attention:
+        raise NotImplementedError
+      if self_attention_in_sequence:
+        raise NotImplementedError
+      if not use_long_term_attention:
+        raise NotImplementedError
+      if num_attention_heads > 1:
+        raise NotImplementedError
+      if num_attention_layers > 1:
+        raise NotImplementedError
+
      self._context_feature_extract_fn = context_rcnn_lib_tf2.AttentionBlock(
          bottleneck_dimension=attention_bottleneck_dimension,
          attention_temperature=attention_temperature,
-          is_training=is_training)
+          is_training=is_training,
+          max_num_proposals=self.max_num_proposals)

  @staticmethod
  def get_side_inputs(features):
@@ -298,8 +349,8 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
    if (fields.InputDataFields.context_features not in features or
        fields.InputDataFields.valid_context_size not in features):
      raise ValueError(
-          "Please make sure context_features and valid_context_size are in the "
-          "features")
+          'Please make sure context_features and valid_context_size are in the '
+          'features')

    return {
        fields.InputDataFields.context_features:
@@ -308,9 +359,189 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
            features[fields.InputDataFields.valid_context_size]
    }

+  def _predict_second_stage(self, rpn_box_encodings,
+                            rpn_objectness_predictions_with_background,
+                            rpn_features_to_crop, anchors, image_shape,
+                            true_image_shapes, **side_inputs):
+    """Predicts the output tensors from second stage of Faster R-CNN.
+
+    Args:
+      rpn_box_encodings: 3-D float tensor of shape
+        [batch_size, num_valid_anchors, self._box_coder.code_size] containing
+        predicted boxes.
+      rpn_objectness_predictions_with_background: 2-D float tensor of shape
+        [batch_size, num_valid_anchors, 2] containing class
+        predictions (logits) for each of the anchors.  Note that this
+        tensor *includes* background class predictions (at class index 0).
+      rpn_features_to_crop: A list of 4-D float32 or bfloat16 tensor with shape
+        [batch_size, height_i, width_i, depth] representing image features to
+        crop using the proposal boxes predicted by the RPN.
+      anchors: 2-D float tensor of shape
+        [num_anchors, self._box_coder.code_size].
+      image_shape: A 1D int32 tensors of size [4] containing the image shape.
+      true_image_shapes: int32 tensor of shape [batch, 3] where each row is
+        of the form [height, width, channels] indicating the shapes
+        of true images in the resized images, as resized images can be padded
+        with zeros.
+      **side_inputs: additional tensors that are required by the network.
+
+    Returns:
+      prediction_dict: a dictionary holding "raw" prediction tensors:
+        1) refined_box_encodings: a 3-D float32 tensor with shape
+          [total_num_proposals, num_classes, self._box_coder.code_size]
+          representing predicted (final) refined box encodings, where
+          total_num_proposals=batch_size*self._max_num_proposals. If using a
+          shared box across classes the shape will instead be
+          [total_num_proposals, 1, self._box_coder.code_size].
+        2) class_predictions_with_background: a 3-D float32 tensor with shape
+          [total_num_proposals, num_classes + 1] containing class
+          predictions (logits) for each of the anchors, where
+          total_num_proposals=batch_size*self._max_num_proposals.
+          Note that this tensor *includes* background class predictions
+          (at class index 0).
+        3) num_proposals: An int32 tensor of shape [batch_size] representing the
+          number of proposals generated by the RPN.  `num_proposals` allows us
+          to keep track of which entries are to be treated as zero paddings and
+          which are not since we always pad the number of proposals to be
+          `self.max_num_proposals` for each image.
+        4) proposal_boxes: A float32 tensor of shape
+          [batch_size, self.max_num_proposals, 4] representing
+          decoded proposal bounding boxes in absolute coordinates.
+        5) proposal_boxes_normalized: A float32 tensor of shape
+          [batch_size, self.max_num_proposals, 4] representing decoded proposal
+          bounding boxes in normalized coordinates. Can be used to override the
+          boxes proposed by the RPN, thus enabling one to extract features and
+          get box classification and prediction for externally selected areas
+          of the image.
+        6) box_classifier_features: a 4-D float32/bfloat16 tensor
+          representing the features for each proposal.
+        If self._return_raw_detections_during_predict is True, the dictionary
+        will also contain:
+        7) raw_detection_boxes: a 4-D float32 tensor with shape
+          [batch_size, self.max_num_proposals, num_classes, 4] in normalized
+          coordinates.
+        8) raw_detection_feature_map_indices: a 3-D int32 tensor with shape
+          [batch_size, self.max_num_proposals, num_classes].
+    """
+    proposal_boxes_normalized, num_proposals = self._proposal_postprocess(
+        rpn_box_encodings, rpn_objectness_predictions_with_background, anchors,
+        image_shape, true_image_shapes)
+
+    prediction_dict = self._box_prediction(rpn_features_to_crop,
+                                           proposal_boxes_normalized,
+                                           image_shape, true_image_shapes,
+                                           num_proposals,
+                                           **side_inputs)
+    prediction_dict['num_proposals'] = num_proposals
+    return prediction_dict
+
+  def _box_prediction(self, rpn_features_to_crop, proposal_boxes_normalized,
+                      image_shape, true_image_shapes, num_proposals,
+                      **side_inputs):
+    """Predicts the output tensors from second stage of Faster R-CNN.
+
+    Args:
+      rpn_features_to_crop: A list 4-D float32 or bfloat16 tensor with shape
+        [batch_size, height_i, width_i, depth] representing image features to
+        crop using the proposal boxes predicted by the RPN.
+      proposal_boxes_normalized: A float tensor with shape [batch_size,
+        max_num_proposals, 4] representing the (potentially zero padded)
+        proposal boxes for all images in the batch.  These boxes are represented
+        as normalized coordinates.
+      image_shape: A 1D int32 tensors of size [4] containing the image shape.
+      true_image_shapes: int32 tensor of shape [batch, 3] where each row is
+        of the form [height, width, channels] indicating the shapes
+        of true images in the resized images, as resized images can be padded
+        with zeros.
+      num_proposals: The number of valid box proposals.
+      **side_inputs: additional tensors that are required by the network.
+
+    Returns:
+      prediction_dict: a dictionary holding "raw" prediction tensors:
+        1) refined_box_encodings: a 3-D float32 tensor with shape
+          [total_num_proposals, num_classes, self._box_coder.code_size]
+          representing predicted (final) refined box encodings, where
+          total_num_proposals=batch_size*self._max_num_proposals. If using a
+          shared box across classes the shape will instead be
+          [total_num_proposals, 1, self._box_coder.code_size].
+        2) class_predictions_with_background: a 3-D float32 tensor with shape
+          [total_num_proposals, num_classes + 1] containing class
+          predictions (logits) for each of the anchors, where
+          total_num_proposals=batch_size*self._max_num_proposals.
+          Note that this tensor *includes* background class predictions
+          (at class index 0).
+        3) proposal_boxes: A float32 tensor of shape
+          [batch_size, self.max_num_proposals, 4] representing
+          decoded proposal bounding boxes in absolute coordinates.
+        4) proposal_boxes_normalized: A float32 tensor of shape
+          [batch_size, self.max_num_proposals, 4] representing decoded proposal
+          bounding boxes in normalized coordinates. Can be used to override the
+          boxes proposed by the RPN, thus enabling one to extract features and
+          get box classification and prediction for externally selected areas
+          of the image.
+        5) box_classifier_features: a 4-D float32/bfloat16 tensor
+          representing the features for each proposal.
+        If self._return_raw_detections_during_predict is True, the dictionary
+        will also contain:
+        6) raw_detection_boxes: a 4-D float32 tensor with shape
+          [batch_size, self.max_num_proposals, num_classes, 4] in normalized
+          coordinates.
+        7) raw_detection_feature_map_indices: a 3-D int32 tensor with shape
+          [batch_size, self.max_num_proposals, num_classes].
+        8) final_anchors: a 3-D float tensor of shape [batch_size,
+          self.max_num_proposals, 4] containing the reference anchors for raw
+          detection boxes in normalized coordinates.
+    """
+    flattened_proposal_feature_maps = (
+        self._compute_second_stage_input_feature_maps(
+            rpn_features_to_crop, proposal_boxes_normalized,
+            image_shape, num_proposals, **side_inputs))
+
+    box_classifier_features = self._extract_box_classifier_features(
+        flattened_proposal_feature_maps, num_proposals, **side_inputs)
+
+    if self._mask_rcnn_box_predictor.is_keras_model:
+      box_predictions = self._mask_rcnn_box_predictor(
+          [box_classifier_features],
+          prediction_stage=2)
+    else:
+      box_predictions = self._mask_rcnn_box_predictor.predict(
+          [box_classifier_features],
+          num_predictions_per_location=[1],
+          scope=self.second_stage_box_predictor_scope,
+          prediction_stage=2)
+
+    refined_box_encodings = tf.squeeze(
+        box_predictions[box_predictor.BOX_ENCODINGS],
+        axis=1, name='all_refined_box_encodings')
+    class_predictions_with_background = tf.squeeze(
+        box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
+        axis=1, name='all_class_predictions_with_background')
+
+    absolute_proposal_boxes = ops.normalized_to_image_coordinates(
+        proposal_boxes_normalized, image_shape, self._parallel_iterations)
+
+    prediction_dict = {
+        'refined_box_encodings': tf.cast(refined_box_encodings,
+                                         dtype=tf.float32),
+        'class_predictions_with_background':
+        tf.cast(class_predictions_with_background, dtype=tf.float32),
+        'proposal_boxes': absolute_proposal_boxes,
+        'box_classifier_features': box_classifier_features,
+        'proposal_boxes_normalized': proposal_boxes_normalized,
+        'final_anchors': proposal_boxes_normalized
+    }
+
+    if self._return_raw_detections_during_predict:
+      prediction_dict.update(self._raw_detections_and_feature_map_inds(
+          refined_box_encodings, absolute_proposal_boxes, true_image_shapes))
+
+    return prediction_dict
+
  def _compute_second_stage_input_feature_maps(self, features_to_crop,
                                               proposal_boxes_normalized,
                                               image_shape,
+                                               num_proposals,
                                               context_features,
                                               valid_context_size):
    """Crops to a set of proposals from the feature map for a batch of images.
@@ -326,6 +557,7 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        num_proposals, box_code_size] containing proposal boxes in normalized
        coordinates.
      image_shape: A 1D int32 tensors of size [4] containing the image shape.
+      num_proposals: The number of valid box proposals.
      context_features: A float Tensor of shape [batch_size, context_size,
        num_context_features].
      valid_context_size: A int32 Tensor of shape [batch_size].
@@ -338,14 +570,55 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        features_to_crop, proposal_boxes_normalized, None,
        [self._initial_crop_size, self._initial_crop_size])

-    attention_features = self._context_feature_extract_fn(
-        box_features=box_features,
-        context_features=context_features,
-        valid_context_size=valid_context_size)
+    flattened_box_features = self._flatten_first_two_dimensions(box_features)
+
+    flattened_box_features = self._maxpool_layer(flattened_box_features)
+
+    if self._attention_position == (
+        faster_rcnn_pb2.AttentionPosition.POST_RPN):
+      attention_features = self._context_feature_extract_fn(
+          box_features=flattened_box_features,
+          num_proposals=num_proposals,
+          context_features=context_features,
+          valid_context_size=valid_context_size)
+
+      # Adds box features with attention features.
+      flattened_box_features += self._flatten_first_two_dimensions(
+          attention_features)
+
+    return flattened_box_features
+
+  def _extract_box_classifier_features(
+      self, flattened_box_features, num_proposals, context_features,
+      valid_context_size,
+      attention_position=(
+          faster_rcnn_pb2.AttentionPosition.POST_BOX_CLASSIFIER)):
+    if self._feature_extractor_for_box_classifier_features == (
+        _UNINITIALIZED_FEATURE_EXTRACTOR):
+      self._feature_extractor_for_box_classifier_features = (
+          self._feature_extractor.get_box_classifier_feature_extractor_model(
+              name=self.second_stage_feature_extractor_scope))
+
+    if self._feature_extractor_for_box_classifier_features:
+      box_classifier_features = (
+          self._feature_extractor_for_box_classifier_features(
+              flattened_box_features))
+    else:
+      box_classifier_features = (
+          self._feature_extractor.extract_box_classifier_features(
+              flattened_box_features,
+              scope=self.second_stage_feature_extractor_scope))

-    # Adds box features with attention features.
-    box_features += attention_features
+    if self._attention_position == (
+        faster_rcnn_pb2.AttentionPosition.POST_BOX_CLASSIFIER):
+      attention_features = self._context_feature_extract_fn(
+          box_features=box_classifier_features,
+          num_proposals=num_proposals,
+          context_features=context_features,
+          valid_context_size=valid_context_size)

-    flattened_feature_maps = self._flatten_first_two_dimensions(box_features)
+      # Adds box features with attention features.
+      box_classifier_features += self._flatten_first_two_dimensions(
+          attention_features)

-    return self._maxpool_layer(flattened_feature_maps)
+    return box_classifier_features
--- a/research/object_detection/meta_architectures/context_rcnn_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/context_rcnn_meta_arch_test.py
@@ -293,7 +293,6 @@ class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):

    first_stage_nms_score_threshold = -1.0
    first_stage_nms_iou_threshold = 1.0
-    first_stage_max_proposals = first_stage_max_proposals
    first_stage_non_max_suppression_fn = functools.partial(
        post_processing.batch_multiclass_non_max_suppression,
        score_thresh=first_stage_nms_score_threshold,
@@ -444,7 +443,7 @@ class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):
  def test_prediction_mock_tf1(self, mock_context_rcnn_lib_v1):
    """Mocks the context_rcnn_lib_v1 module to test the prediction.

-    Using mock object so that we can ensure compute_box_context_attention is
+    Using mock object so that we can ensure _compute_box_context_attention is
    called in side the prediction function.

    Args:
@@ -457,7 +456,7 @@ class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):
        num_classes=42)
    mock_tensor = tf.ones([2, 8, 3, 3, 3], tf.float32)

-    mock_context_rcnn_lib_v1.compute_box_context_attention.return_value = mock_tensor
+    mock_context_rcnn_lib_v1._compute_box_context_attention.return_value = mock_tensor
    inputs_shape = (2, 20, 20, 3)
    inputs = tf.cast(
        tf.random_uniform(inputs_shape, minval=0, maxval=255, dtype=tf.int32),
@@ -479,7 +478,7 @@ class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):
    side_inputs = model.get_side_inputs(features)

    _ = model.predict(preprocessed_inputs, true_image_shapes, **side_inputs)
-    mock_context_rcnn_lib_v1.compute_box_context_attention.assert_called_once()
+    mock_context_rcnn_lib_v1._compute_box_context_attention.assert_called_once()

  @parameterized.named_parameters(
      {'testcase_name': 'static_shapes', 'static_shapes': True},

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
@@ -304,7 +304,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
               resize_masks=True,
               freeze_batchnorm=False,
               return_raw_detections_during_predict=False,
-               output_final_box_features=False):
+               output_final_box_features=False,
+               output_final_box_rpn_features=False):
    """FasterRCNNMetaArch Constructor.

    Args:
@@ -437,8 +438,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
        boxes in the predict() method. These are decoded boxes that have not
        been through postprocessing (i.e. NMS). Default False.
      output_final_box_features: Whether to output final box features. If true,
-        it crops the feauture map based on the final box prediction and returns
-        in the dict as detection_features.
+        it crops the rpn feature map and passes it through box_classifier then
+        returns in the output dict as `detection_features`.
+      output_final_box_rpn_features: Whether to output rpn box features. If
+        true, it crops the rpn feature map and returns in the output dict as
+        `detection_features`.

    Raises:
      ValueError: If `second_stage_batch_size` > `first_stage_max_proposals` at
@@ -604,6 +608,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
    self._return_raw_detections_during_predict = (
        return_raw_detections_during_predict)
    self._output_final_box_features = output_final_box_features
+    self._output_final_box_rpn_features = output_final_box_rpn_features

  @property
  def first_stage_feature_extractor_scope(self):
@@ -821,7 +826,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
              prediction_dict['rpn_objectness_predictions_with_background'],
              prediction_dict['rpn_features_to_crop'],
              prediction_dict['anchors'], prediction_dict['image_shape'],
-              true_image_shapes, **side_inputs))
+              true_image_shapes,
+              **side_inputs))

    if self._number_of_stages == 3:
      prediction_dict = self._predict_third_stage(prediction_dict,
@@ -1059,7 +1065,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
            image_shape, **side_inputs))

    box_classifier_features = self._extract_box_classifier_features(
-        flattened_proposal_feature_maps)
+        flattened_proposal_feature_maps, **side_inputs)

    if self._mask_rcnn_box_predictor.is_keras_model:
      box_predictions = self._mask_rcnn_box_predictor(
@@ -1547,10 +1553,22 @@ class FasterRCNNMetaArch(model.DetectionModel):
              'Please make sure rpn_features_to_crop is in the prediction_dict.'
          )
        detections_dict[
-            'detection_features'] = self._add_detection_features_output_node(
+            'detection_features'] = (
+                self._add_detection_box_boxclassifier_features_output_node(
+                    detections_dict[
+                        fields.DetectionResultFields.detection_boxes],
+                    prediction_dict['rpn_features_to_crop'],
+                    prediction_dict['image_shape']))
+      if self._output_final_box_rpn_features:
+        if 'rpn_features_to_crop' not in prediction_dict:
+          raise ValueError(
+              'Please make sure rpn_features_to_crop is in the prediction_dict.'
+          )
+        detections_dict['cropped_rpn_box_features'] = (
+            self._add_detection_box_rpn_features_output_node(
                detections_dict[fields.DetectionResultFields.detection_boxes],
                prediction_dict['rpn_features_to_crop'],
-                prediction_dict['image_shape'])
+                prediction_dict['image_shape']))

      return detections_dict

@@ -1566,8 +1584,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
        prediction_dict.pop(k)
      return prediction_dict

-  def _add_detection_features_output_node(self, detection_boxes,
-                                          rpn_features_to_crop, image_shape):
+  def _add_detection_box_boxclassifier_features_output_node(
+      self, detection_boxes, rpn_features_to_crop, image_shape):
    """Add detection features to outputs.

    This function extracts box features for each box in rpn_features_to_crop.
@@ -1606,6 +1624,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
      reshaped_detection_features_pool = tf.identity(
          reshaped_detection_features_pool, 'pooled_detection_features')

+      # TODO(sbeery) add node to extract rpn features here!!
+
      reshaped_detection_features = tf.reshape(
          detection_features_unpooled,
          [batch_size, max_detections,
@@ -1615,6 +1635,44 @@ class FasterRCNNMetaArch(model.DetectionModel):

    return reshaped_detection_features

+  def _add_detection_box_rpn_features_output_node(self, detection_boxes,
+                                                  rpn_features_to_crop,
+                                                  image_shape):
+    """Add detection features to outputs.
+
+    This function extracts box features for each box in rpn_features_to_crop.
+    It returns the extracted box features, reshaped to
+    [batch size, max_detections, height, width, depth]
+
+    Args:
+      detection_boxes: a 3-D float32 tensor of shape
+        [batch_size, max_detections, 4] which represents the bounding boxes.
+      rpn_features_to_crop: A list of 4-D float32 tensor with shape
+        [batch, height, width, depth] representing image features to crop using
+        the proposals boxes.
+      image_shape: a 1-D tensor of shape [4] representing the image shape.
+
+    Returns:
+      detection_features: a 4-D float32 tensor of shape
+        [batch size, max_detections, height, width, depth] representing
+        cropped image features
+    """
+    with tf.name_scope('FirstStageDetectionFeaturesExtract'):
+      flattened_detected_feature_maps = (
+          self._compute_second_stage_input_feature_maps(
+              rpn_features_to_crop, detection_boxes, image_shape))
+
+      batch_size = tf.shape(detection_boxes)[0]
+      max_detections = tf.shape(detection_boxes)[1]
+      reshaped_detection_features = tf.reshape(
+          flattened_detected_feature_maps,
+          [batch_size, max_detections,
+           tf.shape(flattened_detected_feature_maps)[1],
+           tf.shape(flattened_detected_feature_maps)[2],
+           tf.shape(flattened_detected_feature_maps)[3]])
+
+    return reshaped_detection_features
+
  def _postprocess_rpn(self,
                       rpn_box_encodings_batch,
                       rpn_objectness_predictions_with_background_batch,

--- a/research/object_detection/meta_architectures/rfcn_meta_arch.py
+++ b/research/object_detection/meta_architectures/rfcn_meta_arch.py
@@ -84,7 +84,8 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
               resize_masks=False,
               freeze_batchnorm=False,
               return_raw_detections_during_predict=False,
-               output_final_box_features=False):
+               output_final_box_features=False,
+               output_final_box_rpn_features=False):
    """RFCNMetaArch Constructor.

    Args:
@@ -194,8 +195,11 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        boxes in the predict() method. These are decoded boxes that have not
        been through postprocessing (i.e. NMS). Default False.
      output_final_box_features: Whether to output final box features. If true,
-        it crops the feauture map based on the final box prediction and returns
-        in the dict as detection_features.
+        it crops the feature map based on the final box prediction and returns
+        it in the dict as detection_features.
+      output_final_box_rpn_features: Whether to output rpn box features. If
+        true, it crops the rpn feature map based on the final box prediction and
+        returns it in the dict as detection_features.

    Raises:
      ValueError: If `second_stage_batch_size` > `first_stage_max_proposals`
@@ -245,7 +249,8 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        freeze_batchnorm=freeze_batchnorm,
        return_raw_detections_during_predict=(
            return_raw_detections_during_predict),
-        output_final_box_features=output_final_box_features)
+        output_final_box_features=output_final_box_features,
+        output_final_box_rpn_features=output_final_box_rpn_features)

    self._rfcn_box_predictor = second_stage_rfcn_box_predictor


--- a/research/object_detection/protos/faster_rcnn.proto
+++ b/research/object_detection/protos/faster_rcnn.proto
@@ -18,6 +18,7 @@ import "object_detection/protos/fpn.proto";
 // (or RPN) and a second stage box classifier.  We thus use the prefixes
 // `first_stage_` and `second_stage_` to indicate the stage to which each
 // parameter pertains when relevant.
+
 message FasterRcnn {
  // Whether to construct only the Region Proposal Network (RPN).
  optional int32 number_of_stages = 1 [default = 2];
@@ -176,17 +177,30 @@ message FasterRcnn {
  // Whether to use tf.image.combined_non_max_suppression.
  optional bool use_combined_nms_in_first_stage = 40 [default = false];

-  // Whether to output final box feature. If true, it will crop the feature map
-  // in the postprocess() method based on the final predictions.
+  // Whether to output final box feature. If true, it will crop the rpn feature
+  // map based on the final prediction boxes, then pass the crops through the
+  // box_classifier to compute the final features in the postprocess() method.
  optional bool output_final_box_features = 42 [default = false];

+  // Whether to output final box rpn features. If true, it will crop the rpn
+  // feature map in the postprocess() method based on the final prediction
+  // boxes.
+  optional bool output_final_box_rpn_features = 43 [default = false];
+
  // Configs for context model.
  optional Context context_config = 41;
 }

+// Input type format: whether inputs are TfExamples or TfSequenceExamples.
+enum AttentionPosition {
+  ATTENTION_DEFAULT = 0;        // Default, currently post box classifier
+  POST_BOX_CLASSIFIER = 1;      // Post box classifier
+  POST_RPN = 2;                 // Post RPN, pre box classifier
+}
+
 message Context {
-  // Configuration proto for Context .
-  // Next id: 4
+  // Configuration proto for Context R-CNN.
+  // Next id: 12

  // The maximum number of contextual features per-image, used for padding
  optional int32 max_num_context_features = 1 [default = 2000];
@@ -199,6 +213,30 @@ message Context {

  // The context feature length.
  optional int32 context_feature_length = 4 [default = 2057];
+
+  // Whether to use self-attention from box proposals to themselves, TF1 only.
+  optional bool use_self_attention = 6 [default = false];
+
+  // Whether to use attention into context features, setting to false is only
+  // implemented in TF1.
+  optional bool use_long_term_attention = 7 [default = true];
+
+  // Whether the self-attention block and the long term attention block should
+  // be in sequence or parallel, ie whether the outputs of the self-attention
+  // block should be the inputs into the long term attention block (sequence)
+  // or whether the self attention block and long term attention block should
+  // happen in parallel, with outputs summed.
+  optional bool self_attention_in_sequence = 8 [default = false];
+
+  // Number of attention heads
+  optional int32 num_attention_heads = 9 [default = 1];
+
+  // Number of attention layers
+  optional int32 num_attention_layers = 11 [default = 1];
+
+  // Where the attention goes, 0 is pre-second-stage, 1 is post-second-stage
+  optional AttentionPosition attention_position = 10 [
+    default = POST_BOX_CLASSIFIER];
 }

 message FasterRcnnFeatureExtractor {