Merge pull request #8895 from syiming:adjust_frcnn_meta_arch_to_multilevel_rpn_feature

PiperOrigin-RevId: 325370846

Merge pull request #8895 from syiming:adjust_frcnn_meta_arch_to_multilevel_rpn_feature
PiperOrigin-RevId: 325370846
0ba5a72b · TF Object Detection Team · 80a6318b · 18d95442 · 0ba5a72b · 0ba5a72b
Commit 0ba5a72b authored Aug 06, 2020 by TF Object Detection Team
7 changed files
--- a/research/object_detection/builders/model_builder.py
+++ b/research/object_detection/builders/model_builder.py
@@ -524,9 +524,31 @@ def _build_faster_rcnn_keras_feature_extractor(
        feature_type))
  feature_extractor_class = FASTER_RCNN_KERAS_FEATURE_EXTRACTOR_CLASS_MAP[
      feature_type]
+  kwargs = {}
+  if feature_extractor_config.HasField('conv_hyperparams'):
+    kwargs.update({
+        'conv_hyperparams':
+            hyperparams_builder.KerasLayerHyperparams(
+                feature_extractor_config.conv_hyperparams),
+        'override_base_feature_extractor_hyperparams':
+            feature_extractor_config.override_base_feature_extractor_hyperparams
+    })
+  if feature_extractor_config.HasField('fpn'):
+    kwargs.update({
+        'fpn_min_level':
+            feature_extractor_config.fpn.min_level,
+        'fpn_max_level':
+            feature_extractor_config.fpn.max_level,
+        'additional_layer_depth':
+            feature_extractor_config.fpn.additional_layer_depth,
+    })
  return feature_extractor_class(
      is_training, first_stage_features_stride,
-      batch_norm_trainable)
+      batch_norm_trainable, **kwargs)
 def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries):

--- a/research/object_detection/meta_architectures/context_rcnn_meta_arch.py
+++ b/research/object_detection/meta_architectures/context_rcnn_meta_arch.py
@@ -310,6 +310,7 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
  def _compute_second_stage_input_feature_maps(self, features_to_crop,
                                               proposal_boxes_normalized,
+                                               image_shape,
                                               context_features,
                                               valid_context_size):
    """Crops to a set of proposals from the feature map for a batch of images.
@@ -324,6 +325,7 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
      proposal_boxes_normalized: A float32 Tensor with shape [batch_size,
        num_proposals, box_code_size] containing proposal boxes in normalized
        coordinates.
+      image_shape: A 1D int32 tensors of size [4] containing the image shape.
      context_features: A float Tensor of shape [batch_size, context_size,
        num_context_features].
      valid_context_size: A int32 Tensor of shape [batch_size].
@@ -331,9 +333,9 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
    Returns:
      A float32 Tensor with shape [K, new_height, new_width, depth].
    """
+    del image_shape
    box_features = self._crop_and_resize_fn(
-        [features_to_crop], proposal_boxes_normalized, None,
+        features_to_crop, proposal_boxes_normalized, None,
        [self._initial_crop_size, self._initial_crop_size])
    attention_features = self._context_feature_extract_fn(

--- a/research/object_detection/meta_architectures/context_rcnn_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/context_rcnn_meta_arch_test.py
@@ -529,7 +529,8 @@ class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):
    (rpn_box_predictor_features, rpn_box_encodings, refined_box_encodings,
     proposal_boxes_normalized, proposal_boxes) = execute_fn(graph_fn, [],
                                                             graph=g)
-    self.assertAllEqual(rpn_box_predictor_features.shape, [2, 20, 20, 512])
+    self.assertAllEqual(len(rpn_box_predictor_features), 1)
+    self.assertAllEqual(rpn_box_predictor_features[0].shape, [2, 20, 20, 512])
    self.assertAllEqual(rpn_box_encodings.shape, [2, 3600, 4])
    self.assertAllEqual(refined_box_encodings.shape, [16, 42, 4])
    self.assertAllEqual(proposal_boxes_normalized.shape, [2, 8, 4])

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
@@ -99,7 +99,6 @@ import functools
 import tensorflow.compat.v1 as tf
 import tf_slim as slim
-from object_detection.anchor_generators import grid_anchor_generator
 from object_detection.builders import box_predictor_builder
 from object_detection.builders import hyperparams_builder
 from object_detection.core import box_list
@@ -451,11 +450,6 @@ class FasterRCNNMetaArch(model.DetectionModel):
    # in the future.
    super(FasterRCNNMetaArch, self).__init__(num_classes=num_classes)
-    if not isinstance(first_stage_anchor_generator,
-                      grid_anchor_generator.GridAnchorGenerator):
-      raise ValueError('first_stage_anchor_generator must be of type '
-                       'grid_anchor_generator.GridAnchorGenerator.')
    self._is_training = is_training
    self._image_resizer_fn = image_resizer_fn
    self._resize_masks = resize_masks
@@ -492,9 +486,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
                  hyperparams_builder.KerasLayerHyperparams):
      num_anchors_per_location = (
          self._first_stage_anchor_generator.num_anchors_per_location())
-      if len(num_anchors_per_location) != 1:
-        raise ValueError('anchor_generator is expected to generate anchors '
-                         'corresponding to a single feature map.')
      conv_hyperparams = (
          first_stage_box_predictor_arg_scope_fn)
      self._first_stage_box_predictor_first_conv = (
@@ -533,11 +525,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
    else:
      self._first_stage_box_predictor_arg_scope_fn = (
          first_stage_box_predictor_arg_scope_fn)
-      def rpn_box_predictor_feature_extractor(rpn_features_to_crop):
+      def rpn_box_predictor_feature_extractor(single_rpn_features_to_crop):
        with slim.arg_scope(self._first_stage_box_predictor_arg_scope_fn()):
-          reuse = tf.get_variable_scope().reuse
          return slim.conv2d(
-              rpn_features_to_crop,
+              single_rpn_features_to_crop,
              self._first_stage_box_predictor_depth,
              kernel_size=[
                  self._first_stage_box_predictor_kernel_size,
@@ -546,7 +537,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
              rate=self._first_stage_atrous_rate,
              activation_fn=tf.nn.relu6,
              scope='Conv',
-              reuse=reuse)
+              reuse=tf.AUTO_REUSE)
      self._first_stage_box_predictor_first_conv = (
          rpn_box_predictor_feature_extractor)
      self._first_stage_box_predictor = (
@@ -762,10 +753,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
    Returns:
      prediction_dict: a dictionary holding "raw" prediction tensors:
-        1) rpn_box_predictor_features: A 4-D float32 tensor with shape
+        1) rpn_box_predictor_features: A list of 4-D float32 tensor with shape
-          [batch_size, height, width, depth] to be used for predicting proposal
+          [batch_size, height_i, width_j, depth] to be used for predicting
-          boxes and corresponding objectness scores.
+          proposal boxes and corresponding objectness scores.
-        2) rpn_features_to_crop: A 4-D float32 tensor with shape
+        2) rpn_features_to_crop: A list of 4-D float32 tensor with shape
          [batch_size, height, width, depth] representing image features to crop
          using the proposal boxes predicted by the RPN.
        3) image_shape: a 1-D tensor of shape [4] representing the input
@@ -850,12 +841,12 @@ class FasterRCNNMetaArch(model.DetectionModel):
    Returns:
      prediction_dict: a dictionary holding "raw" prediction tensors:
-        1) rpn_box_predictor_features: A 4-D float32/bfloat16 tensor with shape
+        1) rpn_box_predictor_features: A list of 4-D float32/bfloat16 tensor
-          [batch_size, height, width, depth] to be used for predicting proposal
+          with shape [batch_size, height_i, width_j, depth] to be used for
-          boxes and corresponding objectness scores.
+          predicting proposal boxes and corresponding objectness scores.
-        2) rpn_features_to_crop: A 4-D float32/bfloat16 tensor with shape
+        2) rpn_features_to_crop: A list of 4-D float32/bfloat16 tensor with
-          [batch_size, height, width, depth] representing image features to crop
+          shape [batch_size, height, width, depth] representing image features
-          using the proposal boxes predicted by the RPN.
+          to crop using the proposal boxes predicted by the RPN.
        3) image_shape: a 1-D tensor of shape [4] representing the input
          image shape.
        4) rpn_box_encodings:  3-D float32 tensor of shape
@@ -911,7 +902,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
                    dtype=tf.float32),
        'anchors':
            anchors_boxlist.data['boxes'],
-        fields.PredictionFields.feature_maps: [rpn_features_to_crop]
+        fields.PredictionFields.feature_maps: rpn_features_to_crop
    }
    return prediction_dict
@@ -947,9 +938,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
        [batch_size, num_valid_anchors, 2] containing class
        predictions (logits) for each of the anchors.  Note that this
        tensor *includes* background class predictions (at class index 0).
-      rpn_features_to_crop: A 4-D float32 or bfloat16 tensor with shape
+      rpn_features_to_crop: A list of 4-D float32 or bfloat16 tensor with shape
-        [batch_size, height, width, depth] representing image features to crop
+        [batch_size, height_i, width_i, depth] representing image features to
-        using the proposal boxes predicted by the RPN.
+        crop using the proposal boxes predicted by the RPN.
      anchors: 2-D float tensor of shape
        [num_anchors, self._box_coder.code_size].
      image_shape: A 1D int32 tensors of size [4] containing the image shape.
@@ -1012,9 +1003,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
    """Predicts the output tensors from second stage of Faster R-CNN.
    Args:
-      rpn_features_to_crop: A 4-D float32 or bfloat16 tensor with shape
+      rpn_features_to_crop: A list 4-D float32 or bfloat16 tensor with shape
-        [batch_size, height, width, depth] representing image features to crop
+        [batch_size, height_i, width_i, depth] representing image features to
-        using the proposal boxes predicted by the RPN.
+        crop using the proposal boxes predicted by the RPN.
      proposal_boxes_normalized: A float tensor with shape [batch_size,
        max_num_proposals, 4] representing the (potentially zero padded)
        proposal boxes for all images in the batch.  These boxes are represented
@@ -1064,7 +1055,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
    """
    flattened_proposal_feature_maps = (
        self._compute_second_stage_input_feature_maps(
-            rpn_features_to_crop, proposal_boxes_normalized, **side_inputs))
+            rpn_features_to_crop, proposal_boxes_normalized,
+            image_shape, **side_inputs))
    box_classifier_features = self._extract_box_classifier_features(
        flattened_proposal_feature_maps)
@@ -1196,6 +1188,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
          decoded proposal bounding boxes in absolute coordinates.
        5) box_classifier_features: a 4-D float32 tensor representing the
          features for each proposal.
+        6) image_shape: a 1-D tensor of shape [4] representing the input
+          image shape.
      image_shapes: A 2-D int32 tensors of shape [batch_size, 3] containing
        shapes of images in the batch.
@@ -1234,11 +1228,12 @@ class FasterRCNNMetaArch(model.DetectionModel):
      detection_classes = detections_dict[
          fields.DetectionResultFields.detection_classes]
      rpn_features_to_crop = prediction_dict['rpn_features_to_crop']
+      image_shape = prediction_dict['image_shape']
      batch_size = tf.shape(detection_boxes)[0]
      max_detection = tf.shape(detection_boxes)[1]
      flattened_detected_feature_maps = (
          self._compute_second_stage_input_feature_maps(
-              rpn_features_to_crop, detection_boxes))
+              rpn_features_to_crop, detection_boxes, image_shape))
      curr_box_classifier_features = self._extract_box_classifier_features(
          flattened_detected_feature_maps)
@@ -1302,13 +1297,13 @@ class FasterRCNNMetaArch(model.DetectionModel):
      preprocessed_inputs: a [batch, height, width, channels] image tensor.
    Returns:
-      rpn_box_predictor_features: A 4-D float32 tensor with shape
+      rpn_box_predictor_features: A list of 4-D float32 tensor with shape
-        [batch, height, width, depth] to be used for predicting proposal boxes
+        [batch, height_i, width_j, depth] to be used for predicting proposal
-        and corresponding objectness scores.
+        boxes and corresponding objectness scores.
-      rpn_features_to_crop: A 4-D float32 tensor with shape
+      rpn_features_to_crop: A list of 4-D float32 tensor with shape
        [batch, height, width, depth] representing image features to crop using
        the proposals boxes.
-      anchors: A BoxList representing anchors (for the RPN) in
+      anchors: A list of BoxList representing anchors (for the RPN) in
        absolute coordinates.
      image_shape: A 1-D tensor representing the input image shape.
    """
@@ -1317,12 +1312,21 @@ class FasterRCNNMetaArch(model.DetectionModel):
    rpn_features_to_crop, self.endpoints = self._extract_proposal_features(
        preprocessed_inputs)
-    feature_map_shape = tf.shape(rpn_features_to_crop)
+    # Decide if rpn_features_to_crop is a list. If not make it a list
+    if not isinstance(rpn_features_to_crop, list):
+      rpn_features_to_crop = [rpn_features_to_crop]
+    feature_map_shapes = []
+    rpn_box_predictor_features = []
+    for single_rpn_features_to_crop in rpn_features_to_crop:
+      single_shape = tf.shape(single_rpn_features_to_crop)
+      feature_map_shapes.append((single_shape[1], single_shape[2]))
+      single_rpn_box_predictor_features = (
+          self._first_stage_box_predictor_first_conv(
+              single_rpn_features_to_crop))
+      rpn_box_predictor_features.append(single_rpn_box_predictor_features)
    anchors = box_list_ops.concatenate(
-        self._first_stage_anchor_generator.generate([(feature_map_shape[1],
+        self._first_stage_anchor_generator.generate(feature_map_shapes))
-                                                      feature_map_shape[2])]))
-    rpn_box_predictor_features = (
-        self._first_stage_box_predictor_first_conv(rpn_features_to_crop))
    return (rpn_box_predictor_features, rpn_features_to_crop,
            anchors, image_shape)
@@ -1349,9 +1353,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
    Note resulting tensors will not have been postprocessed.
    Args:
-      rpn_box_predictor_features: A 4-D float32 tensor with shape
+      rpn_box_predictor_features: A list of 4-D float32 tensor with shape
-        [batch, height, width, depth] to be used for predicting proposal boxes
+        [batch, height_i, width_j, depth] to be used for predicting proposal
-        and corresponding objectness scores.
+        boxes and corresponding objectness scores.
    Returns:
      box_encodings: 3-D float tensor of shape
@@ -1369,15 +1373,13 @@ class FasterRCNNMetaArch(model.DetectionModel):
    """
    num_anchors_per_location = (
        self._first_stage_anchor_generator.num_anchors_per_location())
-    if len(num_anchors_per_location) != 1:
-      raise RuntimeError('anchor_generator is expected to generate anchors '
-                         'corresponding to a single feature map.')
    if self._first_stage_box_predictor.is_keras_model:
      box_predictions = self._first_stage_box_predictor(
-          [rpn_box_predictor_features])
+          rpn_box_predictor_features)
    else:
      box_predictions = self._first_stage_box_predictor.predict(
-          [rpn_box_predictor_features],
+          rpn_box_predictor_features,
          num_anchors_per_location,
          scope=self.first_stage_box_predictor_scope)
@@ -1547,7 +1549,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
        detections_dict[
            'detection_features'] = self._add_detection_features_output_node(
                detections_dict[fields.DetectionResultFields.detection_boxes],
-                prediction_dict['rpn_features_to_crop'])
+                prediction_dict['rpn_features_to_crop'],
+                prediction_dict['image_shape'])
      return detections_dict
@@ -1564,7 +1567,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
      return prediction_dict
  def _add_detection_features_output_node(self, detection_boxes,
-                                          rpn_features_to_crop):
+                                          rpn_features_to_crop, image_shape):
    """Add detection features to outputs.
    This function extracts box features for each box in rpn_features_to_crop.
@@ -1576,9 +1579,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
    Args:
      detection_boxes: a 3-D float32 tensor of shape
        [batch_size, max_detections, 4] which represents the bounding boxes.
-      rpn_features_to_crop: A 4-D float32 tensor with shape
+      rpn_features_to_crop: A list of 4-D float32 tensor with shape
        [batch, height, width, depth] representing image features to crop using
        the proposals boxes.
+      image_shape: a 1-D tensor of shape [4] representing the image shape.
    Returns:
      detection_features: a 4-D float32 tensor of shape
@@ -1588,7 +1592,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
    with tf.name_scope('SecondStageDetectionFeaturesExtract'):
      flattened_detected_feature_maps = (
          self._compute_second_stage_input_feature_maps(
-              rpn_features_to_crop, detection_boxes))
+              rpn_features_to_crop, detection_boxes, image_shape))
      detection_features_unpooled = self._extract_box_classifier_features(
          flattened_detected_feature_maps)
@@ -1930,6 +1934,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
  def _compute_second_stage_input_feature_maps(self, features_to_crop,
                                               proposal_boxes_normalized,
+                                               image_shape,
                                               **side_inputs):
    """Crops to a set of proposals from the feature map for a batch of images.
@@ -1943,18 +1948,24 @@ class FasterRCNNMetaArch(model.DetectionModel):
      proposal_boxes_normalized: A float32 tensor with shape [batch_size,
        num_proposals, box_code_size] containing proposal boxes in
        normalized coordinates.
+      image_shape: A 1D int32 tensors of size [4] containing the image shape.
      **side_inputs: additional tensors that are required by the network.
    Returns:
      A float32 tensor with shape [K, new_height, new_width, depth].
    """
-    features_to_crop = [features_to_crop]
    num_levels = len(features_to_crop)
    box_levels = None
    if num_levels != 1:
      # If there are multiple levels to select, get the box levels
-      box_levels = ops.fpn_feature_levels(num_levels, num_levels - 1,
+      # unit_scale_index: num_levels-2 is chosen based on section 4.2 of
-                                          1.0/224, proposal_boxes_normalized)
+      # https://arxiv.org/pdf/1612.03144.pdf and works best for Resnet based
+      # feature extractor.
+      box_levels = ops.fpn_feature_levels(
+          num_levels, num_levels - 2,
+          tf.sqrt(tf.cast(image_shape[1] * image_shape[2], tf.float32)) / 224.0,
+          proposal_boxes_normalized)
    cropped_regions = self._flatten_first_two_dimensions(
        self._crop_and_resize_fn(
            features_to_crop, proposal_boxes_normalized, box_levels,

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test.py
@@ -484,7 +484,7 @@ class FasterRCNNMetaArchTest(
              'mask_predictions':
                  mask_predictions,
              'rpn_features_to_crop':
-                  rpn_features_to_crop
+                  [rpn_features_to_crop]
          }, true_image_shapes)
      self.assertIn('detection_features', detections)
      return (detections['detection_boxes'], detections['detection_scores'],

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
@@ -23,6 +23,7 @@ import tensorflow.compat.v1 as tf
 from google.protobuf import text_format
 from object_detection.anchor_generators import grid_anchor_generator
+from object_detection.anchor_generators import multiscale_grid_anchor_generator
 from object_detection.builders import box_predictor_builder
 from object_detection.builders import hyperparams_builder
 from object_detection.builders import post_processing_builder
@@ -76,6 +77,36 @@ class FakeFasterRCNNFeatureExtractor(
          proposal_feature_maps, num_outputs=3, kernel_size=1, scope='layer2')
+class FakeFasterRCNNMultiLevelFeatureExtractor(
+    faster_rcnn_meta_arch.FasterRCNNFeatureExtractor):
+  """Fake feature extractor to use in tests."""
+  def __init__(self):
+    super(FakeFasterRCNNMultiLevelFeatureExtractor, self).__init__(
+        is_training=False,
+        first_stage_features_stride=32,
+        reuse_weights=None,
+        weight_decay=0.0)
+  def preprocess(self, resized_inputs):
+    return tf.identity(resized_inputs)
+  def _extract_proposal_features(self, preprocessed_inputs, scope):
+    with tf.variable_scope('mock_model'):
+      proposal_features_1 = 0 * slim.conv2d(
+          preprocessed_inputs, num_outputs=3, kernel_size=3, scope='layer1',
+          padding='VALID')
+      proposal_features_2 = 0 * slim.conv2d(
+          proposal_features_1, num_outputs=3, kernel_size=3, scope='layer2',
+          padding='VALID')
+      return [proposal_features_1, proposal_features_2], {}
+  def _extract_box_classifier_features(self, proposal_feature_maps, scope):
+    with tf.variable_scope('mock_model'):
+      return 0 * slim.conv2d(
+          proposal_feature_maps, num_outputs=3, kernel_size=1, scope='layer3')
 class FakeFasterRCNNKerasFeatureExtractor(
    faster_rcnn_meta_arch.FasterRCNNKerasFeatureExtractor):
  """Fake feature extractor to use in tests."""
@@ -112,6 +143,42 @@ class FakeFasterRCNNKerasFeatureExtractor(
        3, kernel_size=1, padding='SAME', name=name + '_layer2')])
+class FakeFasterRCNNKerasMultilevelFeatureExtractor(
+    faster_rcnn_meta_arch.FasterRCNNKerasFeatureExtractor):
+  """Fake feature extractor to use in tests."""
+  def __init__(self):
+    super(FakeFasterRCNNKerasMultilevelFeatureExtractor, self).__init__(
+        is_training=False,
+        first_stage_features_stride=32,
+        weight_decay=0.0)
+  def preprocess(self, resized_inputs):
+    return tf.identity(resized_inputs)
+  def get_proposal_feature_extractor_model(self, name):
+    class ProposalFeatureExtractor(tf.keras.Model):
+      """Dummy proposal feature extraction."""
+      def __init__(self, name):
+        super(ProposalFeatureExtractor, self).__init__(name=name)
+        self.conv = None
+      def build(self, input_shape):
+        self.conv = tf.keras.layers.Conv2D(
+            3, kernel_size=3, name='layer1')
+        self.conv_1 = tf.keras.layers.Conv2D(
+            3, kernel_size=3, name='layer1')
+      def call(self, inputs):
+        output_1 = self.conv(inputs)
+        output_2 = self.conv_1(output_1)
+        return [output_1, output_2]
+    return ProposalFeatureExtractor(name=name)
 class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
  """Base class to test Faster R-CNN and R-FCN meta architectures."""
@@ -234,7 +301,8 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
                   calibration_mapping_value=None,
                   share_box_across_classes=False,
                   return_raw_detections_during_predict=False,
-                   output_final_box_features=False):
+                   output_final_box_features=False,
+                   multi_level=False):
    use_keras = tf_version.is_tf2()
    def image_resizer_fn(image, masks=None):
      """Fake image resizer function."""
@@ -260,6 +328,19 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
    # anchors in this test are designed so that a subset of anchors are inside
    # the image and a subset of anchors are outside.
+    first_stage_anchor_generator = None
+    if multi_level:
+      min_level = 0
+      max_level = 1
+      anchor_scale = 0.1
+      aspect_ratios = [1.0, 2.0, 0.5]
+      scales_per_octave = 2
+      normalize_coordinates = False
+      (first_stage_anchor_generator
+      ) = multiscale_grid_anchor_generator.MultiscaleGridAnchorGenerator(
+          min_level, max_level, anchor_scale, aspect_ratios, scales_per_octave,
+          normalize_coordinates)
+    else:
      first_stage_anchor_scales = (0.001, 0.005, 0.1)
      first_stage_anchor_aspect_ratios = (0.5, 1.0, 2.0)
      first_stage_anchor_strides = (1, 1)
@@ -273,7 +354,13 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
        use_matmul_gather=use_matmul_gather_in_matcher)
    if use_keras:
+      if multi_level:
+        fake_feature_extractor = FakeFasterRCNNKerasMultilevelFeatureExtractor()
+      else:
        fake_feature_extractor = FakeFasterRCNNKerasFeatureExtractor()
+    else:
+      if multi_level:
+        fake_feature_extractor = FakeFasterRCNNMultiLevelFeatureExtractor()
      else:
        fake_feature_extractor = FakeFasterRCNNFeatureExtractor()
@@ -479,8 +566,8 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
      preprocessed_inputs, true_image_shapes = model.preprocess(images)
      prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
-      return (prediction_dict['rpn_box_predictor_features'],
+      return (prediction_dict['rpn_box_predictor_features'][0],
-              prediction_dict['rpn_features_to_crop'],
+              prediction_dict['rpn_features_to_crop'][0],
              prediction_dict['image_shape'],
              prediction_dict['rpn_box_encodings'],
              prediction_dict['rpn_objectness_predictions_with_background'],
@@ -529,6 +616,92 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
    self.assertTrue(np.all(np.less_equal(anchors[:, 2], height)))
    self.assertTrue(np.all(np.less_equal(anchors[:, 3], width)))
+  @parameterized.parameters(
+      {'use_static_shapes': False},
+      {'use_static_shapes': True},
+  )
+  def test_predict_shape_in_inference_mode_first_stage_only_multi_level(
+      self, use_static_shapes):
+    batch_size = 2
+    height = 50
+    width = 52
+    input_image_shape = (batch_size, height, width, 3)
+    with test_utils.GraphContextOrNone() as g:
+      model = self._build_model(
+          is_training=False,
+          number_of_stages=1,
+          second_stage_batch_size=2,
+          clip_anchors_to_image=use_static_shapes,
+          use_static_shapes=use_static_shapes,
+          multi_level=True)
+    def graph_fn(images):
+      """Function to construct tf graph for the test."""
+      preprocessed_inputs, true_image_shapes = model.preprocess(images)
+      prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
+      return (prediction_dict['rpn_box_predictor_features'][0],
+              prediction_dict['rpn_box_predictor_features'][1],
+              prediction_dict['rpn_features_to_crop'][0],
+              prediction_dict['rpn_features_to_crop'][1],
+              prediction_dict['image_shape'],
+              prediction_dict['rpn_box_encodings'],
+              prediction_dict['rpn_objectness_predictions_with_background'],
+              prediction_dict['anchors'])
+    images = np.zeros(input_image_shape, dtype=np.float32)
+    # In inference mode, anchors are clipped to the image window, but not
+    # pruned.  Since MockFasterRCNN.extract_proposal_features returns a
+    # tensor with the same shape as its input, the expected number of anchors
+    # is height * width * the number of anchors per location (i.e. 3x3).
+    expected_num_anchors = ((height-2) * (width-2) + (height-4) * (width-4)) * 6
+    expected_output_shapes = {
+        'rpn_box_predictor_features_0': (batch_size, height-2, width-2, 512),
+        'rpn_box_predictor_features_1': (batch_size, height-4, width-4, 512),
+        'rpn_features_to_crop_0': (batch_size, height-2, width-2, 3),
+        'rpn_features_to_crop_1': (batch_size, height-4, width-4, 3),
+        'rpn_box_encodings': (batch_size, expected_num_anchors, 4),
+        'rpn_objectness_predictions_with_background':
+        (batch_size, expected_num_anchors, 2),
+    }
+    if use_static_shapes:
+      expected_output_shapes['anchors'] = (expected_num_anchors, 4)
+    else:
+      expected_output_shapes['anchors'] = (18300, 4)
+    if use_static_shapes:
+      results = self.execute(graph_fn, [images], graph=g)
+    else:
+      results = self.execute_cpu(graph_fn, [images], graph=g)
+    self.assertAllEqual(results[0].shape,
+                        expected_output_shapes['rpn_box_predictor_features_0'])
+    self.assertAllEqual(results[1].shape,
+                        expected_output_shapes['rpn_box_predictor_features_1'])
+    self.assertAllEqual(results[2].shape,
+                        expected_output_shapes['rpn_features_to_crop_0'])
+    self.assertAllEqual(results[3].shape,
+                        expected_output_shapes['rpn_features_to_crop_1'])
+    self.assertAllEqual(results[4],
+                        input_image_shape)
+    self.assertAllEqual(results[5].shape,
+                        expected_output_shapes['rpn_box_encodings'])
+    self.assertAllEqual(
+        results[6].shape,
+        expected_output_shapes['rpn_objectness_predictions_with_background'])
+    self.assertAllEqual(results[7].shape,
+                        expected_output_shapes['anchors'])
+    # Check that anchors are clipped to window.
+    anchors = results[5]
+    self.assertTrue(np.all(np.greater_equal(anchors, 0)))
+    self.assertTrue(np.all(np.less_equal(anchors[:, 0], height)))
+    self.assertTrue(np.all(np.less_equal(anchors[:, 1], width)))
+    self.assertTrue(np.all(np.less_equal(anchors[:, 2], height)))
+    self.assertTrue(np.all(np.less_equal(anchors[:, 3], width)))
  def test_regularization_losses(self):
    with test_utils.GraphContextOrNone() as g:
      model = self._build_model(
@@ -601,9 +774,9 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
    def compare_results(results, expected_output_shapes):
      """Checks if the shape of the predictions are as expected."""
-      self.assertAllEqual(results[0].shape,
+      self.assertAllEqual(results[0][0].shape,
                          expected_output_shapes['rpn_box_predictor_features'])
-      self.assertAllEqual(results[1].shape,
+      self.assertAllEqual(results[1][0].shape,
                          expected_output_shapes['rpn_features_to_crop'])
      self.assertAllEqual(results[2].shape,
                          expected_output_shapes['image_shape'])
@@ -746,8 +919,8 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
              result_tensor_dict['anchors'],
              result_tensor_dict['rpn_box_encodings'],
              result_tensor_dict['rpn_objectness_predictions_with_background'],
-              result_tensor_dict['rpn_features_to_crop'],
+              result_tensor_dict['rpn_features_to_crop'][0],
-              result_tensor_dict['rpn_box_predictor_features'],
+              result_tensor_dict['rpn_box_predictor_features'][0],
              result_tensor_dict['final_anchors'],
             )

--- a/research/object_detection/meta_architectures/rfcn_meta_arch.py
+++ b/research/object_detection/meta_architectures/rfcn_meta_arch.py
@@ -265,7 +265,7 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        [batch_size, num_valid_anchors, 2] containing class
        predictions (logits) for each of the anchors.  Note that this
        tensor *includes* background class predictions (at class index 0).
-      rpn_features: A 4-D float32 tensor with shape
+      rpn_features: A list of single 4-D float32 tensor with shape
        [batch_size, height, width, depth] representing image features from the
        RPN.
      anchors: 2-D float tensor of shape
@@ -313,6 +313,7 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
                                rpn_objectness_predictions_with_background,
                                anchors, image_shape_2d, true_image_shapes)
+    rpn_features = rpn_features[0]
    box_classifier_features = (
        self._extract_box_classifier_features(rpn_features))