Merge remote-tracking branch 'upstream/master' into context_tf2

afd5579f · Kaushik Shivakumar · dcd96e02 · 567bd18d · afd5579f · afd5579f
Commit afd5579f authored Jul 22, 2020 by Kaushik Shivakumar
9 changed files
--- a/research/object_detection/models/center_net_mobilenet_v2_feature_extractor.py
+++ b/research/object_detection/models/center_net_mobilenet_v2_feature_extractor.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MobileNet V2[1] feature extractor for CenterNet[2] meta architecture.
+[1]: https://arxiv.org/abs/1801.04381
+[2]: https://arxiv.org/abs/1904.07850
+"""
+import tensorflow.compat.v1 as tf
+from object_detection.meta_architectures import center_net_meta_arch
+from object_detection.models.keras_models import mobilenet_v2 as mobilenetv2
+class CenterNetMobileNetV2FeatureExtractor(
+    center_net_meta_arch.CenterNetFeatureExtractor):
+  """The MobileNet V2 feature extractor for CenterNet."""
+  def __init__(self,
+               mobilenet_v2_net,
+               channel_means=(0., 0., 0.),
+               channel_stds=(1., 1., 1.),
+               bgr_ordering=False):
+    """Intializes the feature extractor.
+    Args:
+      mobilenet_v2_net: The underlying mobilenet_v2 network to use.
+      channel_means: A tuple of floats, denoting the mean of each channel
+        which will be subtracted from it.
+      channel_stds: A tuple of floats, denoting the standard deviation of each
+        channel. Each channel will be divided by its standard deviation value.
+      bgr_ordering: bool, if set will change the channel ordering to be in the
+        [blue, red, green] order.
+    """
+    super(CenterNetMobileNetV2FeatureExtractor, self).__init__(
+        channel_means=channel_means,
+        channel_stds=channel_stds,
+        bgr_ordering=bgr_ordering)
+    self._network = mobilenet_v2_net
+    output = self._network(self._network.input)
+    # TODO(nkhadke): Try out MobileNet+FPN next (skip connections are cheap and
+    # should help with performance).
+    # MobileNet by itself transforms a 224x224x3 volume into a 7x7x1280, which
+    # leads to a stride of 32. We perform upsampling to get it to a target
+    # stride of 4.
+    for num_filters in [256, 128, 64]:
+      # 1. We use a simple convolution instead of a deformable convolution
+      conv = tf.keras.layers.Conv2D(
+          filters=num_filters, kernel_size=1, strides=1, padding='same')
+      output = conv(output)
+      output = tf.keras.layers.BatchNormalization()(output)
+      output = tf.keras.layers.ReLU()(output)
+      # 2. We use the default initialization for the convolution layers
+      # instead of initializing it to do bilinear upsampling.
+      conv_transpose = tf.keras.layers.Conv2DTranspose(
+          filters=num_filters, kernel_size=3, strides=2, padding='same')
+      output = conv_transpose(output)
+      output = tf.keras.layers.BatchNormalization()(output)
+      output = tf.keras.layers.ReLU()(output)
+    self._network = tf.keras.models.Model(
+        inputs=self._network.input, outputs=output)
+  def preprocess(self, resized_inputs):
+    resized_inputs = super(CenterNetMobileNetV2FeatureExtractor,
+                           self).preprocess(resized_inputs)
+    return tf.keras.applications.mobilenet_v2.preprocess_input(resized_inputs)
+  def load_feature_extractor_weights(self, path):
+    self._network.load_weights(path)
+  def get_base_model(self):
+    return self._network
+  def call(self, inputs):
+    return [self._network(inputs)]
+  @property
+  def out_stride(self):
+    """The stride in the output image of the network."""
+    return 4
+  @property
+  def num_feature_outputs(self):
+    """The number of feature outputs returned by the feature extractor."""
+    return 1
+  def get_model(self):
+    return self._network
+def mobilenet_v2(channel_means, channel_stds, bgr_ordering):
+  """The MobileNetV2 backbone for CenterNet."""
+  # We set 'is_training' to True for now.
+  network = mobilenetv2.mobilenet_v2(True, include_top=False)
+  return CenterNetMobileNetV2FeatureExtractor(
+      network,
+      channel_means=channel_means,
+      channel_stds=channel_stds,
+      bgr_ordering=bgr_ordering)
--- a/research/object_detection/models/center_net_mobilenet_v2_feature_extractor_tf2_test.py
+++ b/research/object_detection/models/center_net_mobilenet_v2_feature_extractor_tf2_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Testing mobilenet_v2 feature extractor for CenterNet."""
+import unittest
+import numpy as np
+import tensorflow.compat.v1 as tf
+from object_detection.models import center_net_mobilenet_v2_feature_extractor
+from object_detection.models.keras_models import mobilenet_v2
+from object_detection.utils import test_case
+from object_detection.utils import tf_version
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class CenterNetMobileNetV2FeatureExtractorTest(test_case.TestCase):
+  def test_center_net_mobilenet_v2_feature_extractor(self):
+    net = mobilenet_v2.mobilenet_v2(True, include_top=False)
+    model = center_net_mobilenet_v2_feature_extractor.CenterNetMobileNetV2FeatureExtractor(
+        net)
+    def graph_fn():
+      img = np.zeros((8, 224, 224, 3), dtype=np.float32)
+      processed_img = model.preprocess(img)
+      return model(processed_img)
+    outputs = self.execute(graph_fn, [])
+    self.assertEqual(outputs.shape, (8, 56, 56, 64))
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/models/center_net_resnet_v1_fpn_feature_extractor.py
+++ b/research/object_detection/models/center_net_resnet_v1_fpn_feature_extractor.py
@@ -21,9 +21,14 @@
 import tensorflow.compat.v1 as tf
 from object_detection.meta_architectures.center_net_meta_arch import CenterNetFeatureExtractor
+from object_detection.models.keras_models import resnet_v1
 _RESNET_MODEL_OUTPUT_LAYERS = {
+    'resnet_v1_18': ['conv2_block2_out', 'conv3_block2_out',
+                     'conv4_block2_out', 'conv5_block2_out'],
+    'resnet_v1_34': ['conv2_block3_out', 'conv3_block4_out',
+                     'conv4_block6_out', 'conv5_block3_out'],
    'resnet_v1_50': ['conv2_block3_out', 'conv3_block4_out',
                     'conv4_block6_out', 'conv5_block3_out'],
    'resnet_v1_101': ['conv2_block3_out', 'conv3_block4_out',
@@ -69,6 +74,10 @@ class CenterNetResnetV1FpnFeatureExtractor(CenterNetFeatureExtractor):
      self._base_model = tf.keras.applications.ResNet50(weights=None)
    elif resnet_type == 'resnet_v1_101':
      self._base_model = tf.keras.applications.ResNet101(weights=None)
+    elif resnet_type == 'resnet_v1_18':
+      self._base_model = resnet_v1.resnet_v1_18(weights=None)
+    elif resnet_type == 'resnet_v1_34':
+      self._base_model = resnet_v1.resnet_v1_34(weights=None)
    else:
      raise ValueError('Unknown Resnet Model {}'.format(resnet_type))
    output_layers = _RESNET_MODEL_OUTPUT_LAYERS[resnet_type]
@@ -174,3 +183,24 @@ def resnet_v1_50_fpn(channel_means, channel_stds, bgr_ordering):
      channel_means=channel_means,
      channel_stds=channel_stds,
      bgr_ordering=bgr_ordering)
+def resnet_v1_34_fpn(channel_means, channel_stds, bgr_ordering):
+  """The ResNet v1 34 FPN feature extractor."""
+  return CenterNetResnetV1FpnFeatureExtractor(
+      resnet_type='resnet_v1_34',
+      channel_means=channel_means,
+      channel_stds=channel_stds,
+      bgr_ordering=bgr_ordering
+  )
+def resnet_v1_18_fpn(channel_means, channel_stds, bgr_ordering):
+  """The ResNet v1 18 FPN feature extractor."""
+  return CenterNetResnetV1FpnFeatureExtractor(
+      resnet_type='resnet_v1_18',
+      channel_means=channel_means,
+      channel_stds=channel_stds,
+      bgr_ordering=bgr_ordering)
--- a/research/object_detection/models/center_net_resnet_v1_fpn_feature_extractor_tf2_test.py
+++ b/research/object_detection/models/center_net_resnet_v1_fpn_feature_extractor_tf2_test.py
@@ -31,6 +31,8 @@ class CenterNetResnetV1FpnFeatureExtractorTest(test_case.TestCase,
  @parameterized.parameters(
      {'resnet_type': 'resnet_v1_50'},
      {'resnet_type': 'resnet_v1_101'},
+      {'resnet_type': 'resnet_v1_18'},
+      {'resnet_type': 'resnet_v1_34'},
  )
  def test_correct_output_size(self, resnet_type):
    """Verify that shape of features returned by the backbone is correct."""

--- a/research/object_detection/models/keras_models/resnet_v1.py
+++ b/research/object_detection/models/keras_models/resnet_v1.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import tensorflow.compat.v1 as tf
+from tensorflow.python.keras.applications import resnet
 from object_detection.core import freezable_batch_norm
 from object_detection.models.keras_models import model_utils
@@ -95,11 +96,11 @@ class _LayersOverride(object):
    self.regularizer = tf.keras.regularizers.l2(weight_decay)
    self.initializer = tf.variance_scaling_initializer()
-  def _FixedPaddingLayer(self, kernel_size, rate=1):
+  def _FixedPaddingLayer(self, kernel_size, rate=1):  # pylint: disable=invalid-name
    return tf.keras.layers.Lambda(
        lambda x: _fixed_padding(x, kernel_size, rate))
-  def Conv2D(self, filters, kernel_size, **kwargs):
+  def Conv2D(self, filters, kernel_size, **kwargs):  # pylint: disable=invalid-name
    """Builds a Conv2D layer according to the current Object Detection config.
    Overrides the Keras Resnet application's convolutions with ones that
@@ -141,7 +142,7 @@ class _LayersOverride(object):
    else:
      return tf.keras.layers.Conv2D(filters, kernel_size, **kwargs)
-  def Activation(self, *args, **kwargs):  # pylint: disable=unused-argument
+  def Activation(self, *args, **kwargs):  # pylint: disable=unused-argument,invalid-name
    """Builds an activation layer.
    Overrides the Keras application Activation layer specified by the
@@ -163,7 +164,7 @@ class _LayersOverride(object):
    else:
      return tf.keras.layers.Lambda(tf.nn.relu, name=name)
-  def BatchNormalization(self, **kwargs):
+  def BatchNormalization(self, **kwargs):  # pylint: disable=invalid-name
    """Builds a normalization layer.
    Overrides the Keras application batch norm with the norm specified by the
@@ -191,7 +192,7 @@ class _LayersOverride(object):
          momentum=self._default_batchnorm_momentum,
          **kwargs)
-  def Input(self, shape):
+  def Input(self, shape):  # pylint: disable=invalid-name
    """Builds an Input layer.
    Overrides the Keras application Input layer with one that uses a
@@ -219,7 +220,7 @@ class _LayersOverride(object):
        input=input_tensor, shape=[None] + shape)
    return model_utils.input_layer(shape, placeholder_with_default)
-  def MaxPooling2D(self, pool_size, **kwargs):
+  def MaxPooling2D(self, pool_size, **kwargs):  # pylint: disable=invalid-name
    """Builds a MaxPooling2D layer with default padding as 'SAME'.
    This is specified by the default resnet arg_scope in slim.
@@ -237,7 +238,7 @@ class _LayersOverride(object):
  # Add alias as Keras also has it.
  MaxPool2D = MaxPooling2D  # pylint: disable=invalid-name
-  def ZeroPadding2D(self, padding, **kwargs):  # pylint: disable=unused-argument
+  def ZeroPadding2D(self, padding, **kwargs):  # pylint: disable=unused-argument,invalid-name
    """Replaces explicit padding in the Keras application with a no-op.
    Args:
@@ -395,3 +396,146 @@ def resnet_v1_152(batchnorm_training,
  return tf.keras.applications.resnet.ResNet152(
      layers=layers_override, **kwargs)
 # pylint: enable=invalid-name
+# The following codes are based on the existing keras ResNet model pattern:
+# google3/third_party/tensorflow/python/keras/applications/resnet.py
+def block_basic(x,
+                filters,
+                kernel_size=3,
+                stride=1,
+                conv_shortcut=False,
+                name=None):
+  """A residual block for ResNet18/34.
+  Arguments:
+      x: input tensor.
+      filters: integer, filters of the bottleneck layer.
+      kernel_size: default 3, kernel size of the bottleneck layer.
+      stride: default 1, stride of the first layer.
+      conv_shortcut: default False, use convolution shortcut if True, otherwise
+        identity shortcut.
+      name: string, block label.
+  Returns:
+    Output tensor for the residual block.
+  """
+  layers = tf.keras.layers
+  bn_axis = 3 if tf.keras.backend.image_data_format() == 'channels_last' else 1
+  preact = layers.BatchNormalization(
+      axis=bn_axis, epsilon=1.001e-5, name=name + '_preact_bn')(
+          x)
+  preact = layers.Activation('relu', name=name + '_preact_relu')(preact)
+  if conv_shortcut:
+    shortcut = layers.Conv2D(
+        filters, 1, strides=1, name=name + '_0_conv')(
+            preact)
+  else:
+    shortcut = layers.MaxPooling2D(1, strides=stride)(x) if stride > 1 else x
+  x = layers.ZeroPadding2D(
+      padding=((1, 1), (1, 1)), name=name + '_1_pad')(
+          preact)
+  x = layers.Conv2D(
+      filters, kernel_size, strides=1, use_bias=False, name=name + '_1_conv')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis, epsilon=1.001e-5, name=name + '_1_bn')(
+          x)
+  x = layers.Activation('relu', name=name + '_1_relu')(x)
+  x = layers.ZeroPadding2D(padding=((1, 1), (1, 1)), name=name + '_2_pad')(x)
+  x = layers.Conv2D(
+      filters,
+      kernel_size,
+      strides=stride,
+      use_bias=False,
+      name=name + '_2_conv')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis, epsilon=1.001e-5, name=name + '_2_bn')(
+          x)
+  x = layers.Activation('relu', name=name + '_2_relu')(x)
+  x = layers.Add(name=name + '_out')([shortcut, x])
+  return x
+def stack_basic(x, filters, blocks, stride1=2, name=None):
+  """A set of stacked residual blocks for ResNet18/34.
+  Arguments:
+      x: input tensor.
+      filters: integer, filters of the bottleneck layer in a block.
+      blocks: integer, blocks in the stacked blocks.
+      stride1: default 2, stride of the first layer in the first block.
+      name: string, stack label.
+  Returns:
+      Output tensor for the stacked blocks.
+  """
+  x = block_basic(x, filters, conv_shortcut=True, name=name + '_block1')
+  for i in range(2, blocks):
+    x = block_basic(x, filters, name=name + '_block' + str(i))
+  x = block_basic(
+      x, filters, stride=stride1, name=name + '_block' + str(blocks))
+  return x
+def resnet_v1_18(include_top=True,
+                 weights='imagenet',
+                 input_tensor=None,
+                 input_shape=None,
+                 pooling=None,
+                 classes=1000,
+                 classifier_activation='softmax'):
+  """Instantiates the ResNet18 architecture."""
+  def stack_fn(x):
+    x = stack_basic(x, 64, 2, stride1=1, name='conv2')
+    x = stack_basic(x, 128, 2, name='conv3')
+    x = stack_basic(x, 256, 2, name='conv4')
+    return stack_basic(x, 512, 2, name='conv5')
+  return resnet.ResNet(
+      stack_fn,
+      True,
+      True,
+      'resnet18',
+      include_top,
+      weights,
+      input_tensor,
+      input_shape,
+      pooling,
+      classes,
+      classifier_activation=classifier_activation)
+def resnet_v1_34(include_top=True,
+                 weights='imagenet',
+                 input_tensor=None,
+                 input_shape=None,
+                 pooling=None,
+                 classes=1000,
+                 classifier_activation='softmax'):
+  """Instantiates the ResNet34 architecture."""
+  def stack_fn(x):
+    x = stack_basic(x, 64, 3, stride1=1, name='conv2')
+    x = stack_basic(x, 128, 4, name='conv3')
+    x = stack_basic(x, 256, 6, name='conv4')
+    return stack_basic(x, 512, 3, name='conv5')
+  return resnet.ResNet(
+      stack_fn,
+      True,
+      True,
+      'resnet34',
+      include_top,
+      weights,
+      input_tensor,
+      input_shape,
+      pooling,
+      classes,
+      classifier_activation=classifier_activation)
--- a/research/object_detection/models/keras_models/resnet_v1_tf2_test.py
+++ b/research/object_detection/models/keras_models/resnet_v1_tf2_test.py
@@ -20,12 +20,13 @@ object detection. To verify the consistency of the two models, we compare:
  2. Number of global variables.
 """
 import unittest
+from absl.testing import parameterized
 import numpy as np
 from six.moves import zip
 import tensorflow.compat.v1 as tf
 from google.protobuf import text_format
 from object_detection.builders import hyperparams_builder
 from object_detection.models.keras_models import resnet_v1
 from object_detection.protos import hyperparams_pb2
@@ -180,5 +181,46 @@ class ResnetV1Test(test_case.TestCase):
      self.assertEqual(len(variables), var_num)
+class ResnetShapeTest(test_case.TestCase, parameterized.TestCase):
+  @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+  @parameterized.parameters(
+      {
+          'resnet_type':
+              'resnet_v1_34',
+          'output_layer_names': [
+              'conv2_block3_out', 'conv3_block4_out', 'conv4_block6_out',
+              'conv5_block3_out'
+          ]
+      }, {
+          'resnet_type':
+              'resnet_v1_18',
+          'output_layer_names': [
+              'conv2_block2_out', 'conv3_block2_out', 'conv4_block2_out',
+              'conv5_block2_out'
+          ]
+      })
+  def test_output_shapes(self, resnet_type, output_layer_names):
+    if resnet_type == 'resnet_v1_34':
+      model = resnet_v1.resnet_v1_34(weights=None)
+    else:
+      model = resnet_v1.resnet_v1_18(weights=None)
+    outputs = [
+        model.get_layer(output_layer_name).output
+        for output_layer_name in output_layer_names
+    ]
+    resnet_model = tf.keras.models.Model(inputs=model.input, outputs=outputs)
+    outputs = resnet_model(np.zeros((2, 64, 64, 3), dtype=np.float32))
+    # Check the shape of 'conv2_block3_out':
+    self.assertEqual(outputs[0].shape, [2, 16, 16, 64])
+    # Check the shape of 'conv3_block4_out':
+    self.assertEqual(outputs[1].shape, [2, 8, 8, 128])
+    # Check the shape of 'conv4_block6_out':
+    self.assertEqual(outputs[2].shape, [2, 4, 4, 256])
+    # Check the shape of 'conv5_block3_out':
+    self.assertEqual(outputs[3].shape, [2, 2, 2, 512])
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/protos/input_reader.proto
+++ b/research/object_detection/protos/input_reader.proto
@@ -31,7 +31,7 @@ enum InputType {
  TF_SEQUENCE_EXAMPLE = 2;        // TfSequenceExample Input
 }
-// Next id: 32
+// Next id: 33
 message InputReader {
  // Name of input reader. Typically used to describe the dataset that is read
  // by this input reader.
@@ -133,6 +133,10 @@ message InputReader {
  // Whether input data type is tf.Examples or tf.SequenceExamples
  optional InputType input_type = 30 [default = TF_EXAMPLE];
+  // Which frame to choose from the input if Sequence Example. -1 indicates
+  // random choice.
+  optional int32 frame_index = 32 [default = -1];
  oneof input_reader {
    TFRecordInputReader tf_record_input_reader = 8;
    ExternalInputReader external_input_reader = 9;

--- a/research/object_detection/utils/spatial_transform_ops.py
+++ b/research/object_detection/utils/spatial_transform_ops.py
@@ -411,6 +411,56 @@ def multilevel_roi_align(features, boxes, box_levels, output_size,
    return features_per_box
+def multilevel_native_crop_and_resize(images, boxes, box_levels,
+                                      crop_size, scope=None):
+  """Multilevel native crop and resize.
+  Same as `multilevel_matmul_crop_and_resize` but uses tf.image.crop_and_resize.
+  Args:
+    images: A list of 4-D tensor of shape
+      [batch, image_height, image_width, depth] representing features of
+      different size.
+    boxes: A `Tensor` of type `float32`.
+      A 3-D tensor of shape `[batch, num_boxes, 4]`. The boxes are specified in
+      normalized coordinates and are of the form `[y1, x1, y2, x2]`. A
+      normalized coordinate value of `y` is mapped to the image coordinate at
+      `y * (image_height - 1)`, so as the `[0, 1]` interval of normalized image
+      height is mapped to `[0, image_height - 1] in image height coordinates.
+      We do allow y1 > y2, in which case the sampled crop is an up-down flipped
+      version of the original image. The width dimension is treated similarly.
+      Normalized coordinates outside the `[0, 1]` range are allowed, in which
+      case we use `extrapolation_value` to extrapolate the input image values.
+    box_levels: A 2-D tensor of shape [batch, num_boxes] representing the level
+      of the box.
+    crop_size: A list of two integers `[crop_height, crop_width]`. All
+      cropped image patches are resized to this size. The aspect ratio of the
+      image content is not preserved. Both `crop_height` and `crop_width` need
+      to be positive.
+    scope: A name for the operation (optional).
+  Returns:
+    A 5-D float tensor of shape `[batch, num_boxes, crop_height, crop_width,
+    depth]`
+  """
+  if box_levels is None:
+    return native_crop_and_resize(images[0], boxes, crop_size, scope)
+  with tf.name_scope('MultiLevelNativeCropAndResize'):
+    cropped_feature_list = []
+    for level, image in enumerate(images):
+      # For each level, crop the feature according to all boxes
+      # set the cropped feature not at this level to 0 tensor.
+      # Consider more efficient way of computing cropped features.
+      cropped = native_crop_and_resize(image, boxes, crop_size, scope)
+      cond = tf.tile(
+          tf.equal(box_levels, level)[:, :, tf.newaxis],
+          [1, 1] + [tf.math.reduce_prod(cropped.shape.as_list()[2:])])
+      cond = tf.reshape(cond, cropped.shape)
+      cropped_final = tf.where(cond, cropped, tf.zeros_like(cropped))
+      cropped_feature_list.append(cropped_final)
+    return tf.math.reduce_sum(cropped_feature_list, axis=0)
 def native_crop_and_resize(image, boxes, crop_size, scope=None):
  """Same as `matmul_crop_and_resize` but uses tf.image.crop_and_resize."""
  def get_box_inds(proposals):
@@ -431,6 +481,50 @@ def native_crop_and_resize(image, boxes, crop_size, scope=None):
    return tf.reshape(cropped_regions, final_shape)
+def multilevel_matmul_crop_and_resize(images, boxes, box_levels, crop_size,
+                                      extrapolation_value=0.0, scope=None):
+  """Multilevel matmul crop and resize.
+  Same as `matmul_crop_and_resize` but crop images according to box levels.
+  Args:
+    images: A list of 4-D tensor of shape
+      [batch, image_height, image_width, depth] representing features of
+      different size.
+    boxes: A `Tensor` of type `float32` or 'bfloat16'.
+      A 3-D tensor of shape `[batch, num_boxes, 4]`. The boxes are specified in
+      normalized coordinates and are of the form `[y1, x1, y2, x2]`. A
+      normalized coordinate value of `y` is mapped to the image coordinate at
+      `y * (image_height - 1)`, so as the `[0, 1]` interval of normalized image
+      height is mapped to `[0, image_height - 1] in image height coordinates.
+      We do allow y1 > y2, in which case the sampled crop is an up-down flipped
+      version of the original image. The width dimension is treated similarly.
+      Normalized coordinates outside the `[0, 1]` range are allowed, in which
+      case we use `extrapolation_value` to extrapolate the input image values.
+    box_levels: A 2-D tensor of shape [batch, num_boxes] representing the level
+      of the box.
+    crop_size: A list of two integers `[crop_height, crop_width]`. All
+      cropped image patches are resized to this size. The aspect ratio of the
+      image content is not preserved. Both `crop_height` and `crop_width` need
+      to be positive.
+    extrapolation_value: A float value to use for extrapolation.
+    scope: A name for the operation (optional).
+  Returns:
+    A 5-D float tensor of shape `[batch, num_boxes, crop_height, crop_width,
+    depth]`
+  """
+  with tf.name_scope(scope, 'MultiLevelMatMulCropAndResize'):
+    if box_levels is None:
+      box_levels = tf.zeros(tf.shape(boxes)[:2], dtype=tf.int32)
+    return multilevel_roi_align(images,
+                                boxes,
+                                box_levels,
+                                crop_size,
+                                align_corners=True,
+                                extrapolation_value=extrapolation_value)
 def matmul_crop_and_resize(image, boxes, crop_size, extrapolation_value=0.0,
                           scope=None):
  """Matrix multiplication based implementation of the crop and resize op.

--- a/research/object_detection/utils/spatial_transform_ops_test.py
+++ b/research/object_detection/utils/spatial_transform_ops_test.py
@@ -512,6 +512,38 @@ class MatMulCropAndResizeTest(test_case.TestCase):
    crop_output = self.execute(graph_fn, [image, boxes])
    self.assertAllClose(crop_output, expected_output)
+  def testMultilevelMatMulCropAndResize(self):
+    def graph_fn(image1, image2, boxes, box_levels):
+      return spatial_ops.multilevel_matmul_crop_and_resize([image1, image2],
+                                                           boxes,
+                                                           box_levels,
+                                                           crop_size=[2, 2])
+    image = [np.array([[[[1, 0], [2, 0], [3, 0]],
+                        [[4, 0], [5, 0], [6, 0]],
+                        [[7, 0], [8, 0], [9, 0]]],
+                       [[[1, 0], [2, 0], [3, 0]],
+                        [[4, 0], [5, 0], [6, 0]],
+                        [[7, 0], [8, 0], [9, 0]]]], dtype=np.float32),
+             np.array([[[[1, 0], [2, 1], [3, 2]],
+                        [[4, 3], [5, 4], [6, 5]],
+                        [[7, 6], [8, 7], [9, 8]]],
+                       [[[1, 0], [2, 1], [3, 2]],
+                        [[4, 3], [5, 4], [6, 5]],
+                        [[7, 6], [8, 7], [9, 8]]]], dtype=np.float32)]
+    boxes = np.array([[[1, 1, 0, 0],
+                       [.5, .5, 0, 0]],
+                      [[0, 0, 1, 1],
+                       [0, 0, .5, .5]]], dtype=np.float32)
+    box_levels = np.array([[0, 1], [1, 1]], dtype=np.int32)
+    expected_output = [[[[[9, 0], [7, 0]], [[3, 0], [1, 0]]],
+                        [[[5, 4], [4, 3]], [[2, 1], [1, 0]]]],
+                       [[[[1, 0], [3, 2]], [[7, 6], [9, 8]]],
+                        [[[1, 0], [2, 1]], [[4, 3], [5, 4]]]]]
+    crop_output = self.execute(graph_fn, image + [boxes, box_levels])
+    self.assertAllClose(crop_output, expected_output)
 class NativeCropAndResizeTest(test_case.TestCase):
@@ -537,6 +569,35 @@ class NativeCropAndResizeTest(test_case.TestCase):
    crop_output = self.execute_cpu(graph_fn, [image, boxes])
    self.assertAllClose(crop_output, expected_output)
+  def testMultilevelBatchCropAndResize3x3To2x2_2Channels(self):
+    def graph_fn(image1, image2, boxes, box_levels):
+      return spatial_ops.multilevel_native_crop_and_resize([image1, image2],
+                                                           boxes,
+                                                           box_levels,
+                                                           crop_size=[2, 2])
+    image = [np.array([[[[1, 0], [2, 1], [3, 2]],
+                        [[4, 3], [5, 4], [6, 5]],
+                        [[7, 6], [8, 7], [9, 8]]],
+                       [[[1, 0], [2, 1], [3, 2]],
+                        [[4, 3], [5, 4], [6, 5]],
+                        [[7, 6], [8, 7], [9, 8]]]], dtype=np.float32),
+             np.array([[[[1, 0], [2, 1]],
+                        [[4, 3], [5, 4]]],
+                       [[[1, 0], [2, 1]],
+                        [[4, 3], [5, 4]]]], dtype=np.float32)]
+    boxes = np.array([[[0, 0, 1, 1],
+                       [0, 0, .5, .5]],
+                      [[1, 1, 0, 0],
+                       [.5, .5, 0, 0]]], dtype=np.float32)
+    box_levels = np.array([[0, 1], [0, 0]], dtype=np.float32)
+    expected_output = [[[[[1, 0], [3, 2]], [[7, 6], [9, 8]]],
+                        [[[1, 0], [1.5, 0.5]], [[2.5, 1.5], [3, 2]]]],
+                       [[[[9, 8], [7, 6]], [[3, 2], [1, 0]]],
+                        [[[5, 4], [4, 3]], [[2, 1], [1, 0]]]]]
+    crop_output = self.execute_cpu(graph_fn, image + [boxes, box_levels])
+    self.assertAllClose(crop_output, expected_output)
 if __name__ == '__main__':
  tf.test.main()