Merge remote-tracking branch 'upstream/master'

27b4acd4 · Aman Gupta · 5133522f · d4e1f97f · 27b4acd4 · 27b4acd4
Commit 27b4acd4 authored Sep 25, 2018 by Aman Gupta
20 changed files
--- a/research/object_detection/models/ssd_mobilenet_v2_fpn_feature_extractor.py
+++ b/research/object_detection/models/ssd_mobilenet_v2_fpn_feature_extractor.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""SSD MobilenetV2 FPN Feature Extractor."""
+
+import copy
+import functools
+import tensorflow as tf
+
+from object_detection.meta_architectures import ssd_meta_arch
+from object_detection.models import feature_map_generators
+from object_detection.utils import context_manager
+from object_detection.utils import ops
+from object_detection.utils import shape_utils
+from nets.mobilenet import mobilenet
+from nets.mobilenet import mobilenet_v2
+
+slim = tf.contrib.slim
+
+
+# A modified config of mobilenet v2 that makes it more detection friendly,
+def _create_modified_mobilenet_config():
+  conv_defs = copy.copy(mobilenet_v2.V2_DEF)
+  conv_defs['spec'][-1] = mobilenet.op(
+      slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=256)
+  return conv_defs
+
+
+_CONV_DEFS = _create_modified_mobilenet_config()
+
+
+class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
+  """SSD Feature Extractor using MobilenetV2 FPN features."""
+
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams_fn,
+               fpn_min_level=3,
+               fpn_max_level=7,
+               additional_layer_depth=256,
+               reuse_weights=None,
+               use_explicit_padding=False,
+               use_depthwise=False,
+               override_base_feature_extractor_hyperparams=False):
+    """SSD FPN feature extractor based on Mobilenet v2 architecture.
+
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: float depth multiplier for feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
+        and separable_conv2d ops in the layers that are added on top of the base
+        feature extractor.
+      fpn_min_level: the highest resolution feature map to use in FPN. The valid
+        values are {2, 3, 4, 5} which map to MobileNet v2 layers
+        {layer_4, layer_7, layer_14, layer_19}, respectively.
+      fpn_max_level: the smallest resolution feature map to construct or use in
+        FPN. FPN constructions uses features maps starting from fpn_min_level
+        upto the fpn_max_level. In the case that there are not enough feature
+        maps in the backbone network, additional feature maps are created by
+        applying stride 2 convolutions until we get the desired number of fpn
+        levels.
+      additional_layer_depth: additional feature map layer channel depth.
+      reuse_weights: whether to reuse variables. Default is None.
+      use_explicit_padding: Whether to use explicit padding when extracting
+        features. Default is False.
+      use_depthwise: Whether to use depthwise convolutions. Default is False.
+      override_base_feature_extractor_hyperparams: Whether to override
+        hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams_fn`.
+    """
+    super(SSDMobileNetV2FpnFeatureExtractor, self).__init__(
+        is_training=is_training,
+        depth_multiplier=depth_multiplier,
+        min_depth=min_depth,
+        pad_to_multiple=pad_to_multiple,
+        conv_hyperparams_fn=conv_hyperparams_fn,
+        reuse_weights=reuse_weights,
+        use_explicit_padding=use_explicit_padding,
+        use_depthwise=use_depthwise,
+        override_base_feature_extractor_hyperparams=
+        override_base_feature_extractor_hyperparams)
+    self._fpn_min_level = fpn_min_level
+    self._fpn_max_level = fpn_max_level
+    self._additional_layer_depth = additional_layer_depth
+
+  def preprocess(self, resized_inputs):
+    """SSD preprocessing.
+
+    Maps pixel values to the range [-1, 1].
+
+    Args:
+      resized_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+
+    Returns:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+    """
+    return (2.0 / 255.0) * resized_inputs - 1.0
+
+  def extract_features(self, preprocessed_inputs):
+    """Extract features from preprocessed inputs.
+
+    Args:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+
+    Returns:
+      feature_maps: a list of tensors where the ith tensor has shape
+        [batch, height_i, width_i, depth_i]
+    """
+    preprocessed_inputs = shape_utils.check_min_image_dim(
+        33, preprocessed_inputs)
+
+    with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope:
+      with slim.arg_scope(
+          mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \
+          slim.arg_scope(
+              [mobilenet.depth_multiplier], min_depth=self._min_depth):
+        with (slim.arg_scope(self._conv_hyperparams_fn())
+              if self._override_base_feature_extractor_hyperparams else
+              context_manager.IdentityContextManager()):
+          _, image_features = mobilenet_v2.mobilenet_base(
+              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
+              final_endpoint='layer_19',
+              depth_multiplier=self._depth_multiplier,
+              conv_defs=_CONV_DEFS if self._use_depthwise else None,
+              use_explicit_padding=self._use_explicit_padding,
+              scope=scope)
+      depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth)
+      with slim.arg_scope(self._conv_hyperparams_fn()):
+        with tf.variable_scope('fpn', reuse=self._reuse_weights):
+          feature_blocks = [
+              'layer_4', 'layer_7', 'layer_14', 'layer_19'
+          ]
+          base_fpn_max_level = min(self._fpn_max_level, 5)
+          feature_block_list = []
+          for level in range(self._fpn_min_level, base_fpn_max_level + 1):
+            feature_block_list.append(feature_blocks[level - 2])
+          fpn_features = feature_map_generators.fpn_top_down_feature_maps(
+              [(key, image_features[key]) for key in feature_block_list],
+              depth=depth_fn(self._additional_layer_depth),
+              use_depthwise=self._use_depthwise)
+          feature_maps = []
+          for level in range(self._fpn_min_level, base_fpn_max_level + 1):
+            feature_maps.append(fpn_features['top_down_{}'.format(
+                feature_blocks[level - 2])])
+          last_feature_map = fpn_features['top_down_{}'.format(
+              feature_blocks[base_fpn_max_level - 2])]
+          # Construct coarse features
+          for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1):
+            if self._use_depthwise:
+              conv_op = functools.partial(
+                  slim.separable_conv2d, depth_multiplier=1)
+            else:
+              conv_op = slim.conv2d
+            last_feature_map = conv_op(
+                last_feature_map,
+                num_outputs=depth_fn(self._additional_layer_depth),
+                kernel_size=[3, 3],
+                stride=2,
+                padding='SAME',
+                scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 19))
+            feature_maps.append(last_feature_map)
+    return feature_maps
--- a/research/object_detection/models/ssd_mobilenet_v2_fpn_feature_extractor_test.py
+++ b/research/object_detection/models/ssd_mobilenet_v2_fpn_feature_extractor_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for ssd_mobilenet_v2_fpn_feature_extractor."""
+import numpy as np
+import tensorflow as tf
+
+from object_detection.models import ssd_feature_extractor_test
+from object_detection.models import ssd_mobilenet_v2_fpn_feature_extractor
+
+slim = tf.contrib.slim
+
+
+class SsdMobilenetV2FpnFeatureExtractorTest(
+    ssd_feature_extractor_test.SsdFeatureExtractorTestBase):
+
+  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
+                                is_training=True, use_explicit_padding=False):
+    """Constructs a new feature extractor.
+
+    Args:
+      depth_multiplier: float depth multiplier for feature extractor
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      is_training: whether the network is in training mode.
+      use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
+        inputs so that the output dimensions are the same as if 'SAME' padding
+        were used.
+    Returns:
+      an ssd_meta_arch.SSDFeatureExtractor object.
+    """
+    min_depth = 32
+    return (ssd_mobilenet_v2_fpn_feature_extractor.
+            SSDMobileNetV2FpnFeatureExtractor(
+                is_training,
+                depth_multiplier,
+                min_depth,
+                pad_to_multiple,
+                self.conv_hyperparams_fn,
+                use_explicit_padding=use_explicit_padding))
+
+  def test_extract_features_returns_correct_shapes_256(self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 32, 32, 256), (2, 16, 16, 256),
+                                  (2, 8, 8, 256), (2, 4, 4, 256),
+                                  (2, 2, 2, 256)]
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape, use_explicit_padding=False)
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape, use_explicit_padding=True)
+
+  def test_extract_features_returns_correct_shapes_384(self):
+    image_height = 320
+    image_width = 320
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 40, 40, 256), (2, 20, 20, 256),
+                                  (2, 10, 10, 256), (2, 5, 5, 256),
+                                  (2, 3, 3, 256)]
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape, use_explicit_padding=False)
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape, use_explicit_padding=True)
+
+  def test_extract_features_with_dynamic_image_shape(self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 32, 32, 256), (2, 16, 16, 256),
+                                  (2, 8, 8, 256), (2, 4, 4, 256),
+                                  (2, 2, 2, 256)]
+    self.check_extract_features_returns_correct_shapes_with_dynamic_inputs(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape, use_explicit_padding=False)
+    self.check_extract_features_returns_correct_shapes_with_dynamic_inputs(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape, use_explicit_padding=True)
+
+  def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self):
+    image_height = 299
+    image_width = 299
+    depth_multiplier = 1.0
+    pad_to_multiple = 32
+    expected_feature_map_shape = [(2, 40, 40, 256), (2, 20, 20, 256),
+                                  (2, 10, 10, 256), (2, 5, 5, 256),
+                                  (2, 3, 3, 256)]
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape, use_explicit_padding=False)
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape, use_explicit_padding=True)
+
+  def test_extract_features_returns_correct_shapes_enforcing_min_depth(self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 0.5**12
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 32, 32, 32), (2, 16, 16, 32),
+                                  (2, 8, 8, 32), (2, 4, 4, 32),
+                                  (2, 2, 2, 32)]
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape, use_explicit_padding=False)
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape, use_explicit_padding=True)
+
+  def test_extract_features_raises_error_with_invalid_image_size(self):
+    image_height = 32
+    image_width = 32
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    self.check_extract_features_raises_error_with_invalid_image_size(
+        image_height, image_width, depth_multiplier, pad_to_multiple)
+
+  def test_preprocess_returns_correct_value_range(self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 1
+    pad_to_multiple = 1
+    test_image = np.random.rand(2, image_height, image_width, 3)
+    feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                       pad_to_multiple)
+    preprocessed_image = feature_extractor.preprocess(test_image)
+    self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))
+
+  def test_variables_only_created_in_scope(self):
+    depth_multiplier = 1
+    pad_to_multiple = 1
+    scope_name = 'MobilenetV2'
+    self.check_feature_extractor_variables_under_scope(
+        depth_multiplier, pad_to_multiple, scope_name)
+
+  def test_fused_batchnorm(self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 1
+    pad_to_multiple = 1
+    image_placeholder = tf.placeholder(tf.float32,
+                                       [1, image_height, image_width, 3])
+    feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                       pad_to_multiple)
+    preprocessed_image = feature_extractor.preprocess(image_placeholder)
+    _ = feature_extractor.extract_features(preprocessed_image)
+    self.assertTrue(
+        any(op.type == 'FusedBatchNorm'
+            for op in tf.get_default_graph().get_operations()))
+
+  def test_get_expected_feature_map_variable_names(self):
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+
+    expected_feature_maps_variables = set([
+        # Mobilenet V2 feature maps
+        'MobilenetV2/expanded_conv_4/depthwise/depthwise_weights',
+        'MobilenetV2/expanded_conv_7/depthwise/depthwise_weights',
+        'MobilenetV2/expanded_conv_14/depthwise/depthwise_weights',
+        'MobilenetV2/Conv_1/weights',
+        # FPN layers
+        'MobilenetV2/fpn/bottom_up_Conv2d_20/weights',
+        'MobilenetV2/fpn/bottom_up_Conv2d_21/weights',
+        'MobilenetV2/fpn/smoothing_1/weights',
+        'MobilenetV2/fpn/smoothing_2/weights',
+        'MobilenetV2/fpn/projection_1/weights',
+        'MobilenetV2/fpn/projection_2/weights',
+        'MobilenetV2/fpn/projection_3/weights',
+    ])
+
+    g = tf.Graph()
+    with g.as_default():
+      preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3))
+      feature_extractor = self._create_feature_extractor(
+          depth_multiplier, pad_to_multiple)
+      feature_extractor.extract_features(preprocessed_inputs)
+      actual_variable_set = set([
+          var.op.name for var in g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
+      ])
+      variable_intersection = expected_feature_maps_variables.intersection(
+          actual_variable_set)
+      self.assertSetEqual(expected_feature_maps_variables,
+                          variable_intersection)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/models/ssd_mobilenet_v2_keras_feature_extractor.py
+++ b/research/object_detection/models/ssd_mobilenet_v2_keras_feature_extractor.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""SSDFeatureExtractor for MobilenetV2 features."""
+
+import tensorflow as tf
+
+from object_detection.meta_architectures import ssd_meta_arch
+from object_detection.models import feature_map_generators
+from object_detection.models.keras_applications import mobilenet_v2
+from object_detection.utils import ops
+from object_detection.utils import shape_utils
+
+
+class SSDMobileNetV2KerasFeatureExtractor(
+    ssd_meta_arch.SSDKerasFeatureExtractor):
+  """SSD Feature Extractor using MobilenetV2 features."""
+
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams,
+               freeze_batchnorm,
+               inplace_batchnorm_update,
+               use_explicit_padding=False,
+               use_depthwise=False,
+               override_base_feature_extractor_hyperparams=False,
+               name=None):
+    """MobileNetV2 Feature Extractor for SSD Models.
+
+    Mobilenet v2 (experimental), designed by sandler@. More details can be found
+    in //knowledge/cerebra/brain/compression/mobilenet/mobilenet_experimental.py
+
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: float depth multiplier for feature extractor (Functions
+        as a width multiplier for the mobilenet_v2 network itself).
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams: `hyperparams_builder.KerasLayerHyperparams` object
+        containing convolution hyperparameters for the layers added on top of
+        the base feature extractor.
+      freeze_batchnorm: Whether to freeze batch norm parameters during
+        training or not. When training with a small batch size (e.g. 1), it is
+        desirable to freeze batch norm update and use pretrained batch norm
+        params.
+      inplace_batchnorm_update: Whether to update batch norm moving average
+        values inplace. When this is false train op must add a control
+        dependency on tf.graphkeys.UPDATE_OPS collection in order to update
+        batch norm statistics.
+      use_explicit_padding: Whether to use explicit padding when extracting
+        features. Default is False.
+      use_depthwise: Whether to use depthwise convolutions. Default is False.
+      override_base_feature_extractor_hyperparams: Whether to override
+        hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams_fn`.
+      name: A string name scope to assign to the model. If 'None', Keras
+        will auto-generate one from the class name.
+    """
+    super(SSDMobileNetV2KerasFeatureExtractor, self).__init__(
+        is_training=is_training,
+        depth_multiplier=depth_multiplier,
+        min_depth=min_depth,
+        pad_to_multiple=pad_to_multiple,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=freeze_batchnorm,
+        inplace_batchnorm_update=inplace_batchnorm_update,
+        use_explicit_padding=use_explicit_padding,
+        use_depthwise=use_depthwise,
+        override_base_feature_extractor_hyperparams=
+        override_base_feature_extractor_hyperparams,
+        name=name)
+    feature_map_layout = {
+        'from_layer': ['layer_15/expansion_output', 'layer_19', '', '', '', ''],
+        'layer_depth': [-1, -1, 512, 256, 256, 128],
+        'use_depthwise': self._use_depthwise,
+        'use_explicit_padding': self._use_explicit_padding,
+    }
+
+    with tf.name_scope('MobilenetV2'):
+      full_mobilenet_v2 = mobilenet_v2.mobilenet_v2(
+          batchnorm_training=(is_training and not freeze_batchnorm),
+          conv_hyperparams=(conv_hyperparams
+                            if self._override_base_feature_extractor_hyperparams
+                            else None),
+          weights=None,
+          use_explicit_padding=use_explicit_padding,
+          alpha=self._depth_multiplier,
+          min_depth=self._min_depth,
+          include_top=False)
+      conv2d_11_pointwise = full_mobilenet_v2.get_layer(
+          name='block_13_expand_relu').output
+      conv2d_13_pointwise = full_mobilenet_v2.get_layer(name='out_relu').output
+      self.mobilenet_v2 = tf.keras.Model(
+          inputs=full_mobilenet_v2.inputs,
+          outputs=[conv2d_11_pointwise, conv2d_13_pointwise])
+
+      self.feature_map_generator = (
+          feature_map_generators.KerasMultiResolutionFeatureMaps(
+              feature_map_layout=feature_map_layout,
+              depth_multiplier=self._depth_multiplier,
+              min_depth=self._min_depth,
+              insert_1x1_conv=True,
+              is_training=is_training,
+              conv_hyperparams=conv_hyperparams,
+              freeze_batchnorm=freeze_batchnorm,
+              name='FeatureMaps'))
+
+  def preprocess(self, resized_inputs):
+    """SSD preprocessing.
+
+    Maps pixel values to the range [-1, 1].
+
+    Args:
+      resized_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+
+    Returns:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+    """
+    return (2.0 / 255.0) * resized_inputs - 1.0
+
+  def _extract_features(self, preprocessed_inputs):
+    """Extract features from preprocessed inputs.
+
+    Args:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+
+    Returns:
+      feature_maps: a list of tensors where the ith tensor has shape
+        [batch, height_i, width_i, depth_i]
+    """
+    preprocessed_inputs = shape_utils.check_min_image_dim(
+        33, preprocessed_inputs)
+
+    image_features = self.mobilenet_v2(
+        ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple))
+
+    feature_maps = self.feature_map_generator({
+        'layer_15/expansion_output': image_features[0],
+        'layer_19': image_features[1]})
+
+    return feature_maps.values()
--- a/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor.py
+++ b/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor.py
@@ -43,6 +43,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
               fpn_scope_name,
               fpn_min_level=3,
               fpn_max_level=7,
+               additional_layer_depth=256,
               reuse_weights=None,
               use_explicit_padding=False,
               use_depthwise=False,
@@ -72,6 +73,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
        maps in the backbone network, additional feature maps are created by
        applying stride 2 convolutions until we get the desired number of fpn
        levels.
+      additional_layer_depth: additional feature map layer channel depth.
      reuse_weights: Whether to reuse variables. Default is None.
      use_explicit_padding: Whether to use explicit padding when extracting
        features. Default is False. UNUSED currently.
@@ -104,6 +106,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
    self._fpn_scope_name = fpn_scope_name
    self._fpn_min_level = fpn_min_level
    self._fpn_max_level = fpn_max_level
+    self._additional_layer_depth = additional_layer_depth

  def preprocess(self, resized_inputs):
    """SSD preprocessing.
@@ -177,7 +180,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
            feature_block_list.append('block{}'.format(level - 1))
          fpn_features = feature_map_generators.fpn_top_down_feature_maps(
              [(key, image_features[key]) for key in feature_block_list],
-              depth=256)
+              depth=self._additional_layer_depth)
          feature_maps = []
          for level in range(self._fpn_min_level, base_fpn_max_level + 1):
            feature_maps.append(
@@ -188,7 +191,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
          for i in range(base_fpn_max_level, self._fpn_max_level):
            last_feature_map = slim.conv2d(
                last_feature_map,
-                num_outputs=256,
+                num_outputs=self._additional_layer_depth,
                kernel_size=[3, 3],
                stride=2,
                padding='SAME',
@@ -208,6 +211,7 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
               conv_hyperparams_fn,
               fpn_min_level=3,
               fpn_max_level=7,
+               additional_layer_depth=256,
               reuse_weights=None,
               use_explicit_padding=False,
               use_depthwise=False,
@@ -226,6 +230,7 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
        base feature extractor.
      fpn_min_level: the minimum level in feature pyramid networks.
      fpn_max_level: the maximum level in feature pyramid networks.
+      additional_layer_depth: additional feature map layer channel depth.
      reuse_weights: Whether to reuse variables. Default is None.
      use_explicit_padding: Whether to use explicit padding when extracting
        features. Default is False. UNUSED currently.
@@ -245,6 +250,7 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
        'fpn',
        fpn_min_level,
        fpn_max_level,
+        additional_layer_depth,
        reuse_weights=reuse_weights,
        use_explicit_padding=use_explicit_padding,
        use_depthwise=use_depthwise,
@@ -263,6 +269,7 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
               conv_hyperparams_fn,
               fpn_min_level=3,
               fpn_max_level=7,
+               additional_layer_depth=256,
               reuse_weights=None,
               use_explicit_padding=False,
               use_depthwise=False,
@@ -281,6 +288,7 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
        base feature extractor.
      fpn_min_level: the minimum level in feature pyramid networks.
      fpn_max_level: the maximum level in feature pyramid networks.
+      additional_layer_depth: additional feature map layer channel depth.
      reuse_weights: Whether to reuse variables. Default is None.
      use_explicit_padding: Whether to use explicit padding when extracting
        features. Default is False. UNUSED currently.
@@ -300,6 +308,7 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
        'fpn',
        fpn_min_level,
        fpn_max_level,
+        additional_layer_depth,
        reuse_weights=reuse_weights,
        use_explicit_padding=use_explicit_padding,
        use_depthwise=use_depthwise,
@@ -318,6 +327,7 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
               conv_hyperparams_fn,
               fpn_min_level=3,
               fpn_max_level=7,
+               additional_layer_depth=256,
               reuse_weights=None,
               use_explicit_padding=False,
               use_depthwise=False,
@@ -336,6 +346,7 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
        base feature extractor.
      fpn_min_level: the minimum level in feature pyramid networks.
      fpn_max_level: the maximum level in feature pyramid networks.
+      additional_layer_depth: additional feature map layer channel depth.
      reuse_weights: Whether to reuse variables. Default is None.
      use_explicit_padding: Whether to use explicit padding when extracting
        features. Default is False. UNUSED currently.
@@ -355,6 +366,7 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
        'fpn',
        fpn_min_level,
        fpn_max_level,
+        additional_layer_depth,
        reuse_weights=reuse_weights,
        use_explicit_padding=use_explicit_padding,
        use_depthwise=use_depthwise,

--- a/research/object_detection/object_detection_tutorial.ipynb
+++ b/research/object_detection/object_detection_tutorial.ipynb
@@ -36,7 +36,6 @@
      },
      "outputs": [],
      "source": [
-        "from distutils.version import StrictVersion\n",
        "import numpy as np\n",
        "import os\n",
        "import six.moves.urllib as urllib\n",
@@ -45,6 +44,7 @@
        "import tensorflow as tf\n",
        "import zipfile\n",
        "\n",
+        "from distutils.version import StrictVersion\n",
        "from collections import defaultdict\n",
        "from io import StringIO\n",
        "from matplotlib import pyplot as plt\n",
@@ -166,9 +166,7 @@
        "PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'\n",
        "\n",
        "# List of the strings that is used to add correct label for each box.\n",
-        "PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')\n",
-        "\n",
-        "NUM_CLASSES = 90"
+        "PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')"
      ]
    },
    {
@@ -265,9 +263,7 @@
      },
      "outputs": [],
      "source": [
-        "label_map = label_map_util.load_labelmap(PATH_TO_LABELS)\n",
-        "categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)\n",
-        "category_index = label_map_util.create_category_index(categories)"
+        "category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS, use_display_name=True)"
      ]
    },
    {

--- a/research/object_detection/predictors/convolutional_box_predictor.py
+++ b/research/object_detection/predictors/convolutional_box_predictor.py
@@ -14,6 +14,7 @@
 # ==============================================================================

 """Convolutional Box Predictors with and without weight sharing."""
+import functools
 import tensorflow as tf
 from object_detection.core import box_predictor
 from object_detection.utils import static_shape
@@ -163,7 +164,7 @@ class ConvolutionalBoxPredictor(box_predictor.BoxPredictor):
              else:
                head_obj = self._other_heads[head_name]
              prediction = head_obj.predict(
-                  features=image_feature,
+                  features=net,
                  num_predictions_per_location=num_predictions_per_location)
              predictions[head_name].append(prediction)
    return predictions
@@ -203,7 +204,8 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor):
               num_layers_before_predictor,
               kernel_size=3,
               apply_batch_norm=False,
-               share_prediction_tower=False):
+               share_prediction_tower=False,
+               use_depthwise=False):
    """Constructor.

    Args:
@@ -226,6 +228,8 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor):
        this predictor.
      share_prediction_tower: Whether to share the multi-layer tower between box
        prediction and class prediction heads.
+      use_depthwise: Whether to use depthwise separable conv2d instead of
+       regular conv2d.
    """
    super(WeightSharedConvolutionalBoxPredictor, self).__init__(is_training,
                                                                num_classes)
@@ -238,6 +242,7 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor):
    self._kernel_size = kernel_size
    self._apply_batch_norm = apply_batch_norm
    self._share_prediction_tower = share_prediction_tower
+    self._use_depthwise = use_depthwise

  @property
  def num_classes(self):
@@ -270,7 +275,11 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor):
                          inserted_layer_counter):
    net = image_feature
    for i in range(self._num_layers_before_predictor):
-      net = slim.conv2d(
+      if self._use_depthwise:
+        conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1)
+      else:
+        conv_op = slim.conv2d
+      net = conv_op(
          net,
          self._depth, [self._kernel_size, self._kernel_size],
          stride=1,

--- a/research/object_detection/predictors/convolutional_box_predictor_test.py
+++ b/research/object_detection/predictors/convolutional_box_predictor_test.py
@@ -234,6 +234,40 @@ class ConvolutionalBoxPredictorTest(test_case.TestCase):
        'BoxPredictor/ClassPredictor/weights'])
    self.assertEqual(expected_variable_set, actual_variable_set)

+  def test_no_dangling_outputs(self):
+    image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
+    conv_box_predictor = (
+        box_predictor_builder.build_convolutional_box_predictor(
+            is_training=False,
+            num_classes=0,
+            conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
+            min_depth=0,
+            max_depth=32,
+            num_layers_before_predictor=1,
+            dropout_keep_prob=0.8,
+            kernel_size=1,
+            box_code_size=4,
+            use_dropout=True,
+            use_depthwise=True))
+    box_predictions = conv_box_predictor.predict(
+        [image_features], num_predictions_per_location=[5],
+        scope='BoxPredictor')
+    tf.concat(
+        box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
+    tf.concat(
+        box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
+        axis=1)
+
+    bad_dangling_ops = []
+    types_safe_to_dangle = set(['Assign', 'Mul', 'Const'])
+    for op in tf.get_default_graph().get_operations():
+      if (not op.outputs) or (not op.outputs[0].consumers()):
+        if 'BoxPredictor' in op.name:
+          if op.type not in types_safe_to_dangle:
+            bad_dangling_ops.append(op)
+
+    self.assertEqual(bad_dangling_ops, [])
+

 class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):

@@ -545,6 +579,79 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
         'ClassPredictor/biases')])
    self.assertEqual(expected_variable_set, actual_variable_set)

+  def test_predictions_multiple_feature_maps_share_weights_with_depthwise(
+      self):
+    num_classes_without_background = 6
+    def graph_fn(image_features1, image_features2):
+      conv_box_predictor = (
+          box_predictor_builder.build_weight_shared_convolutional_box_predictor(
+              is_training=False,
+              num_classes=num_classes_without_background,
+              conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
+              depth=32,
+              num_layers_before_predictor=2,
+              box_code_size=4,
+              apply_batch_norm=False,
+              use_depthwise=True))
+      box_predictions = conv_box_predictor.predict(
+          [image_features1, image_features2],
+          num_predictions_per_location=[5, 5],
+          scope='BoxPredictor')
+      box_encodings = tf.concat(
+          box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
+      class_predictions_with_background = tf.concat(
+          box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
+          axis=1)
+      return (box_encodings, class_predictions_with_background)
+
+    with self.test_session(graph=tf.Graph()):
+      graph_fn(tf.random_uniform([4, 32, 32, 3], dtype=tf.float32),
+               tf.random_uniform([4, 16, 16, 3], dtype=tf.float32))
+      actual_variable_set = set(
+          [var.op.name for var in tf.trainable_variables()])
+    expected_variable_set = set([
+        # Box prediction tower
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxPredictionTower/conv2d_0/depthwise_weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxPredictionTower/conv2d_0/pointwise_weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxPredictionTower/conv2d_0/biases'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxPredictionTower/conv2d_1/depthwise_weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxPredictionTower/conv2d_1/pointwise_weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxPredictionTower/conv2d_1/biases'),
+        # Box prediction head
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxPredictor/depthwise_weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxPredictor/pointwise_weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxPredictor/biases'),
+        # Class prediction tower
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictionTower/conv2d_0/depthwise_weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictionTower/conv2d_0/pointwise_weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictionTower/conv2d_0/biases'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictionTower/conv2d_1/depthwise_weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictionTower/conv2d_1/pointwise_weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictionTower/conv2d_1/biases'),
+        # Class prediction head
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictor/depthwise_weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictor/pointwise_weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictor/biases')])
+    self.assertEqual(expected_variable_set, actual_variable_set)
+
  def test_no_batchnorm_params_when_batchnorm_is_not_configured(self):
    num_classes_without_background = 6
    def graph_fn(image_features1, image_features2):

--- a/research/object_detection/predictors/convolutional_keras_box_predictor.py
+++ b/research/object_detection/predictors/convolutional_keras_box_predictor.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Convolutional Box Predictors with and without weight sharing."""
+import collections
+
+import tensorflow as tf
+
+from object_detection.core import box_predictor
+from object_detection.utils import static_shape
+
+keras = tf.keras.layers
+
+BOX_ENCODINGS = box_predictor.BOX_ENCODINGS
+CLASS_PREDICTIONS_WITH_BACKGROUND = (
+    box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND)
+MASK_PREDICTIONS = box_predictor.MASK_PREDICTIONS
+
+
+class _NoopVariableScope(object):
+  """A dummy class that does not push any scope."""
+
+  def __enter__(self):
+    return None
+
+  def __exit__(self, exc_type, exc_value, traceback):
+    return False
+
+
+class ConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor):
+  """Convolutional Keras Box Predictor.
+
+  Optionally add an intermediate 1x1 convolutional layer after features and
+  predict in parallel branches box_encodings and
+  class_predictions_with_background.
+
+  Currently this box predictor assumes that predictions are "shared" across
+  classes --- that is each anchor makes box predictions which do not depend
+  on class.
+  """
+
+  def __init__(self,
+               is_training,
+               num_classes,
+               box_prediction_heads,
+               class_prediction_heads,
+               other_heads,
+               conv_hyperparams,
+               num_layers_before_predictor,
+               min_depth,
+               max_depth,
+               freeze_batchnorm,
+               inplace_batchnorm_update,
+               name=None):
+    """Constructor.
+
+    Args:
+      is_training: Indicates whether the BoxPredictor is in training mode.
+      num_classes: number of classes.  Note that num_classes *does not*
+        include the background category, so if groundtruth labels take values
+        in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
+        assigned classification targets can range from {0,... K}).
+      box_prediction_heads: A list of heads that predict the boxes.
+      class_prediction_heads: A list of heads that predict the classes.
+      other_heads: A dictionary mapping head names to lists of convolutional
+        heads.
+      conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
+        containing hyperparameters for convolution ops.
+      num_layers_before_predictor: Number of the additional conv layers before
+        the predictor.
+      min_depth: Minimum feature depth prior to predicting box encodings
+        and class predictions.
+      max_depth: Maximum feature depth prior to predicting box encodings
+        and class predictions. If max_depth is set to 0, no additional
+        feature map will be inserted before location and class predictions.
+      freeze_batchnorm: Whether to freeze batch norm parameters during
+        training or not. When training with a small batch size (e.g. 1), it is
+        desirable to freeze batch norm update and use pretrained batch norm
+        params.
+      inplace_batchnorm_update: Whether to update batch norm moving average
+        values inplace. When this is false train op must add a control
+        dependency on tf.graphkeys.UPDATE_OPS collection in order to update
+        batch norm statistics.
+      name: A string name scope to assign to the model. If `None`, Keras
+        will auto-generate one from the class name.
+
+    Raises:
+      ValueError: if min_depth > max_depth.
+    """
+    super(ConvolutionalBoxPredictor, self).__init__(
+        is_training, num_classes, freeze_batchnorm=freeze_batchnorm,
+        inplace_batchnorm_update=inplace_batchnorm_update,
+        name=name)
+    if min_depth > max_depth:
+      raise ValueError('min_depth should be less than or equal to max_depth')
+    if len(box_prediction_heads) != len(class_prediction_heads):
+      raise ValueError('All lists of heads must be the same length.')
+    for other_head_list in other_heads.values():
+      if len(box_prediction_heads) != len(other_head_list):
+        raise ValueError('All lists of heads must be the same length.')
+
+    self._prediction_heads = {
+        BOX_ENCODINGS: box_prediction_heads,
+        CLASS_PREDICTIONS_WITH_BACKGROUND: class_prediction_heads,
+    }
+
+    if other_heads:
+      self._prediction_heads.update(other_heads)
+
+    self._conv_hyperparams = conv_hyperparams
+    self._min_depth = min_depth
+    self._max_depth = max_depth
+    self._num_layers_before_predictor = num_layers_before_predictor
+
+    self._shared_nets = []
+
+  def build(self, input_shapes):
+    """Creates the variables of the layer."""
+    if len(input_shapes) != len(self._prediction_heads[BOX_ENCODINGS]):
+      raise ValueError('This box predictor was constructed with %d heads,'
+                       'but there are %d inputs.' %
+                       (len(self._prediction_heads[BOX_ENCODINGS]),
+                        len(input_shapes)))
+    for stack_index, input_shape in enumerate(input_shapes):
+      net = tf.keras.Sequential(name='PreHeadConvolutions_%d' % stack_index)
+      self._shared_nets.append(net)
+
+      # Add additional conv layers before the class predictor.
+      features_depth = static_shape.get_depth(input_shape)
+      depth = max(min(features_depth, self._max_depth), self._min_depth)
+      tf.logging.info(
+          'depth of additional conv before box predictor: {}'.format(depth))
+      if depth > 0 and self._num_layers_before_predictor > 0:
+        for i in range(self._num_layers_before_predictor):
+          net.add(keras.Conv2D(depth, [1, 1],
+                               name='Conv2d_%d_1x1_%d' % (i, depth),
+                               padding='SAME',
+                               **self._conv_hyperparams.params()))
+          net.add(self._conv_hyperparams.build_batch_norm(
+              training=(self._is_training and not self._freeze_batchnorm),
+              name='Conv2d_%d_1x1_%d_norm' % (i, depth)))
+          net.add(self._conv_hyperparams.build_activation_layer(
+              name='Conv2d_%d_1x1_%d_activation' % (i, depth),
+          ))
+    self.built = True
+
+  def _predict(self, image_features):
+    """Computes encoded object locations and corresponding confidences.
+
+    Args:
+      image_features: A list of float tensors of shape [batch_size, height_i,
+        width_i, channels_i] containing features for a batch of images.
+
+    Returns:
+      box_encodings: A list of float tensors of shape
+        [batch_size, num_anchors_i, q, code_size] representing the location of
+        the objects, where q is 1 or the number of classes. Each entry in the
+        list corresponds to a feature map in the input `image_features` list.
+      class_predictions_with_background: A list of float tensors of shape
+        [batch_size, num_anchors_i, num_classes + 1] representing the class
+        predictions for the proposals. Each entry in the list corresponds to a
+        feature map in the input `image_features` list.
+    """
+    predictions = collections.defaultdict(list)
+
+    for (index, image_feature) in enumerate(image_features):
+
+      # Apply shared conv layers before the head predictors.
+      net = self._shared_nets[index](image_feature)
+
+      for head_name in self._prediction_heads:
+        head_obj = self._prediction_heads[head_name][index]
+        prediction = head_obj(net)
+        predictions[head_name].append(prediction)
+
+    return predictions
--- a/research/object_detection/predictors/convolutional_keras_box_predictor_test.py
+++ b/research/object_detection/predictors/convolutional_keras_box_predictor_test.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for object_detection.predictors.convolutional_keras_box_predictor."""
+import numpy as np
+import tensorflow as tf
+
+from google.protobuf import text_format
+from object_detection.builders import box_predictor_builder
+from object_detection.builders import hyperparams_builder
+from object_detection.predictors import convolutional_keras_box_predictor as box_predictor
+from object_detection.protos import hyperparams_pb2
+from object_detection.utils import test_case
+
+
+class ConvolutionalKerasBoxPredictorTest(test_case.TestCase):
+
+  def _build_conv_hyperparams(self):
+    conv_hyperparams = hyperparams_pb2.Hyperparams()
+    conv_hyperparams_text_proto = """
+      activation: RELU_6
+      regularizer {
+        l2_regularizer {
+        }
+      }
+      initializer {
+        truncated_normal_initializer {
+        }
+      }
+    """
+    text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
+    return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
+
+  def test_get_boxes_for_five_aspect_ratios_per_location(self):
+    def graph_fn(image_features):
+      conv_box_predictor = (
+          box_predictor_builder.build_convolutional_keras_box_predictor(
+              is_training=False,
+              num_classes=0,
+              conv_hyperparams=self._build_conv_hyperparams(),
+              freeze_batchnorm=False,
+              inplace_batchnorm_update=False,
+              num_predictions_per_location_list=[5],
+              min_depth=0,
+              max_depth=32,
+              num_layers_before_predictor=1,
+              use_dropout=True,
+              dropout_keep_prob=0.8,
+              kernel_size=1,
+              box_code_size=4
+          ))
+      box_predictions = conv_box_predictor([image_features])
+      box_encodings = tf.concat(
+          box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
+      objectness_predictions = tf.concat(
+          box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
+          axis=1)
+      return (box_encodings, objectness_predictions)
+    image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
+    (box_encodings, objectness_predictions) = self.execute(graph_fn,
+                                                           [image_features])
+    self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
+    self.assertAllEqual(objectness_predictions.shape, [4, 320, 1])
+
+  def test_get_boxes_for_one_aspect_ratio_per_location(self):
+    def graph_fn(image_features):
+      conv_box_predictor = (
+          box_predictor_builder.build_convolutional_keras_box_predictor(
+              is_training=False,
+              num_classes=0,
+              conv_hyperparams=self._build_conv_hyperparams(),
+              freeze_batchnorm=False,
+              inplace_batchnorm_update=False,
+              num_predictions_per_location_list=[1],
+              min_depth=0,
+              max_depth=32,
+              num_layers_before_predictor=1,
+              use_dropout=True,
+              dropout_keep_prob=0.8,
+              kernel_size=1,
+              box_code_size=4
+          ))
+      box_predictions = conv_box_predictor([image_features])
+      box_encodings = tf.concat(
+          box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
+      objectness_predictions = tf.concat(box_predictions[
+          box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1)
+      return (box_encodings, objectness_predictions)
+    image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
+    (box_encodings, objectness_predictions) = self.execute(graph_fn,
+                                                           [image_features])
+    self.assertAllEqual(box_encodings.shape, [4, 64, 1, 4])
+    self.assertAllEqual(objectness_predictions.shape, [4, 64, 1])
+
+  def test_get_multi_class_predictions_for_five_aspect_ratios_per_location(
+      self):
+    num_classes_without_background = 6
+    image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
+    def graph_fn(image_features):
+      conv_box_predictor = (
+          box_predictor_builder.build_convolutional_keras_box_predictor(
+              is_training=False,
+              num_classes=num_classes_without_background,
+              conv_hyperparams=self._build_conv_hyperparams(),
+              freeze_batchnorm=False,
+              inplace_batchnorm_update=False,
+              num_predictions_per_location_list=[5],
+              min_depth=0,
+              max_depth=32,
+              num_layers_before_predictor=1,
+              use_dropout=True,
+              dropout_keep_prob=0.8,
+              kernel_size=1,
+              box_code_size=4
+          ))
+      box_predictions = conv_box_predictor([image_features])
+      box_encodings = tf.concat(
+          box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
+      class_predictions_with_background = tf.concat(
+          box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
+          axis=1)
+      return (box_encodings, class_predictions_with_background)
+    (box_encodings,
+     class_predictions_with_background) = self.execute(graph_fn,
+                                                       [image_features])
+    self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
+    self.assertAllEqual(class_predictions_with_background.shape,
+                        [4, 320, num_classes_without_background+1])
+
+  def test_get_predictions_with_feature_maps_of_dynamic_shape(
+      self):
+    image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
+    conv_box_predictor = (
+        box_predictor_builder.build_convolutional_keras_box_predictor(
+            is_training=False,
+            num_classes=0,
+            conv_hyperparams=self._build_conv_hyperparams(),
+            freeze_batchnorm=False,
+            inplace_batchnorm_update=False,
+            num_predictions_per_location_list=[5],
+            min_depth=0,
+            max_depth=32,
+            num_layers_before_predictor=1,
+            use_dropout=True,
+            dropout_keep_prob=0.8,
+            kernel_size=1,
+            box_code_size=4
+        ))
+    box_predictions = conv_box_predictor([image_features])
+    box_encodings = tf.concat(
+        box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
+    objectness_predictions = tf.concat(
+        box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
+        axis=1)
+    init_op = tf.global_variables_initializer()
+
+    resolution = 32
+    expected_num_anchors = resolution*resolution*5
+    with self.test_session() as sess:
+      sess.run(init_op)
+      (box_encodings_shape,
+       objectness_predictions_shape) = sess.run(
+           [tf.shape(box_encodings), tf.shape(objectness_predictions)],
+           feed_dict={image_features:
+                      np.random.rand(4, resolution, resolution, 64)})
+      actual_variable_set = set(
+          [var.op.name for var in tf.trainable_variables()])
+      self.assertAllEqual(box_encodings_shape, [4, expected_num_anchors, 1, 4])
+      self.assertAllEqual(objectness_predictions_shape,
+                          [4, expected_num_anchors, 1])
+    expected_variable_set = set([
+        'BoxPredictor/PreHeadConvolutions_0/Conv2d_0_1x1_32/bias',
+        'BoxPredictor/PreHeadConvolutions_0/Conv2d_0_1x1_32/kernel',
+        'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/bias',
+        'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/kernel',
+        'BoxPredictor/ConvolutionalClassHead_0/ClassPredictor/bias',
+        'BoxPredictor/ConvolutionalClassHead_0/ClassPredictor/kernel'])
+    self.assertEqual(expected_variable_set, actual_variable_set)
+
+  # TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/predictors/heads/box_head.py
+++ b/research/object_detection/predictors/heads/box_head.py
@@ -19,6 +19,7 @@ Contains Box prediction head classes for different meta architectures.
 All the box prediction heads have a predict function that receives the
 `features` as the first argument and returns `box_encodings`.
 """
+import functools
 import tensorflow as tf

 from object_detection.predictors.heads import head
@@ -196,18 +197,22 @@ class WeightSharedConvolutionalBoxHead(head.Head):
  def __init__(self,
               box_code_size,
               kernel_size=3,
-               class_prediction_bias_init=0.0):
+               use_depthwise=False,
+               box_encodings_clip_range=None):
    """Constructor.

    Args:
      box_code_size: Size of encoding for each box.
      kernel_size: Size of final convolution kernel.
-      class_prediction_bias_init: constant value to initialize bias of the last
-        conv2d layer before class prediction.
+      use_depthwise: Whether to use depthwise convolutions for prediction steps.
+        Default is False.
+      box_encodings_clip_range: Min and max values for clipping box_encodings.
    """
    super(WeightSharedConvolutionalBoxHead, self).__init__()
    self._box_code_size = box_code_size
    self._kernel_size = kernel_size
+    self._use_depthwise = use_depthwise
+    self._box_encodings_clip_range = box_encodings_clip_range

  def predict(self, features, num_predictions_per_location):
    """Predicts boxes.
@@ -224,7 +229,11 @@ class WeightSharedConvolutionalBoxHead(head.Head):
        the objects.
    """
    box_encodings_net = features
-    box_encodings = slim.conv2d(
+    if self._use_depthwise:
+      conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1)
+    else:
+      conv_op = slim.conv2d
+    box_encodings = conv_op(
        box_encodings_net,
        num_predictions_per_location * self._box_code_size,
        [self._kernel_size, self._kernel_size],
@@ -234,6 +243,11 @@ class WeightSharedConvolutionalBoxHead(head.Head):
    batch_size = features.get_shape().as_list()[0]
    if batch_size is None:
      batch_size = tf.shape(features)[0]
+    # Clipping the box encodings to make the inference graph TPU friendly.
+    if self._box_encodings_clip_range is not None:
+      box_encodings = tf.clip_by_value(
+          box_encodings, self._box_encodings_clip_range.min,
+          self._box_encodings_clip_range.max)
    box_encodings = tf.reshape(box_encodings,
                               [batch_size, -1, self._box_code_size])
    return box_encodings
--- a/research/object_detection/predictors/heads/class_head.py
+++ b/research/object_detection/predictors/heads/class_head.py
@@ -19,6 +19,7 @@ Contains Class prediction head classes for different meta architectures.
 All the class prediction heads have a predict function that receives the
 `features` as the first argument and returns class predictions with background.
 """
+import functools
 import tensorflow as tf

 from object_detection.predictors.heads import head
@@ -211,7 +212,9 @@ class WeightSharedConvolutionalClassHead(head.Head):
               kernel_size=3,
               class_prediction_bias_init=0.0,
               use_dropout=False,
-               dropout_keep_prob=0.8):
+               dropout_keep_prob=0.8,
+               use_depthwise=False,
+               score_converter_fn=tf.identity):
    """Constructor.

    Args:
@@ -224,6 +227,10 @@ class WeightSharedConvolutionalClassHead(head.Head):
        conv2d layer before class prediction.
      use_dropout: Whether to apply dropout to class prediction head.
      dropout_keep_prob: Probability of keeping activiations.
+      use_depthwise: Whether to use depthwise convolutions for prediction
+        steps. Default is False.
+      score_converter_fn: Callable elementwise nonlinearity (that takes tensors
+        as inputs and returns tensors).
    """
    super(WeightSharedConvolutionalClassHead, self).__init__()
    self._num_classes = num_classes
@@ -231,6 +238,8 @@ class WeightSharedConvolutionalClassHead(head.Head):
    self._class_prediction_bias_init = class_prediction_bias_init
    self._use_dropout = use_dropout
    self._dropout_keep_prob = dropout_keep_prob
+    self._use_depthwise = use_depthwise
+    self._score_converter_fn = score_converter_fn

  def predict(self, features, num_predictions_per_location):
    """Predicts boxes.
@@ -252,7 +261,11 @@ class WeightSharedConvolutionalClassHead(head.Head):
    if self._use_dropout:
      class_predictions_net = slim.dropout(
          class_predictions_net, keep_prob=self._dropout_keep_prob)
-    class_predictions_with_background = slim.conv2d(
+    if self._use_depthwise:
+      conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1)
+    else:
+      conv_op = slim.conv2d
+    class_predictions_with_background = conv_op(
        class_predictions_net,
        num_predictions_per_location * num_class_slots,
        [self._kernel_size, self._kernel_size],
@@ -264,6 +277,8 @@ class WeightSharedConvolutionalClassHead(head.Head):
    batch_size = features.get_shape().as_list()[0]
    if batch_size is None:
      batch_size = tf.shape(features)[0]
+    class_predictions_with_background = self._score_converter_fn(
+        class_predictions_with_background)
    class_predictions_with_background = tf.reshape(
        class_predictions_with_background, [batch_size, -1, num_class_slots])
    return class_predictions_with_background
--- a/research/object_detection/predictors/heads/head.py
+++ b/research/object_detection/predictors/heads/head.py
@@ -36,6 +36,8 @@ Mask RCNN box predictor.
 """
 from abc import abstractmethod

+import tensorflow as tf
+

 class Head(object):
  """Mask RCNN head base class."""
@@ -57,3 +59,23 @@ class Head(object):
      A tf.float32 tensor.
    """
    pass
+
+
+class KerasHead(tf.keras.Model):
+  """Keras head base class."""
+
+  def call(self, features):
+    """The Keras model call will delegate to the `_predict` method."""
+    return self._predict(features)
+
+  @abstractmethod
+  def _predict(self, features):
+    """Returns the head's predictions.
+
+    Args:
+      features: A float tensor of features.
+
+    Returns:
+      A tf.float32 tensor.
+    """
+    pass
--- a/research/object_detection/predictors/heads/keras_box_head.py
+++ b/research/object_detection/predictors/heads/keras_box_head.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Box Head.
+
+Contains Box prediction head classes for different meta architectures.
+All the box prediction heads have a _predict function that receives the
+`features` as the first argument and returns `box_encodings`.
+"""
+import tensorflow as tf
+
+from object_detection.predictors.heads import head
+
+
+class ConvolutionalBoxHead(head.KerasHead):
+  """Convolutional box prediction head."""
+
+  def __init__(self,
+               is_training,
+               box_code_size,
+               kernel_size,
+               num_predictions_per_location,
+               conv_hyperparams,
+               freeze_batchnorm,
+               use_depthwise=True,
+               name=None):
+    """Constructor.
+
+    Args:
+      is_training: Indicates whether the BoxPredictor is in training mode.
+      box_code_size: Size of encoding for each box.
+      kernel_size: Size of final convolution kernel.  If the
+        spatial resolution of the feature map is smaller than the kernel size,
+        then the kernel size is automatically set to be
+        min(feature_width, feature_height).
+      num_predictions_per_location: Number of box predictions to be made per
+        spatial location. Int specifying number of boxes per location.
+      conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
+        containing hyperparameters for convolution ops.
+      freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
+        training or not. When training with a small batch size (e.g. 1), it is
+        desirable to freeze batch norm update and use pretrained batch norm
+        params.
+      use_depthwise: Whether to use depthwise convolutions for prediction
+        steps. Default is False.
+      name: A string name scope to assign to the model. If `None`, Keras
+        will auto-generate one from the class name.
+
+    Raises:
+      ValueError: if min_depth > max_depth.
+    """
+    super(ConvolutionalBoxHead, self).__init__(name=name)
+    self._is_training = is_training
+    self._box_code_size = box_code_size
+    self._kernel_size = kernel_size
+    self._num_predictions_per_location = num_predictions_per_location
+    self._use_depthwise = use_depthwise
+
+    self._box_encoder_layers = []
+
+    if self._use_depthwise:
+      self._box_encoder_layers.append(
+          tf.keras.layers.DepthwiseConv2D(
+              [self._kernel_size, self._kernel_size],
+              padding='SAME',
+              depth_multiplier=1,
+              strides=1,
+              dilation_rate=1,
+              name='BoxEncodingPredictor_depthwise',
+              **conv_hyperparams.params()))
+      self._box_encoder_layers.append(
+          conv_hyperparams.build_batch_norm(
+              training=(is_training and not freeze_batchnorm),
+              name='BoxEncodingPredictor_depthwise_batchnorm'))
+      self._box_encoder_layers.append(
+          conv_hyperparams.build_activation_layer(
+              name='BoxEncodingPredictor_depthwise_activation'))
+      self._box_encoder_layers.append(
+          tf.keras.layers.Conv2D(
+              num_predictions_per_location * self._box_code_size, [1, 1],
+              name='BoxEncodingPredictor',
+              **conv_hyperparams.params(activation=None)))
+    else:
+      self._box_encoder_layers.append(
+          tf.keras.layers.Conv2D(
+              num_predictions_per_location * self._box_code_size,
+              [self._kernel_size, self._kernel_size],
+              padding='SAME',
+              name='BoxEncodingPredictor',
+              **conv_hyperparams.params(activation=None)))
+
+  def _predict(self, features):
+    """Predicts boxes.
+
+    Args:
+      features: A float tensor of shape [batch_size, height, width, channels]
+        containing image features.
+
+    Returns:
+      box_encodings: A float tensor of shape
+        [batch_size, num_anchors, q, code_size] representing the location of
+        the objects, where q is 1 or the number of classes.
+    """
+    box_encodings = features
+    for layer in self._box_encoder_layers:
+      box_encodings = layer(box_encodings)
+    batch_size = features.get_shape().as_list()[0]
+    if batch_size is None:
+      batch_size = tf.shape(features)[0]
+    box_encodings = tf.reshape(box_encodings,
+                               [batch_size, -1, 1, self._box_code_size])
+    return box_encodings
--- a/research/object_detection/predictors/heads/keras_box_head_test.py
+++ b/research/object_detection/predictors/heads/keras_box_head_test.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for object_detection.predictors.heads.box_head."""
+import tensorflow as tf
+
+from google.protobuf import text_format
+from object_detection.builders import hyperparams_builder
+from object_detection.predictors.heads import keras_box_head
+from object_detection.protos import hyperparams_pb2
+from object_detection.utils import test_case
+
+
+class ConvolutionalKerasBoxHeadTest(test_case.TestCase):
+
+  def _build_conv_hyperparams(self):
+    conv_hyperparams = hyperparams_pb2.Hyperparams()
+    conv_hyperparams_text_proto = """
+    activation: NONE
+      regularizer {
+        l2_regularizer {
+        }
+      }
+      initializer {
+        truncated_normal_initializer {
+        }
+      }
+    """
+    text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
+    return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
+
+  def test_prediction_size_depthwise_false(self):
+    conv_hyperparams = self._build_conv_hyperparams()
+    box_prediction_head = keras_box_head.ConvolutionalBoxHead(
+        is_training=True,
+        box_code_size=4,
+        kernel_size=3,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=False,
+        num_predictions_per_location=1,
+        use_depthwise=False)
+    image_feature = tf.random_uniform(
+        [64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
+    box_encodings = box_prediction_head(image_feature)
+    self.assertAllEqual([64, 323, 1, 4], box_encodings.get_shape().as_list())
+
+  # TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/predictors/heads/keras_class_head.py
+++ b/research/object_detection/predictors/heads/keras_class_head.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Class Head.
+
+Contains Class prediction head classes for different meta architectures.
+All the class prediction heads have a predict function that receives the
+`features` as the first argument and returns class predictions with background.
+"""
+import tensorflow as tf
+
+from object_detection.predictors.heads import head
+
+
+class ConvolutionalClassHead(head.KerasHead):
+  """Convolutional class prediction head."""
+
+  def __init__(self,
+               is_training,
+               num_classes,
+               use_dropout,
+               dropout_keep_prob,
+               kernel_size,
+               num_predictions_per_location,
+               conv_hyperparams,
+               freeze_batchnorm,
+               class_prediction_bias_init=0.0,
+               use_depthwise=False,
+               name=None):
+    """Constructor.
+
+    Args:
+      is_training: Indicates whether the BoxPredictor is in training mode.
+      num_classes: Number of classes.
+      use_dropout: Option to use dropout or not.  Note that a single dropout
+        op is applied here prior to both box and class predictions, which stands
+        in contrast to the ConvolutionalBoxPredictor below.
+      dropout_keep_prob: Keep probability for dropout.
+        This is only used if use_dropout is True.
+      kernel_size: Size of final convolution kernel.  If the
+        spatial resolution of the feature map is smaller than the kernel size,
+        then the kernel size is automatically set to be
+        min(feature_width, feature_height).
+      num_predictions_per_location: Number of box predictions to be made per
+        spatial location. Int specifying number of boxes per location.
+      conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
+        containing hyperparameters for convolution ops.
+      freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
+        training or not. When training with a small batch size (e.g. 1), it is
+        desirable to freeze batch norm update and use pretrained batch norm
+        params.
+      class_prediction_bias_init: constant value to initialize bias of the last
+        conv2d layer before class prediction.
+      use_depthwise: Whether to use depthwise convolutions for prediction
+        steps. Default is False.
+      name: A string name scope to assign to the model. If `None`, Keras
+        will auto-generate one from the class name.
+
+    Raises:
+      ValueError: if min_depth > max_depth.
+    """
+    super(ConvolutionalClassHead, self).__init__(name=name)
+    self._is_training = is_training
+    self._num_classes = num_classes
+    self._use_dropout = use_dropout
+    self._dropout_keep_prob = dropout_keep_prob
+    self._kernel_size = kernel_size
+    self._class_prediction_bias_init = class_prediction_bias_init
+    self._use_depthwise = use_depthwise
+    self._num_class_slots = self._num_classes + 1
+
+    self._class_predictor_layers = []
+
+    if self._use_dropout:
+      self._class_predictor_layers.append(
+          # The Dropout layer's `training` parameter for the call method must
+          # be set implicitly by the Keras set_learning_phase. The object
+          # detection training code takes care of this.
+          tf.keras.layers.Dropout(rate=1.0 - self._dropout_keep_prob))
+    if self._use_depthwise:
+      self._class_predictor_layers.append(
+          tf.keras.layers.DepthwiseConv2D(
+              [self._kernel_size, self._kernel_size],
+              padding='SAME',
+              depth_multiplier=1,
+              strides=1,
+              dilation_rate=1,
+              name='ClassPredictor_depthwise',
+              **conv_hyperparams.params()))
+      self._class_predictor_layers.append(
+          conv_hyperparams.build_batch_norm(
+              training=(is_training and not freeze_batchnorm),
+              name='ClassPredictor_depthwise_batchnorm'))
+      self._class_predictor_layers.append(
+          conv_hyperparams.build_activation_layer(
+              name='ClassPredictor_depthwise_activation'))
+      self._class_predictor_layers.append(
+          tf.keras.layers.Conv2D(
+              num_predictions_per_location * self._num_class_slots, [1, 1],
+              name='ClassPredictor',
+              **conv_hyperparams.params(activation=None)))
+    else:
+      self._class_predictor_layers.append(
+          tf.keras.layers.Conv2D(
+              num_predictions_per_location * self._num_class_slots,
+              [self._kernel_size, self._kernel_size],
+              padding='SAME',
+              name='ClassPredictor',
+              bias_initializer=tf.constant_initializer(
+                  self._class_prediction_bias_init),
+              **conv_hyperparams.params(activation=None)))
+
+  def _predict(self, features):
+    """Predicts boxes.
+
+    Args:
+      features: A float tensor of shape [batch_size, height, width, channels]
+        containing image features.
+
+    Returns:
+      class_predictions_with_background: A float tensor of shape
+        [batch_size, num_anchors, num_classes + 1] representing the class
+        predictions for the proposals.
+    """
+    # Add a slot for the background class.
+    class_predictions_with_background = features
+    for layer in self._class_predictor_layers:
+      class_predictions_with_background = layer(
+          class_predictions_with_background)
+    batch_size = features.get_shape().as_list()[0]
+    if batch_size is None:
+      batch_size = tf.shape(features)[0]
+    class_predictions_with_background = tf.reshape(
+        class_predictions_with_background,
+        [batch_size, -1, self._num_class_slots])
+    return class_predictions_with_background
--- a/research/object_detection/predictors/heads/keras_class_head_test.py
+++ b/research/object_detection/predictors/heads/keras_class_head_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for object_detection.predictors.heads.class_head."""
+import tensorflow as tf
+
+from google.protobuf import text_format
+from object_detection.builders import hyperparams_builder
+from object_detection.predictors.heads import keras_class_head
+from object_detection.protos import hyperparams_pb2
+from object_detection.utils import test_case
+
+
+class ConvolutionalKerasClassPredictorTest(test_case.TestCase):
+
+  def _build_conv_hyperparams(self):
+    conv_hyperparams = hyperparams_pb2.Hyperparams()
+    conv_hyperparams_text_proto = """
+    activation: NONE
+      regularizer {
+        l2_regularizer {
+        }
+      }
+      initializer {
+        truncated_normal_initializer {
+        }
+      }
+    """
+    text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
+    return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
+
+  def test_prediction_size_depthwise_false(self):
+    conv_hyperparams = self._build_conv_hyperparams()
+    class_prediction_head = keras_class_head.ConvolutionalClassHead(
+        is_training=True,
+        num_classes=20,
+        use_dropout=True,
+        dropout_keep_prob=0.5,
+        kernel_size=3,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=False,
+        num_predictions_per_location=1,
+        use_depthwise=False)
+    image_feature = tf.random_uniform(
+        [64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
+    class_predictions = class_prediction_head(image_feature,)
+    self.assertAllEqual([64, 323, 21],
+                        class_predictions.get_shape().as_list())
+
+  # TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/predictors/heads/keras_mask_head.py
+++ b/research/object_detection/predictors/heads/keras_mask_head.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Keras Mask Heads.
+
+Contains Mask prediction head classes for different meta architectures.
+All the mask prediction heads have a predict function that receives the
+`features` as the first argument and returns `mask_predictions`.
+"""
+import tensorflow as tf
+
+from object_detection.predictors.heads import head
+
+
+class ConvolutionalMaskHead(head.KerasHead):
+  """Convolutional class prediction head."""
+
+  def __init__(self,
+               is_training,
+               num_classes,
+               use_dropout,
+               dropout_keep_prob,
+               kernel_size,
+               num_predictions_per_location,
+               conv_hyperparams,
+               freeze_batchnorm,
+               use_depthwise=False,
+               mask_height=7,
+               mask_width=7,
+               masks_are_class_agnostic=False,
+               name=None):
+    """Constructor.
+
+    Args:
+      is_training: Indicates whether the BoxPredictor is in training mode.
+      num_classes: Number of classes.
+      use_dropout: Option to use dropout or not.  Note that a single dropout
+        op is applied here prior to both box and class predictions, which stands
+        in contrast to the ConvolutionalBoxPredictor below.
+      dropout_keep_prob: Keep probability for dropout.
+        This is only used if use_dropout is True.
+      kernel_size: Size of final convolution kernel.  If the
+        spatial resolution of the feature map is smaller than the kernel size,
+        then the kernel size is automatically set to be
+        min(feature_width, feature_height).
+      num_predictions_per_location: Number of box predictions to be made per
+        spatial location. Int specifying number of boxes per location.
+      conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
+        containing hyperparameters for convolution ops.
+      freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
+        training or not. When training with a small batch size (e.g. 1), it is
+        desirable to freeze batch norm update and use pretrained batch norm
+        params.
+      use_depthwise: Whether to use depthwise convolutions for prediction
+        steps. Default is False.
+      mask_height: Desired output mask height. The default value is 7.
+      mask_width: Desired output mask width. The default value is 7.
+      masks_are_class_agnostic: Boolean determining if the mask-head is
+        class-agnostic or not.
+      name: A string name scope to assign to the model. If `None`, Keras
+        will auto-generate one from the class name.
+
+    Raises:
+      ValueError: if min_depth > max_depth.
+    """
+    super(ConvolutionalMaskHead, self).__init__(name=name)
+    self._is_training = is_training
+    self._num_classes = num_classes
+    self._use_dropout = use_dropout
+    self._dropout_keep_prob = dropout_keep_prob
+    self._kernel_size = kernel_size
+    self._num_predictions_per_location = num_predictions_per_location
+    self._use_depthwise = use_depthwise
+    self._mask_height = mask_height
+    self._mask_width = mask_width
+    self._masks_are_class_agnostic = masks_are_class_agnostic
+
+    self._mask_predictor_layers = []
+
+    # Add a slot for the background class.
+    if self._masks_are_class_agnostic:
+      self._num_masks = 1
+    else:
+      self._num_masks = self._num_classes
+
+    num_mask_channels = self._num_masks * self._mask_height * self._mask_width
+
+    if self._use_dropout:
+      self._mask_predictor_layers.append(
+          # The Dropout layer's `training` parameter for the call method must
+          # be set implicitly by the Keras set_learning_phase. The object
+          # detection training code takes care of this.
+          tf.keras.layers.Dropout(rate=1.0 - self._dropout_keep_prob))
+    if self._use_depthwise:
+      self._mask_predictor_layers.append(
+          tf.keras.layers.DepthwiseConv2D(
+              [self._kernel_size, self._kernel_size],
+              padding='SAME',
+              depth_multiplier=1,
+              strides=1,
+              dilation_rate=1,
+              name='MaskPredictor_depthwise',
+              **conv_hyperparams.params()))
+      self._mask_predictor_layers.append(
+          conv_hyperparams.build_batch_norm(
+              training=(is_training and not freeze_batchnorm),
+              name='MaskPredictor_depthwise_batchnorm'))
+      self._mask_predictor_layers.append(
+          conv_hyperparams.build_activation_layer(
+              name='MaskPredictor_depthwise_activation'))
+      self._mask_predictor_layers.append(
+          tf.keras.layers.Conv2D(
+              num_predictions_per_location * num_mask_channels, [1, 1],
+              name='MaskPredictor',
+              **conv_hyperparams.params(activation=None)))
+    else:
+      self._mask_predictor_layers.append(
+          tf.keras.layers.Conv2D(
+              num_predictions_per_location * num_mask_channels,
+              [self._kernel_size, self._kernel_size],
+              padding='SAME',
+              name='MaskPredictor',
+              **conv_hyperparams.params(activation=None)))
+
+  def _predict(self, features):
+    """Predicts boxes.
+
+    Args:
+      features: A float tensor of shape [batch_size, height, width, channels]
+        containing image features.
+
+    Returns:
+      mask_predictions: A float tensors of shape
+        [batch_size, num_anchors, num_masks, mask_height, mask_width]
+        representing the mask predictions for the proposals.
+    """
+    mask_predictions = features
+    for layer in self._mask_predictor_layers:
+      mask_predictions = layer(mask_predictions)
+    batch_size = features.get_shape().as_list()[0]
+    if batch_size is None:
+      batch_size = tf.shape(features)[0]
+    mask_predictions = tf.reshape(
+        mask_predictions,
+        [batch_size, -1, self._num_masks, self._mask_height, self._mask_width])
+    return mask_predictions
--- a/research/object_detection/predictors/heads/keras_mask_head_test.py
+++ b/research/object_detection/predictors/heads/keras_mask_head_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for object_detection.predictors.heads.mask_head."""
+import tensorflow as tf
+
+from google.protobuf import text_format
+from object_detection.builders import hyperparams_builder
+from object_detection.predictors.heads import keras_mask_head
+from object_detection.protos import hyperparams_pb2
+from object_detection.utils import test_case
+
+
+class ConvolutionalMaskPredictorTest(test_case.TestCase):
+
+  def _build_conv_hyperparams(self):
+    conv_hyperparams = hyperparams_pb2.Hyperparams()
+    conv_hyperparams_text_proto = """
+    activation: NONE
+      regularizer {
+        l2_regularizer {
+        }
+      }
+      initializer {
+        truncated_normal_initializer {
+        }
+      }
+    """
+    text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
+    return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
+
+  def test_prediction_size_use_depthwise_false(self):
+    conv_hyperparams = self._build_conv_hyperparams()
+    mask_prediction_head = keras_mask_head.ConvolutionalMaskHead(
+        is_training=True,
+        num_classes=20,
+        use_dropout=True,
+        dropout_keep_prob=0.5,
+        kernel_size=3,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=False,
+        num_predictions_per_location=1,
+        use_depthwise=False,
+        mask_height=7,
+        mask_width=7)
+    image_feature = tf.random_uniform(
+        [64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
+    mask_predictions = mask_prediction_head(image_feature)
+    self.assertAllEqual([64, 323, 20, 7, 7],
+                        mask_predictions.get_shape().as_list())
+
+  # TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
+
+  def test_class_agnostic_prediction_size_use_depthwise_false(self):
+    conv_hyperparams = self._build_conv_hyperparams()
+    mask_prediction_head = keras_mask_head.ConvolutionalMaskHead(
+        is_training=True,
+        num_classes=20,
+        use_dropout=True,
+        dropout_keep_prob=0.5,
+        kernel_size=3,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=False,
+        num_predictions_per_location=1,
+        use_depthwise=False,
+        mask_height=7,
+        mask_width=7,
+        masks_are_class_agnostic=True)
+    image_feature = tf.random_uniform(
+        [64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
+    mask_predictions = mask_prediction_head(image_feature)
+    self.assertAllEqual([64, 323, 1, 7, 7],
+                        mask_predictions.get_shape().as_list())
+
+  # TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/predictors/heads/mask_head.py
+++ b/research/object_detection/predictors/heads/mask_head.py
@@ -148,6 +148,7 @@ class MaskRCNNMaskHead(head.Head):
          upsampled_features,
          num_outputs=num_masks,
          activation_fn=None,
+          normalizer_fn=None,
          kernel_size=[3, 3])
      return tf.expand_dims(
          tf.transpose(mask_predictions, perm=[0, 3, 1, 2]),

--- a/research/object_detection/protos/box_predictor.proto
+++ b/research/object_detection/protos/box_predictor.proto
@@ -15,7 +15,21 @@ message BoxPredictor {
  }
 }

+// Configuration proto for MaskHead in predictors.
+// Next id: 4
+message MaskHead {
+  // The height and the width of the predicted mask. Only used when
+  // predict_instance_masks is true.
+  optional int32 mask_height = 1 [default = 15];
+  optional int32 mask_width = 2 [default = 15];
+
+  // Whether to predict class agnostic masks. Only used when
+  // predict_instance_masks is true.
+  optional bool masks_are_class_agnostic = 3 [default = true];
+}
+
 // Configuration proto for Convolutional box predictor.
+// Next id: 13
 message ConvolutionalBoxPredictor {
  // Hyperparameters for convolution ops used in the box predictor.
  optional Hyperparams conv_hyperparams = 1;
@@ -55,9 +69,13 @@ message ConvolutionalBoxPredictor {

  // Whether to use depthwise separable convolution for box predictor layers.
  optional bool use_depthwise = 11 [default = false];
+
+  // Configs for a mask prediction head.
+  optional MaskHead mask_head = 12;
 }

 // Configuration proto for weight shared convolutional box predictor.
+// Next id: 18
 message WeightSharedConvolutionalBoxPredictor {
  // Hyperparameters for convolution ops used in the box predictor.
  optional Hyperparams conv_hyperparams = 1;
@@ -85,12 +103,37 @@ message WeightSharedConvolutionalBoxPredictor {
  // Whether to use dropout for class prediction.
  optional bool use_dropout = 11 [default = false];

-  // Keep probability for dropout
+  // Keep probability for dropout.
  optional float dropout_keep_probability = 12 [default = 0.8];

  // Whether to share the multi-layer tower between box prediction and class
  // prediction heads.
  optional bool share_prediction_tower = 13 [default = false];
+
+  // Whether to use depthwise separable convolution for box predictor layers.
+  optional bool use_depthwise = 14 [default = false];
+
+  // Configs for a mask prediction head.
+  optional MaskHead mask_head = 15;
+
+  // Enum to specify how to convert the detection scores at inference time.
+  enum ScoreConverter {
+    // Input scores equals output scores.
+    IDENTITY = 0;
+
+    // Applies a sigmoid on input scores.
+    SIGMOID = 1;
+  }
+
+  // Callable elementwise score converter at inference time.
+  optional ScoreConverter score_converter = 16 [default = IDENTITY];
+
+  // If specified, apply clipping to box encodings.
+  message BoxEncodingsClipRange {
+    optional float min = 1;
+    optional float max = 2;
+  }
+  optional BoxEncodingsClipRange box_encodings_clip_range = 17;
 }

 // TODO(alirezafathi): Refactor the proto file to be able to configure mask rcnn