Merge remote-tracking branch 'upstream/master'

e00e0e13 · dreamdragon · b915db4e · 402b561b · e00e0e13 · e00e0e13
Commit e00e0e13 authored Dec 03, 2018 by dreamdragon
20 changed files
--- a/research/object_detection/models/ssd_mobilenet_v2_keras_feature_extractor.py
+++ b/research/object_detection/models/ssd_mobilenet_v2_keras_feature_extractor.py
@@ -85,41 +85,44 @@ class SSDMobileNetV2KerasFeatureExtractor(
        override_base_feature_extractor_hyperparams=
        override_base_feature_extractor_hyperparams,
        name=name)
-    feature_map_layout = {
+    self._feature_map_layout = {
        'from_layer': ['layer_15/expansion_output', 'layer_19', '', '', '', ''],
        'layer_depth': [-1, -1, 512, 256, 256, 128],
        'use_depthwise': self._use_depthwise,
        'use_explicit_padding': self._use_explicit_padding,
    }
-    with tf.name_scope('MobilenetV2'):
+    self.mobilenet_v2 = None
-      full_mobilenet_v2 = mobilenet_v2.mobilenet_v2(
+    self.feature_map_generator = None
-          batchnorm_training=(is_training and not freeze_batchnorm),
-          conv_hyperparams=(conv_hyperparams
+  def build(self, input_shape):
-                            if self._override_base_feature_extractor_hyperparams
+    full_mobilenet_v2 = mobilenet_v2.mobilenet_v2(
-                            else None),
+        batchnorm_training=(self._is_training and not self._freeze_batchnorm),
-          weights=None,
+        conv_hyperparams=(self._conv_hyperparams
-          use_explicit_padding=use_explicit_padding,
+                          if self._override_base_feature_extractor_hyperparams
-          alpha=self._depth_multiplier,
+                          else None),
-          min_depth=self._min_depth,
+        weights=None,
-          include_top=False)
+        use_explicit_padding=self._use_explicit_padding,
-      conv2d_11_pointwise = full_mobilenet_v2.get_layer(
+        alpha=self._depth_multiplier,
-          name='block_13_expand_relu').output
+        min_depth=self._min_depth,
-      conv2d_13_pointwise = full_mobilenet_v2.get_layer(name='out_relu').output
+        include_top=False)
-      self.mobilenet_v2 = tf.keras.Model(
+    conv2d_11_pointwise = full_mobilenet_v2.get_layer(
-          inputs=full_mobilenet_v2.inputs,
+        name='block_13_expand_relu').output
-          outputs=[conv2d_11_pointwise, conv2d_13_pointwise])
+    conv2d_13_pointwise = full_mobilenet_v2.get_layer(name='out_relu').output
+    self.mobilenet_v2 = tf.keras.Model(
-      self.feature_map_generator = (
+        inputs=full_mobilenet_v2.inputs,
-          feature_map_generators.KerasMultiResolutionFeatureMaps(
+        outputs=[conv2d_11_pointwise, conv2d_13_pointwise])
-              feature_map_layout=feature_map_layout,
+    self.feature_map_generator = (
-              depth_multiplier=self._depth_multiplier,
+        feature_map_generators.KerasMultiResolutionFeatureMaps(
-              min_depth=self._min_depth,
+            feature_map_layout=self._feature_map_layout,
-              insert_1x1_conv=True,
+            depth_multiplier=self._depth_multiplier,
-              is_training=is_training,
+            min_depth=self._min_depth,
-              conv_hyperparams=conv_hyperparams,
+            insert_1x1_conv=True,
-              freeze_batchnorm=freeze_batchnorm,
+            is_training=self._is_training,
-              name='FeatureMaps'))
+            conv_hyperparams=self._conv_hyperparams,
+            freeze_batchnorm=self._freeze_batchnorm,
+            name='FeatureMaps'))
+    self.built = True
  def preprocess(self, resized_inputs):
    """SSD preprocessing.

--- a/research/object_detection/models/ssd_pnasnet_feature_extractor.py
+++ b/research/object_detection/models/ssd_pnasnet_feature_extractor.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SSDFeatureExtractor for PNASNet features.
+Based on PNASNet ImageNet model: https://arxiv.org/abs/1712.00559
+"""
+import tensorflow as tf
+from object_detection.meta_architectures import ssd_meta_arch
+from object_detection.models import feature_map_generators
+from object_detection.utils import context_manager
+from object_detection.utils import ops
+from nets.nasnet import pnasnet
+slim = tf.contrib.slim
+def pnasnet_large_arg_scope_for_detection(is_batch_norm_training=False):
+  """Defines the default arg scope for the PNASNet Large for object detection.
+  This provides a small edit to switch batch norm training on and off.
+  Args:
+    is_batch_norm_training: Boolean indicating whether to train with batch norm.
+    Default is False.
+  Returns:
+    An `arg_scope` to use for the PNASNet Large Model.
+  """
+  imagenet_scope = pnasnet.pnasnet_large_arg_scope()
+  with slim.arg_scope(imagenet_scope):
+    with slim.arg_scope([slim.batch_norm],
+                        is_training=is_batch_norm_training) as sc:
+      return sc
+class SSDPNASNetFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
+  """SSD Feature Extractor using PNASNet features."""
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams_fn,
+               reuse_weights=None,
+               use_explicit_padding=False,
+               use_depthwise=False,
+               override_base_feature_extractor_hyperparams=False):
+    """PNASNet Feature Extractor for SSD Models.
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: float depth multiplier for feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
+        and separable_conv2d ops in the layers that are added on top of the
+        base feature extractor.
+      reuse_weights: Whether to reuse variables. Default is None.
+      use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
+        inputs so that the output dimensions are the same as if 'SAME' padding
+        were used.
+      use_depthwise: Whether to use depthwise convolutions.
+      override_base_feature_extractor_hyperparams: Whether to override
+        hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams_fn`.
+    """
+    super(SSDPNASNetFeatureExtractor, self).__init__(
+        is_training=is_training,
+        depth_multiplier=depth_multiplier,
+        min_depth=min_depth,
+        pad_to_multiple=pad_to_multiple,
+        conv_hyperparams_fn=conv_hyperparams_fn,
+        reuse_weights=reuse_weights,
+        use_explicit_padding=use_explicit_padding,
+        use_depthwise=use_depthwise,
+        override_base_feature_extractor_hyperparams=
+        override_base_feature_extractor_hyperparams)
+  def preprocess(self, resized_inputs):
+    """SSD preprocessing.
+    Maps pixel values to the range [-1, 1].
+    Args:
+      resized_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+    Returns:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+    """
+    return (2.0 / 255.0) * resized_inputs - 1.0
+  def extract_features(self, preprocessed_inputs):
+    """Extract features from preprocessed inputs.
+    Args:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+    Returns:
+      feature_maps: a list of tensors where the ith tensor has shape
+        [batch, height_i, width_i, depth_i]
+    """
+    feature_map_layout = {
+        'from_layer': ['Cell_7', 'Cell_11', '', '', '', ''],
+        'layer_depth': [-1, -1, 512, 256, 256, 128],
+        'use_explicit_padding': self._use_explicit_padding,
+        'use_depthwise': self._use_depthwise,
+    }
+    with slim.arg_scope(
+        pnasnet_large_arg_scope_for_detection(
+            is_batch_norm_training=self._is_training)):
+      with slim.arg_scope([slim.conv2d, slim.batch_norm, slim.separable_conv2d],
+                          reuse=self._reuse_weights):
+        with (slim.arg_scope(self._conv_hyperparams_fn())
+              if self._override_base_feature_extractor_hyperparams else
+              context_manager.IdentityContextManager()):
+          _, image_features = pnasnet.build_pnasnet_large(
+              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
+              num_classes=None,
+              is_training=self._is_training,
+              final_endpoint='Cell_11')
+    with tf.variable_scope('SSD_feature_maps', reuse=self._reuse_weights):
+      with slim.arg_scope(self._conv_hyperparams_fn()):
+        feature_maps = feature_map_generators.multi_resolution_feature_maps(
+            feature_map_layout=feature_map_layout,
+            depth_multiplier=self._depth_multiplier,
+            min_depth=self._min_depth,
+            insert_1x1_conv=True,
+            image_features=image_features)
+    return feature_maps.values()
+  def restore_from_classification_checkpoint_fn(self, feature_extractor_scope):
+    """Returns a map of variables to load from a foreign checkpoint.
+    Note that this overrides the default implementation in
+    ssd_meta_arch.SSDFeatureExtractor which does not work for PNASNet
+    checkpoints.
+    Args:
+      feature_extractor_scope: A scope name for the first stage feature
+        extractor.
+    Returns:
+      A dict mapping variable names (to load from a checkpoint) to variables in
+      the model graph.
+    """
+    variables_to_restore = {}
+    for variable in tf.global_variables():
+      if variable.op.name.startswith(feature_extractor_scope):
+        var_name = variable.op.name.replace(feature_extractor_scope + '/', '')
+        var_name += '/ExponentialMovingAverage'
+        variables_to_restore[var_name] = variable
+    return variables_to_restore
--- a/research/object_detection/models/ssd_pnasnet_feature_extractor_test.py
+++ b/research/object_detection/models/ssd_pnasnet_feature_extractor_test.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ssd_pnas_feature_extractor."""
+import numpy as np
+import tensorflow as tf
+from object_detection.models import ssd_feature_extractor_test
+from object_detection.models import ssd_pnasnet_feature_extractor
+slim = tf.contrib.slim
+class SsdPnasNetFeatureExtractorTest(
+    ssd_feature_extractor_test.SsdFeatureExtractorTestBase):
+  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
+                                is_training=True, use_explicit_padding=False):
+    """Constructs a new feature extractor.
+    Args:
+      depth_multiplier: float depth multiplier for feature extractor
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      is_training: whether the network is in training mode.
+      use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
+        inputs so that the output dimensions are the same as if 'SAME' padding
+        were used.
+    Returns:
+      an ssd_meta_arch.SSDFeatureExtractor object.
+    """
+    min_depth = 32
+    return ssd_pnasnet_feature_extractor.SSDPNASNetFeatureExtractor(
+        is_training, depth_multiplier, min_depth, pad_to_multiple,
+        self.conv_hyperparams_fn,
+        use_explicit_padding=use_explicit_padding)
+  def test_extract_features_returns_correct_shapes_128(self):
+    image_height = 128
+    image_width = 128
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 8, 8, 2160), (2, 4, 4, 4320),
+                                  (2, 2, 2, 512), (2, 1, 1, 256),
+                                  (2, 1, 1, 256), (2, 1, 1, 128)]
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+  def test_extract_features_returns_correct_shapes_299(self):
+    image_height = 299
+    image_width = 299
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 19, 19, 2160), (2, 10, 10, 4320),
+                                  (2, 5, 5, 512), (2, 3, 3, 256),
+                                  (2, 2, 2, 256), (2, 1, 1, 128)]
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+  def test_preprocess_returns_correct_value_range(self):
+    image_height = 128
+    image_width = 128
+    depth_multiplier = 1
+    pad_to_multiple = 1
+    test_image = np.random.rand(2, image_height, image_width, 3)
+    feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                       pad_to_multiple)
+    preprocessed_image = feature_extractor.preprocess(test_image)
+    self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor.py
+++ b/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor.py
@@ -113,6 +113,8 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
    VGG style channel mean subtraction as described here:
    https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-mdnge.
+    Note that if the number of channels is not equal to 3, the mean subtraction
+    will be skipped and the original resized_inputs will be returned.
    Args:
      resized_inputs: a [batch, height, width, channels] float tensor
@@ -122,8 +124,11 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.
    """
-    channel_means = [123.68, 116.779, 103.939]
+    if resized_inputs.shape.as_list()[3] == 3:
-    return resized_inputs - [[channel_means]]
+      channel_means = [123.68, 116.779, 103.939]
+      return resized_inputs - [[channel_means]]
+    else:
+      return resized_inputs
  def _filter_features(self, image_features):
    # TODO(rathodv): Change resnet endpoint to strip scope prefixes instead

--- a/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_testbase.py
+++ b/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_testbase.py
@@ -82,12 +82,15 @@ class SSDResnetFPNFeatureExtractorTestBase(
    image_width = 128
    depth_multiplier = 1
    pad_to_multiple = 1
-    test_image = np.random.rand(4, image_height, image_width, 3)
+    test_image = tf.constant(np.random.rand(4, image_height, image_width, 3))
    feature_extractor = self._create_feature_extractor(depth_multiplier,
                                                       pad_to_multiple)
    preprocessed_image = feature_extractor.preprocess(test_image)
-    self.assertAllClose(preprocessed_image,
+    with self.test_session() as sess:
-                        test_image - [[123.68, 116.779, 103.939]])
+      test_image_out, preprocessed_image_out = sess.run(
+          [test_image, preprocessed_image])
+      self.assertAllClose(preprocessed_image_out,
+                          test_image_out - [[123.68, 116.779, 103.939]])
  def test_variables_only_created_in_scope(self):
    depth_multiplier = 1
@@ -103,5 +106,3 @@ class SSDResnetFPNFeatureExtractorTestBase(
        self.assertTrue(
            variable.name.startswith(self._resnet_scope_name())
            or variable.name.startswith(self._fpn_scope_name()))
--- a/research/object_detection/models/ssd_resnet_v1_ppn_feature_extractor.py
+++ b/research/object_detection/models/ssd_resnet_v1_ppn_feature_extractor.py
@@ -98,6 +98,8 @@ class _SSDResnetPpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
    VGG style channel mean subtraction as described here:
    https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-mdnge.
+    Note that if the number of channels is not equal to 3, the mean subtraction
+    will be skipped and the original resized_inputs will be returned.
    Args:
      resized_inputs: a [batch, height, width, channels] float tensor
@@ -107,8 +109,11 @@ class _SSDResnetPpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.
    """
-    channel_means = [123.68, 116.779, 103.939]
+    if resized_inputs.shape.as_list()[3] == 3:
-    return resized_inputs - [[channel_means]]
+      channel_means = [123.68, 116.779, 103.939]
+      return resized_inputs - [[channel_means]]
+    else:
+      return resized_inputs
  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

--- a/research/object_detection/models/ssd_resnet_v1_ppn_feature_extractor_testbase.py
+++ b/research/object_detection/models/ssd_resnet_v1_ppn_feature_extractor_testbase.py
@@ -15,6 +15,7 @@
 """Tests for ssd resnet v1 feature extractors."""
 import abc
 import numpy as np
+import tensorflow as tf
 from object_detection.models import ssd_feature_extractor_test
@@ -64,12 +65,15 @@ class SSDResnetPpnFeatureExtractorTestBase(
    image_width = 128
    depth_multiplier = 1
    pad_to_multiple = 1
-    test_image = np.random.rand(4, image_height, image_width, 3)
+    test_image = tf.constant(np.random.rand(4, image_height, image_width, 3))
    feature_extractor = self._create_feature_extractor(depth_multiplier,
                                                       pad_to_multiple)
    preprocessed_image = feature_extractor.preprocess(test_image)
-    self.assertAllClose(preprocessed_image,
+    with self.test_session() as sess:
-                        test_image - [[123.68, 116.779, 103.939]])
+      test_image_out, preprocessed_image_out = sess.run(
+          [test_image, preprocessed_image])
+      self.assertAllClose(preprocessed_image_out,
+                          test_image_out - [[123.68, 116.779, 103.939]])
  def test_variables_only_created_in_scope(self):
    depth_multiplier = 1

--- a/research/object_detection/predictors/convolutional_keras_box_predictor.py
+++ b/research/object_detection/predictors/convolutional_keras_box_predictor.py
@@ -134,26 +134,32 @@ class ConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor):
                       (len(self._prediction_heads[BOX_ENCODINGS]),
                        len(input_shapes)))
    for stack_index, input_shape in enumerate(input_shapes):
-      net = tf.keras.Sequential(name='PreHeadConvolutions_%d' % stack_index)
+      net = []
-      self._shared_nets.append(net)
      # Add additional conv layers before the class predictor.
      features_depth = static_shape.get_depth(input_shape)
      depth = max(min(features_depth, self._max_depth), self._min_depth)
      tf.logging.info(
          'depth of additional conv before box predictor: {}'.format(depth))
      if depth > 0 and self._num_layers_before_predictor > 0:
        for i in range(self._num_layers_before_predictor):
-          net.add(keras.Conv2D(depth, [1, 1],
+          net.append(keras.Conv2D(depth, [1, 1],
-                               name='Conv2d_%d_1x1_%d' % (i, depth),
+                                  name='SharedConvolutions_%d/Conv2d_%d_1x1_%d'
-                               padding='SAME',
+                                  % (stack_index, i, depth),
-                               **self._conv_hyperparams.params()))
+                                  padding='SAME',
-          net.add(self._conv_hyperparams.build_batch_norm(
+                                  **self._conv_hyperparams.params()))
+          net.append(self._conv_hyperparams.build_batch_norm(
              training=(self._is_training and not self._freeze_batchnorm),
-              name='Conv2d_%d_1x1_%d_norm' % (i, depth)))
+              name='SharedConvolutions_%d/Conv2d_%d_1x1_%d_norm'
-          net.add(self._conv_hyperparams.build_activation_layer(
+              % (stack_index, i, depth)))
-              name='Conv2d_%d_1x1_%d_activation' % (i, depth),
+          net.append(self._conv_hyperparams.build_activation_layer(
+              name='SharedConvolutions_%d/Conv2d_%d_1x1_%d_activation'
+              % (stack_index, i, depth),
          ))
+      # Until certain bugs are fixed in checkpointable lists,
+      # this net must be appended only once it's been filled with layers
+      self._shared_nets.append(net)
    self.built = True
  def _predict(self, image_features):
@@ -175,10 +181,11 @@ class ConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor):
    """
    predictions = collections.defaultdict(list)
-    for (index, image_feature) in enumerate(image_features):
+    for (index, net) in enumerate(image_features):
      # Apply shared conv layers before the head predictors.
-      net = self._shared_nets[index](image_feature)
+      for layer in self._shared_nets[index]:
+        net = layer(net)
      for head_name in self._prediction_heads:
        head_obj = self._prediction_heads[head_name][index]

--- a/research/object_detection/predictors/convolutional_keras_box_predictor_test.py
+++ b/research/object_detection/predictors/convolutional_keras_box_predictor_test.py
@@ -181,8 +181,8 @@ class ConvolutionalKerasBoxPredictorTest(test_case.TestCase):
      self.assertAllEqual(objectness_predictions_shape,
                          [4, expected_num_anchors, 1])
    expected_variable_set = set([
-        'BoxPredictor/PreHeadConvolutions_0/Conv2d_0_1x1_32/bias',
+        'BoxPredictor/SharedConvolutions_0/Conv2d_0_1x1_32/bias',
-        'BoxPredictor/PreHeadConvolutions_0/Conv2d_0_1x1_32/kernel',
+        'BoxPredictor/SharedConvolutions_0/Conv2d_0_1x1_32/kernel',
        'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/bias',
        'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/kernel',
        'BoxPredictor/ConvolutionalClassHead_0/ClassPredictor/bias',

--- a/research/object_detection/predictors/heads/class_head.py
+++ b/research/object_detection/predictors/heads/class_head.py
@@ -34,16 +34,18 @@ class MaskRCNNClassHead(head.Head):
  https://arxiv.org/abs/1703.06870
  """
-  def __init__(self, is_training, num_classes, fc_hyperparams_fn,
+  def __init__(self,
-               use_dropout, dropout_keep_prob):
+               is_training,
+               num_class_slots,
+               fc_hyperparams_fn,
+               use_dropout,
+               dropout_keep_prob):
    """Constructor.
    Args:
      is_training: Indicates whether the BoxPredictor is in training mode.
-      num_classes: number of classes.  Note that num_classes *does not*
+      num_class_slots: number of class slots. Note that num_class_slots may or
-        include the background category, so if groundtruth labels take values
+        may not include an implicit background category.
-        in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
-        assigned classification targets can range from {0,... K}).
      fc_hyperparams_fn: A function to generate tf-slim arg_scope with
        hyperparameters for fully connected ops.
      use_dropout: Option to use dropout or not.  Note that a single dropout
@@ -54,7 +56,7 @@ class MaskRCNNClassHead(head.Head):
    """
    super(MaskRCNNClassHead, self).__init__()
    self._is_training = is_training
-    self._num_classes = num_classes
+    self._num_class_slots = num_class_slots
    self._fc_hyperparams_fn = fc_hyperparams_fn
    self._use_dropout = use_dropout
    self._dropout_keep_prob = dropout_keep_prob
@@ -70,7 +72,7 @@ class MaskRCNNClassHead(head.Head):
    Returns:
      class_predictions_with_background: A float tensor of shape
-        [batch_size, 1, num_classes + 1] representing the class predictions for
+        [batch_size, 1, num_class_slots] representing the class predictions for
        the proposals.
    Raises:
@@ -91,11 +93,12 @@ class MaskRCNNClassHead(head.Head):
    with slim.arg_scope(self._fc_hyperparams_fn()):
      class_predictions_with_background = slim.fully_connected(
          flattened_roi_pooled_features,
-          self._num_classes + 1,
+          self._num_class_slots,
          activation_fn=None,
          scope='ClassPredictor')
    class_predictions_with_background = tf.reshape(
-        class_predictions_with_background, [-1, 1, self._num_classes + 1])
+        class_predictions_with_background,
+        [-1, 1, self._num_class_slots])
    return class_predictions_with_background
@@ -104,7 +107,7 @@ class ConvolutionalClassHead(head.Head):
  def __init__(self,
               is_training,
-               num_classes,
+               num_class_slots,
               use_dropout,
               dropout_keep_prob,
               kernel_size,
@@ -115,7 +118,8 @@ class ConvolutionalClassHead(head.Head):
    Args:
      is_training: Indicates whether the BoxPredictor is in training mode.
-      num_classes: Number of classes.
+      num_class_slots: number of class slots. Note that num_class_slots may or
+        may not include an implicit background category.
      use_dropout: Option to use dropout or not.  Note that a single dropout
        op is applied here prior to both box and class predictions, which stands
        in contrast to the ConvolutionalBoxPredictor below.
@@ -137,7 +141,7 @@ class ConvolutionalClassHead(head.Head):
    """
    super(ConvolutionalClassHead, self).__init__()
    self._is_training = is_training
-    self._num_classes = num_classes
+    self._num_class_slots = num_class_slots
    self._use_dropout = use_dropout
    self._dropout_keep_prob = dropout_keep_prob
    self._kernel_size = kernel_size
@@ -156,12 +160,10 @@ class ConvolutionalClassHead(head.Head):
    Returns:
      class_predictions_with_background: A float tensors of shape
-        [batch_size, num_anchors, num_classes + 1] representing the class
+        [batch_size, num_anchors, num_class_slots] representing the class
        predictions for the proposals.
    """
    net = features
-    # Add a slot for the background class.
-    num_class_slots = self._num_classes + 1
    if self._use_dropout:
      net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
    if self._use_depthwise:
@@ -171,7 +173,7 @@ class ConvolutionalClassHead(head.Head):
          rate=1, scope='ClassPredictor_depthwise')
      class_predictions_with_background = slim.conv2d(
          class_predictions_with_background,
-          num_predictions_per_location * num_class_slots, [1, 1],
+          num_predictions_per_location * self._num_class_slots, [1, 1],
          activation_fn=None,
          normalizer_fn=None,
          normalizer_params=None,
@@ -179,7 +181,7 @@ class ConvolutionalClassHead(head.Head):
    else:
      class_predictions_with_background = slim.conv2d(
          net,
-          num_predictions_per_location * num_class_slots,
+          num_predictions_per_location * self._num_class_slots,
          [self._kernel_size, self._kernel_size],
          activation_fn=None,
          normalizer_fn=None,
@@ -194,7 +196,8 @@ class ConvolutionalClassHead(head.Head):
    if batch_size is None:
      batch_size = tf.shape(features)[0]
    class_predictions_with_background = tf.reshape(
-        class_predictions_with_background, [batch_size, -1, num_class_slots])
+        class_predictions_with_background,
+        [batch_size, -1, self._num_class_slots])
    return class_predictions_with_background
@@ -208,7 +211,7 @@ class WeightSharedConvolutionalClassHead(head.Head):
  """
  def __init__(self,
-               num_classes,
+               num_class_slots,
               kernel_size=3,
               class_prediction_bias_init=0.0,
               use_dropout=False,
@@ -218,10 +221,8 @@ class WeightSharedConvolutionalClassHead(head.Head):
    """Constructor.
    Args:
-      num_classes: number of classes.  Note that num_classes *does not*
+      num_class_slots: number of class slots. Note that num_class_slots may or
-        include the background category, so if groundtruth labels take values
+        may not include an implicit background category.
-        in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
-        assigned classification targets can range from {0,... K}).
      kernel_size: Size of final convolution kernel.
      class_prediction_bias_init: constant value to initialize bias of the last
        conv2d layer before class prediction.
@@ -233,7 +234,7 @@ class WeightSharedConvolutionalClassHead(head.Head):
        as inputs and returns tensors).
    """
    super(WeightSharedConvolutionalClassHead, self).__init__()
-    self._num_classes = num_classes
+    self._num_class_slots = num_class_slots
    self._kernel_size = kernel_size
    self._class_prediction_bias_init = class_prediction_bias_init
    self._use_dropout = use_dropout
@@ -252,12 +253,10 @@ class WeightSharedConvolutionalClassHead(head.Head):
    Returns:
      class_predictions_with_background: A tensor of shape
-        [batch_size, num_anchors, num_classes + 1] representing the class
+        [batch_size, num_anchors, num_class_slots] representing the class
        predictions for the proposals.
    """
    class_predictions_net = features
-    num_class_slots = self._num_classes + 1
-    # Add a slot for the background class.
    if self._use_dropout:
      class_predictions_net = slim.dropout(
          class_predictions_net, keep_prob=self._dropout_keep_prob)
@@ -267,7 +266,7 @@ class WeightSharedConvolutionalClassHead(head.Head):
      conv_op = slim.conv2d
    class_predictions_with_background = conv_op(
        class_predictions_net,
-        num_predictions_per_location * num_class_slots,
+        num_predictions_per_location * self._num_class_slots,
        [self._kernel_size, self._kernel_size],
        activation_fn=None, stride=1, padding='SAME',
        normalizer_fn=None,
@@ -280,5 +279,6 @@ class WeightSharedConvolutionalClassHead(head.Head):
    class_predictions_with_background = self._score_converter_fn(
        class_predictions_with_background)
    class_predictions_with_background = tf.reshape(
-        class_predictions_with_background, [batch_size, -1, num_class_slots])
+        class_predictions_with_background,
+        [batch_size, -1, self._num_class_slots])
    return class_predictions_with_background
--- a/research/object_detection/predictors/heads/class_head_test.py
+++ b/research/object_detection/predictors/heads/class_head_test.py
@@ -46,7 +46,7 @@ class MaskRCNNClassHeadTest(test_case.TestCase):
  def test_prediction_size(self):
    class_prediction_head = class_head.MaskRCNNClassHead(
        is_training=False,
-        num_classes=20,
+        num_class_slots=20,
        fc_hyperparams_fn=self._build_arg_scope_with_hyperparams(),
        use_dropout=True,
        dropout_keep_prob=0.5)
@@ -54,7 +54,7 @@ class MaskRCNNClassHeadTest(test_case.TestCase):
        [64, 7, 7, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
    prediction = class_prediction_head.predict(
        features=roi_pooled_features, num_predictions_per_location=1)
-    self.assertAllEqual([64, 1, 21], prediction.get_shape().as_list())
+    self.assertAllEqual([64, 1, 20], prediction.get_shape().as_list())
 class ConvolutionalClassPredictorTest(test_case.TestCase):
@@ -80,7 +80,7 @@ class ConvolutionalClassPredictorTest(test_case.TestCase):
  def test_prediction_size(self):
    class_prediction_head = class_head.ConvolutionalClassHead(
        is_training=True,
-        num_classes=20,
+        num_class_slots=20,
        use_dropout=True,
        dropout_keep_prob=0.5,
        kernel_size=3)
@@ -89,7 +89,7 @@ class ConvolutionalClassPredictorTest(test_case.TestCase):
    class_predictions = class_prediction_head.predict(
        features=image_feature,
        num_predictions_per_location=1)
-    self.assertAllEqual([64, 323, 21],
+    self.assertAllEqual([64, 323, 20],
                        class_predictions.get_shape().as_list())
@@ -115,13 +115,13 @@ class WeightSharedConvolutionalClassPredictorTest(test_case.TestCase):
  def test_prediction_size(self):
    class_prediction_head = (
-        class_head.WeightSharedConvolutionalClassHead(num_classes=20))
+        class_head.WeightSharedConvolutionalClassHead(num_class_slots=20))
    image_feature = tf.random_uniform(
        [64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
    class_predictions = class_prediction_head.predict(
        features=image_feature,
        num_predictions_per_location=1)
-    self.assertAllEqual([64, 323, 21], class_predictions.get_shape().as_list())
+    self.assertAllEqual([64, 323, 20], class_predictions.get_shape().as_list())
 if __name__ == '__main__':

--- a/research/object_detection/predictors/heads/keras_box_head.py
+++ b/research/object_detection/predictors/heads/keras_box_head.py
@@ -91,7 +91,7 @@ class ConvolutionalBoxHead(head.KerasHead):
          tf.keras.layers.Conv2D(
              num_predictions_per_location * self._box_code_size, [1, 1],
              name='BoxEncodingPredictor',
-              **conv_hyperparams.params(activation=None)))
+              **conv_hyperparams.params(use_bias=True)))
    else:
      self._box_encoder_layers.append(
          tf.keras.layers.Conv2D(
@@ -99,7 +99,7 @@ class ConvolutionalBoxHead(head.KerasHead):
              [self._kernel_size, self._kernel_size],
              padding='SAME',
              name='BoxEncodingPredictor',
-              **conv_hyperparams.params(activation=None)))
+              **conv_hyperparams.params(use_bias=True)))
  def _predict(self, features):
    """Predicts boxes.

--- a/research/object_detection/predictors/heads/keras_class_head.py
+++ b/research/object_detection/predictors/heads/keras_class_head.py
@@ -29,7 +29,7 @@ class ConvolutionalClassHead(head.KerasHead):
  def __init__(self,
               is_training,
-               num_classes,
+               num_class_slots,
               use_dropout,
               dropout_keep_prob,
               kernel_size,
@@ -43,7 +43,8 @@ class ConvolutionalClassHead(head.KerasHead):
    Args:
      is_training: Indicates whether the BoxPredictor is in training mode.
-      num_classes: Number of classes.
+      num_class_slots: number of class slots. Note that num_class_slots may or
+        may not include an implicit background category.
      use_dropout: Option to use dropout or not.  Note that a single dropout
        op is applied here prior to both box and class predictions, which stands
        in contrast to the ConvolutionalBoxPredictor below.
@@ -73,13 +74,12 @@ class ConvolutionalClassHead(head.KerasHead):
    """
    super(ConvolutionalClassHead, self).__init__(name=name)
    self._is_training = is_training
-    self._num_classes = num_classes
    self._use_dropout = use_dropout
    self._dropout_keep_prob = dropout_keep_prob
    self._kernel_size = kernel_size
    self._class_prediction_bias_init = class_prediction_bias_init
    self._use_depthwise = use_depthwise
-    self._num_class_slots = self._num_classes + 1
+    self._num_class_slots = num_class_slots
    self._class_predictor_layers = []
@@ -110,7 +110,7 @@ class ConvolutionalClassHead(head.KerasHead):
          tf.keras.layers.Conv2D(
              num_predictions_per_location * self._num_class_slots, [1, 1],
              name='ClassPredictor',
-              **conv_hyperparams.params(activation=None)))
+              **conv_hyperparams.params(use_bias=True)))
    else:
      self._class_predictor_layers.append(
          tf.keras.layers.Conv2D(
@@ -120,7 +120,7 @@ class ConvolutionalClassHead(head.KerasHead):
              name='ClassPredictor',
              bias_initializer=tf.constant_initializer(
                  self._class_prediction_bias_init),
-              **conv_hyperparams.params(activation=None)))
+              **conv_hyperparams.params(use_bias=True)))
  def _predict(self, features):
    """Predicts boxes.
@@ -131,7 +131,7 @@ class ConvolutionalClassHead(head.KerasHead):
    Returns:
      class_predictions_with_background: A float tensor of shape
-        [batch_size, num_anchors, num_classes + 1] representing the class
+        [batch_size, num_anchors, num_class_slots] representing the class
        predictions for the proposals.
    """
    # Add a slot for the background class.

--- a/research/object_detection/predictors/heads/keras_class_head_test.py
+++ b/research/object_detection/predictors/heads/keras_class_head_test.py
@@ -45,7 +45,7 @@ class ConvolutionalKerasClassPredictorTest(test_case.TestCase):
    conv_hyperparams = self._build_conv_hyperparams()
    class_prediction_head = keras_class_head.ConvolutionalClassHead(
        is_training=True,
-        num_classes=20,
+        num_class_slots=20,
        use_dropout=True,
        dropout_keep_prob=0.5,
        kernel_size=3,
@@ -56,7 +56,7 @@ class ConvolutionalKerasClassPredictorTest(test_case.TestCase):
    image_feature = tf.random_uniform(
        [64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
    class_predictions = class_prediction_head(image_feature,)
-    self.assertAllEqual([64, 323, 21],
+    self.assertAllEqual([64, 323, 20],
                        class_predictions.get_shape().as_list())
  # TODO(kaftan): Remove conditional after CMLE moves to TF 1.10

--- a/research/object_detection/predictors/heads/keras_mask_head.py
+++ b/research/object_detection/predictors/heads/keras_mask_head.py
@@ -124,7 +124,7 @@ class ConvolutionalMaskHead(head.KerasHead):
          tf.keras.layers.Conv2D(
              num_predictions_per_location * num_mask_channels, [1, 1],
              name='MaskPredictor',
-              **conv_hyperparams.params(activation=None)))
+              **conv_hyperparams.params(use_bias=True)))
    else:
      self._mask_predictor_layers.append(
          tf.keras.layers.Conv2D(
@@ -132,7 +132,7 @@ class ConvolutionalMaskHead(head.KerasHead):
              [self._kernel_size, self._kernel_size],
              padding='SAME',
              name='MaskPredictor',
-              **conv_hyperparams.params(activation=None)))
+              **conv_hyperparams.params(use_bias=True)))
  def _predict(self, features):
    """Predicts boxes.

--- a/research/object_detection/predictors/heads/mask_head.py
+++ b/research/object_detection/predictors/heads/mask_head.py
@@ -23,6 +23,7 @@ import math
 import tensorflow as tf
 from object_detection.predictors.heads import head
+from object_detection.utils import ops
 slim = tf.contrib.slim
@@ -41,7 +42,8 @@ class MaskRCNNMaskHead(head.Head):
               mask_width=14,
               mask_prediction_num_conv_layers=2,
               mask_prediction_conv_depth=256,
-               masks_are_class_agnostic=False):
+               masks_are_class_agnostic=False,
+               convolve_then_upsample=False):
    """Constructor.
    Args:
@@ -62,6 +64,10 @@ class MaskRCNNMaskHead(head.Head):
        image features.
      masks_are_class_agnostic: Boolean determining if the mask-head is
        class-agnostic or not.
+      convolve_then_upsample: Whether to apply convolutions on mask features
+        before upsampling using nearest neighbor resizing. Otherwise, mask
+        features are resized to [`mask_height`, `mask_width`] using bilinear
+        resizing before applying convolutions.
    Raises:
      ValueError: conv_hyperparams_fn is None.
@@ -74,6 +80,7 @@ class MaskRCNNMaskHead(head.Head):
    self._mask_prediction_num_conv_layers = mask_prediction_num_conv_layers
    self._mask_prediction_conv_depth = mask_prediction_conv_depth
    self._masks_are_class_agnostic = masks_are_class_agnostic
+    self._convolve_then_upsample = convolve_then_upsample
    if conv_hyperparams_fn is None:
      raise ValueError('conv_hyperparams_fn is None.')
@@ -135,17 +142,30 @@ class MaskRCNNMaskHead(head.Head):
      num_conv_channels = self._get_mask_predictor_conv_depth(
          num_feature_channels, self._num_classes)
    with slim.arg_scope(self._conv_hyperparams_fn()):
-      upsampled_features = tf.image.resize_bilinear(
+      if not self._convolve_then_upsample:
-          features, [self._mask_height, self._mask_width],
+        features = tf.image.resize_bilinear(
-          align_corners=True)
+            features, [self._mask_height, self._mask_width],
+            align_corners=True)
      for _ in range(self._mask_prediction_num_conv_layers - 1):
-        upsampled_features = slim.conv2d(
+        features = slim.conv2d(
-            upsampled_features,
+            features,
+            num_outputs=num_conv_channels,
+            kernel_size=[3, 3])
+      if self._convolve_then_upsample:
+        # Replace Transposed Convolution with a Nearest Neighbor upsampling step
+        # followed by 3x3 convolution.
+        height_scale = self._mask_height / features.shape[1].value
+        width_scale = self._mask_width / features.shape[2].value
+        features = ops.nearest_neighbor_upsampling(
+            features, height_scale=height_scale, width_scale=width_scale)
+        features = slim.conv2d(
+            features,
            num_outputs=num_conv_channels,
            kernel_size=[3, 3])
      num_masks = 1 if self._masks_are_class_agnostic else self._num_classes
      mask_predictions = slim.conv2d(
-          upsampled_features,
+          features,
          num_outputs=num_masks,
          activation_fn=None,
          normalizer_fn=None,

--- a/research/object_detection/predictors/heads/mask_head_test.py
+++ b/research/object_detection/predictors/heads/mask_head_test.py
@@ -58,6 +58,22 @@ class MaskRCNNMaskHeadTest(test_case.TestCase):
        features=roi_pooled_features, num_predictions_per_location=1)
    self.assertAllEqual([64, 1, 20, 14, 14], prediction.get_shape().as_list())
+  def test_prediction_size_with_convolve_then_upsample(self):
+    mask_prediction_head = mask_head.MaskRCNNMaskHead(
+        num_classes=20,
+        conv_hyperparams_fn=self._build_arg_scope_with_hyperparams(),
+        mask_height=28,
+        mask_width=28,
+        mask_prediction_num_conv_layers=2,
+        mask_prediction_conv_depth=256,
+        masks_are_class_agnostic=True,
+        convolve_then_upsample=True)
+    roi_pooled_features = tf.random_uniform(
+        [64, 14, 14, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
+    prediction = mask_prediction_head.predict(
+        features=roi_pooled_features, num_predictions_per_location=1)
+    self.assertAllEqual([64, 1, 1, 28, 28], prediction.get_shape().as_list())
 class ConvolutionalMaskPredictorTest(test_case.TestCase):

--- a/research/object_detection/protos/box_predictor.proto
+++ b/research/object_detection/protos/box_predictor.proto
@@ -138,6 +138,7 @@ message WeightSharedConvolutionalBoxPredictor {
 // TODO(alirezafathi): Refactor the proto file to be able to configure mask rcnn
 // head easily.
+// Next id: 15
 message MaskRCNNBoxPredictor {
  // Hyperparameters for fully connected ops used in the box predictor.
  optional Hyperparams fc_hyperparams = 1;
@@ -178,6 +179,12 @@ message MaskRCNNBoxPredictor {
  // Whether to use one box for all classes rather than a different box for each
  // class.
  optional bool share_box_across_classes = 13 [default = false];
+  // Whether to apply convolutions on mask features before upsampling using
+  // nearest neighbor resizing.
+  // By default, mask features are resized to [`mask_height`, `mask_width`]
+  // before applying convolutions and predicting masks.
+  optional bool convolve_then_upsample_masks = 14 [default = false];
 }
 message RfcnBoxPredictor {

--- a/research/object_detection/protos/faster_rcnn.proto
+++ b/research/object_detection/protos/faster_rcnn.proto
@@ -164,6 +164,10 @@ message FasterRcnn {
  // Whether the masks present in groundtruth should be resized in the model to
  // match the image size.
  optional bool resize_masks = 36 [default = true];
+  // If True, uses implementation of ops with static shape guarantees when
+  // running evaluation (specifically not is_training if False).
+  optional bool use_static_shapes_for_eval = 37 [default = false];
 }

--- a/research/object_detection/protos/preprocessor.proto
+++ b/research/object_detection/protos/preprocessor.proto
@@ -155,6 +155,9 @@ message RandomCropImage {
  // value, it is removed from the new image.
  optional float overlap_thresh = 6 [default=0.3];
+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 8 [default=true];
  // Probability of keeping the original image.
  optional float random_coef = 7 [default=0.0];
 }
@@ -194,6 +197,9 @@ message RandomCropPadImage {
  // value, it is removed from the new image.
  optional float overlap_thresh = 6 [default=0.3];
+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 11 [default=true];
  // Probability of keeping the original image during the crop operation.
  optional float random_coef = 7 [default=0.0];
@@ -217,6 +223,9 @@ message RandomCropToAspectRatio {
  // ratio between a cropped bounding box and the original is less than this
  // value, it is removed from the new image.
  optional float overlap_thresh = 2 [default=0.3];
+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 3 [default=true];
 }
 // Randomly adds black square patches to an image.
@@ -285,6 +294,9 @@ message SSDRandomCropOperation {
  // Cropped box area ratio must be above this threhold to be kept.
  optional float overlap_thresh = 6;
+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 8 [default=true];
  // Probability a crop operation is skipped.
  optional float random_coef = 7;
 }
@@ -315,6 +327,9 @@ message SSDRandomCropPadOperation {
  // Cropped box area ratio must be above this threhold to be kept.
  optional float overlap_thresh = 6;
+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 13 [default=true];
  // Probability a crop operation is skipped.
  optional float random_coef = 7;
@@ -353,6 +368,9 @@ message SSDRandomCropFixedAspectRatioOperation {
  // Cropped box area ratio must be above this threhold to be kept.
  optional float overlap_thresh = 6;
+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 8 [default=true];
  // Probability a crop operation is skipped.
  optional float random_coef = 7;
 }
@@ -387,6 +405,9 @@ message SSDRandomCropPadFixedAspectRatioOperation {
  // Cropped box area ratio must be above this threhold to be kept.
  optional float overlap_thresh = 6;
+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 8 [default=true];
  // Probability a crop operation is skipped.
  optional float random_coef = 7;
 }