Merge branch 'master' of https://github.com/ilyamironov/models

3b158095 · Ilya Mironov · a90db800 · be659c2f · 3b158095 · 3b158095
Commit 3b158095 authored May 07, 2018 by Ilya Mironov
20 changed files
--- a/research/object_detection/core/losses.py
+++ b/research/object_detection/core/losses.py
@@ -23,6 +23,7 @@ Localization losses:
 Classification losses:
 * WeightedSigmoidClassificationLoss
 * WeightedSoftmaxClassificationLoss
+ * WeightedSoftmaxClassificationAgainstLogitsLoss
 * BootstrappedSigmoidClassificationLoss
 """
 from abc import ABCMeta
@@ -317,6 +318,54 @@ class WeightedSoftmaxClassificationLoss(Loss):
    return tf.reshape(per_row_cross_ent, tf.shape(weights)) * weights


+class WeightedSoftmaxClassificationAgainstLogitsLoss(Loss):
+  """Softmax loss function against logits.
+
+   Targets are expected to be provided in logits space instead of "one hot" or
+   "probability distribution" space.
+  """
+
+  def __init__(self, logit_scale=1.0):
+    """Constructor.
+
+    Args:
+      logit_scale: When this value is high, the target is "diffused" and
+                   when this value is low, the target is made peakier.
+                   (default 1.0)
+
+    """
+    self._logit_scale = logit_scale
+
+  def _scale_and_softmax_logits(self, logits):
+    """Scale logits then apply softmax."""
+    scaled_logits = tf.divide(logits, self._logit_scale, name='scale_logits')
+    return tf.nn.softmax(scaled_logits, name='convert_scores')
+
+  def _compute_loss(self, prediction_tensor, target_tensor, weights):
+    """Compute loss function.
+
+    Args:
+      prediction_tensor: A float tensor of shape [batch_size, num_anchors,
+        num_classes] representing the predicted logits for each class
+      target_tensor: A float tensor of shape [batch_size, num_anchors,
+        num_classes] representing logit classification targets
+      weights: a float tensor of shape [batch_size, num_anchors]
+
+    Returns:
+      loss: a float tensor of shape [batch_size, num_anchors]
+        representing the value of the loss function.
+    """
+    num_classes = prediction_tensor.get_shape().as_list()[-1]
+    target_tensor = self._scale_and_softmax_logits(target_tensor)
+    prediction_tensor = tf.divide(prediction_tensor, self._logit_scale,
+                                  name='scale_logits')
+
+    per_row_cross_ent = (tf.nn.softmax_cross_entropy_with_logits(
+        labels=tf.reshape(target_tensor, [-1, num_classes]),
+        logits=tf.reshape(prediction_tensor, [-1, num_classes])))
+    return tf.reshape(per_row_cross_ent, tf.shape(weights)) * weights
+
+
 class BootstrappedSigmoidClassificationLoss(Loss):
  """Bootstrapped sigmoid cross entropy classification loss function.


--- a/research/object_detection/core/losses_test.py
+++ b/research/object_detection/core/losses_test.py
@@ -576,6 +576,111 @@ class WeightedSoftmaxClassificationLossTest(tf.test.TestCase):
      self.assertAllClose(loss_output, exp_loss)


+class WeightedSoftmaxClassificationAgainstLogitsLossTest(tf.test.TestCase):
+
+  def testReturnsCorrectLoss(self):
+    prediction_tensor = tf.constant([[[-100, 100, -100],
+                                      [100, -100, -100],
+                                      [0, 0, -100],
+                                      [-100, -100, 100]],
+                                     [[-100, 0, 0],
+                                      [-100, 100, -100],
+                                      [-100, 100, -100],
+                                      [100, -100, -100]]], tf.float32)
+
+    target_tensor = tf.constant([[[-100, 100, -100],
+                                  [100, -100, -100],
+                                  [100, -100, -100],
+                                  [-100, -100, 100]],
+                                 [[-100, -100, 100],
+                                  [-100, 100, -100],
+                                  [-100, 100, -100],
+                                  [100, -100, -100]]], tf.float32)
+    weights = tf.constant([[1, 1, .5, 1],
+                           [1, 1, 1, 1]], tf.float32)
+    loss_op = losses.WeightedSoftmaxClassificationAgainstLogitsLoss()
+    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
+    loss = tf.reduce_sum(loss)
+
+    exp_loss = - 1.5 * math.log(.5)
+    with self.test_session() as sess:
+      loss_output = sess.run(loss)
+      self.assertAllClose(loss_output, exp_loss)
+
+  def testReturnsCorrectAnchorWiseLoss(self):
+    prediction_tensor = tf.constant([[[-100, 100, -100],
+                                      [100, -100, -100],
+                                      [0, 0, -100],
+                                      [-100, -100, 100]],
+                                     [[-100, 0, 0],
+                                      [-100, 100, -100],
+                                      [-100, 100, -100],
+                                      [100, -100, -100]]], tf.float32)
+    target_tensor = tf.constant([[[-100, 100, -100],
+                                  [100, -100, -100],
+                                  [100, -100, -100],
+                                  [-100, -100, 100]],
+                                 [[-100, -100, 100],
+                                  [-100, 100, -100],
+                                  [-100, 100, -100],
+                                  [100, -100, -100]]], tf.float32)
+    weights = tf.constant([[1, 1, .5, 1],
+                           [1, 1, 1, 0]], tf.float32)
+    loss_op = losses.WeightedSoftmaxClassificationAgainstLogitsLoss()
+    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
+
+    exp_loss = np.matrix([[0, 0, - 0.5 * math.log(.5), 0],
+                          [-math.log(.5), 0, 0, 0]])
+    with self.test_session() as sess:
+      loss_output = sess.run(loss)
+      self.assertAllClose(loss_output, exp_loss)
+
+  def testReturnsCorrectAnchorWiseLossWithLogitScaleSetting(self):
+    logit_scale = 100.
+    prediction_tensor = tf.constant([[[-100, 100, -100],
+                                      [100, -100, -100],
+                                      [0, 0, -100],
+                                      [-100, -100, 100]],
+                                     [[-100, 0, 0],
+                                      [-100, 100, -100],
+                                      [-100, 100, -100],
+                                      [100, -100, -100]]], tf.float32)
+    target_tensor = tf.constant([[[-100, 100, -100],
+                                  [100, -100, -100],
+                                  [0, 0, -100],
+                                  [-100, -100, 100]],
+                                 [[-100, 0, 0],
+                                  [-100, 100, -100],
+                                  [-100, 100, -100],
+                                  [100, -100, -100]]], tf.float32)
+    weights = tf.constant([[1, 1, .5, 1],
+                           [1, 1, 1, 0]], tf.float32)
+    loss_op = losses.WeightedSoftmaxClassificationAgainstLogitsLoss(
+        logit_scale=logit_scale)
+    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
+
+    # find softmax of the two prediction types above
+    softmax_pred1 = [np.exp(-1), np.exp(-1), np.exp(1)]
+    softmax_pred1 /= sum(softmax_pred1)
+    softmax_pred2 = [np.exp(0), np.exp(0), np.exp(-1)]
+    softmax_pred2 /= sum(softmax_pred2)
+
+    # compute the expected cross entropy for perfect matches
+    exp_entropy1 = sum(
+        [-x*np.log(x) for x in softmax_pred1])
+    exp_entropy2 = sum(
+        [-x*np.log(x) for x in softmax_pred2])
+
+    # weighted expected losses
+    exp_loss = np.matrix(
+        [[exp_entropy1, exp_entropy1, exp_entropy2*.5, exp_entropy1],
+         [exp_entropy2, exp_entropy1, exp_entropy1, 0.]])
+
+    with self.test_session() as sess:
+      loss_output = sess.run(loss)
+      self.assertAllClose(loss_output, exp_loss)
+
+
 class BootstrappedSigmoidClassificationLossTest(tf.test.TestCase):

  def testReturnsCorrectLossSoftBootstrapping(self):

--- a/research/object_detection/core/preprocessor.py
+++ b/research/object_detection/core/preprocessor.py
@@ -1000,8 +1000,8 @@ def random_adjust_saturation(image,
 def random_distort_color(image, color_ordering=0, preprocess_vars_cache=None):
  """Randomly distorts color.

-  Randomly distorts color using a combination of brightness, hue, contrast
-  and saturation changes. Makes sure the output image is still between 0 and 255.
+  Randomly distorts color using a combination of brightness, hue, contrast and
+  saturation changes. Makes sure the output image is still between 0 and 255.

  Args:
    image: rank 3 float32 tensor contains 1 image -> [height, width, channels]

--- a/research/object_detection/core/preprocessor_test.py
+++ b/research/object_detection/core/preprocessor_test.py
@@ -2620,16 +2620,24 @@ class PreprocessorTest(tf.test.TestCase):
    distorted_images_rank = tf.rank(distorted_images)
    boxes_rank = tf.rank(boxes)
    distorted_boxes_rank = tf.rank(distorted_boxes)
+    multiclass_scores_rank = tf.rank(multiclass_scores)
+    distorted_multiclass_scores_rank = tf.rank(distorted_multiclass_scores)

    with self.test_session() as sess:
-      (boxes_rank_, distorted_boxes_rank_, images_rank_, distorted_images_rank_,
-       multiclass_scores_, distorted_multiclass_scores_) = sess.run([
-           boxes_rank, distorted_boxes_rank, images_rank, distorted_images_rank,
-           multiclass_scores, distorted_multiclass_scores
+      (boxes_rank_, distorted_boxes_, distorted_boxes_rank_, images_rank_,
+       distorted_images_rank_, multiclass_scores_rank_,
+       distorted_multiclass_scores_,
+       distorted_multiclass_scores_rank_) = sess.run([
+           boxes_rank, distorted_boxes, distorted_boxes_rank, images_rank,
+           distorted_images_rank, multiclass_scores_rank,
+           distorted_multiclass_scores, distorted_multiclass_scores_rank
       ])
      self.assertAllEqual(boxes_rank_, distorted_boxes_rank_)
      self.assertAllEqual(images_rank_, distorted_images_rank_)
-      self.assertAllEqual(multiclass_scores_, distorted_multiclass_scores_)
+      self.assertAllEqual(multiclass_scores_rank_,
+                          distorted_multiclass_scores_rank_)
+      self.assertAllEqual(distorted_boxes_.shape[0],
+                          distorted_multiclass_scores_.shape[0])

  def testSSDRandomCropPad(self):
    images = self.createTestImages()

--- a/research/object_detection/data/ava_label_map_v2.1.pbtxt
+++ b/research/object_detection/data/ava_label_map_v2.1.pbtxt
+item {
+  name: "bend/bow (at the waist)"
+  id: 1
+}
+item {
+  name: "crouch/kneel"
+  id: 3
+}
+item {
+  name: "dance"
+  id: 4
+}
+item {
+  name: "fall down"
+  id: 5
+}
+item {
+  name: "get up"
+  id: 6
+}
+item {
+  name: "jump/leap"
+  id: 7
+}
+item {
+  name: "lie/sleep"
+  id: 8
+}
+item {
+  name: "martial art"
+  id: 9
+}
+item {
+  name: "run/jog"
+  id: 10
+}
+item {
+  name: "sit"
+  id: 11
+}
+item {
+  name: "stand"
+  id: 12
+}
+item {
+  name: "swim"
+  id: 13
+}
+item {
+  name: "walk"
+  id: 14
+}
+item {
+  name: "answer phone"
+  id: 15
+}
+item {
+  name: "carry/hold (an object)"
+  id: 17
+}
+item {
+  name: "climb (e.g., a mountain)"
+  id: 20
+}
+item {
+  name: "close (e.g., a door, a box)"
+  id: 22
+}
+item {
+  name: "cut"
+  id: 24
+}
+item {
+  name: "dress/put on clothing"
+  id: 26
+}
+item {
+  name: "drink"
+  id: 27
+}
+item {
+  name: "drive (e.g., a car, a truck)"
+  id: 28
+}
+item {
+  name: "eat"
+  id: 29
+}
+item {
+  name: "enter"
+  id: 30
+}
+item {
+  name: "hit (an object)"
+  id: 34
+}
+item {
+  name: "lift/pick up"
+  id: 36
+}
+item {
+  name: "listen (e.g., to music)"
+  id: 37
+}
+item {
+  name: "open (e.g., a window, a car door)"
+  id: 38
+}
+item {
+  name: "play musical instrument"
+  id: 41
+}
+item {
+  name: "point to (an object)"
+  id: 43
+}
+item {
+  name: "pull (an object)"
+  id: 45
+}
+item {
+  name: "push (an object)"
+  id: 46
+}
+item {
+  name: "put down"
+  id: 47
+}
+item {
+  name: "read"
+  id: 48
+}
+item {
+  name: "ride (e.g., a bike, a car, a horse)"
+  id: 49
+}
+item {
+  name: "sail boat"
+  id: 51
+}
+item {
+  name: "shoot"
+  id: 52
+}
+item {
+  name: "smoke"
+  id: 54
+}
+item {
+  name: "take a photo"
+  id: 56
+}
+item {
+  name: "text on/look at a cellphone"
+  id: 57
+}
+item {
+  name: "throw"
+  id: 58
+}
+item {
+  name: "touch (an object)"
+  id: 59
+}
+item {
+  name: "turn (e.g., a screwdriver)"
+  id: 60
+}
+item {
+  name: "watch (e.g., TV)"
+  id: 61
+}
+item {
+  name: "work on a computer"
+  id: 62
+}
+item {
+  name: "write"
+  id: 63
+}
+item {
+  name: "fight/hit (a person)"
+  id: 64
+}
+item {
+  name: "give/serve (an object) to (a person)"
+  id: 65
+}
+item {
+  name: "grab (a person)"
+  id: 66
+}
+item {
+  name: "hand clap"
+  id: 67
+}
+item {
+  name: "hand shake"
+  id: 68
+}
+item {
+  name: "hand wave"
+  id: 69
+}
+item {
+  name: "hug (a person)"
+  id: 70
+}
+item {
+  name: "kiss (a person)"
+  id: 72
+}
+item {
+  name: "lift (a person)"
+  id: 73
+}
+item {
+  name: "listen to (a person)"
+  id: 74
+}
+item {
+  name: "push (another person)"
+  id: 76
+}
+item {
+  name: "sing to (e.g., self, a person, a group)"
+  id: 77
+}
+item {
+  name: "take (an object) from (a person)"
+  id: 78
+}
+item {
+  name: "talk to (e.g., self, a person, a group)"
+  id: 79
+}
+item {
+  name: "watch (a person)"
+  id: 80
+}
--- a/research/object_detection/data_decoders/tf_example_decoder.py
+++ b/research/object_detection/data_decoders/tf_example_decoder.py
@@ -111,7 +111,8 @@ class TfExampleDecoder(data_decoder.DataDecoder):
               instance_mask_type=input_reader_pb2.NUMERICAL_MASKS,
               label_map_proto_file=None,
               use_display_name=False,
-               dct_method=''):
+               dct_method='',
+               num_keypoints=0):
    """Constructor sets keys_to_features and items_to_handlers.

    Args:
@@ -131,6 +132,7 @@ class TfExampleDecoder(data_decoder.DataDecoder):
        algorithm used for jpeg decompression. Currently valid values
        are ['INTEGER_FAST', 'INTEGER_ACCURATE']. The hint may be ignored, for
        example, the jpeg library does not have that specific option.
+      num_keypoints: the number of keypoints per object.

    Raises:
      ValueError: If `instance_mask_type` option is not one of
@@ -149,9 +151,9 @@ class TfExampleDecoder(data_decoder.DataDecoder):
        'image/source_id':
            tf.FixedLenFeature((), tf.string, default_value=''),
        'image/height':
-            tf.FixedLenFeature((), tf.int64, 1),
+            tf.FixedLenFeature((), tf.int64, default_value=1),
        'image/width':
-            tf.FixedLenFeature((), tf.int64, 1),
+            tf.FixedLenFeature((), tf.int64, default_value=1),
        # Object boxes and classes.
        'image/object/bbox/xmin':
            tf.VarLenFeature(tf.float32),
@@ -209,6 +211,16 @@ class TfExampleDecoder(data_decoder.DataDecoder):
        fields.InputDataFields.groundtruth_weights: (
            slim_example_decoder.Tensor('image/object/weight')),
    }
+    self._num_keypoints = num_keypoints
+    if num_keypoints > 0:
+      self.keys_to_features['image/object/keypoint/x'] = (
+          tf.VarLenFeature(tf.float32))
+      self.keys_to_features['image/object/keypoint/y'] = (
+          tf.VarLenFeature(tf.float32))
+      self.items_to_handlers[fields.InputDataFields.groundtruth_keypoints] = (
+          slim_example_decoder.ItemHandlerCallback(
+              ['image/object/keypoint/y', 'image/object/keypoint/x'],
+              self._reshape_keypoints))
    if load_instance_masks:
      if instance_mask_type in (input_reader_pb2.DEFAULT,
                                input_reader_pb2.NUMERICAL_MASKS):
@@ -286,6 +298,9 @@ class TfExampleDecoder(data_decoder.DataDecoder):
        [None] indicating if the boxes represent `difficult` instances.
      fields.InputDataFields.groundtruth_group_of - 1D bool tensor of shape
        [None] indicating if the boxes represent `group_of` instances.
+      fields.InputDataFields.groundtruth_keypoints - 3D float32 tensor of
+        shape [None, None, 2] containing keypoints, where the coordinates of
+        the keypoints are ordered (y, x).
      fields.InputDataFields.groundtruth_instance_masks - 3D float32 tensor of
        shape [None, None, None] containing instance masks.
    """
@@ -314,6 +329,31 @@ class TfExampleDecoder(data_decoder.DataDecoder):
        default_groundtruth_weights)
    return tensor_dict

+  def _reshape_keypoints(self, keys_to_tensors):
+    """Reshape keypoints.
+
+    The instance segmentation masks are reshaped to [num_instances,
+    num_keypoints, 2].
+
+    Args:
+      keys_to_tensors: a dictionary from keys to tensors.
+
+    Returns:
+      A 3-D float tensor of shape [num_instances, num_keypoints, 2] with values
+        in {0, 1}.
+    """
+    y = keys_to_tensors['image/object/keypoint/y']
+    if isinstance(y, tf.SparseTensor):
+      y = tf.sparse_tensor_to_dense(y)
+    y = tf.expand_dims(y, 1)
+    x = keys_to_tensors['image/object/keypoint/x']
+    if isinstance(x, tf.SparseTensor):
+      x = tf.sparse_tensor_to_dense(x)
+    x = tf.expand_dims(x, 1)
+    keypoints = tf.concat([y, x], 1)
+    keypoints = tf.reshape(keypoints, [-1, self._num_keypoints, 2])
+    return keypoints
+
  def _reshape_instance_masks(self, keys_to_tensors):
    """Reshape instance segmentation masks.


--- a/research/object_detection/data_decoders/tf_example_decoder_test.py
+++ b/research/object_detection/data_decoders/tf_example_decoder_test.py
@@ -304,6 +304,50 @@ class TfExampleDecoderTest(tf.test.TestCase):
    self.assertAllEqual(
        2, tensor_dict[fields.InputDataFields.num_groundtruth_boxes])

+  def testDecodeKeypoint(self):
+    image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8)
+    encoded_jpeg = self._EncodeImage(image_tensor)
+    bbox_ymins = [0.0, 4.0]
+    bbox_xmins = [1.0, 5.0]
+    bbox_ymaxs = [2.0, 6.0]
+    bbox_xmaxs = [3.0, 7.0]
+    keypoint_ys = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
+    keypoint_xs = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+    example = tf.train.Example(features=tf.train.Features(feature={
+        'image/encoded': self._BytesFeature(encoded_jpeg),
+        'image/format': self._BytesFeature('jpeg'),
+        'image/object/bbox/ymin': self._FloatFeature(bbox_ymins),
+        'image/object/bbox/xmin': self._FloatFeature(bbox_xmins),
+        'image/object/bbox/ymax': self._FloatFeature(bbox_ymaxs),
+        'image/object/bbox/xmax': self._FloatFeature(bbox_xmaxs),
+        'image/object/keypoint/y': self._FloatFeature(keypoint_ys),
+        'image/object/keypoint/x': self._FloatFeature(keypoint_xs),
+    })).SerializeToString()
+
+    example_decoder = tf_example_decoder.TfExampleDecoder(num_keypoints=3)
+    tensor_dict = example_decoder.decode(tf.convert_to_tensor(example))
+
+    self.assertAllEqual((tensor_dict[fields.InputDataFields.groundtruth_boxes].
+                         get_shape().as_list()), [None, 4])
+    self.assertAllEqual((tensor_dict[fields.InputDataFields.
+                                     groundtruth_keypoints].
+                         get_shape().as_list()), [None, 3, 2])
+    with self.test_session() as sess:
+      tensor_dict = sess.run(tensor_dict)
+
+    expected_boxes = np.vstack([bbox_ymins, bbox_xmins,
+                                bbox_ymaxs, bbox_xmaxs]).transpose()
+    self.assertAllEqual(expected_boxes,
+                        tensor_dict[fields.InputDataFields.groundtruth_boxes])
+    self.assertAllEqual(
+        2, tensor_dict[fields.InputDataFields.num_groundtruth_boxes])
+
+    expected_keypoints = (
+        np.vstack([keypoint_ys, keypoint_xs]).transpose().reshape((2, 3, 2)))
+    self.assertAllEqual(expected_keypoints,
+                        tensor_dict[
+                            fields.InputDataFields.groundtruth_keypoints])
+
  def testDecodeDefaultGroundtruthWeights(self):
    image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8)
    encoded_jpeg = self._EncodeImage(image_tensor)

--- a/research/object_detection/g3doc/detection_model_zoo.md
+++ b/research/object_detection/g3doc/detection_model_zoo.md
@@ -91,7 +91,7 @@ Some remarks on frozen inference graphs:

 ## Kitti-trained models {#kitti-models}

-Model name                                                                                                                                                        | Speed (ms) | Pascal mAP@0.5 (ms) | Outputs
+Model name                                                                                                                                                        | Speed (ms) | Pascal mAP@0.5 | Outputs
 ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----:
 [faster_rcnn_resnet101_kitti](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_kitti_2018_01_28.tar.gz) | 79  | 87              | Boxes

@@ -103,6 +103,13 @@ Model name
 [faster_rcnn_inception_resnet_v2_atrous_lowproposals_oid](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_lowproposals_oid_2018_01_28.tar.gz) | 347  |               | Boxes


+## AVA v2.1 trained models {#ava-models}
+
+Model name                                                                                                                                                        | Speed (ms) | Pascal mAP@0.5 | Outputs
+----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----:
+[faster_rcnn_resnet101_ava_v2.1](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_ava_v2.1_2018_04_30.tar.gz) | 93  | 11              | Boxes
+
+
 [^1]: See [MSCOCO evaluation protocol](http://cocodataset.org/#detections-eval).
 [^2]: This is PASCAL mAP with a slightly different way of true positives computation: see [Open Images evaluation protocol](evaluation_protocols.md#open-images).

--- a/research/object_detection/g3doc/installation.md
+++ b/research/object_detection/g3doc/installation.md
@@ -4,7 +4,7 @@

 Tensorflow Object Detection API depends on the following libraries:

-*   Protobuf 2.6
+*   Protobuf 3+
 *   Python-tk
 *   Pillow 1.0
 *   lxml

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
@@ -599,9 +599,11 @@ class FasterRCNNMetaArch(model.DetectionModel):

        (and if number_of_stages > 1):
        7) refined_box_encodings: a 3-D tensor with shape
-          [total_num_proposals, num_classes, 4] representing predicted
-          (final) refined box encodings, where
-          total_num_proposals=batch_size*self._max_num_proposals
+          [total_num_proposals, num_classes, self._box_coder.code_size]
+          representing predicted (final) refined box encodings, where
+          total_num_proposals=batch_size*self._max_num_proposals. If using
+          a shared box across classes the shape will instead be
+          [total_num_proposals, 1, self._box_coder.code_size].
        8) class_predictions_with_background: a 3-D tensor with shape
          [total_num_proposals, num_classes + 1] containing class
          predictions (logits) for each of the anchors, where
@@ -712,9 +714,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
    Returns:
      prediction_dict: a dictionary holding "raw" prediction tensors:
        1) refined_box_encodings: a 3-D tensor with shape
-          [total_num_proposals, num_classes, 4] representing predicted
-          (final) refined box encodings, where
-          total_num_proposals=batch_size*self._max_num_proposals
+          [total_num_proposals, num_classes, self._box_coder.code_size]
+          representing predicted (final) refined box encodings, where
+          total_num_proposals=batch_size*self._max_num_proposals. If using a
+          shared box across classes the shape will instead be
+          [total_num_proposals, 1, self._box_coder.code_size].
        2) class_predictions_with_background: a 3-D tensor with shape
          [total_num_proposals, num_classes + 1] containing class
          predictions (logits) for each of the anchors, where
@@ -791,9 +795,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
    Args:
     prediction_dict: a dictionary holding "raw" prediction tensors:
        1) refined_box_encodings: a 3-D tensor with shape
-          [total_num_proposals, num_classes, 4] representing predicted
-          (final) refined box encodings, where
-          total_num_proposals=batch_size*self._max_num_proposals
+          [total_num_proposals, num_classes, self._box_coder.code_size]
+          representing predicted (final) refined box encodings, where
+          total_num_proposals=batch_size*self._max_num_proposals. If using a
+          shared box across classes the shape will instead be
+          [total_num_proposals, 1, self._box_coder.code_size].
        2) class_predictions_with_background: a 3-D tensor with shape
          [total_num_proposals, num_classes + 1] containing class
          predictions (logits) for each of the anchors, where
@@ -823,13 +829,13 @@ class FasterRCNNMetaArch(model.DetectionModel):
    if self._is_training:
      curr_box_classifier_features = prediction_dict['box_classifier_features']
      detection_classes = prediction_dict['class_predictions_with_background']
-      box_predictions = self._mask_rcnn_box_predictor.predict(
+      mask_predictions = self._mask_rcnn_box_predictor.predict(
          [curr_box_classifier_features],
          num_predictions_per_location=[1],
          scope=self.second_stage_box_predictor_scope,
          predict_boxes_and_classes=False,
          predict_auxiliary_outputs=True)
-      prediction_dict['mask_predictions'] = tf.squeeze(box_predictions[
+      prediction_dict['mask_predictions'] = tf.squeeze(mask_predictions[
          box_predictor.MASK_PREDICTIONS], axis=1)
    else:
      detections_dict = self._postprocess_box_classifier(
@@ -854,14 +860,14 @@ class FasterRCNNMetaArch(model.DetectionModel):
              flattened_detected_feature_maps,
              scope=self.second_stage_feature_extractor_scope))

-      box_predictions = self._mask_rcnn_box_predictor.predict(
+      mask_predictions = self._mask_rcnn_box_predictor.predict(
          [curr_box_classifier_features],
          num_predictions_per_location=[1],
          scope=self.second_stage_box_predictor_scope,
          predict_boxes_and_classes=False,
          predict_auxiliary_outputs=True)

-      detection_masks = tf.squeeze(box_predictions[
+      detection_masks = tf.squeeze(mask_predictions[
          box_predictor.MASK_PREDICTIONS], axis=1)

      _, num_classes, mask_height, mask_width = (
@@ -1098,8 +1104,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
                tf.to_float(num_proposals),
        }

+    # TODO(jrru): Remove mask_predictions from _post_process_box_classifier.
    with tf.name_scope('SecondStagePostprocessor'):
-      if self._number_of_stages == 2:
+      if (self._number_of_stages == 2 or
+          (self._number_of_stages == 3 and self._is_training)):
        mask_predictions = prediction_dict.get(box_predictor.MASK_PREDICTIONS)
        detections_dict = self._postprocess_box_classifier(
            prediction_dict['refined_box_encodings'],
@@ -1438,8 +1446,10 @@ class FasterRCNNMetaArch(model.DetectionModel):

    Args:
      refined_box_encodings: a 3-D float tensor with shape
-        [total_num_padded_proposals, num_classes, 4] representing predicted
-        (final) refined box encodings.
+        [total_num_padded_proposals, num_classes, self._box_coder.code_size]
+        representing predicted (final) refined box encodings. If using a shared
+        box across classes the shape will instead be
+        [total_num_padded_proposals, 1, 4]
      class_predictions_with_background: a 3-D tensor float with shape
        [total_num_padded_proposals, num_classes + 1] containing class
        predictions (logits) for each of the proposals.  Note that this tensor
@@ -1466,10 +1476,12 @@ class FasterRCNNMetaArch(model.DetectionModel):
          that a pixel-wise sigmoid score converter is applied to the detection
          masks.
    """
-    refined_box_encodings_batch = tf.reshape(refined_box_encodings,
-                                             [-1, self.max_num_proposals,
-                                              self.num_classes,
-                                              self._box_coder.code_size])
+    refined_box_encodings_batch = tf.reshape(
+        refined_box_encodings,
+        [-1,
+         self.max_num_proposals,
+         refined_box_encodings.shape[1],
+         self._box_coder.code_size])
    class_predictions_with_background_batch = tf.reshape(
        class_predictions_with_background,
        [-1, self.max_num_proposals, self.num_classes + 1]
@@ -1517,13 +1529,18 @@ class FasterRCNNMetaArch(model.DetectionModel):
      box_encodings: a 4-D tensor with shape
        [batch_size, num_anchors, num_classes, self._box_coder.code_size]
        representing box encodings.
-      anchor_boxes: [batch_size, num_anchors, 4] representing
-        decoded bounding boxes.
+      anchor_boxes: [batch_size, num_anchors, self._box_coder.code_size]
+        representing decoded bounding boxes. If using a shared box across
+        classes the shape will instead be
+        [total_num_proposals, 1, self._box_coder.code_size].

    Returns:
-      decoded_boxes: a [batch_size, num_anchors, num_classes, 4]
-        float tensor representing bounding box predictions
-        (for each image in batch, proposal and class).
+      decoded_boxes: a
+        [batch_size, num_anchors, num_classes, self._box_coder.code_size]
+        float tensor representing bounding box predictions (for each image in
+        batch, proposal and class). If using a shared box across classes the
+        shape will instead be
+        [batch_size, num_anchors, 1, self._box_coder.code_size].
    """
    combined_shape = shape_utils.combined_static_and_dynamic_shape(
        box_encodings)
@@ -1697,7 +1714,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
    Args:
      refined_box_encodings: a 3-D tensor with shape
        [total_num_proposals, num_classes, box_coder.code_size] representing
-        predicted (final) refined box encodings.
+        predicted (final) refined box encodings. If using a shared box across
+        classes this will instead have shape
+        [total_num_proposals, 1, box_coder.code_size].
      class_predictions_with_background: a 2-D tensor with shape
        [total_num_proposals, num_classes + 1] containing class
        predictions (logits) for each of the anchors.  Note that this tensor
@@ -1748,31 +1767,39 @@ class FasterRCNNMetaArch(model.DetectionModel):
           self._detector_target_assigner, proposal_boxlists,
           groundtruth_boxlists, groundtruth_classes_with_background_list)

-      # We only predict refined location encodings for the non background
-      # classes, but we now pad it to make it compatible with the class
-      # predictions
+      class_predictions_with_background = tf.reshape(
+          class_predictions_with_background,
+          [batch_size, self.max_num_proposals, -1])
+
      flat_cls_targets_with_background = tf.reshape(
          batch_cls_targets_with_background,
          [batch_size * self.max_num_proposals, -1])
-      refined_box_encodings_with_background = tf.pad(
-          refined_box_encodings, [[0, 0], [1, 0], [0, 0]])
-      # For anchors with multiple labels, picks refined_location_encodings
-      # for just one class to avoid over-counting for regression loss and
-      # (optionally) mask loss.
      one_hot_flat_cls_targets_with_background = tf.argmax(
          flat_cls_targets_with_background, axis=1)
      one_hot_flat_cls_targets_with_background = tf.one_hot(
          one_hot_flat_cls_targets_with_background,
          flat_cls_targets_with_background.get_shape()[1])
-      refined_box_encodings_masked_by_class_targets = tf.boolean_mask(
-          refined_box_encodings_with_background,
-          tf.greater(one_hot_flat_cls_targets_with_background, 0))
-      class_predictions_with_background = tf.reshape(
-          class_predictions_with_background,
-          [batch_size, self.max_num_proposals, -1])
-      reshaped_refined_box_encodings = tf.reshape(
-          refined_box_encodings_masked_by_class_targets,
-          [batch_size, -1, 4])
+
+      # If using a shared box across classes use directly
+      if refined_box_encodings.shape[1] == 1:
+        reshaped_refined_box_encodings = tf.reshape(
+            refined_box_encodings,
+            [batch_size, self.max_num_proposals, self._box_coder.code_size])
+      # For anchors with multiple labels, picks refined_location_encodings
+      # for just one class to avoid over-counting for regression loss and
+      # (optionally) mask loss.
+      else:
+        # We only predict refined location encodings for the non background
+        # classes, but we now pad it to make it compatible with the class
+        # predictions
+        refined_box_encodings_with_background = tf.pad(
+            refined_box_encodings, [[0, 0], [1, 0], [0, 0]])
+        refined_box_encodings_masked_by_class_targets = tf.boolean_mask(
+            refined_box_encodings_with_background,
+            tf.greater(one_hot_flat_cls_targets_with_background, 0))
+        reshaped_refined_box_encodings = tf.reshape(
+            refined_box_encodings_masked_by_class_targets,
+            [batch_size, self.max_num_proposals, self._box_coder.code_size])

      second_stage_loc_losses = self._second_stage_localization_loss(
          reshaped_refined_box_encodings,

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test.py
@@ -85,6 +85,46 @@ class FasterRCNNMetaArchTest(
      self.assertTrue(np.amax(detections_out['detection_masks'] <= 1.0))
      self.assertTrue(np.amin(detections_out['detection_masks'] >= 0.0))

+  def test_postprocess_second_stage_only_inference_mode_with_shared_boxes(self):
+    model = self._build_model(
+        is_training=False, number_of_stages=2, second_stage_batch_size=6)
+
+    batch_size = 2
+    total_num_padded_proposals = batch_size * model.max_num_proposals
+    proposal_boxes = tf.constant(
+        [[[1, 1, 2, 3],
+          [0, 0, 1, 1],
+          [.5, .5, .6, .6],
+          4*[0], 4*[0], 4*[0], 4*[0], 4*[0]],
+         [[2, 3, 6, 8],
+          [1, 2, 5, 3],
+          4*[0], 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]]], dtype=tf.float32)
+    num_proposals = tf.constant([3, 2], dtype=tf.int32)
+
+    # This has 1 box instead of one for each class.
+    refined_box_encodings = tf.zeros(
+        [total_num_padded_proposals, 1, 4], dtype=tf.float32)
+    class_predictions_with_background = tf.ones(
+        [total_num_padded_proposals, model.num_classes+1], dtype=tf.float32)
+    image_shape = tf.constant([batch_size, 36, 48, 3], dtype=tf.int32)
+
+    _, true_image_shapes = model.preprocess(tf.zeros(image_shape))
+    detections = model.postprocess({
+        'refined_box_encodings': refined_box_encodings,
+        'class_predictions_with_background': class_predictions_with_background,
+        'num_proposals': num_proposals,
+        'proposal_boxes': proposal_boxes,
+        'image_shape': image_shape,
+    }, true_image_shapes)
+    with self.test_session() as sess:
+      detections_out = sess.run(detections)
+      self.assertAllEqual(detections_out['detection_boxes'].shape, [2, 5, 4])
+      self.assertAllClose(detections_out['detection_scores'],
+                          [[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]])
+      self.assertAllClose(detections_out['detection_classes'],
+                          [[0, 0, 0, 1, 1], [0, 0, 1, 1, 0]])
+      self.assertAllClose(detections_out['num_detections'], [5, 4])
+
  @parameterized.parameters(
      {'masks_are_class_agnostic': False},
      {'masks_are_class_agnostic': True},

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
@@ -1284,6 +1284,106 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
      self.assertAllClose(loss_dict_out[
          'Loss/BoxClassifierLoss/classification_loss'], 0)

+  def test_loss_full_with_shared_boxes(self):
+    model = self._build_model(
+        is_training=True, number_of_stages=2, second_stage_batch_size=6)
+    batch_size = 2
+    anchors = tf.constant(
+        [[0, 0, 16, 16],
+         [0, 16, 16, 32],
+         [16, 0, 32, 16],
+         [16, 16, 32, 32]], dtype=tf.float32)
+    rpn_box_encodings = tf.zeros(
+        [batch_size,
+         anchors.get_shape().as_list()[0],
+         BOX_CODE_SIZE], dtype=tf.float32)
+    # use different numbers for the objectness category to break ties in
+    # order of boxes returned by NMS
+    rpn_objectness_predictions_with_background = tf.constant([
+        [[-10, 13],
+         [10, -10],
+         [10, -11],
+         [-10, 12]],
+        [[10, -10],
+         [-10, 13],
+         [-10, 12],
+         [10, -11]]], dtype=tf.float32)
+    image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32)
+
+    num_proposals = tf.constant([6, 6], dtype=tf.int32)
+    proposal_boxes = tf.constant(
+        2 * [[[0, 0, 16, 16],
+              [0, 16, 16, 32],
+              [16, 0, 32, 16],
+              [16, 16, 32, 32],
+              [0, 0, 16, 16],
+              [0, 16, 16, 32]]], dtype=tf.float32)
+    refined_box_encodings = tf.zeros(
+        (batch_size * model.max_num_proposals,
+         1,  # one box shared among all the classes
+         BOX_CODE_SIZE), dtype=tf.float32)
+    class_predictions_with_background = tf.constant(
+        [[-10, 10, -10],  # first image
+         [10, -10, -10],
+         [10, -10, -10],
+         [-10, -10, 10],
+         [-10, 10, -10],
+         [10, -10, -10],
+         [10, -10, -10],  # second image
+         [-10, 10, -10],
+         [-10, 10, -10],
+         [10, -10, -10],
+         [10, -10, -10],
+         [-10, 10, -10]], dtype=tf.float32)
+
+    mask_predictions_logits = 20 * tf.ones((batch_size *
+                                            model.max_num_proposals,
+                                            model.num_classes,
+                                            14, 14),
+                                           dtype=tf.float32)
+
+    groundtruth_boxes_list = [
+        tf.constant([[0, 0, .5, .5], [.5, .5, 1, 1]], dtype=tf.float32),
+        tf.constant([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=tf.float32)]
+    groundtruth_classes_list = [tf.constant([[1, 0], [0, 1]], dtype=tf.float32),
+                                tf.constant([[1, 0], [1, 0]], dtype=tf.float32)]
+
+    # Set all elements of groundtruth mask to 1.0. In this case all proposal
+    # crops of the groundtruth masks should return a mask that covers the entire
+    # proposal. Thus, if mask_predictions_logits element values are all greater
+    # than 20, the loss should be zero.
+    groundtruth_masks_list = [tf.convert_to_tensor(np.ones((2, 32, 32)),
+                                                   dtype=tf.float32),
+                              tf.convert_to_tensor(np.ones((2, 32, 32)),
+                                                   dtype=tf.float32)]
+    prediction_dict = {
+        'rpn_box_encodings': rpn_box_encodings,
+        'rpn_objectness_predictions_with_background':
+        rpn_objectness_predictions_with_background,
+        'image_shape': image_shape,
+        'anchors': anchors,
+        'refined_box_encodings': refined_box_encodings,
+        'class_predictions_with_background': class_predictions_with_background,
+        'proposal_boxes': proposal_boxes,
+        'num_proposals': num_proposals,
+        'mask_predictions': mask_predictions_logits
+    }
+    _, true_image_shapes = model.preprocess(tf.zeros(image_shape))
+    model.provide_groundtruth(groundtruth_boxes_list,
+                              groundtruth_classes_list,
+                              groundtruth_masks_list)
+    loss_dict = model.loss(prediction_dict, true_image_shapes)
+
+    with self.test_session() as sess:
+      loss_dict_out = sess.run(loss_dict)
+      self.assertAllClose(loss_dict_out['Loss/RPNLoss/localization_loss'], 0)
+      self.assertAllClose(loss_dict_out['Loss/RPNLoss/objectness_loss'], 0)
+      self.assertAllClose(loss_dict_out[
+          'Loss/BoxClassifierLoss/localization_loss'], 0)
+      self.assertAllClose(loss_dict_out[
+          'Loss/BoxClassifierLoss/classification_loss'], 0)
+      self.assertAllClose(loss_dict_out['Loss/BoxClassifierLoss/mask_loss'], 0)
+
  def test_restore_map_for_classification_ckpt(self):
    # Define mock tensorflow classification graph and save variables.
    test_graph_classification = tf.Graph()

--- a/research/object_detection/metrics/coco_evaluation.py
+++ b/research/object_detection/metrics/coco_evaluation.py
@@ -203,26 +203,39 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):

  def get_estimator_eval_metric_ops(self, image_id, groundtruth_boxes,
                                    groundtruth_classes, detection_boxes,
-                                    detection_scores, detection_classes):
+                                    detection_scores, detection_classes,
+                                    num_gt_boxes_per_image=None,
+                                    num_det_boxes_per_image=None):
    """Returns a dictionary of eval metric ops to use with `tf.EstimatorSpec`.

    Note that once value_op is called, the detections and groundtruth added via
    update_op are cleared.

+    This function can take in groundtruth and detections for a batch of images,
+    or for a single image. For the latter case, the batch dimension for input
+    tensors need not be present.
+
    Args:
-      image_id: Unique string/integer identifier for the image.
-      groundtruth_boxes: float32 tensor of shape [num_boxes, 4] containing
-        `num_boxes` groundtruth boxes of the format
+      image_id: string/integer tensor of shape [batch] with unique identifiers
+        for the images.
+      groundtruth_boxes: float32 tensor of shape [batch, num_boxes, 4]
+        containing `num_boxes` groundtruth boxes of the format
        [ymin, xmin, ymax, xmax] in absolute image coordinates.
-      groundtruth_classes: int32 tensor of shape [num_boxes] containing
+      groundtruth_classes: int32 tensor of shape [batch, num_boxes] containing
        1-indexed groundtruth classes for the boxes.
-      detection_boxes: float32 tensor of shape [num_boxes, 4] containing
+      detection_boxes: float32 tensor of shape [batch, num_boxes, 4] containing
        `num_boxes` detection boxes of the format [ymin, xmin, ymax, xmax]
        in absolute image coordinates.
-      detection_scores: float32 tensor of shape [num_boxes] containing
+      detection_scores: float32 tensor of shape [batch, num_boxes] containing
        detection scores for the boxes.
-      detection_classes: int32 tensor of shape [num_boxes] containing
+      detection_classes: int32 tensor of shape [batch, num_boxes] containing
        1-indexed detection classes for the boxes.
+      num_gt_boxes_per_image: int32 tensor of shape [batch] containing the
+        number of groundtruth boxes per image. If None, will assume no padding
+        in groundtruth tensors.
+      num_det_boxes_per_image: int32 tensor of shape [batch] containing the
+        number of detection boxes per image. If None, will assume no padding in
+        the detection tensors.

    Returns:
      a dictionary of metric names to tuple of value_op and update_op that can
@@ -231,28 +244,68 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
      guarantee correct behaviour.
    """
    def update_op(
-        image_id,
-        groundtruth_boxes,
-        groundtruth_classes,
-        detection_boxes,
-        detection_scores,
-        detection_classes):
-      self.add_single_ground_truth_image_info(
-          image_id,
-          {'groundtruth_boxes': groundtruth_boxes,
-           'groundtruth_classes': groundtruth_classes})
-      self.add_single_detected_image_info(
-          image_id,
-          {'detection_boxes': detection_boxes,
-           'detection_scores': detection_scores,
-           'detection_classes': detection_classes})
+        image_id_batched,
+        groundtruth_boxes_batched,
+        groundtruth_classes_batched,
+        num_gt_boxes_per_image,
+        detection_boxes_batched,
+        detection_scores_batched,
+        detection_classes_batched,
+        num_det_boxes_per_image):
+      """Update operation for adding batch of images to Coco evaluator."""
+
+      for (image_id, gt_box, gt_class, num_gt_box, det_box, det_score,
+           det_class, num_det_box) in zip(
+               image_id_batched, groundtruth_boxes_batched,
+               groundtruth_classes_batched, num_gt_boxes_per_image,
+               detection_boxes_batched, detection_scores_batched,
+               detection_classes_batched, num_det_boxes_per_image):
+        self.add_single_ground_truth_image_info(
+            image_id,
+            {'groundtruth_boxes': gt_box[:num_gt_box],
+             'groundtruth_classes': gt_class[:num_gt_box]})
+        self.add_single_detected_image_info(
+            image_id,
+            {'detection_boxes': det_box[:num_det_box],
+             'detection_scores': det_score[:num_det_box],
+             'detection_classes': det_class[:num_det_box]})
+
+    if not image_id.shape.as_list():
+      # Apply a batch dimension to all tensors.
+      image_id = tf.expand_dims(image_id, 0)
+      groundtruth_boxes = tf.expand_dims(groundtruth_boxes, 0)
+      groundtruth_classes = tf.expand_dims(groundtruth_classes, 0)
+      detection_boxes = tf.expand_dims(detection_boxes, 0)
+      detection_scores = tf.expand_dims(detection_scores, 0)
+      detection_classes = tf.expand_dims(detection_classes, 0)
+
+      if num_gt_boxes_per_image is None:
+        num_gt_boxes_per_image = tf.shape(groundtruth_boxes)[1:2]
+      else:
+        num_gt_boxes_per_image = tf.expand_dims(num_gt_boxes_per_image, 0)
+
+      if num_det_boxes_per_image is None:
+        num_det_boxes_per_image = tf.shape(detection_boxes)[1:2]
+      else:
+        num_det_boxes_per_image = tf.expand_dims(num_det_boxes_per_image, 0)
+    else:
+      if num_gt_boxes_per_image is None:
+        num_gt_boxes_per_image = tf.tile(
+            tf.shape(groundtruth_boxes)[1:2],
+            multiples=tf.shape(groundtruth_boxes)[0:1])
+      if num_det_boxes_per_image is None:
+        num_det_boxes_per_image = tf.tile(
+            tf.shape(detection_boxes)[1:2],
+            multiples=tf.shape(detection_boxes)[0:1])

    update_op = tf.py_func(update_op, [image_id,
                                       groundtruth_boxes,
                                       groundtruth_classes,
+                                       num_gt_boxes_per_image,
                                       detection_boxes,
                                       detection_scores,
-                                       detection_classes], [])
+                                       detection_classes,
+                                       num_det_boxes_per_image], [])
    metric_names = ['DetectionBoxes_Precision/mAP',
                    'DetectionBoxes_Precision/mAP@.50IOU',
                    'DetectionBoxes_Precision/mAP@.75IOU',
@@ -583,5 +636,3 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
        eval_metric_ops[metric_name] = (tf.py_func(
            value_func_factory(metric_name), [], np.float32), update_op)
    return eval_metric_ops
-
-
--- a/research/object_detection/metrics/coco_evaluation_test.py
+++ b/research/object_detection/metrics/coco_evaluation_test.py
@@ -317,6 +317,230 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
    self.assertFalse(coco_evaluator._detection_boxes_list)
    self.assertFalse(coco_evaluator._image_ids)

+  def testGetOneMAPWithMatchingGroundtruthAndDetectionsPadded(self):
+    category_list = [{
+        'id': 0,
+        'name': 'person'
+    }, {
+        'id': 1,
+        'name': 'cat'
+    }, {
+        'id': 2,
+        'name': 'dog'
+    }]
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
+    image_id = tf.placeholder(tf.string, shape=())
+    groundtruth_boxes = tf.placeholder(tf.float32, shape=(None, 4))
+    groundtruth_classes = tf.placeholder(tf.float32, shape=(None))
+    detection_boxes = tf.placeholder(tf.float32, shape=(None, 4))
+    detection_scores = tf.placeholder(tf.float32, shape=(None))
+    detection_classes = tf.placeholder(tf.float32, shape=(None))
+
+    eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
+        image_id, groundtruth_boxes, groundtruth_classes, detection_boxes,
+        detection_scores, detection_classes)
+
+    _, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
+
+    with self.test_session() as sess:
+      sess.run(
+          update_op,
+          feed_dict={
+              image_id:
+                  'image1',
+              groundtruth_boxes:
+                  np.array([[100., 100., 200., 200.], [-1, -1, -1, -1]]),
+              groundtruth_classes:
+                  np.array([1, -1]),
+              detection_boxes:
+                  np.array([[100., 100., 200., 200.], [0., 0., 0., 0.]]),
+              detection_scores:
+                  np.array([.8, 0.]),
+              detection_classes:
+                  np.array([1, -1])
+          })
+      sess.run(
+          update_op,
+          feed_dict={
+              image_id:
+                  'image2',
+              groundtruth_boxes:
+                  np.array([[50., 50., 100., 100.], [-1, -1, -1, -1]]),
+              groundtruth_classes:
+                  np.array([3, -1]),
+              detection_boxes:
+                  np.array([[50., 50., 100., 100.], [0., 0., 0., 0.]]),
+              detection_scores:
+                  np.array([.7, 0.]),
+              detection_classes:
+                  np.array([3, -1])
+          })
+      sess.run(
+          update_op,
+          feed_dict={
+              image_id:
+                  'image3',
+              groundtruth_boxes:
+                  np.array([[25., 25., 50., 50.], [10., 10., 15., 15.]]),
+              groundtruth_classes:
+                  np.array([2, 2]),
+              detection_boxes:
+                  np.array([[25., 25., 50., 50.], [10., 10., 15., 15.]]),
+              detection_scores:
+                  np.array([.95, .9]),
+              detection_classes:
+                  np.array([2, 2])
+          })
+    metrics = {}
+    for key, (value_op, _) in eval_metric_ops.iteritems():
+      metrics[key] = value_op
+    metrics = sess.run(metrics)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.50IOU'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'],
+                           -1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 0.75)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'],
+                           -1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0)
+    self.assertFalse(coco_evaluator._groundtruth_list)
+    self.assertFalse(coco_evaluator._detection_boxes_list)
+    self.assertFalse(coco_evaluator._image_ids)
+
+  def testGetOneMAPWithMatchingGroundtruthAndDetectionsBatched(self):
+    category_list = [{'id': 0, 'name': 'person'},
+                     {'id': 1, 'name': 'cat'},
+                     {'id': 2, 'name': 'dog'}]
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
+    batch_size = 3
+    image_id = tf.placeholder(tf.string, shape=(batch_size))
+    groundtruth_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4))
+    groundtruth_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
+    detection_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4))
+    detection_scores = tf.placeholder(tf.float32, shape=(batch_size, None))
+    detection_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
+
+    eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
+        image_id, groundtruth_boxes,
+        groundtruth_classes,
+        detection_boxes,
+        detection_scores,
+        detection_classes)
+
+    _, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
+
+    with self.test_session() as sess:
+      sess.run(update_op,
+               feed_dict={
+                   image_id: ['image1', 'image2', 'image3'],
+                   groundtruth_boxes: np.array([[[100., 100., 200., 200.]],
+                                                [[50., 50., 100., 100.]],
+                                                [[25., 25., 50., 50.]]]),
+                   groundtruth_classes: np.array([[1], [3], [2]]),
+                   detection_boxes: np.array([[[100., 100., 200., 200.]],
+                                              [[50., 50., 100., 100.]],
+                                              [[25., 25., 50., 50.]]]),
+                   detection_scores: np.array([[.8], [.7], [.9]]),
+                   detection_classes: np.array([[1], [3], [2]])
+               })
+    metrics = {}
+    for key, (value_op, _) in eval_metric_ops.iteritems():
+      metrics[key] = value_op
+    metrics = sess.run(metrics)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.50IOU'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'],
+                           -1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'],
+                           -1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0)
+    self.assertFalse(coco_evaluator._groundtruth_list)
+    self.assertFalse(coco_evaluator._detection_boxes_list)
+    self.assertFalse(coco_evaluator._image_ids)
+
+  def testGetOneMAPWithMatchingGroundtruthAndDetectionsPaddedBatches(self):
+    category_list = [{'id': 0, 'name': 'person'},
+                     {'id': 1, 'name': 'cat'},
+                     {'id': 2, 'name': 'dog'}]
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
+    batch_size = 3
+    image_id = tf.placeholder(tf.string, shape=(batch_size))
+    groundtruth_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4))
+    groundtruth_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
+    num_gt_boxes_per_image = tf.placeholder(tf.int32, shape=(None))
+    detection_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4))
+    detection_scores = tf.placeholder(tf.float32, shape=(batch_size, None))
+    detection_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
+    num_det_boxes_per_image = tf.placeholder(tf.int32, shape=(None))
+
+    eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
+        image_id, groundtruth_boxes,
+        groundtruth_classes,
+        detection_boxes,
+        detection_scores,
+        detection_classes,
+        num_gt_boxes_per_image,
+        num_det_boxes_per_image)
+
+    _, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
+
+    with self.test_session() as sess:
+      sess.run(update_op,
+               feed_dict={
+                   image_id: ['image1', 'image2', 'image3'],
+                   groundtruth_boxes: np.array([[[100., 100., 200., 200.],
+                                                 [-1, -1, -1, -1]],
+                                                [[50., 50., 100., 100.],
+                                                 [-1, -1, -1, -1]],
+                                                [[25., 25., 50., 50.],
+                                                 [10., 10., 15., 15.]]]),
+                   groundtruth_classes: np.array([[1, -1], [3, -1], [2, 2]]),
+                   num_gt_boxes_per_image: np.array([1, 1, 2]),
+                   detection_boxes: np.array([[[100., 100., 200., 200.],
+                                               [0., 0., 0., 0.]],
+                                              [[50., 50., 100., 100.],
+                                               [0., 0., 0., 0.]],
+                                              [[25., 25., 50., 50.],
+                                               [10., 10., 15., 15.]]]),
+                   detection_scores: np.array([[.8, 0.], [.7, 0.], [.95, .9]]),
+                   detection_classes: np.array([[1, -1], [3, -1], [2, 2]]),
+                   num_det_boxes_per_image: np.array([1, 1, 2]),
+               })
+    metrics = {}
+    for key, (value_op, _) in eval_metric_ops.iteritems():
+      metrics[key] = value_op
+    metrics = sess.run(metrics)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.50IOU'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'],
+                           -1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 0.75)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'],
+                           -1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0)
+    self.assertFalse(coco_evaluator._groundtruth_list)
+    self.assertFalse(coco_evaluator._detection_boxes_list)
+    self.assertFalse(coco_evaluator._image_ids)
+

 class CocoMaskEvaluationTest(tf.test.TestCase):


--- a/research/object_detection/model_lib.py
+++ b/research/object_detection/model_lib.py
@@ -325,16 +325,16 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
      }

    eval_metric_ops = None
-    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
+    if mode == tf.estimator.ModeKeys.EVAL:
      class_agnostic = (fields.DetectionResultFields.detection_classes
                        not in detections)
      groundtruth = _get_groundtruth_data(detection_model, class_agnostic)
      use_original_images = fields.InputDataFields.original_image in features
-      original_images = (
+      eval_images = (
          features[fields.InputDataFields.original_image] if use_original_images
          else features[fields.InputDataFields.image])
      eval_dict = eval_util.result_dict_for_single_example(
-          original_images[0:1],
+          eval_images[0:1],
          features[inputs.HASH_KEY][0],
          detections,
          groundtruth,
@@ -355,22 +355,21 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
        img_summary = tf.summary.image('Detections_Left_Groundtruth_Right',
                                       detection_and_groundtruth)

-      if mode == tf.estimator.ModeKeys.EVAL:
-        # Eval metrics on a single example.
-        eval_metrics = eval_config.metrics_set
-        if not eval_metrics:
-          eval_metrics = ['coco_detection_metrics']
-        eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
-            eval_metrics, category_index.values(), eval_dict,
-            include_metrics_per_category=False)
-        for loss_key, loss_tensor in iter(losses_dict.items()):
-          eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
-        for var in optimizer_summary_vars:
-          eval_metric_ops[var.op.name] = (var, tf.no_op())
-        if img_summary is not None:
-          eval_metric_ops['Detections_Left_Groundtruth_Right'] = (
-              img_summary, tf.no_op())
-        eval_metric_ops = {str(k): v for k, v in eval_metric_ops.iteritems()}
+      # Eval metrics on a single example.
+      eval_metrics = eval_config.metrics_set
+      if not eval_metrics:
+        eval_metrics = ['coco_detection_metrics']
+      eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
+          eval_metrics, category_index.values(), eval_dict,
+          include_metrics_per_category=False)
+      for loss_key, loss_tensor in iter(losses_dict.items()):
+        eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
+      for var in optimizer_summary_vars:
+        eval_metric_ops[var.op.name] = (var, tf.no_op())
+      if img_summary is not None:
+        eval_metric_ops['Detections_Left_Groundtruth_Right'] = (
+            img_summary, tf.no_op())
+      eval_metric_ops = {str(k): v for k, v in eval_metric_ops.iteritems()}

    if use_tpu:
      return tf.contrib.tpu.TPUEstimatorSpec(

--- a/research/object_detection/model_main.py
+++ b/research/object_detection/model_main.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -36,6 +36,10 @@ flags.DEFINE_string(
    'hparams_overrides', None, 'Hyperparameter overrides, '
    'represented as a string containing comma-separated '
    'hparam_name=value pairs.')
+flags.DEFINE_string(
+    'checkpoint_dir', None, 'Path to directory holding a checkpoint.  If '
+    '`checkpoint_dir` is provided, this binary operates in eval-only mode, '
+    'writing resulting metrics to `model_dir`.')

 FLAGS = flags.FLAGS

@@ -59,17 +63,23 @@ def main(unused_argv):
  train_steps = train_and_eval_dict['train_steps']
  eval_steps = train_and_eval_dict['eval_steps']

-  train_spec, eval_specs = model_lib.create_train_and_eval_specs(
-      train_input_fn,
-      eval_input_fn,
-      eval_on_train_input_fn,
-      predict_input_fn,
-      train_steps,
-      eval_steps,
-      eval_on_train_data=False)
+  if FLAGS.checkpoint_dir:
+    estimator.evaluate(eval_input_fn,
+                       eval_steps,
+                       checkpoint_path=tf.train.latest_checkpoint(
+                           FLAGS.checkpoint_dir))
+  else:
+    train_spec, eval_specs = model_lib.create_train_and_eval_specs(
+        train_input_fn,
+        eval_input_fn,
+        eval_on_train_input_fn,
+        predict_input_fn,
+        train_steps,
+        eval_steps,
+        eval_on_train_data=False)

-  # Currently only a single Eval Spec is allowed.
-  tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
+    # Currently only a single Eval Spec is allowed.
+    tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])


 if __name__ == '__main__':

--- a/research/object_detection/model_tpu_main.py
+++ b/research/object_detection/model_tpu_main.py
@@ -56,7 +56,9 @@ flags.DEFINE_integer('iterations_per_loop', 100,
 # recent checkpoint every 10 minutes by default for train_and_eval
 flags.DEFINE_string('mode', 'train',
                    'Mode to run: train, eval')
-flags.DEFINE_integer('train_batch_size', 32 * 8, 'Batch size for training.')
+flags.DEFINE_integer('train_batch_size', None, 'Batch size for training. If '
+                     'this is not provided, batch size is read from training '
+                     'config.')

 flags.DEFINE_string(
    'hparams_overrides', None, 'Comma-separated list of '
@@ -93,6 +95,10 @@ def main(unused_argv):
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_shards))

+  kwargs = {}
+  if FLAGS.train_batch_size:
+    kwargs['batch_size'] = FLAGS.train_batch_size
+
  train_and_eval_dict = model_lib.create_estimator_and_inputs(
      run_config=config,
      hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
@@ -102,7 +108,7 @@ def main(unused_argv):
      use_tpu_estimator=True,
      use_tpu=FLAGS.use_tpu,
      num_shards=FLAGS.num_shards,
-      batch_size=FLAGS.train_batch_size)
+      **kwargs)
  estimator = train_and_eval_dict['estimator']
  train_input_fn = train_and_eval_dict['train_input_fn']
  eval_input_fn = train_and_eval_dict['eval_input_fn']

--- a/research/object_detection/models/faster_rcnn_mobilenet_v1_feature_extractor.py
+++ b/research/object_detection/models/faster_rcnn_mobilenet_v1_feature_extractor.py
@@ -14,6 +14,8 @@
 # ==============================================================================

 """Mobilenet v1 Faster R-CNN implementation."""
+import numpy as np
+
 import tensorflow as tf

 from object_detection.meta_architectures import faster_rcnn_meta_arch
@@ -23,22 +25,31 @@ from nets import mobilenet_v1
 slim = tf.contrib.slim


-_MOBILENET_V1_100_CONV_NO_LAST_STRIDE_DEFS = [
-    mobilenet_v1.Conv(kernel=[3, 3], stride=2, depth=32),
-    mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=64),
-    mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=128),
-    mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=128),
-    mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=256),
-    mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=256),
-    mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=512),
-    mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=512),
-    mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=512),
-    mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=512),
-    mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=512),
-    mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=512),
-    mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=1024),
-    mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=1024)
-]
+def _get_mobilenet_conv_no_last_stride_defs(conv_depth_ratio_in_percentage):
+  if conv_depth_ratio_in_percentage not in [25, 50, 75, 100]:
+    raise ValueError(
+        'Only the following ratio percentages are supported: 25, 50, 75, 100')
+  conv_depth_ratio_in_percentage = float(conv_depth_ratio_in_percentage) / 100.0
+  channels = np.array([
+      32, 64, 128, 128, 256, 256, 512, 512, 512, 512, 512, 512, 1024, 1024
+  ], dtype=np.float32)
+  channels = (channels * conv_depth_ratio_in_percentage).astype(np.int32)
+  return [
+      mobilenet_v1.Conv(kernel=[3, 3], stride=2, depth=channels[0]),
+      mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[1]),
+      mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=channels[2]),
+      mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[3]),
+      mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=channels[4]),
+      mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[5]),
+      mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=channels[6]),
+      mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[7]),
+      mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[8]),
+      mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[9]),
+      mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[10]),
+      mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[11]),
+      mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[12]),
+      mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[13])
+  ]


 class FasterRCNNMobilenetV1FeatureExtractor(
@@ -53,7 +64,8 @@ class FasterRCNNMobilenetV1FeatureExtractor(
               weight_decay=0.0,
               depth_multiplier=1.0,
               min_depth=16,
-               skip_last_stride=False):
+               skip_last_stride=False,
+               conv_depth_ratio_in_percentage=100):
    """Constructor.

    Args:
@@ -65,6 +77,8 @@ class FasterRCNNMobilenetV1FeatureExtractor(
      depth_multiplier: float depth multiplier for feature extractor.
      min_depth: minimum feature extractor depth.
      skip_last_stride: Skip the last stride if True.
+      conv_depth_ratio_in_percentage: Conv depth ratio in percentage. Only
+        applied if skip_last_stride is True.

    Raises:
      ValueError: If `first_stage_features_stride` is not 8 or 16.
@@ -74,6 +88,7 @@ class FasterRCNNMobilenetV1FeatureExtractor(
    self._depth_multiplier = depth_multiplier
    self._min_depth = min_depth
    self._skip_last_stride = skip_last_stride
+    self._conv_depth_ratio_in_percentage = conv_depth_ratio_in_percentage
    super(FasterRCNNMobilenetV1FeatureExtractor, self).__init__(
        is_training, first_stage_features_stride, batch_norm_trainable,
        reuse_weights, weight_decay)
@@ -124,7 +139,9 @@ class FasterRCNNMobilenetV1FeatureExtractor(
                             reuse=self._reuse_weights) as scope:
        params = {}
        if self._skip_last_stride:
-          params['conv_defs'] = _MOBILENET_V1_100_CONV_NO_LAST_STRIDE_DEFS
+          params['conv_defs'] = _get_mobilenet_conv_no_last_stride_defs(
+              conv_depth_ratio_in_percentage=self.
+              _conv_depth_ratio_in_percentage)
        _, activations = mobilenet_v1.mobilenet_v1_base(
            preprocessed_inputs,
            final_endpoint='Conv2d_11_pointwise',
@@ -150,6 +167,11 @@ class FasterRCNNMobilenetV1FeatureExtractor(
    """
    net = proposal_feature_maps

+    conv_depth = 1024
+    if self._skip_last_stride:
+      conv_depth_ratio = float(self._conv_depth_ratio_in_percentage) / 100.0
+      conv_depth = int(float(conv_depth) * conv_depth_ratio)
+
    depth = lambda d: max(int(d * 1.0), 16)
    with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights):
      with slim.arg_scope(
@@ -160,13 +182,13 @@ class FasterRCNNMobilenetV1FeatureExtractor(
            [slim.conv2d, slim.separable_conv2d], padding='SAME'):
          net = slim.separable_conv2d(
              net,
-              depth(1024), [3, 3],
+              depth(conv_depth), [3, 3],
              depth_multiplier=1,
              stride=2,
              scope='Conv2d_12_pointwise')
          return slim.separable_conv2d(
              net,
-              depth(1024), [3, 3],
+              depth(conv_depth), [3, 3],
              depth_multiplier=1,
              stride=1,
              scope='Conv2d_13_pointwise')
--- a/research/object_detection/protos/box_predictor.proto
+++ b/research/object_detection/protos/box_predictor.proto
@@ -20,7 +20,7 @@ message ConvolutionalBoxPredictor {
  // Hyperparameters for convolution ops used in the box predictor.
  optional Hyperparams conv_hyperparams = 1;

-  // Minumum feature depth prior to predicting box encodings and class
+  // Minimum feature depth prior to predicting box encodings and class
  // predictions.
  optional int32 min_depth = 2 [default = 0];

@@ -81,6 +81,12 @@ message WeightSharedConvolutionalBoxPredictor {
  // training where there are large number of negative boxes. See
  // https://arxiv.org/abs/1708.02002 for details.
  optional float class_prediction_bias_init = 10 [default = 0.0];
+
+   // Whether to use dropout for class prediction.
+  optional bool use_dropout = 11 [default = false];
+
+  // Keep probability for dropout
+  optional float dropout_keep_probability = 12 [default = 0.8];
 }

 message MaskRCNNBoxPredictor {
@@ -119,6 +125,10 @@ message MaskRCNNBoxPredictor {
  // branch.
  optional int32 mask_prediction_num_conv_layers = 11 [default = 2];
  optional bool masks_are_class_agnostic = 12 [default = false];
+
+  // Whether to use one box for all classes rather than a different box for each
+  // class.
+  optional bool share_box_across_classes = 13 [default = false];
 }

 message RfcnBoxPredictor {

--- a/research/object_detection/protos/input_reader.proto
+++ b/research/object_detection/protos/input_reader.proto
@@ -60,6 +60,9 @@ message InputReader {
  // Number of parallel decode ops to apply.
  optional uint32 num_parallel_map_calls = 14 [default = 64];

+  // Number of groundtruth keypoints per object.
+  optional uint32 num_keypoints = 16 [default = 0];
+
  // Whether to load groundtruth instance masks.
  optional bool load_instance_masks = 7 [default = false];