update post_processing module, builders, and meta architectures.

12714f88 · Vivek Rathod · c46caa56 · 12714f88 · 12714f88 · 12714f88
Commit 12714f88 authored Oct 28, 2017 by Vivek Rathod
12 changed files
--- a/research/object_detection/builders/post_processing_builder.py
+++ b/research/object_detection/builders/post_processing_builder.py
@@ -28,8 +28,8 @@ def build(post_processing_config):
  configuration.
  Non-max suppression callable takes `boxes`, `scores`, and optionally
-  `clip_window`, `parallel_iterations` and `scope` as inputs. It returns
+  `clip_window`, `parallel_iterations` `masks, and `scope` as inputs. It returns
-  `nms_boxes`, `nms_scores`, `nms_nms_classes` and `num_detections`. See
+  `nms_boxes`, `nms_scores`, `nms_classes` `nms_masks` and `num_detections`. See
  post_processing.batch_multiclass_non_max_suppression for the type and shape
  of these tensors.
@@ -55,7 +55,8 @@ def build(post_processing_config):
  non_max_suppressor_fn = _build_non_max_suppressor(
      post_processing_config.batch_non_max_suppression)
  score_converter_fn = _build_score_converter(
-      post_processing_config.score_converter)
+      post_processing_config.score_converter,
+      post_processing_config.logit_scale)
  return non_max_suppressor_fn, score_converter_fn
@@ -87,7 +88,17 @@ def _build_non_max_suppressor(nms_config):
  return non_max_suppressor_fn
-def _build_score_converter(score_converter_config):
+def _score_converter_fn_with_logit_scale(tf_score_converter_fn, logit_scale):
+  """Create a function to scale logits then apply a Tensorflow function."""
+  def score_converter_fn(logits):
+    scaled_logits = tf.divide(logits, logit_scale, name='scale_logits')
+    return tf_score_converter_fn(scaled_logits, name='convert_scores')
+  score_converter_fn.__name__ = '%s_with_logit_scale' % (
+      tf_score_converter_fn.__name__)
+  return score_converter_fn
+def _build_score_converter(score_converter_config, logit_scale):
  """Builds score converter based on the config.
  Builds one of [tf.identity, tf.sigmoid, tf.softmax] score converters based on
@@ -95,6 +106,7 @@ def _build_score_converter(score_converter_config):
  Args:
    score_converter_config: post_processing_pb2.PostProcessing.score_converter.
+    logit_scale: temperature to use for SOFTMAX score_converter.
  Returns:
    Callable score converter op.
@@ -103,9 +115,9 @@ def _build_score_converter(score_converter_config):
    ValueError: On unknown score converter.
  """
  if score_converter_config == post_processing_pb2.PostProcessing.IDENTITY:
-    return tf.identity
+    return _score_converter_fn_with_logit_scale(tf.identity, logit_scale)
  if score_converter_config == post_processing_pb2.PostProcessing.SIGMOID:
-    return tf.sigmoid
+    return _score_converter_fn_with_logit_scale(tf.sigmoid, logit_scale)
  if score_converter_config == post_processing_pb2.PostProcessing.SOFTMAX:
-    return tf.nn.softmax
+    return _score_converter_fn_with_logit_scale(tf.nn.softmax, logit_scale)
  raise ValueError('Unknown score converter.')
--- a/research/object_detection/builders/post_processing_builder_test.py
+++ b/research/object_detection/builders/post_processing_builder_test.py
@@ -48,7 +48,31 @@ class PostProcessingBuilderTest(tf.test.TestCase):
    post_processing_config = post_processing_pb2.PostProcessing()
    text_format.Merge(post_processing_text_proto, post_processing_config)
    _, score_converter = post_processing_builder.build(post_processing_config)
-    self.assertEqual(score_converter, tf.identity)
+    self.assertEqual(score_converter.__name__, 'identity_with_logit_scale')
+    inputs = tf.constant([1, 1], tf.float32)
+    outputs = score_converter(inputs)
+    with self.test_session() as sess:
+      converted_scores = sess.run(outputs)
+      expected_converted_scores = sess.run(inputs)
+      self.assertAllClose(converted_scores, expected_converted_scores)
+  def test_build_identity_score_converter_with_logit_scale(self):
+    post_processing_text_proto = """
+      score_converter: IDENTITY
+      logit_scale: 2.0
+    """
+    post_processing_config = post_processing_pb2.PostProcessing()
+    text_format.Merge(post_processing_text_proto, post_processing_config)
+    _, score_converter = post_processing_builder.build(post_processing_config)
+    self.assertEqual(score_converter.__name__, 'identity_with_logit_scale')
+    inputs = tf.constant([1, 1], tf.float32)
+    outputs = score_converter(inputs)
+    with self.test_session() as sess:
+      converted_scores = sess.run(outputs)
+      expected_converted_scores = sess.run(tf.constant([.5, .5], tf.float32))
+      self.assertAllClose(converted_scores, expected_converted_scores)
  def test_build_sigmoid_score_converter(self):
    post_processing_text_proto = """
@@ -57,7 +81,7 @@ class PostProcessingBuilderTest(tf.test.TestCase):
    post_processing_config = post_processing_pb2.PostProcessing()
    text_format.Merge(post_processing_text_proto, post_processing_config)
    _, score_converter = post_processing_builder.build(post_processing_config)
-    self.assertEqual(score_converter, tf.sigmoid)
+    self.assertEqual(score_converter.__name__, 'sigmoid_with_logit_scale')
  def test_build_softmax_score_converter(self):
    post_processing_text_proto = """
@@ -66,7 +90,17 @@ class PostProcessingBuilderTest(tf.test.TestCase):
    post_processing_config = post_processing_pb2.PostProcessing()
    text_format.Merge(post_processing_text_proto, post_processing_config)
    _, score_converter = post_processing_builder.build(post_processing_config)
-    self.assertEqual(score_converter, tf.nn.softmax)
+    self.assertEqual(score_converter.__name__, 'softmax_with_logit_scale')
+  def test_build_softmax_score_converter_with_temperature(self):
+    post_processing_text_proto = """
+      score_converter: SOFTMAX
+      logit_scale: 2.0
+    """
+    post_processing_config = post_processing_pb2.PostProcessing()
+    text_format.Merge(post_processing_text_proto, post_processing_config)
+    _, score_converter = post_processing_builder.build(post_processing_config)
+    self.assertEqual(score_converter.__name__, 'softmax_with_logit_scale')
 if __name__ == '__main__':

--- a/research/object_detection/core/post_processing.py
+++ b/research/object_detection/core/post_processing.py
@@ -76,8 +76,6 @@ def multiclass_non_max_suppression(boxes,
    a BoxList holding M boxes with a rank-1 scores field representing
      corresponding scores for each box with scores sorted in decreasing order
      and a rank-1 classes field representing a class label for each box.
-      If masks, keypoints, keypoint_heatmaps is not None, the boxlist will
-      contain masks, keypoints, keypoint_heatmaps corresponding to boxes.
  Raises:
    ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have
@@ -174,6 +172,7 @@ def batch_multiclass_non_max_suppression(boxes,
                                         change_coordinate_frame=False,
                                         num_valid_boxes=None,
                                         masks=None,
+                                         additional_fields=None,
                                         scope=None,
                                         parallel_iterations=32):
  """Multi-class version of non maximum suppression that operates on a batch.
@@ -203,11 +202,13 @@ def batch_multiclass_non_max_suppression(boxes,
      is provided)
    num_valid_boxes: (optional) a Tensor of type `int32`. A 1-D tensor of shape
      [batch_size] representing the number of valid boxes to be considered
-        for each image in the batch.  This parameter allows for ignoring zero
+      for each image in the batch.  This parameter allows for ignoring zero
-        paddings.
+      paddings.
    masks: (optional) a [batch_size, num_anchors, q, mask_height, mask_width]
      float32 tensor containing box masks. `q` can be either number of classes
      or 1 depending on whether a separate mask is predicted per class.
+    additional_fields: (optional) If not None, a dictionary that maps keys to
+      tensors whose dimensions are [batch_size, num_anchors, ...].
    scope: tf scope name.
    parallel_iterations: (optional) number of batch items to process in
      parallel.
@@ -223,9 +224,13 @@ def batch_multiclass_non_max_suppression(boxes,
      [batch_size, max_detections, mask_height, mask_width] float32 tensor
      containing masks for each selected box. This is set to None if input
      `masks` is None.
+    'nmsed_additional_fields': (optional) a dictionary of
+      [batch_size, max_detections, ...] float32 tensors corresponding to the
+      tensors specified in the input `additional_fields`. This is not returned
+      if input `additional_fields` is None.
    'num_detections': A [batch_size] int32 tensor indicating the number of
      valid detections per batch item. Only the top num_detections[i] entries in
-      nms_boxes[i], nms_scores[i] and nms_class[i] are valid. the rest of the
+      nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the
      entries are zero paddings.
  Raises:
@@ -239,6 +244,7 @@ def batch_multiclass_non_max_suppression(boxes,
                     'to the third dimension of scores')
  original_masks = masks
+  original_additional_fields = additional_fields
  with tf.name_scope(scope, 'BatchMultiClassNonMaxSuppression'):
    boxes_shape = boxes.shape
    batch_size = boxes_shape[0].value
@@ -255,15 +261,61 @@ def batch_multiclass_non_max_suppression(boxes,
      num_valid_boxes = tf.ones([batch_size], dtype=tf.int32) * num_anchors
    # If masks aren't provided, create dummy masks so we can only have one copy
-    # of single_image_nms_fn and discard the dummy masks after map_fn.
+    # of _single_image_nms_fn and discard the dummy masks after map_fn.
    if masks is None:
      masks_shape = tf.stack([batch_size, num_anchors, 1, 0, 0])
      masks = tf.zeros(masks_shape)
-    def single_image_nms_fn(args):
+    if additional_fields is None:
-      """Runs NMS on a single image and returns padded output."""
+      additional_fields = {}
-      (per_image_boxes, per_image_scores, per_image_masks,
-       per_image_num_valid_boxes) = args
+    def _single_image_nms_fn(args):
+      """Runs NMS on a single image and returns padded output.
+      Args:
+        args: A list of tensors consisting of the following:
+          per_image_boxes - A [num_anchors, q, 4] float32 tensor containing
+            detections. If `q` is 1 then same boxes are used for all classes
+            otherwise, if `q` is equal to number of classes, class-specific
+            boxes are used.
+          per_image_scores - A [num_anchors, num_classes] float32 tensor
+            containing the scores for each of the `num_anchors` detections.
+          per_image_masks - A [num_anchors, q, mask_height, mask_width] float32
+            tensor containing box masks. `q` can be either number of classes
+            or 1 depending on whether a separate mask is predicted per class.
+          per_image_additional_fields - (optional) A variable number of float32
+            tensors each with size [num_anchors, ...].
+          per_image_num_valid_boxes - A tensor of type `int32`. A 1-D tensor of
+            shape [batch_size] representing the number of valid boxes to be
+            considered for each image in the batch.  This parameter allows for
+            ignoring zero paddings.
+      Returns:
+        'nmsed_boxes': A [max_detections, 4] float32 tensor containing the
+          non-max suppressed boxes.
+        'nmsed_scores': A [max_detections] float32 tensor containing the scores
+          for the boxes.
+        'nmsed_classes': A [max_detections] float32 tensor containing the class
+          for boxes.
+        'nmsed_masks': (optional) a [max_detections, mask_height, mask_width]
+          float32 tensor containing masks for each selected box. This is set to
+          None if input `masks` is None.
+        'nmsed_additional_fields':  (optional) A variable number of float32
+          tensors each with size [max_detections, ...] corresponding to the
+          input `per_image_additional_fields`.
+        'num_detections': A [batch_size] int32 tensor indicating the number of
+          valid detections per batch item. Only the top num_detections[i]
+          entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The
+          rest of the entries are zero paddings.
+      """
+      per_image_boxes = args[0]
+      per_image_scores = args[1]
+      per_image_masks = args[2]
+      per_image_additional_fields = {
+          key: value
+          for key, value in zip(additional_fields, args[3:-1])
+      }
+      per_image_num_valid_boxes = args[-1]
      per_image_boxes = tf.reshape(
          tf.slice(per_image_boxes, 3 * [0],
                   tf.stack([per_image_num_valid_boxes, -1, -1])), [-1, q, 4])
@@ -271,12 +323,21 @@ def batch_multiclass_non_max_suppression(boxes,
          tf.slice(per_image_scores, [0, 0],
                   tf.stack([per_image_num_valid_boxes, -1])),
          [-1, num_classes])
      per_image_masks = tf.reshape(
          tf.slice(per_image_masks, 4 * [0],
                   tf.stack([per_image_num_valid_boxes, -1, -1, -1])),
          [-1, q, per_image_masks.shape[2].value,
           per_image_masks.shape[3].value])
+      if per_image_additional_fields is not None:
+        for key, tensor in per_image_additional_fields.items():
+          additional_field_shape = tensor.get_shape()
+          additional_field_dim = len(additional_field_shape)
+          per_image_additional_fields[key] = tf.reshape(
+              tf.slice(per_image_additional_fields[key],
+                       additional_field_dim * [0],
+                       tf.stack([per_image_num_valid_boxes] +
+                                (additional_field_dim - 1) * [-1])),
+              [-1] + [dim.value for dim in additional_field_shape[1:]])
      nmsed_boxlist = multiclass_non_max_suppression(
          per_image_boxes,
          per_image_scores,
@@ -284,9 +345,10 @@ def batch_multiclass_non_max_suppression(boxes,
          iou_thresh,
          max_size_per_class,
          max_total_size,
-          masks=per_image_masks,
          clip_window=clip_window,
-          change_coordinate_frame=change_coordinate_frame)
+          change_coordinate_frame=change_coordinate_frame,
+          masks=per_image_masks,
+          additional_fields=per_image_additional_fields)
      padded_boxlist = box_list_ops.pad_or_clip_box_list(nmsed_boxlist,
                                                         max_total_size)
      num_detections = nmsed_boxlist.num_boxes()
@@ -294,19 +356,40 @@ def batch_multiclass_non_max_suppression(boxes,
      nmsed_scores = padded_boxlist.get_field(fields.BoxListFields.scores)
      nmsed_classes = padded_boxlist.get_field(fields.BoxListFields.classes)
      nmsed_masks = padded_boxlist.get_field(fields.BoxListFields.masks)
-      return [nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks,
+      nmsed_additional_fields = [
-              num_detections]
+          padded_boxlist.get_field(key) for key in per_image_additional_fields
+      ]
+      return ([nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks] +
+              nmsed_additional_fields + [num_detections])
+    num_additional_fields = 0
+    if additional_fields is not None:
+      num_additional_fields = len(additional_fields)
+    num_nmsed_outputs = 4 + num_additional_fields
-    (batch_nmsed_boxes, batch_nmsed_scores,
+    batch_outputs = tf.map_fn(
-     batch_nmsed_classes, batch_nmsed_masks,
+        _single_image_nms_fn,
-     batch_num_detections) = tf.map_fn(
+        elems=([boxes, scores, masks] + list(additional_fields.values()) +
-         single_image_nms_fn,
+               [num_valid_boxes]),
-         elems=[boxes, scores, masks, num_valid_boxes],
+        dtype=(num_nmsed_outputs * [tf.float32] + [tf.int32]),
-         dtype=[tf.float32, tf.float32, tf.float32, tf.float32, tf.int32],
+        parallel_iterations=parallel_iterations)
-         parallel_iterations=parallel_iterations)
+    batch_nmsed_boxes = batch_outputs[0]
+    batch_nmsed_scores = batch_outputs[1]
+    batch_nmsed_classes = batch_outputs[2]
+    batch_nmsed_masks = batch_outputs[3]
+    batch_nmsed_additional_fields = {
+        key: value
+        for key, value in zip(additional_fields, batch_outputs[4:-1])
+    }
+    batch_num_detections = batch_outputs[-1]
    if original_masks is None:
      batch_nmsed_masks = None
+    if original_additional_fields is None:
+      batch_nmsed_additional_fields = None
    return (batch_nmsed_boxes, batch_nmsed_scores, batch_nmsed_classes,
-            batch_nmsed_masks, batch_num_detections)
+            batch_nmsed_masks, batch_nmsed_additional_fields,
+            batch_num_detections)
--- a/research/object_detection/core/post_processing_test.py
+++ b/research/object_detection/core/post_processing_test.py
@@ -497,11 +497,13 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
    exp_nms_classes = [[0, 0, 1, 0]]
    (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks,
-     num_detections) = post_processing.batch_multiclass_non_max_suppression(
+     nmsed_additional_fields, num_detections
-         boxes, scores, score_thresh, iou_thresh,
+    ) = post_processing.batch_multiclass_non_max_suppression(
-         max_size_per_class=max_output_size, max_total_size=max_output_size)
+        boxes, scores, score_thresh, iou_thresh,
+        max_size_per_class=max_output_size, max_total_size=max_output_size)
    self.assertIsNone(nmsed_masks)
+    self.assertIsNone(nmsed_additional_fields)
    with self.test_session() as sess:
      (nmsed_boxes, nmsed_scores, nmsed_classes,
@@ -544,11 +546,13 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
                                [1, 0, 0, 0]])
    (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks,
-     num_detections) = post_processing.batch_multiclass_non_max_suppression(
+     nmsed_additional_fields, num_detections
-         boxes, scores, score_thresh, iou_thresh,
+    ) = post_processing.batch_multiclass_non_max_suppression(
-         max_size_per_class=max_output_size, max_total_size=max_output_size)
+        boxes, scores, score_thresh, iou_thresh,
+        max_size_per_class=max_output_size, max_total_size=max_output_size)
    self.assertIsNone(nmsed_masks)
+    self.assertIsNone(nmsed_additional_fields)
    # Check static shapes
    self.assertAllEqual(nmsed_boxes.shape.as_list(),
                        exp_nms_corners.shape)
@@ -616,11 +620,13 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
                               [[0, 0], [0, 0]]]])
    (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks,
-     num_detections) = post_processing.batch_multiclass_non_max_suppression(
+     nmsed_additional_fields, num_detections
-         boxes, scores, score_thresh, iou_thresh,
+    ) = post_processing.batch_multiclass_non_max_suppression(
-         max_size_per_class=max_output_size, max_total_size=max_output_size,
+        boxes, scores, score_thresh, iou_thresh,
-         masks=masks)
+        max_size_per_class=max_output_size, max_total_size=max_output_size,
+        masks=masks)
+    self.assertIsNone(nmsed_additional_fields)
    # Check static shapes
    self.assertAllEqual(nmsed_boxes.shape.as_list(), exp_nms_corners.shape)
    self.assertAllEqual(nmsed_scores.shape.as_list(), exp_nms_scores.shape)
@@ -639,6 +645,91 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
      self.assertAllClose(num_detections, [2, 3])
      self.assertAllClose(nmsed_masks, exp_nms_masks)
+  def test_batch_multiclass_nms_with_additional_fields(self):
+    boxes = tf.constant([[[[0, 0, 1, 1], [0, 0, 4, 5]],
+                          [[0, 0.1, 1, 1.1], [0, 0.1, 2, 1.1]],
+                          [[0, -0.1, 1, 0.9], [0, -0.1, 1, 0.9]],
+                          [[0, 10, 1, 11], [0, 10, 1, 11]]],
+                         [[[0, 10.1, 1, 11.1], [0, 10.1, 1, 11.1]],
+                          [[0, 100, 1, 101], [0, 100, 1, 101]],
+                          [[0, 1000, 1, 1002], [0, 999, 2, 1004]],
+                          [[0, 1000, 1, 1002.1], [0, 999, 2, 1002.7]]]],
+                        tf.float32)
+    scores = tf.constant([[[.9, 0.01], [.75, 0.05],
+                           [.6, 0.01], [.95, 0]],
+                          [[.5, 0.01], [.3, 0.01],
+                           [.01, .85], [.01, .5]]])
+    additional_fields = {
+        'keypoints': tf.constant(
+            [[[[6, 7], [8, 9]],
+              [[0, 1], [2, 3]],
+              [[0, 0], [0, 0]],
+              [[0, 0], [0, 0]]],
+             [[[13, 14], [15, 16]],
+              [[8, 9], [10, 11]],
+              [[10, 11], [12, 13]],
+              [[0, 0], [0, 0]]]],
+            tf.float32)
+    }
+    score_thresh = 0.1
+    iou_thresh = .5
+    max_output_size = 4
+    exp_nms_corners = np.array([[[0, 10, 1, 11],
+                                 [0, 0, 1, 1],
+                                 [0, 0, 0, 0],
+                                 [0, 0, 0, 0]],
+                                [[0, 999, 2, 1004],
+                                 [0, 10.1, 1, 11.1],
+                                 [0, 100, 1, 101],
+                                 [0, 0, 0, 0]]])
+    exp_nms_scores = np.array([[.95, .9, 0, 0],
+                               [.85, .5, .3, 0]])
+    exp_nms_classes = np.array([[0, 0, 0, 0],
+                                [1, 0, 0, 0]])
+    exp_nms_additional_fields = {
+        'keypoints': np.array([[[[0, 0], [0, 0]],
+                                [[6, 7], [8, 9]],
+                                [[0, 0], [0, 0]],
+                                [[0, 0], [0, 0]]],
+                               [[[10, 11], [12, 13]],
+                                [[13, 14], [15, 16]],
+                                [[8, 9], [10, 11]],
+                                [[0, 0], [0, 0]]]])
+    }
+    (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks,
+     nmsed_additional_fields, num_detections
+    ) = post_processing.batch_multiclass_non_max_suppression(
+        boxes, scores, score_thresh, iou_thresh,
+        max_size_per_class=max_output_size, max_total_size=max_output_size,
+        additional_fields=additional_fields)
+    self.assertIsNone(nmsed_masks)
+    # Check static shapes
+    self.assertAllEqual(nmsed_boxes.shape.as_list(), exp_nms_corners.shape)
+    self.assertAllEqual(nmsed_scores.shape.as_list(), exp_nms_scores.shape)
+    self.assertAllEqual(nmsed_classes.shape.as_list(), exp_nms_classes.shape)
+    self.assertEqual(len(nmsed_additional_fields),
+                     len(exp_nms_additional_fields))
+    for key in exp_nms_additional_fields:
+      self.assertAllEqual(nmsed_additional_fields[key].shape.as_list(),
+                          exp_nms_additional_fields[key].shape)
+    self.assertEqual(num_detections.shape.as_list(), [2])
+    with self.test_session() as sess:
+      (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_additional_fields,
+       num_detections) = sess.run([nmsed_boxes, nmsed_scores, nmsed_classes,
+                                   nmsed_additional_fields, num_detections])
+      self.assertAllClose(nmsed_boxes, exp_nms_corners)
+      self.assertAllClose(nmsed_scores, exp_nms_scores)
+      self.assertAllClose(nmsed_classes, exp_nms_classes)
+      for key in exp_nms_additional_fields:
+        self.assertAllClose(nmsed_additional_fields[key],
+                            exp_nms_additional_fields[key])
+      self.assertAllClose(num_detections, [2, 3])
  def test_batch_multiclass_nms_with_dynamic_batch_size(self):
    boxes_placeholder = tf.placeholder(tf.float32, shape=(None, None, 2, 4))
    scores_placeholder = tf.placeholder(tf.float32, shape=(None, None, 2))
@@ -690,11 +781,13 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
                               [[0, 0], [0, 0]]]])
    (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks,
-     num_detections) = post_processing.batch_multiclass_non_max_suppression(
+     nmsed_additional_fields, num_detections
-         boxes_placeholder, scores_placeholder, score_thresh, iou_thresh,
+    ) = post_processing.batch_multiclass_non_max_suppression(
-         max_size_per_class=max_output_size, max_total_size=max_output_size,
+        boxes_placeholder, scores_placeholder, score_thresh, iou_thresh,
-         masks=masks_placeholder)
+        max_size_per_class=max_output_size, max_total_size=max_output_size,
+        masks=masks_placeholder)
+    self.assertIsNone(nmsed_additional_fields)
    # Check static shapes
    self.assertAllEqual(nmsed_boxes.shape.as_list(), [None, 4, 4])
    self.assertAllEqual(nmsed_scores.shape.as_list(), [None, 4])
@@ -765,10 +858,13 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
                      [[0, 0], [0, 0]]]]
    (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks,
-     num_detections) = post_processing.batch_multiclass_non_max_suppression(
+     nmsed_additional_fields, num_detections
-         boxes, scores, score_thresh, iou_thresh,
+    ) = post_processing.batch_multiclass_non_max_suppression(
-         max_size_per_class=max_output_size, max_total_size=max_output_size,
+        boxes, scores, score_thresh, iou_thresh,
-         num_valid_boxes=num_valid_boxes, masks=masks)
+        max_size_per_class=max_output_size, max_total_size=max_output_size,
+        num_valid_boxes=num_valid_boxes, masks=masks)
+    self.assertIsNone(nmsed_additional_fields)
    with self.test_session() as sess:
      (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks,
@@ -780,6 +876,84 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
      self.assertAllClose(num_detections, [1, 1])
      self.assertAllClose(nmsed_masks, exp_nms_masks)
+  def test_batch_multiclass_nms_with_additional_fields_and_num_valid_boxes(
+      self):
+    boxes = tf.constant([[[[0, 0, 1, 1], [0, 0, 4, 5]],
+                          [[0, 0.1, 1, 1.1], [0, 0.1, 2, 1.1]],
+                          [[0, -0.1, 1, 0.9], [0, -0.1, 1, 0.9]],
+                          [[0, 10, 1, 11], [0, 10, 1, 11]]],
+                         [[[0, 10.1, 1, 11.1], [0, 10.1, 1, 11.1]],
+                          [[0, 100, 1, 101], [0, 100, 1, 101]],
+                          [[0, 1000, 1, 1002], [0, 999, 2, 1004]],
+                          [[0, 1000, 1, 1002.1], [0, 999, 2, 1002.7]]]],
+                        tf.float32)
+    scores = tf.constant([[[.9, 0.01], [.75, 0.05],
+                           [.6, 0.01], [.95, 0]],
+                          [[.5, 0.01], [.3, 0.01],
+                           [.01, .85], [.01, .5]]])
+    additional_fields = {
+        'keypoints': tf.constant(
+            [[[[6, 7], [8, 9]],
+              [[0, 1], [2, 3]],
+              [[0, 0], [0, 0]],
+              [[0, 0], [0, 0]]],
+             [[[13, 14], [15, 16]],
+              [[8, 9], [10, 11]],
+              [[10, 11], [12, 13]],
+              [[0, 0], [0, 0]]]],
+            tf.float32)
+    }
+    num_valid_boxes = tf.constant([1, 1], tf.int32)
+    score_thresh = 0.1
+    iou_thresh = .5
+    max_output_size = 4
+    exp_nms_corners = [[[0, 0, 1, 1],
+                        [0, 0, 0, 0],
+                        [0, 0, 0, 0],
+                        [0, 0, 0, 0]],
+                       [[0, 10.1, 1, 11.1],
+                        [0, 0, 0, 0],
+                        [0, 0, 0, 0],
+                        [0, 0, 0, 0]]]
+    exp_nms_scores = [[.9, 0, 0, 0],
+                      [.5, 0, 0, 0]]
+    exp_nms_classes = [[0, 0, 0, 0],
+                       [0, 0, 0, 0]]
+    exp_nms_additional_fields = {
+        'keypoints': np.array([[[[6, 7], [8, 9]],
+                                [[0, 0], [0, 0]],
+                                [[0, 0], [0, 0]],
+                                [[0, 0], [0, 0]]],
+                               [[[13, 14], [15, 16]],
+                                [[0, 0], [0, 0]],
+                                [[0, 0], [0, 0]],
+                                [[0, 0], [0, 0]]]])
+    }
+    (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks,
+     nmsed_additional_fields, num_detections
+    ) = post_processing.batch_multiclass_non_max_suppression(
+        boxes, scores, score_thresh, iou_thresh,
+        max_size_per_class=max_output_size, max_total_size=max_output_size,
+        num_valid_boxes=num_valid_boxes,
+        additional_fields=additional_fields)
+    self.assertIsNone(nmsed_masks)
+    with self.test_session() as sess:
+      (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_additional_fields,
+       num_detections) = sess.run([nmsed_boxes, nmsed_scores, nmsed_classes,
+                                   nmsed_additional_fields, num_detections])
+      self.assertAllClose(nmsed_boxes, exp_nms_corners)
+      self.assertAllClose(nmsed_scores, exp_nms_scores)
+      self.assertAllClose(nmsed_classes, exp_nms_classes)
+      for key in exp_nms_additional_fields:
+        self.assertAllClose(nmsed_additional_fields[key],
+                            exp_nms_additional_fields[key])
+      self.assertAllClose(num_detections, [1, 1])
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/meta_architectures/BUILD
+++ b/research/object_detection/meta_architectures/BUILD
@@ -18,6 +18,7 @@ py_library(
        "//tensorflow_models/object_detection/core:model",
        "//tensorflow_models/object_detection/core:target_assigner",
        "//tensorflow_models/object_detection/utils:shape_utils",
+        "//tensorflow_models/object_detection/utils:visualization_utils",
    ],
 )

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
@@ -63,7 +63,7 @@ postprocessing operations are always normalized boxes however, internally, we
 sometimes convert to absolute --- e.g. for loss computation.  In particular,
 anchors and proposal_boxes are both represented as absolute coordinates.
-TODO: Support TPU implementations and sigmoid loss.
+TODO: Support TPU implementations.
 """
 from abc import abstractmethod
 from functools import partial
@@ -91,6 +91,7 @@ class FasterRCNNFeatureExtractor(object):
  def __init__(self,
               is_training,
               first_stage_features_stride,
+               batch_norm_trainable=False,
               reuse_weights=None,
               weight_decay=0.0):
    """Constructor.
@@ -99,11 +100,15 @@ class FasterRCNNFeatureExtractor(object):
      is_training: A boolean indicating whether the training version of the
        computation graph should be constructed.
      first_stage_features_stride: Output stride of extracted RPN feature map.
+      batch_norm_trainable: Whether to update batch norm parameters during
+        training or not. When training with a relative large batch size
+        (e.g. 8), it could be desirable to enable batch norm update.
      reuse_weights: Whether to reuse variables. Default is None.
      weight_decay: float weight decay for feature extractor (default: 0.0).
    """
    self._is_training = is_training
    self._first_stage_features_stride = first_stage_features_stride
+    self._train_batch_norm = (batch_norm_trainable and is_training)
    self._reuse_weights = reuse_weights
    self._weight_decay = weight_decay
@@ -214,7 +219,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
               second_stage_score_conversion_fn,
               second_stage_localization_loss_weight,
               second_stage_classification_loss_weight,
-               hard_example_miner,
+               second_stage_classification_loss,
+               second_stage_mask_prediction_loss_weight=1.0,
+               hard_example_miner=None,
               parallel_iterations=16):
    """FasterRCNNMetaArch Constructor.
@@ -225,10 +232,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
        include the background category, so if groundtruth labels take values
        in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
        assigned classification targets can range from {0,... K}).
-      image_resizer_fn: A callable for image resizing.  This callable always
+      image_resizer_fn: A callable for image resizing.  This callable
-        takes a rank-3 image tensor (corresponding to a single image) and
+        takes a rank-3 image tensor of shape [height, width, channels]
-        returns a rank-3 image tensor, possibly with new spatial dimensions.
+        (corresponding to a single image) and returns a rank-3 image tensor,
-        See builders/image_resizer_builder.py.
+        possibly with new spatial dimensions. See
+        builders/image_resizer_builder.py.
      feature_extractor: A FasterRCNNFeatureExtractor object.
      first_stage_only:  Whether to construct only the Region Proposal Network
        (RPN) part of the model.
@@ -295,19 +303,28 @@ class FasterRCNNMetaArch(model.DetectionModel):
      second_stage_score_conversion_fn: Callable elementwise nonlinearity
        (that takes tensors as inputs and returns tensors).  This is usually
        used to convert logits to probabilities.
-      second_stage_localization_loss_weight: A float
+      second_stage_localization_loss_weight: A float indicating the scale factor
-      second_stage_classification_loss_weight: A float
+        for second stage localization loss.
+      second_stage_classification_loss_weight: A float indicating the scale
+        factor for second stage classification loss.
+      second_stage_classification_loss: Classification loss used by the second
+        stage classifier. Either losses.WeightedSigmoidClassificationLoss or
+        losses.WeightedSoftmaxClassificationLoss.
+      second_stage_mask_prediction_loss_weight: A float indicating the scale
+        factor for second stage mask prediction loss. This is applicable only if
+        second stage box predictor is configured to predict masks.
      hard_example_miner:  A losses.HardExampleMiner object (can be None).
      parallel_iterations: (Optional) The number of iterations allowed to run
        in parallel for calls to tf.map_fn.
    Raises:
-      ValueError: If `second_stage_batch_size` > `first_stage_max_proposals`
+      ValueError: If `second_stage_batch_size` > `first_stage_max_proposals` at
+        training time.
      ValueError: If first_stage_anchor_generator is not of type
        grid_anchor_generator.GridAnchorGenerator.
    """
    super(FasterRCNNMetaArch, self).__init__(num_classes=num_classes)
-    if second_stage_batch_size > first_stage_max_proposals:
+    if is_training and second_stage_batch_size > first_stage_max_proposals:
      raise ValueError('second_stage_batch_size should be no greater than '
                       'first_stage_max_proposals.')
    if not isinstance(first_stage_anchor_generator,
@@ -375,10 +392,13 @@ class FasterRCNNMetaArch(model.DetectionModel):
    self._second_stage_localization_loss = (
        losses.WeightedSmoothL1LocalizationLoss(anchorwise_output=True))
-    self._second_stage_classification_loss = (
+    self._second_stage_classification_loss = second_stage_classification_loss
-        losses.WeightedSoftmaxClassificationLoss(anchorwise_output=True))
+    self._second_stage_mask_loss = (
+        losses.WeightedSigmoidClassificationLoss(anchorwise_output=True))
    self._second_stage_loc_loss_weight = second_stage_localization_loss_weight
    self._second_stage_cls_loss_weight = second_stage_classification_loss_weight
+    self._second_stage_mask_loss_weight = (
+        second_stage_mask_prediction_loss_weight)
    self._hard_example_miner = hard_example_miner
    self._parallel_iterations = parallel_iterations
@@ -491,7 +511,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
          [total_num_proposals, num_classes, 4] representing predicted
          (final) refined box encodings, where
          total_num_proposals=batch_size*self._max_num_proposals
-        8) class_predictions_with_background: a 2-D tensor with shape
+        8) class_predictions_with_background: a 3-D tensor with shape
          [total_num_proposals, num_classes + 1] containing class
          predictions (logits) for each of the anchors, where
          total_num_proposals=batch_size*self._max_num_proposals.
@@ -504,7 +524,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
          `self.max_num_proposals` for each image.
        10) proposal_boxes: A float32 tensor of shape
          [batch_size, self.max_num_proposals, 4] representing
-          decoded proposal bounding boxes (in absolute coordinates).
+          decoded proposal bounding boxes in absolute coordinates.
        11) mask_predictions: (optional) a 4-D tensor with shape
          [total_num_padded_proposals, num_classes, mask_height, mask_width]
          containing instance mask predictions.
@@ -553,10 +573,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
    """Predicts the output tensors from second stage of Faster R-CNN.
    Args:
-      rpn_box_encodings: 3-D float tensor of shape
+      rpn_box_encodings: 4-D float tensor of shape
        [batch_size, num_valid_anchors, self._box_coder.code_size] containing
        predicted boxes.
-      rpn_objectness_predictions_with_background: 3-D float tensor of shape
+      rpn_objectness_predictions_with_background: 2-D float tensor of shape
        [batch_size, num_valid_anchors, 2] containing class
        predictions (logits) for each of the anchors.  Note that this
        tensor *includes* background class predictions (at class index 0).
@@ -573,7 +593,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
          [total_num_proposals, num_classes, 4] representing predicted
          (final) refined box encodings, where
          total_num_proposals=batch_size*self._max_num_proposals
-        2) class_predictions_with_background: a 2-D tensor with shape
+        2) class_predictions_with_background: a 3-D tensor with shape
          [total_num_proposals, num_classes + 1] containing class
          predictions (logits) for each of the anchors, where
          total_num_proposals=batch_size*self._max_num_proposals.
@@ -586,8 +606,16 @@ class FasterRCNNMetaArch(model.DetectionModel):
          `self.max_num_proposals` for each image.
        4) proposal_boxes: A float32 tensor of shape
          [batch_size, self.max_num_proposals, 4] representing
-          decoded proposal bounding boxes (in absolute coordinates).
+          decoded proposal bounding boxes in absolute coordinates.
-        5) mask_predictions: (optional) a 4-D tensor with shape
+        5) proposal_boxes_normalized: A float32 tensor of shape
+          [batch_size, self.max_num_proposals, 4] representing decoded proposal
+          bounding boxes in normalized coordinates. Can be used to override the
+          boxes proposed by the RPN, thus enabling one to extract features and
+          get box classification and prediction for externally selected areas
+          of the image.
+        6) box_classifier_features: a 4-D float32 tensor representing the
+          features for each proposal.
+        7) mask_predictions: (optional) a 4-D tensor with shape
          [total_num_padded_proposals, num_classes, mask_height, mask_width]
          containing instance mask predictions.
    """
@@ -622,7 +650,14 @@ class FasterRCNNMetaArch(model.DetectionModel):
        class_predictions_with_background,
        'num_proposals': num_proposals,
        'proposal_boxes': absolute_proposal_boxes,
+        'box_classifier_features': box_classifier_features,
+        'proposal_boxes_normalized': proposal_boxes_normalized,
    }
+    if box_predictor.MASK_PREDICTIONS in box_predictions:
+      mask_predictions = tf.squeeze(box_predictions[
+          box_predictor.MASK_PREDICTIONS], axis=1)
+      prediction_dict['mask_predictions'] = mask_predictions
    return prediction_dict
  def _extract_rpn_feature_maps(self, preprocessed_inputs):
@@ -729,10 +764,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
        extent of the window to clip/prune to.
    Returns:
-      box_encodings: 3-D float tensor of shape
+      box_encodings: 4-D float tensor of shape
        [batch_size, num_valid_anchors, self._box_coder.code_size] containing
        predicted boxes, where num_valid_anchors <= num_anchors
-      objectness_predictions_with_background: 3-D float tensor of shape
+      objectness_predictions_with_background: 2-D float tensor of shape
        [batch_size, num_valid_anchors, 2] containing class
        predictions (logits) for each of the anchors, where
        num_valid_anchors <= num_anchors.  Note that this
@@ -813,7 +848,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
        return {
            'detection_boxes': proposal_boxes,
            'detection_scores': proposal_scores,
-            'num_detections': num_proposals
+            'num_detections': tf.to_float(num_proposals)
        }
    with tf.name_scope('SecondStagePostprocessor'):
      mask_predictions = prediction_dict.get(box_predictor.MASK_PREDICTIONS)
@@ -877,7 +912,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
    rpn_objectness_softmax_without_background = tf.nn.softmax(
        rpn_objectness_predictions_with_background_batch)[:, :, 1]
    clip_window = tf.to_float(tf.stack([0, 0, image_shape[1], image_shape[2]]))
-    (proposal_boxes, proposal_scores, _, _,
+    (proposal_boxes, proposal_scores, _, _, _,
     num_proposals) = post_processing.batch_multiclass_non_max_suppression(
         tf.expand_dims(proposal_boxes, axis=2),
         tf.expand_dims(rpn_objectness_softmax_without_background,
@@ -891,7 +926,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
      proposal_boxes = tf.stop_gradient(proposal_boxes)
      if not self._hard_example_miner:
        (groundtruth_boxlists, groundtruth_classes_with_background_list,
-         ) = self._format_groundtruth_data(image_shape)
+         _) = self._format_groundtruth_data(image_shape)
        (proposal_boxes, proposal_scores,
         num_proposals) = self._unpad_proposals_and_sample_box_classifier_batch(
             proposal_boxes, proposal_scores, num_proposals,
@@ -998,6 +1033,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
    for target assignment, we:
    1) convert boxes to absolute coordinates,
    2) add a background class at class index 0
+    3) groundtruth instance masks, if available, are resized to match
+       image_shape.
    Args:
      image_shape: A 1-D int32 tensor of shape [4] representing the shape of the
@@ -1009,6 +1046,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
      groundtruth_classes_with_background_list: A list of 2-D one-hot
        (or k-hot) tensors of shape [num_boxes, num_classes+1] containing the
        class targets with the 0th index assumed to map to the background class.
+      groundtruth_masks_list: If present, a list of 3-D tf.float32 tensors of
+        shape [num_boxes, image_height, image_width] containing instance masks.
+        This is set to None if no masks exist in the provided groundtruth.
    """
    groundtruth_boxlists = [
        box_list_ops.to_absolute_coordinates(
@@ -1019,7 +1059,22 @@ class FasterRCNNMetaArch(model.DetectionModel):
            tf.pad(one_hot_encoding, [[0, 0], [1, 0]], mode='CONSTANT'))
        for one_hot_encoding in self.groundtruth_lists(
            fields.BoxListFields.classes)]
-    return groundtruth_boxlists, groundtruth_classes_with_background_list
+    groundtruth_masks_list = self._groundtruth_lists.get(
+        fields.BoxListFields.masks)
+    if groundtruth_masks_list is not None:
+      resized_masks_list = []
+      for mask in groundtruth_masks_list:
+        resized_4d_mask = tf.image.resize_images(
+            tf.expand_dims(mask, axis=3),
+            image_shape[1:3],
+            method=tf.image.ResizeMethod.NEAREST_NEIGHBOR,
+            align_corners=True)
+        resized_masks_list.append(tf.squeeze(resized_4d_mask, axis=3))
+      groundtruth_masks_list = resized_masks_list
+    return (groundtruth_boxlists, groundtruth_classes_with_background_list,
+            groundtruth_masks_list)
  def _sample_box_classifier_minibatch(self,
                                       proposal_boxlist,
@@ -1100,29 +1155,26 @@ class FasterRCNNMetaArch(model.DetectionModel):
                                  proposal_boxes,
                                  num_proposals,
                                  image_shape,
-                                  mask_predictions=None,
+                                  mask_predictions=None):
-                                  mask_threshold=0.5):
    """Converts predictions from the second stage box classifier to detections.
    Args:
-      refined_box_encodings: a 3-D tensor with shape
+      refined_box_encodings: a 3-D float tensor with shape
        [total_num_padded_proposals, num_classes, 4] representing predicted
        (final) refined box encodings.
-      class_predictions_with_background: a 3-D tensor with shape
+      class_predictions_with_background: a 3-D tensor float with shape
        [total_num_padded_proposals, num_classes + 1] containing class
        predictions (logits) for each of the proposals.  Note that this tensor
        *includes* background class predictions (at class index 0).
-      proposal_boxes: [batch_size, self.max_num_proposals, 4] representing
+      proposal_boxes: a 3-D float tensor with shape
-        decoded proposal bounding boxes.
+        [batch_size, self.max_num_proposals, 4] representing decoded proposal
-      num_proposals: A Tensor of type `int32`. A 1-D tensor of shape [batch]
+        bounding boxes in absolute coordinates.
-        representing the number of proposals predicted for each image in
+      num_proposals: a 1-D int32 tensor of shape [batch] representing the number
-        the batch.
+        of proposals predicted for each image in the batch.
-      image_shape: a 1-D tensor representing the input image shape.
+      image_shape: a 1-D int32 tensor representing the input image shape.
-      mask_predictions: (optional) a 4-D tensor with shape
+      mask_predictions: (optional) a 4-D float tensor with shape
        [total_num_padded_proposals, num_classes, mask_height, mask_width]
-        containing instance mask predictions.
+        containing instance mask prediction logits.
-      mask_threshold: a scalar threshold determining which mask values are
-        rounded to 0 or 1.
    Returns:
      A dictionary containing:
@@ -1131,7 +1183,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
        `detection_classes`: [batch, max_detections]
        `num_detections`: [batch]
        `detection_masks`:
-          (optional) [batch, max_detections, mask_height, mask_width]
+          (optional) [batch, max_detections, mask_height, mask_width]. Note
+          that a pixel-wise sigmoid score converter is applied to the detection
+          masks.
    """
    refined_box_encodings_batch = tf.reshape(refined_box_encodings,
                                             [-1, self.max_num_proposals,
@@ -1156,10 +1210,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
    if mask_predictions is not None:
      mask_height = mask_predictions.shape[2].value
      mask_width = mask_predictions.shape[3].value
+      mask_predictions = tf.sigmoid(mask_predictions)
      mask_predictions_batch = tf.reshape(
          mask_predictions, [-1, self.max_num_proposals,
                             self.num_classes, mask_height, mask_width])
-    (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks,
+    (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, _,
     num_detections) = self._second_stage_nms_fn(
         refined_decoded_boxes_batch,
         class_predictions_batch,
@@ -1173,26 +1228,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
                  'num_detections': tf.to_float(num_detections)}
    if nmsed_masks is not None:
      detections['detection_masks'] = nmsed_masks
-    if mask_predictions is not None:
-      detections['detection_masks'] = tf.to_float(
-          tf.greater_equal(detections['detection_masks'], mask_threshold))
    return detections
  def _batch_decode_boxes(self, box_encodings, anchor_boxes):
-    """Decode tensor of refined box encodings.
-    Args:
-      refined_box_encodings: a 4-D tensor with shape
-        [batch_size, max_num_proposals, num_classes, self._box_coder.code_size]
-        representing predicted (final) refined box encodings.
-      proposal_boxes: [batch_size, self.max_num_proposals, 4] representing
-        decoded proposal bounding boxes.
-    Returns:
-      refined_box_predictions: a [batch_size, max_num_proposals, num_classes, 4]
-        float tensor representing (padded) refined bounding box predictions
-        (for each image in batch, proposal and class).
-    """
    """Decodes box encodings with respect to the anchor boxes.
    Args:
@@ -1246,7 +1284,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
        corresponding loss values.
    """
    with tf.name_scope(scope, 'Loss', prediction_dict.values()):
-      (groundtruth_boxlists, groundtruth_classes_with_background_list
+      (groundtruth_boxlists, groundtruth_classes_with_background_list,
+       groundtruth_masks_list
      ) = self._format_groundtruth_data(prediction_dict['image_shape'])
      loss_dict = self._loss_rpn(
          prediction_dict['rpn_box_encodings'],
@@ -1262,7 +1301,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
                prediction_dict['proposal_boxes'],
                prediction_dict['num_proposals'],
                groundtruth_boxlists,
-                groundtruth_classes_with_background_list))
+                groundtruth_classes_with_background_list,
+                prediction_dict['image_shape'],
+                prediction_dict.get('mask_predictions'),
+                groundtruth_masks_list,
+            ))
    return loss_dict
  def _loss_rpn(self,
@@ -1278,10 +1321,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
    participate in the loss computation, and returns the RPN losses.
    Args:
-      rpn_box_encodings: A 3-D float tensor of shape
+      rpn_box_encodings: A 4-D float tensor of shape
        [batch_size, num_anchors, self._box_coder.code_size] containing
        predicted proposal box encodings.
-      rpn_objectness_predictions_with_background: A 3-D float tensor of shape
+      rpn_objectness_predictions_with_background: A 2-D float tensor of shape
        [batch_size, num_anchors, 2] containing objectness predictions
        (logits) for each of the anchors with 0 corresponding to background
        and 1 corresponding to object.
@@ -1334,12 +1377,14 @@ class FasterRCNNMetaArch(model.DetectionModel):
          tf.reduce_sum(localization_losses, axis=1) / normalizer)
      objectness_loss = tf.reduce_mean(
          tf.reduce_sum(objectness_losses, axis=1) / normalizer)
-      loss_dict = {
+      loss_dict = {}
-          'first_stage_localization_loss':
-          self._first_stage_loc_loss_weight * localization_loss,
+      with tf.name_scope('localization_loss'):
-          'first_stage_objectness_loss':
+        loss_dict['first_stage_localization_loss'] = (
-          self._first_stage_obj_loss_weight * objectness_loss,
+            self._first_stage_loc_loss_weight * localization_loss)
-      }
+      with tf.name_scope('objectness_loss'):
+        loss_dict['first_stage_objectness_loss'] = (
+            self._first_stage_obj_loss_weight * objectness_loss)
    return loss_dict
  def _loss_box_classifier(self,
@@ -1348,17 +1393,23 @@ class FasterRCNNMetaArch(model.DetectionModel):
                           proposal_boxes,
                           num_proposals,
                           groundtruth_boxlists,
-                           groundtruth_classes_with_background_list):
+                           groundtruth_classes_with_background_list,
+                           image_shape,
+                           prediction_masks=None,
+                           groundtruth_masks_list=None):
    """Computes scalar box classifier loss tensors.
    Uses self._detector_target_assigner to obtain regression and classification
    targets for the second stage box classifier, optionally performs
    hard mining, and returns losses.  All losses are computed independently
    for each image and then averaged across the batch.
+    Please note that for boxes and masks with multiple labels, the box
+    regression and mask prediction losses are only computed for one label.
    This function assumes that the proposal boxes in the "padded" regions are
    actually zero (and thus should not be matched to).
    Args:
      refined_box_encodings: a 3-D tensor with shape
        [total_num_proposals, num_classes, box_coder.code_size] representing
@@ -1377,11 +1428,23 @@ class FasterRCNNMetaArch(model.DetectionModel):
      groundtruth_classes_with_background_list: a list of 2-D one-hot
        (or k-hot) tensors of shape [num_boxes, num_classes + 1] containing the
        class targets with the 0th index assumed to map to the background class.
+      image_shape: a 1-D tensor of shape [4] representing the image shape.
+      prediction_masks: an optional 4-D tensor with shape [total_num_proposals,
+        num_classes, mask_height, mask_width] containing the instance masks for
+        each box.
+      groundtruth_masks_list: an optional list of 3-D tensors of shape
+        [num_boxes, image_height, image_width] containing the instance masks for
+        each of the boxes.
    Returns:
      a dictionary mapping loss keys ('second_stage_localization_loss',
        'second_stage_classification_loss') to scalar tensors representing
        corresponding loss values.
+    Raises:
+      ValueError: if `predict_instance_masks` in
+        second_stage_mask_rcnn_box_predictor is True and
+        `groundtruth_masks_list` is not provided.
    """
    with tf.name_scope('BoxClassifierLoss'):
      paddings_indicator = self._padded_batched_proposals_indicator(
@@ -1409,9 +1472,20 @@ class FasterRCNNMetaArch(model.DetectionModel):
          [batch_size * self.max_num_proposals, -1])
      refined_box_encodings_with_background = tf.pad(
          refined_box_encodings, [[0, 0], [1, 0], [0, 0]])
+      # For anchors with multiple labels, picks refined_location_encodings
+      # for just one class to avoid over-counting for regression loss and
+      # (optionally) mask loss.
+      one_hot_flat_cls_targets_with_background = tf.argmax(
+          flat_cls_targets_with_background, axis=1)
+      one_hot_flat_cls_targets_with_background = tf.one_hot(
+          one_hot_flat_cls_targets_with_background,
+          flat_cls_targets_with_background.get_shape()[1])
      refined_box_encodings_masked_by_class_targets = tf.boolean_mask(
          refined_box_encodings_with_background,
-          tf.greater(flat_cls_targets_with_background, 0))
+          tf.greater(one_hot_flat_cls_targets_with_background, 0))
+      class_predictions_with_background = tf.reshape(
+          class_predictions_with_background,
+          [batch_size, self.max_num_proposals, -1])
      reshaped_refined_box_encodings = tf.reshape(
          refined_box_encodings_masked_by_class_targets,
          [batch_size, -1, 4])
@@ -1433,12 +1507,82 @@ class FasterRCNNMetaArch(model.DetectionModel):
        ) = self._unpad_proposals_and_apply_hard_mining(
            proposal_boxlists, second_stage_loc_losses,
            second_stage_cls_losses, num_proposals)
-      loss_dict = {
+      loss_dict = {}
-          'second_stage_localization_loss':
+      with tf.name_scope('localization_loss'):
-          (self._second_stage_loc_loss_weight * second_stage_loc_loss),
+        loss_dict['second_stage_localization_loss'] = (
-          'second_stage_classification_loss':
+            self._second_stage_loc_loss_weight * second_stage_loc_loss)
-          (self._second_stage_cls_loss_weight * second_stage_cls_loss),
-      }
+      with tf.name_scope('classification_loss'):
+        loss_dict['second_stage_classification_loss'] = (
+            self._second_stage_cls_loss_weight * second_stage_cls_loss)
+      second_stage_mask_loss = None
+      if prediction_masks is not None:
+        if groundtruth_masks_list is None:
+          raise ValueError('Groundtruth instance masks not provided. '
+                           'Please configure input reader.')
+        # Create a new target assigner that matches the proposals to groundtruth
+        # and returns the mask targets.
+        # TODO: Move `unmatched_cls_target` from constructor to assign function.
+        # This will enable reuse of a single target assigner for both class
+        # targets and mask targets.
+        mask_target_assigner = target_assigner.create_target_assigner(
+            'FasterRCNN', 'detection',
+            unmatched_cls_target=tf.zeros(image_shape[1:3], dtype=tf.float32))
+        (batch_mask_targets, _, _,
+         batch_mask_target_weights, _) = target_assigner.batch_assign_targets(
+             mask_target_assigner, proposal_boxlists,
+             groundtruth_boxlists, groundtruth_masks_list)
+        # Pad the prediction_masks with to add zeros for background class to be
+        # consistent with class predictions.
+        prediction_masks_with_background = tf.pad(
+            prediction_masks, [[0, 0], [1, 0], [0, 0], [0, 0]])
+        prediction_masks_masked_by_class_targets = tf.boolean_mask(
+            prediction_masks_with_background,
+            tf.greater(one_hot_flat_cls_targets_with_background, 0))
+        mask_height = prediction_masks.shape[2].value
+        mask_width = prediction_masks.shape[3].value
+        reshaped_prediction_masks = tf.reshape(
+            prediction_masks_masked_by_class_targets,
+            [batch_size, -1, mask_height * mask_width])
+        batch_mask_targets_shape = tf.shape(batch_mask_targets)
+        flat_gt_masks = tf.reshape(batch_mask_targets,
+                                   [-1, batch_mask_targets_shape[2],
+                                    batch_mask_targets_shape[3]])
+        # Use normalized proposals to crop mask targets from image masks.
+        flat_normalized_proposals = box_list_ops.to_normalized_coordinates(
+            box_list.BoxList(tf.reshape(proposal_boxes, [-1, 4])),
+            image_shape[1], image_shape[2]).get()
+        flat_cropped_gt_mask = tf.image.crop_and_resize(
+            tf.expand_dims(flat_gt_masks, -1),
+            flat_normalized_proposals,
+            tf.range(flat_normalized_proposals.shape[0].value),
+            [mask_height, mask_width])
+        batch_cropped_gt_mask = tf.reshape(
+            flat_cropped_gt_mask,
+            [batch_size, -1, mask_height * mask_width])
+        second_stage_mask_losses = self._second_stage_mask_loss(
+            reshaped_prediction_masks,
+            batch_cropped_gt_mask,
+            weights=batch_mask_target_weights) / (
+                mask_height * mask_width *
+                tf.maximum(tf.reduce_sum(batch_mask_target_weights, axis=1,
+                                         keep_dims=True),
+                           tf.ones((batch_size, 1))))
+        second_stage_mask_loss = tf.reduce_sum(
+            tf.boolean_mask(second_stage_mask_losses, paddings_indicator))
+      if second_stage_mask_loss is not None:
+        with tf.name_scope('mask_loss'):
+          loss_dict['second_stage_mask_loss'] = (
+              self._second_stage_mask_loss_weight * second_stage_mask_loss)
    return loss_dict
  def _padded_batched_proposals_indicator(self,

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test.py
@@ -15,6 +15,7 @@
 """Tests for object_detection.meta_architectures.faster_rcnn_meta_arch."""
+import numpy as np
 import tensorflow as tf
 from object_detection.meta_architectures import faster_rcnn_meta_arch_test_lib
@@ -46,19 +47,19 @@ class FasterRCNNMetaArchTest(
    mask_height = 2
    mask_width = 2
-    mask_predictions = .6 * tf.ones(
+    mask_predictions = 30. * tf.ones(
        [total_num_padded_proposals, model.num_classes,
         mask_height, mask_width], dtype=tf.float32)
-    exp_detection_masks = [[[[1, 1], [1, 1]],
+    exp_detection_masks = np.array([[[[1, 1], [1, 1]],
-                            [[1, 1], [1, 1]],
+                                     [[1, 1], [1, 1]],
-                            [[1, 1], [1, 1]],
+                                     [[1, 1], [1, 1]],
-                            [[1, 1], [1, 1]],
+                                     [[1, 1], [1, 1]],
-                            [[1, 1], [1, 1]]],
+                                     [[1, 1], [1, 1]]],
-                           [[[1, 1], [1, 1]],
+                                    [[[1, 1], [1, 1]],
-                            [[1, 1], [1, 1]],
+                                     [[1, 1], [1, 1]],
-                            [[1, 1], [1, 1]],
+                                     [[1, 1], [1, 1]],
-                            [[1, 1], [1, 1]],
+                                     [[1, 1], [1, 1]],
-                            [[0, 0], [0, 0]]]]
+                                     [[0, 0], [0, 0]]]])
    detections = model.postprocess({
        'refined_box_encodings': refined_box_encodings,
@@ -79,6 +80,17 @@ class FasterRCNNMetaArchTest(
      self.assertAllClose(detections_out['detection_masks'],
                          exp_detection_masks)
+  def _get_box_classifier_features_shape(self,
+                                         image_size,
+                                         batch_size,
+                                         max_num_proposals,
+                                         initial_crop_size,
+                                         maxpool_stride,
+                                         num_features):
+    return (batch_size * max_num_proposals,
+            initial_crop_size/maxpool_stride,
+            initial_crop_size/maxpool_stride,
+            num_features)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
@@ -113,7 +113,8 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
                   second_stage_batch_size,
                   first_stage_max_proposals=8,
                   num_classes=2,
-                   hard_mining=False):
+                   hard_mining=False,
+                   softmax_second_stage_classification_loss=True):
    def image_resizer_fn(image):
      return tf.identity(image)
@@ -178,6 +179,12 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
    second_stage_score_conversion_fn = tf.identity
    second_stage_localization_loss_weight = 1.0
    second_stage_classification_loss_weight = 1.0
+    if softmax_second_stage_classification_loss:
+      second_stage_classification_loss = (
+          losses.WeightedSoftmaxClassificationLoss(anchorwise_output=True))
+    else:
+      second_stage_classification_loss = (
+          losses.WeightedSigmoidClassificationLoss(anchorwise_output=True))
    hard_example_miner = None
    if hard_mining:
@@ -221,52 +228,68 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
        second_stage_localization_loss_weight,
        'second_stage_classification_loss_weight':
        second_stage_classification_loss_weight,
+        'second_stage_classification_loss':
+        second_stage_classification_loss,
        'hard_example_miner': hard_example_miner}
    return self._get_model(self._get_second_stage_box_predictor(
        num_classes=num_classes, is_training=is_training), **common_kwargs)
-  def test_predict_correct_shapes_in_inference_mode_both_stages(
+  def test_predict_gives_correct_shapes_in_inference_mode_first_stage_only(
      self):
-    batch_size = 2
+    test_graph = tf.Graph()
-    image_size = 10
+    with test_graph.as_default():
-    input_shapes = [(batch_size, image_size, image_size, 3),
+      model = self._build_model(
-                    (None, image_size, image_size, 3),
+          is_training=False, first_stage_only=True, second_stage_batch_size=2)
-                    (batch_size, None, None, 3),
+      batch_size = 2
-                    (None, None, None, 3)]
+      height = 10
-    expected_num_anchors = image_size * image_size * 3 * 3
+      width = 12
-    expected_shapes = {
+      input_image_shape = (batch_size, height, width, 3)
-        'rpn_box_predictor_features':
-        (2, image_size, image_size, 512),
+      preprocessed_inputs = tf.placeholder(dtype=tf.float32,
-        'rpn_features_to_crop': (2, image_size, image_size, 3),
+                                           shape=(batch_size, None, None, 3))
-        'image_shape': (4,),
+      prediction_dict = model.predict(preprocessed_inputs)
-        'rpn_box_encodings': (2, expected_num_anchors, 4),
-        'rpn_objectness_predictions_with_background':
+      # In inference mode, anchors are clipped to the image window, but not
-        (2, expected_num_anchors, 2),
+      # pruned.  Since MockFasterRCNN.extract_proposal_features returns a
-        'anchors': (expected_num_anchors, 4),
+      # tensor with the same shape as its input, the expected number of anchors
-        'refined_box_encodings': (2 * 8, 2, 4),
+      # is height * width * the number of anchors per location (i.e. 3x3).
-        'class_predictions_with_background': (2 * 8, 2 + 1),
+      expected_num_anchors = height * width * 3 * 3
-        'num_proposals': (2,),
+      expected_output_keys = set([
-        'proposal_boxes': (2, 8, 4),
+          'rpn_box_predictor_features', 'rpn_features_to_crop', 'image_shape',
-    }
+          'rpn_box_encodings', 'rpn_objectness_predictions_with_background',
-    for input_shape in input_shapes:
+          'anchors'])
-      test_graph = tf.Graph()
+      expected_output_shapes = {
-      with test_graph.as_default():
+          'rpn_box_predictor_features': (batch_size, height, width, 512),
-        model = self._build_model(
+          'rpn_features_to_crop': (batch_size, height, width, 3),
-            is_training=False, first_stage_only=False,
+          'rpn_box_encodings': (batch_size, expected_num_anchors, 4),
-            second_stage_batch_size=2)
+          'rpn_objectness_predictions_with_background':
-        preprocessed_inputs = tf.placeholder(tf.float32, shape=input_shape)
+          (batch_size, expected_num_anchors, 2),
-        result_tensor_dict = model.predict(preprocessed_inputs)
+          'anchors': (expected_num_anchors, 4)
-        init_op = tf.global_variables_initializer()
+      }
-      with self.test_session(graph=test_graph) as sess:
+      init_op = tf.global_variables_initializer()
+      with self.test_session() as sess:
        sess.run(init_op)
-        tensor_dict_out = sess.run(result_tensor_dict, feed_dict={
+        prediction_out = sess.run(prediction_dict,
-            preprocessed_inputs:
+                                  feed_dict={
-            np.zeros((batch_size, image_size, image_size, 3))})
+                                      preprocessed_inputs:
-      self.assertEqual(set(tensor_dict_out.keys()),
+                                      np.zeros(input_image_shape)
-                       set(expected_shapes.keys()))
+                                  })
-      for key in expected_shapes:
-        self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key])
+        self.assertEqual(set(prediction_out.keys()), expected_output_keys)
+        self.assertAllEqual(prediction_out['image_shape'], input_image_shape)
+        for output_key, expected_shape in expected_output_shapes.items():
+          self.assertAllEqual(prediction_out[output_key].shape, expected_shape)
+        # Check that anchors are clipped to window.
+        anchors = prediction_out['anchors']
+        self.assertTrue(np.all(np.greater_equal(anchors, 0)))
+        self.assertTrue(np.all(np.less_equal(anchors[:, 0], height)))
+        self.assertTrue(np.all(np.less_equal(anchors[:, 1], width)))
+        self.assertTrue(np.all(np.less_equal(anchors[:, 2], height)))
+        self.assertTrue(np.all(np.less_equal(anchors[:, 3], width)))
  def test_predict_gives_valid_anchors_in_training_mode_first_stage_only(self):
    test_graph = tf.Graph()
@@ -321,48 +344,73 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
            prediction_out['rpn_objectness_predictions_with_background'].shape,
            (batch_size, num_anchors_out, 2))
-  def test_predict_gives_correct_shapes_in_inference_mode_both_stages(self):
+  def test_predict_correct_shapes_in_inference_mode_both_stages(
-    test_graph = tf.Graph()
+      self):
-    with test_graph.as_default():
+    batch_size = 2
-      model = self._build_model(
+    image_size = 10
-          is_training=False, first_stage_only=False, second_stage_batch_size=2)
+    max_num_proposals = 8
-      batch_size = 2
+    initial_crop_size = 3
-      image_size = 10
+    maxpool_stride = 1
-      image_shape = (batch_size, image_size, image_size, 3)
-      preprocessed_inputs = tf.zeros(image_shape, dtype=tf.float32)
-      result_tensor_dict = model.predict(preprocessed_inputs)
-      expected_num_anchors = image_size * image_size * 3 * 3
-      expected_shapes = {
+    input_shapes = [(batch_size, image_size, image_size, 3),
-          'rpn_box_predictor_features':
+                    (None, image_size, image_size, 3),
-          (2, image_size, image_size, 512),
+                    (batch_size, None, None, 3),
-          'rpn_features_to_crop': (2, image_size, image_size, 3),
+                    (None, None, None, 3)]
-          'image_shape': (4,),
+    expected_num_anchors = image_size * image_size * 3 * 3
-          'rpn_box_encodings': (2, expected_num_anchors, 4),
+    expected_shapes = {
-          'rpn_objectness_predictions_with_background':
+        'rpn_box_predictor_features':
-          (2, expected_num_anchors, 2),
+        (2, image_size, image_size, 512),
-          'anchors': (expected_num_anchors, 4),
+        'rpn_features_to_crop': (2, image_size, image_size, 3),
-          'refined_box_encodings': (2 * 8, 2, 4),
+        'image_shape': (4,),
-          'class_predictions_with_background': (2 * 8, 2 + 1),
+        'rpn_box_encodings': (2, expected_num_anchors, 4),
-          'num_proposals': (2,),
+        'rpn_objectness_predictions_with_background':
-          'proposal_boxes': (2, 8, 4),
+        (2, expected_num_anchors, 2),
-      }
+        'anchors': (expected_num_anchors, 4),
-      init_op = tf.global_variables_initializer()
+        'refined_box_encodings': (2 * max_num_proposals, 2, 4),
-      with self.test_session() as sess:
+        'class_predictions_with_background': (2 * max_num_proposals, 2 + 1),
+        'num_proposals': (2,),
+        'proposal_boxes': (2, max_num_proposals, 4),
+        'proposal_boxes_normalized': (2, max_num_proposals, 4),
+        'box_classifier_features':
+        self._get_box_classifier_features_shape(image_size,
+                                                batch_size,
+                                                max_num_proposals,
+                                                initial_crop_size,
+                                                maxpool_stride,
+                                                3)
+    }
+    for input_shape in input_shapes:
+      test_graph = tf.Graph()
+      with test_graph.as_default():
+        model = self._build_model(
+            is_training=False, first_stage_only=False,
+            second_stage_batch_size=2)
+        preprocessed_inputs = tf.placeholder(tf.float32, shape=input_shape)
+        result_tensor_dict = model.predict(preprocessed_inputs)
+        init_op = tf.global_variables_initializer()
+      with self.test_session(graph=test_graph) as sess:
        sess.run(init_op)
-        tensor_dict_out = sess.run(result_tensor_dict)
+        tensor_dict_out = sess.run(result_tensor_dict, feed_dict={
-        self.assertEqual(set(tensor_dict_out.keys()),
+            preprocessed_inputs:
-                         set(expected_shapes.keys()))
+            np.zeros((batch_size, image_size, image_size, 3))})
-        for key in expected_shapes:
+      self.assertEqual(set(tensor_dict_out.keys()),
-          self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key])
+                       set(expected_shapes.keys()))
+      for key in expected_shapes:
+        self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key])
  def test_predict_gives_correct_shapes_in_train_mode_both_stages(self):
    test_graph = tf.Graph()
    with test_graph.as_default():
      model = self._build_model(
          is_training=True, first_stage_only=False, second_stage_batch_size=7)
      batch_size = 2
      image_size = 10
+      max_num_proposals = 7
+      initial_crop_size = 3
+      maxpool_stride = 1
      image_shape = (batch_size, image_size, image_size, 3)
      preprocessed_inputs = tf.zeros(image_shape, dtype=tf.float32)
      groundtruth_boxes_list = [
@@ -381,11 +429,20 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
          (2, image_size, image_size, 512),
          'rpn_features_to_crop': (2, image_size, image_size, 3),
          'image_shape': (4,),
-          'refined_box_encodings': (2 * 7, 2, 4),
+          'refined_box_encodings': (2 * max_num_proposals, 2, 4),
-          'class_predictions_with_background': (2 * 7, 2 + 1),
+          'class_predictions_with_background': (2 * max_num_proposals, 2 + 1),
          'num_proposals': (2,),
-          'proposal_boxes': (2, 7, 4),
+          'proposal_boxes': (2, max_num_proposals, 4),
+          'proposal_boxes_normalized': (2, max_num_proposals, 4),
+          'box_classifier_features':
+          self._get_box_classifier_features_shape(image_size,
+                                                  batch_size,
+                                                  max_num_proposals,
+                                                  initial_crop_size,
+                                                  maxpool_stride,
+                                                  3)
      }
      init_op = tf.global_variables_initializer()
      with self.test_session() as sess:
        sess.run(init_op)
@@ -600,6 +657,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
      preprocessed_inputs = model.preprocess(image_placeholder)
      self.assertAllEqual(preprocessed_inputs.shape.as_list(), image_shape)
+  # TODO: Split test into two - with and without masks.
  def test_loss_first_stage_only_mode(self):
    model = self._build_model(
        is_training=True, first_stage_only=True, second_stage_batch_size=6)
@@ -650,6 +708,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
      self.assertTrue('second_stage_localization_loss' not in loss_dict_out)
      self.assertTrue('second_stage_classification_loss' not in loss_dict_out)
+  # TODO: Split test into two - with and without masks.
  def test_loss_full(self):
    model = self._build_model(
        is_training=True, first_stage_only=False, second_stage_batch_size=6)
@@ -702,12 +761,26 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
         [10, -10, -10],
         [-10, 10, -10]], dtype=tf.float32)
+    mask_predictions_logits = 20 * tf.ones((batch_size *
+                                            model.max_num_proposals,
+                                            model.num_classes,
+                                            14, 14),
+                                           dtype=tf.float32)
    groundtruth_boxes_list = [
        tf.constant([[0, 0, .5, .5], [.5, .5, 1, 1]], dtype=tf.float32),
        tf.constant([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=tf.float32)]
    groundtruth_classes_list = [tf.constant([[1, 0], [0, 1]], dtype=tf.float32),
                                tf.constant([[1, 0], [1, 0]], dtype=tf.float32)]
+    # Set all elements of groundtruth mask to 1.0. In this case all proposal
+    # crops of the groundtruth masks should return a mask that covers the entire
+    # proposal. Thus, if mask_predictions_logits element values are all greater
+    # than 20, the loss should be zero.
+    groundtruth_masks_list = [tf.convert_to_tensor(np.ones((2, 32, 32)),
+                                                   dtype=tf.float32),
+                              tf.convert_to_tensor(np.ones((2, 32, 32)),
+                                                   dtype=tf.float32)]
    prediction_dict = {
        'rpn_box_encodings': rpn_box_encodings,
        'rpn_objectness_predictions_with_background':
@@ -717,10 +790,12 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
        'refined_box_encodings': refined_box_encodings,
        'class_predictions_with_background': class_predictions_with_background,
        'proposal_boxes': proposal_boxes,
-        'num_proposals': num_proposals
+        'num_proposals': num_proposals,
+        'mask_predictions': mask_predictions_logits
    }
    model.provide_groundtruth(groundtruth_boxes_list,
-                              groundtruth_classes_list)
+                              groundtruth_classes_list,
+                              groundtruth_masks_list)
    loss_dict = model.loss(prediction_dict)
    with self.test_session() as sess:
@@ -729,6 +804,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
      self.assertAllClose(loss_dict_out['first_stage_objectness_loss'], 0)
      self.assertAllClose(loss_dict_out['second_stage_localization_loss'], 0)
      self.assertAllClose(loss_dict_out['second_stage_classification_loss'], 0)
+      self.assertAllClose(loss_dict_out['second_stage_mask_loss'], 0)
  def test_loss_full_zero_padded_proposals(self):
    model = self._build_model(
@@ -775,10 +851,23 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
         [0, 0, 0],
         [0, 0, 0]], dtype=tf.float32)
+    mask_predictions_logits = 20 * tf.ones((batch_size *
+                                            model.max_num_proposals,
+                                            model.num_classes,
+                                            14, 14),
+                                           dtype=tf.float32)
    groundtruth_boxes_list = [
        tf.constant([[0, 0, .5, .5]], dtype=tf.float32)]
    groundtruth_classes_list = [tf.constant([[1, 0]], dtype=tf.float32)]
+    # Set all elements of groundtruth mask to 1.0. In this case all proposal
+    # crops of the groundtruth masks should return a mask that covers the entire
+    # proposal. Thus, if mask_predictions_logits element values are all greater
+    # than 20, the loss should be zero.
+    groundtruth_masks_list = [tf.convert_to_tensor(np.ones((1, 32, 32)),
+                                                   dtype=tf.float32)]
    prediction_dict = {
        'rpn_box_encodings': rpn_box_encodings,
        'rpn_objectness_predictions_with_background':
@@ -788,10 +877,12 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
        'refined_box_encodings': refined_box_encodings,
        'class_predictions_with_background': class_predictions_with_background,
        'proposal_boxes': proposal_boxes,
-        'num_proposals': num_proposals
+        'num_proposals': num_proposals,
+        'mask_predictions': mask_predictions_logits
    }
    model.provide_groundtruth(groundtruth_boxes_list,
-                              groundtruth_classes_list)
+                              groundtruth_classes_list,
+                              groundtruth_masks_list)
    loss_dict = model.loss(prediction_dict)
    with self.test_session() as sess:
@@ -800,6 +891,102 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
      self.assertAllClose(loss_dict_out['first_stage_objectness_loss'], 0)
      self.assertAllClose(loss_dict_out['second_stage_localization_loss'], 0)
      self.assertAllClose(loss_dict_out['second_stage_classification_loss'], 0)
+      self.assertAllClose(loss_dict_out['second_stage_mask_loss'], 0)
+  def test_loss_full_multiple_label_groundtruth(self):
+    model = self._build_model(
+        is_training=True, first_stage_only=False, second_stage_batch_size=6,
+        softmax_second_stage_classification_loss=False)
+    batch_size = 1
+    anchors = tf.constant(
+        [[0, 0, 16, 16],
+         [0, 16, 16, 32],
+         [16, 0, 32, 16],
+         [16, 16, 32, 32]], dtype=tf.float32)
+    rpn_box_encodings = tf.zeros(
+        [batch_size,
+         anchors.get_shape().as_list()[0],
+         BOX_CODE_SIZE], dtype=tf.float32)
+    # use different numbers for the objectness category to break ties in
+    # order of boxes returned by NMS
+    rpn_objectness_predictions_with_background = tf.constant([
+        [[-10, 13],
+         [10, -10],
+         [10, -11],
+         [10, -12]],], dtype=tf.float32)
+    image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32)
+    # box_classifier_batch_size is 6, but here we assume that the number of
+    # actual proposals (not counting zero paddings) is fewer (3).
+    num_proposals = tf.constant([3], dtype=tf.int32)
+    proposal_boxes = tf.constant(
+        [[[0, 0, 16, 16],
+          [0, 16, 16, 32],
+          [16, 0, 32, 16],
+          [0, 0, 0, 0],  # begin paddings
+          [0, 0, 0, 0],
+          [0, 0, 0, 0]]], dtype=tf.float32)
+    # second_stage_localization_loss should only be computed for predictions
+    # that match groundtruth. For multiple label groundtruth boxes, the loss
+    # should only be computed once for the label with the smaller index.
+    refined_box_encodings = tf.constant(
+        [[[0, 0, 0, 0], [1, 1, -1, -1]],
+         [[1, 1, -1, -1], [1, 1, 1, 1]],
+         [[1, 1, -1, -1], [1, 1, 1, 1]],
+         [[1, 1, -1, -1], [1, 1, 1, 1]],
+         [[1, 1, -1, -1], [1, 1, 1, 1]],
+         [[1, 1, -1, -1], [1, 1, 1, 1]]], dtype=tf.float32)
+    class_predictions_with_background = tf.constant(
+        [[-100, 100, 100],
+         [100, -100, -100],
+         [100, -100, -100],
+         [0, 0, 0],  # begin paddings
+         [0, 0, 0],
+         [0, 0, 0]], dtype=tf.float32)
+    mask_predictions_logits = 20 * tf.ones((batch_size *
+                                            model.max_num_proposals,
+                                            model.num_classes,
+                                            14, 14),
+                                           dtype=tf.float32)
+    groundtruth_boxes_list = [
+        tf.constant([[0, 0, .5, .5]], dtype=tf.float32)]
+    # Box contains two ground truth labels.
+    groundtruth_classes_list = [tf.constant([[1, 1]], dtype=tf.float32)]
+    # Set all elements of groundtruth mask to 1.0. In this case all proposal
+    # crops of the groundtruth masks should return a mask that covers the entire
+    # proposal. Thus, if mask_predictions_logits element values are all greater
+    # than 20, the loss should be zero.
+    groundtruth_masks_list = [tf.convert_to_tensor(np.ones((1, 32, 32)),
+                                                   dtype=tf.float32)]
+    prediction_dict = {
+        'rpn_box_encodings': rpn_box_encodings,
+        'rpn_objectness_predictions_with_background':
+        rpn_objectness_predictions_with_background,
+        'image_shape': image_shape,
+        'anchors': anchors,
+        'refined_box_encodings': refined_box_encodings,
+        'class_predictions_with_background': class_predictions_with_background,
+        'proposal_boxes': proposal_boxes,
+        'num_proposals': num_proposals,
+        'mask_predictions': mask_predictions_logits
+    }
+    model.provide_groundtruth(groundtruth_boxes_list,
+                              groundtruth_classes_list,
+                              groundtruth_masks_list)
+    loss_dict = model.loss(prediction_dict)
+    with self.test_session() as sess:
+      loss_dict_out = sess.run(loss_dict)
+      self.assertAllClose(loss_dict_out['first_stage_localization_loss'], 0)
+      self.assertAllClose(loss_dict_out['first_stage_objectness_loss'], 0)
+      self.assertAllClose(loss_dict_out['second_stage_localization_loss'], 0)
+      self.assertAllClose(loss_dict_out['second_stage_classification_loss'], 0)
+      self.assertAllClose(loss_dict_out['second_stage_mask_loss'], 0)
  def test_loss_full_zero_padded_proposals_nonzero_loss_with_two_images(self):
    model = self._build_model(
@@ -828,7 +1015,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
    image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32)
    # box_classifier_batch_size is 6, but here we assume that the number of
-    # actual proposals (not counting zero paddings) is fewer (3).
+    # actual proposals (not counting zero paddings) is fewer.
    num_proposals = tf.constant([3, 2], dtype=tf.int32)
    proposal_boxes = tf.constant(
        [[[0, 0, 16, 16],
@@ -839,9 +1026,9 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
          [0, 0, 0, 0]],
         [[0, 0, 16, 16],
          [0, 16, 16, 32],
-          [0, 0, 0, 0],
          [0, 0, 0, 0],  # begin paddings
          [0, 0, 0, 0],
+          [0, 0, 0, 0],
          [0, 0, 0, 0]]], dtype=tf.float32)
    refined_box_encodings = tf.zeros(

--- a/research/object_detection/meta_architectures/rfcn_meta_arch.py
+++ b/research/object_detection/meta_architectures/rfcn_meta_arch.py
@@ -73,6 +73,7 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
               second_stage_score_conversion_fn,
               second_stage_localization_loss_weight,
               second_stage_classification_loss_weight,
+               second_stage_classification_loss,
               hard_example_miner,
               parallel_iterations=16):
    """RFCNMetaArch Constructor.
@@ -149,6 +150,8 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        used to convert logits to probabilities.
      second_stage_localization_loss_weight: A float
      second_stage_classification_loss_weight: A float
+      second_stage_classification_loss: A string indicating which loss function
+        to use, supports 'softmax' and 'sigmoid'.
      hard_example_miner:  A losses.HardExampleMiner object (can be None).
      parallel_iterations: (Optional) The number of iterations allowed to run
        in parallel for calls to tf.map_fn.
@@ -185,6 +188,8 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        second_stage_score_conversion_fn,
        second_stage_localization_loss_weight,
        second_stage_classification_loss_weight,
+        second_stage_classification_loss,
+        1.0,  # second stage mask prediction loss weight isn't used in R-FCN.
        hard_example_miner,
        parallel_iterations)
@@ -198,10 +203,10 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
    """Predicts the output tensors from 2nd stage of FasterRCNN.
    Args:
-      rpn_box_encodings: 3-D float tensor of shape
+      rpn_box_encodings: 4-D float tensor of shape
        [batch_size, num_valid_anchors, self._box_coder.code_size] containing
        predicted boxes.
-      rpn_objectness_predictions_with_background: 3-D float tensor of shape
+      rpn_objectness_predictions_with_background: 2-D float tensor of shape
        [batch_size, num_valid_anchors, 2] containing class
        predictions (logits) for each of the anchors.  Note that this
        tensor *includes* background class predictions (at class index 0).
@@ -225,13 +230,22 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
          Note that this tensor *includes* background class predictions
          (at class index 0).
        3) num_proposals: An int32 tensor of shape [batch_size] representing the
-          number of proposals generated by the RPN.  `num_proposals` allows us
+          number of proposals generated by the RPN. `num_proposals` allows us
          to keep track of which entries are to be treated as zero paddings and
          which are not since we always pad the number of proposals to be
          `self.max_num_proposals` for each image.
        4) proposal_boxes: A float32 tensor of shape
          [batch_size, self.max_num_proposals, 4] representing
          decoded proposal bounding boxes (in absolute coordinates).
+        5) proposal_boxes_normalized: A float32 tensor of shape
+          [batch_size, self.max_num_proposals, 4] representing decoded proposal
+          bounding boxes (in normalized coordinates). Can be used to override
+          the boxes proposed by the RPN, thus enabling one to extract box
+          classification and prediction for externally selected areas of the
+          image.
+        6) box_classifier_features: a 4-D float32 tensor, of shape
+          [batch_size, feature_map_height, feature_map_width, depth],
+          representing the box classifier features.
    """
    proposal_boxes_normalized, _, num_proposals = self._postprocess_rpn(
        rpn_box_encodings, rpn_objectness_predictions_with_background,
@@ -263,5 +277,7 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        class_predictions_with_background,
        'num_proposals': num_proposals,
        'proposal_boxes': absolute_proposal_boxes,
+        'box_classifier_features': box_classifier_features,
+        'proposal_boxes_normalized': proposal_boxes_normalized,
    }
    return prediction_dict
--- a/research/object_detection/meta_architectures/rfcn_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/rfcn_meta_arch_test.py
@@ -51,6 +51,15 @@ class RFCNMetaArchTest(
    return rfcn_meta_arch.RFCNMetaArch(
        second_stage_rfcn_box_predictor=box_predictor, **common_kwargs)
+  def _get_box_classifier_features_shape(self,
+                                         image_size,
+                                         batch_size,
+                                         max_num_proposals,
+                                         initial_crop_size,
+                                         maxpool_stride,
+                                         num_features):
+    return (batch_size, image_size, image_size, num_features)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/meta_architectures/ssd_meta_arch.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """SSD Meta-architecture definition.
 General tensorflow implementation of convolutional Multibox/SSD detection
@@ -29,6 +28,7 @@ from object_detection.core import model
 from object_detection.core import standard_fields as fields
 from object_detection.core import target_assigner
 from object_detection.utils import shape_utils
+from object_detection.utils import visualization_utils
 slim = tf.contrib.slim
@@ -37,13 +37,34 @@ class SSDFeatureExtractor(object):
  """SSD Feature Extractor definition."""
  def __init__(self,
+               is_training,
               depth_multiplier,
               min_depth,
+               pad_to_multiple,
               conv_hyperparams,
+               batch_norm_trainable=True,
               reuse_weights=None):
+    """Constructor.
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: float depth multiplier for feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
+      batch_norm_trainable: Whether to update batch norm parameters during
+        training or not. When training with a small batch size
+        (e.g. 1), it is desirable to disable batch norm update and use
+        pretrained batch norm params.
+      reuse_weights: whether to reuse variables. Default is None.
+    """
+    self._is_training = is_training
    self._depth_multiplier = depth_multiplier
    self._min_depth = min_depth
+    self._pad_to_multiple = pad_to_multiple
    self._conv_hyperparams = conv_hyperparams
+    self._batch_norm_trainable = batch_norm_trainable
    self._reuse_weights = reuse_weights
  @abstractmethod
@@ -101,9 +122,9 @@ class SSDMetaArch(model.DetectionModel):
               add_summaries=True):
    """SSDMetaArch Constructor.
-    TODO: group NMS parameters + score converter into
+    TODO: group NMS parameters + score converter into a class and loss
-    a class and loss parameters into a class and write config protos for
+    parameters into a class and write config protos for postprocessing
-    postprocessing and losses.
+    and losses.
    Args:
      is_training: A boolean indicating whether the training version of the
@@ -204,8 +225,8 @@ class SSDMetaArch(model.DetectionModel):
    if inputs.dtype is not tf.float32:
      raise ValueError('`preprocess` expects a tf.float32 tensor')
    with tf.name_scope('Preprocessor'):
-      # TODO: revisit whether to always use batch size as  the number of
+      # TODO: revisit whether to always use batch size as the number of parallel
-      # parallel iterations vs allow for dynamic batching.
+      # iterations vs allow for dynamic batching.
      resized_inputs = tf.map_fn(self._image_resizer_fn,
                                 elems=inputs,
                                 dtype=tf.float32)
@@ -226,7 +247,7 @@ class SSDMetaArch(model.DetectionModel):
    Returns:
      prediction_dict: a dictionary holding "raw" prediction tensors:
-        1) box_encodings: 3-D float tensor of shape [batch_size, num_anchors,
+        1) box_encodings: 4-D float tensor of shape [batch_size, num_anchors,
          box_code_dimension] containing predicted boxes.
        2) class_predictions_with_background: 3-D float tensor of shape
          [batch_size, num_anchors, num_classes+1] containing class predictions
@@ -234,19 +255,26 @@ class SSDMetaArch(model.DetectionModel):
          background class predictions (at class index 0).
        3) feature_maps: a list of tensors where the ith tensor has shape
          [batch, height_i, width_i, depth_i].
+        4) anchors: 2-D float tensor of shape [num_anchors, 4] containing
+          the generated anchors in normalized coordinates.
    """
    with tf.variable_scope(None, self._extract_features_scope,
                           [preprocessed_inputs]):
      feature_maps = self._feature_extractor.extract_features(
          preprocessed_inputs)
    feature_map_spatial_dims = self._get_feature_map_spatial_dims(feature_maps)
-    self._anchors = self._anchor_generator.generate(feature_map_spatial_dims)
+    image_shape = tf.shape(preprocessed_inputs)
+    self._anchors = self._anchor_generator.generate(
+        feature_map_spatial_dims,
+        im_height=image_shape[1],
+        im_width=image_shape[2])
    (box_encodings, class_predictions_with_background
    ) = self._add_box_predictions_to_feature_maps(feature_maps)
    predictions_dict = {
        'box_encodings': box_encodings,
        'class_predictions_with_background': class_predictions_with_background,
-        'feature_maps': feature_maps
+        'feature_maps': feature_maps,
+        'anchors': self._anchors.get()
    }
    return predictions_dict
@@ -351,9 +379,11 @@ class SSDMetaArch(model.DetectionModel):
    Returns:
      detections: a dictionary containing the following fields
-        detection_boxes: [batch, max_detection, 4]
+        detection_boxes: [batch, max_detections, 4]
        detection_scores: [batch, max_detections]
        detection_classes: [batch, max_detections]
+        detection_keypoints: [batch, max_detections, num_keypoints, 2] (if
+          encoded in the prediction_dict 'box_encodings')
        num_detections: [batch]
    Raises:
      ValueError: if prediction_dict does not contain `box_encodings` or
@@ -365,7 +395,7 @@ class SSDMetaArch(model.DetectionModel):
    with tf.name_scope('Postprocessor'):
      box_encodings = prediction_dict['box_encodings']
      class_predictions = prediction_dict['class_predictions_with_background']
-      detection_boxes = self._batch_decode(box_encodings)
+      detection_boxes, detection_keypoints = self._batch_decode(box_encodings)
      detection_boxes = tf.expand_dims(detection_boxes, axis=2)
      class_predictions_without_background = tf.slice(class_predictions,
@@ -374,14 +404,25 @@ class SSDMetaArch(model.DetectionModel):
      detection_scores = self._score_conversion_fn(
          class_predictions_without_background)
      clip_window = tf.constant([0, 0, 1, 1], tf.float32)
-      (nmsed_boxes, nmsed_scores, nmsed_classes, _,
+      additional_fields = None
-       num_detections) = self._non_max_suppression_fn(detection_boxes,
+      if detection_keypoints is not None:
-                                                      detection_scores,
+        additional_fields = {
-                                                      clip_window=clip_window)
+            fields.BoxListFields.keypoints: detection_keypoints}
-      return {'detection_boxes': nmsed_boxes,
+      (nmsed_boxes, nmsed_scores, nmsed_classes, _, nmsed_additional_fields,
-              'detection_scores': nmsed_scores,
+       num_detections) = self._non_max_suppression_fn(
-              'detection_classes': nmsed_classes,
+           detection_boxes,
-              'num_detections': tf.to_float(num_detections)}
+           detection_scores,
+           clip_window=clip_window,
+           additional_fields=additional_fields)
+      detection_dict = {'detection_boxes': nmsed_boxes,
+                        'detection_scores': nmsed_scores,
+                        'detection_classes': nmsed_classes,
+                        'num_detections': tf.to_float(num_detections)}
+      if (nmsed_additional_fields is not None and
+          fields.BoxListFields.keypoints in nmsed_additional_fields):
+        detection_dict['detection_keypoints'] = nmsed_additional_fields[
+            fields.BoxListFields.keypoints]
+      return detection_dict
  def loss(self, prediction_dict, scope=None):
    """Compute scalar loss tensors with respect to provided groundtruth.
@@ -395,7 +436,7 @@ class SSDMetaArch(model.DetectionModel):
          box_code_dimension] containing predicted boxes.
        2) class_predictions_with_background: 3-D float tensor of shape
          [batch_size, num_anchors, num_classes+1] containing class predictions
-          (logits) for each of the anchors.  Note that this tensor *includes*
+          (logits) for each of the anchors. Note that this tensor *includes*
          background class predictions.
      scope: Optional scope name.
@@ -405,10 +446,14 @@ class SSDMetaArch(model.DetectionModel):
        values.
    """
    with tf.name_scope(scope, 'Loss', prediction_dict.values()):
+      keypoints = None
+      if self.groundtruth_has_field(fields.BoxListFields.keypoints):
+        keypoints = self.groundtruth_lists(fields.BoxListFields.keypoints)
      (batch_cls_targets, batch_cls_weights, batch_reg_targets,
       batch_reg_weights, match_list) = self._assign_targets(
           self.groundtruth_lists(fields.BoxListFields.boxes),
-           self.groundtruth_lists(fields.BoxListFields.classes))
+           self.groundtruth_lists(fields.BoxListFields.classes),
+           keypoints)
      if self._add_summaries:
        self._summarize_input(
            self.groundtruth_lists(fields.BoxListFields.boxes), match_list)
@@ -417,35 +462,60 @@ class SSDMetaArch(model.DetectionModel):
      location_losses = self._localization_loss(
          prediction_dict['box_encodings'],
          batch_reg_targets,
+          ignore_nan_targets=True,
          weights=batch_reg_weights)
      cls_losses = self._classification_loss(
          prediction_dict['class_predictions_with_background'],
          batch_cls_targets,
          weights=batch_cls_weights)
-      # Optionally apply hard mining on top of loss values
-      localization_loss = tf.reduce_sum(location_losses)
-      classification_loss = tf.reduce_sum(cls_losses)
      if self._hard_example_miner:
        (localization_loss, classification_loss) = self._apply_hard_mining(
            location_losses, cls_losses, prediction_dict, match_list)
        if self._add_summaries:
          self._hard_example_miner.summarize()
+      else:
+        if self._add_summaries:
+          class_ids = tf.argmax(batch_cls_targets, axis=2)
+          flattened_class_ids = tf.reshape(class_ids, [-1])
+          flattened_classification_losses = tf.reshape(cls_losses, [-1])
+          self._summarize_anchor_classification_loss(
+              flattened_class_ids, flattened_classification_losses)
+        localization_loss = tf.reduce_sum(location_losses)
+        classification_loss = tf.reduce_sum(cls_losses)
      # Optionally normalize by number of positive matches
      normalizer = tf.constant(1.0, dtype=tf.float32)
      if self._normalize_loss_by_num_matches:
        normalizer = tf.maximum(tf.to_float(tf.reduce_sum(num_matches)), 1.0)
+      with tf.name_scope('localization_loss'):
+        localization_loss = ((self._localization_loss_weight / normalizer) *
+                             localization_loss)
+      with tf.name_scope('classification_loss'):
+        classification_loss = ((self._classification_loss_weight / normalizer) *
+                               classification_loss)
      loss_dict = {
-          'localization_loss': (self._localization_loss_weight / normalizer) *
+          'localization_loss': localization_loss,
-                               localization_loss,
+          'classification_loss': classification_loss
-          'classification_loss': (self._classification_loss_weight /
-                                  normalizer) * classification_loss
      }
    return loss_dict
-  def _assign_targets(self, groundtruth_boxes_list, groundtruth_classes_list):
+  def _summarize_anchor_classification_loss(self, class_ids, cls_losses):
+    positive_indices = tf.where(tf.greater(class_ids, 0))
+    positive_anchor_cls_loss = tf.squeeze(
+        tf.gather(cls_losses, positive_indices), axis=1)
+    visualization_utils.add_cdf_image_summary(positive_anchor_cls_loss,
+                                              'PositiveAnchorLossCDF')
+    negative_indices = tf.where(tf.equal(class_ids, 0))
+    negative_anchor_cls_loss = tf.squeeze(
+        tf.gather(cls_losses, negative_indices), axis=1)
+    visualization_utils.add_cdf_image_summary(negative_anchor_cls_loss,
+                                              'NegativeAnchorLossCDF')
+  def _assign_targets(self, groundtruth_boxes_list, groundtruth_classes_list,
+                      groundtruth_keypoints_list=None):
    """Assign groundtruth targets.
    Adds a background class to each one-hot encoding of groundtruth classes
@@ -460,6 +530,8 @@ class SSDMetaArch(model.DetectionModel):
      groundtruth_classes_list: a list of 2-D one-hot (or k-hot) tensors of
        shape [num_boxes, num_classes] containing the class targets with the 0th
        index assumed to map to the first non-background class.
+      groundtruth_keypoints_list: (optional) a list of 3-D tensors of shape
+        [num_boxes, num_keypoints, 2]
    Returns:
      batch_cls_targets: a tensor with shape [batch_size, num_anchors,
@@ -480,6 +552,10 @@ class SSDMetaArch(model.DetectionModel):
        tf.pad(one_hot_encoding, [[0, 0], [1, 0]], mode='CONSTANT')
        for one_hot_encoding in groundtruth_classes_list
    ]
+    if groundtruth_keypoints_list is not None:
+      for boxlist, keypoints in zip(
+          groundtruth_boxlists, groundtruth_keypoints_list):
+        boxlist.add_field(fields.BoxListFields.keypoints, keypoints)
    return target_assigner.batch_assign_targets(
        self._target_assigner, self.anchors, groundtruth_boxlists,
        groundtruth_classes_with_background_list)
@@ -544,12 +620,11 @@ class SSDMetaArch(model.DetectionModel):
      mined_cls_loss: a float scalar with sum of classification losses from
        selected hard examples.
    """
-    class_pred_shape = [-1, self.anchors.num_boxes_static(), self.num_classes]
+    class_predictions = tf.slice(
-    class_predictions = tf.reshape(
+        prediction_dict['class_predictions_with_background'], [0, 0,
-        tf.slice(prediction_dict['class_predictions_with_background'],
+                                                               1], [-1, -1, -1])
-                 [0, 0, 1], class_pred_shape), class_pred_shape)
-    decoded_boxes = self._batch_decode(prediction_dict['box_encodings'])
+    decoded_boxes, _ = self._batch_decode(prediction_dict['box_encodings'])
    decoded_box_tensors_list = tf.unstack(decoded_boxes)
    class_prediction_list = tf.unstack(class_predictions)
    decoded_boxlist_list = []
@@ -574,6 +649,9 @@ class SSDMetaArch(model.DetectionModel):
    Returns:
      decoded_boxes: A float32 tensor of shape
        [batch_size, num_anchors, 4] containing the decoded boxes.
+      decoded_keypoints: A float32 tensor of shape
+        [batch_size, num_anchors, num_keypoints, 2] containing the decoded
+        keypoints if present in the input `box_encodings`, None otherwise.
    """
    combined_shape = shape_utils.combined_static_and_dynamic_shape(
        box_encodings)
@@ -581,13 +659,21 @@ class SSDMetaArch(model.DetectionModel):
    tiled_anchor_boxes = tf.tile(
        tf.expand_dims(self.anchors.get(), 0), [batch_size, 1, 1])
    tiled_anchors_boxlist = box_list.BoxList(
-        tf.reshape(tiled_anchor_boxes, [-1, self._box_coder.code_size]))
+        tf.reshape(tiled_anchor_boxes, [-1, 4]))
    decoded_boxes = self._box_coder.decode(
        tf.reshape(box_encodings, [-1, self._box_coder.code_size]),
        tiled_anchors_boxlist)
-    return tf.reshape(decoded_boxes.get(),
+    decoded_keypoints = None
-                      tf.stack([combined_shape[0], combined_shape[1],
+    if decoded_boxes.has_field(fields.BoxListFields.keypoints):
-                                4]))
+      decoded_keypoints = decoded_boxes.get_field(
+          fields.BoxListFields.keypoints)
+      num_keypoints = decoded_keypoints.get_shape()[1]
+      decoded_keypoints = tf.reshape(
+          decoded_keypoints,
+          tf.stack([combined_shape[0], combined_shape[1], num_keypoints, 2]))
+    decoded_boxes = tf.reshape(decoded_boxes.get(), tf.stack(
+        [combined_shape[0], combined_shape[1], 4]))
+    return decoded_boxes, decoded_keypoints
  def restore_map(self, from_detection_checkpoint=True):
    """Returns a map of variables to load from a foreign checkpoint.

--- a/research/object_detection/meta_architectures/ssd_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch_test.py
@@ -18,7 +18,6 @@ import functools
 import numpy as np
 import tensorflow as tf
-from tensorflow.python.training import saver as tf_saver
 from object_detection.core import anchor_generator
 from object_detection.core import box_list
 from object_detection.core import losses
@@ -34,7 +33,12 @@ class FakeSSDFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
  def __init__(self):
    super(FakeSSDFeatureExtractor, self).__init__(
-        depth_multiplier=0, min_depth=0, conv_hyperparams=None)
+        is_training=True,
+        depth_multiplier=0,
+        min_depth=0,
+        pad_to_multiple=1,
+        batch_norm_trainable=True,
+        conv_hyperparams=None)
  def preprocess(self, resized_inputs):
    return tf.identity(resized_inputs)
@@ -55,7 +59,7 @@ class MockAnchorGenerator2x2(anchor_generator.AnchorGenerator):
  def num_anchors_per_location(self):
    return [1]
-  def _generate(self, feature_map_shape_list):
+  def _generate(self, feature_map_shape_list, im_height, im_width):
    return box_list.BoxList(
        tf.constant([[0, 0, .5, .5],
                     [0, .5, .5, 1],
@@ -147,6 +151,7 @@ class SsdMetaArchTest(tf.test.TestCase):
        self.assertTrue('box_encodings' in prediction_dict)
        self.assertTrue('class_predictions_with_background' in prediction_dict)
        self.assertTrue('feature_maps' in prediction_dict)
+        self.assertTrue('anchors' in prediction_dict)
        init_op = tf.global_variables_initializer()
      with self.test_session(graph=tf_graph) as sess:
@@ -242,7 +247,7 @@ class SsdMetaArchTest(tf.test.TestCase):
  def test_restore_map_for_detection_ckpt(self):
    init_op = tf.global_variables_initializer()
-    saver = tf_saver.Saver()
+    saver = tf.train.Saver()
    save_path = self.get_temp_dir()
    with self.test_session() as sess:
      sess.run(init_op)