Merge remote-tracking branch 'upstream/master'

27b4acd4 · Aman Gupta · 5133522f · d4e1f97f · 27b4acd4 · 27b4acd4
Commit 27b4acd4 authored Sep 25, 2018 by Aman Gupta
20 changed files
--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
@@ -14,8 +14,12 @@
 # ==============================================================================
 """Tests for object_detection.meta_architectures.faster_rcnn_meta_arch."""
+import functools
+from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
 from google.protobuf import text_format
 from object_detection.anchor_generators import grid_anchor_generator
 from object_detection.builders import box_predictor_builder
@@ -23,11 +27,14 @@ from object_detection.builders import hyperparams_builder
 from object_detection.builders import post_processing_builder
 from object_detection.core import balanced_positive_negative_sampler as sampler
 from object_detection.core import losses
+from object_detection.core import post_processing
 from object_detection.core import target_assigner
 from object_detection.meta_architectures import faster_rcnn_meta_arch
 from object_detection.protos import box_predictor_pb2
 from object_detection.protos import hyperparams_pb2
 from object_detection.protos import post_processing_pb2
+from object_detection.utils import ops
+from object_detection.utils import test_case
 from object_detection.utils import test_utils
 slim = tf.contrib.slim
@@ -60,7 +67,7 @@ class FakeFasterRCNNFeatureExtractor(
                             num_outputs=3, kernel_size=1, scope='layer2')
-class FasterRCNNMetaArchTestBase(tf.test.TestCase):
+class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
  """Base class to test Faster R-CNN and R-FCN meta architectures."""
  def _build_arg_scope_with_hyperparams(self,
@@ -157,7 +164,8 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
                   masks_are_class_agnostic=False,
                   use_matmul_crop_and_resize=False,
                   clip_anchors_to_image=False,
-                   use_matmul_gather_in_matcher=False):
+                   use_matmul_gather_in_matcher=False,
+                   use_static_shapes=False):
    def image_resizer_fn(image, masks=None):
      """Fake image resizer function."""
@@ -220,11 +228,18 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
    first_stage_box_predictor_depth = 512
    first_stage_minibatch_size = 3
    first_stage_sampler = sampler.BalancedPositiveNegativeSampler(
-        positive_fraction=0.5, is_static=False)
+        positive_fraction=0.5, is_static=use_static_shapes)
    first_stage_nms_score_threshold = -1.0
    first_stage_nms_iou_threshold = 1.0
    first_stage_max_proposals = first_stage_max_proposals
+    first_stage_non_max_suppression_fn = functools.partial(
+        post_processing.batch_multiclass_non_max_suppression,
+        score_thresh=first_stage_nms_score_threshold,
+        iou_thresh=first_stage_nms_iou_threshold,
+        max_size_per_class=first_stage_max_proposals,
+        max_total_size=first_stage_max_proposals,
+        use_static_shapes=use_static_shapes)
    first_stage_localization_loss_weight = 1.0
    first_stage_objectness_loss_weight = 1.0
@@ -246,7 +261,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
    second_stage_non_max_suppression_fn, _ = post_processing_builder.build(
        post_processing_config)
    second_stage_sampler = sampler.BalancedPositiveNegativeSampler(
-        positive_fraction=1.0, is_static=False)
+        positive_fraction=1.0, is_static=use_static_shapes)
    second_stage_score_conversion_fn = tf.identity
    second_stage_localization_loss_weight = 1.0
@@ -268,6 +283,9 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
          loc_loss_weight=second_stage_localization_loss_weight,
          max_negatives_per_positive=None)
+    crop_and_resize_fn = (
+        ops.matmul_crop_and_resize
+        if use_matmul_crop_and_resize else ops.native_crop_and_resize)
    common_kwargs = {
        'is_training': is_training,
        'num_classes': num_classes,
@@ -284,8 +302,8 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
        'first_stage_box_predictor_depth': first_stage_box_predictor_depth,
        'first_stage_minibatch_size': first_stage_minibatch_size,
        'first_stage_sampler': first_stage_sampler,
-        'first_stage_nms_score_threshold': first_stage_nms_score_threshold,
+        'first_stage_non_max_suppression_fn':
-        'first_stage_nms_iou_threshold': first_stage_nms_iou_threshold,
+        first_stage_non_max_suppression_fn,
        'first_stage_max_proposals': first_stage_max_proposals,
        'first_stage_localization_loss_weight':
        first_stage_localization_loss_weight,
@@ -304,8 +322,10 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
        'second_stage_classification_loss':
        second_stage_classification_loss,
        'hard_example_miner': hard_example_miner,
-        'use_matmul_crop_and_resize': use_matmul_crop_and_resize,
+        'crop_and_resize_fn': crop_and_resize_fn,
-        'clip_anchors_to_image': clip_anchors_to_image
+        'clip_anchors_to_image': clip_anchors_to_image,
+        'use_static_shapes': use_static_shapes,
+        'resize_masks': True,
    }
    return self._get_model(
@@ -412,7 +432,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
        anchors = prediction_out['anchors']
        self.assertTrue(len(anchors.shape) == 2 and anchors.shape[1] == 4)
        num_anchors_out = anchors.shape[0]
-        self.assertTrue(num_anchors_out < num_anchors_strict_upper_bound)
+        self.assertLess(num_anchors_out, num_anchors_strict_upper_bound)
        self.assertTrue(np.all(np.greater_equal(anchors, 0)))
        self.assertTrue(np.all(np.less_equal(anchors[:, 0], height)))
@@ -484,94 +504,97 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
      for key in expected_shapes:
        self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key])
-  def _test_predict_gives_correct_shapes_in_train_mode_both_stages(
+  def test_predict_gives_correct_shapes_in_train_mode_both_stages(
-      self, use_matmul_crop_and_resize=False,
+      self,
-      clip_anchors_to_image=False):
+      use_static_shapes=False):
-    test_graph = tf.Graph()
+    batch_size = 2
-    with test_graph.as_default():
+    image_size = 10
+    max_num_proposals = 7
+    initial_crop_size = 3
+    maxpool_stride = 1
+    def graph_fn(images, gt_boxes, gt_classes, gt_weights):
+      """Function to construct tf graph for the test."""
      model = self._build_model(
          is_training=True,
          number_of_stages=2,
          second_stage_batch_size=7,
          predict_masks=False,
-          use_matmul_crop_and_resize=use_matmul_crop_and_resize,
+          use_matmul_crop_and_resize=use_static_shapes,
-          clip_anchors_to_image=clip_anchors_to_image)
+          clip_anchors_to_image=use_static_shapes,
+          use_static_shapes=use_static_shapes)
-      batch_size = 2
+      preprocessed_inputs, true_image_shapes = model.preprocess(images)
-      image_size = 10
-      max_num_proposals = 7
-      initial_crop_size = 3
-      maxpool_stride = 1
-      image_shape = (batch_size, image_size, image_size, 3)
-      preprocessed_inputs = tf.zeros(image_shape, dtype=tf.float32)
-      groundtruth_boxes_list = [
-          tf.constant([[0, 0, .5, .5], [.5, .5, 1, 1]], dtype=tf.float32),
-          tf.constant([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=tf.float32)]
-      groundtruth_classes_list = [
-          tf.constant([[1, 0], [0, 1]], dtype=tf.float32),
-          tf.constant([[1, 0], [1, 0]], dtype=tf.float32)]
-      groundtruth_weights_list = [
-          tf.constant([1, 1], dtype=tf.float32),
-          tf.constant([1, 1], dtype=tf.float32)]
-      _, true_image_shapes = model.preprocess(tf.zeros(image_shape))
      model.provide_groundtruth(
-          groundtruth_boxes_list,
+          groundtruth_boxes_list=tf.unstack(gt_boxes),
-          groundtruth_classes_list,
+          groundtruth_classes_list=tf.unstack(gt_classes),
-          groundtruth_weights_list=groundtruth_weights_list)
+          groundtruth_weights_list=tf.unstack(gt_weights))
      result_tensor_dict = model.predict(preprocessed_inputs, true_image_shapes)
-      expected_shapes = {
+      return (result_tensor_dict['refined_box_encodings'],
-          'rpn_box_predictor_features':
+              result_tensor_dict['class_predictions_with_background'],
-          (2, image_size, image_size, 512),
+              result_tensor_dict['proposal_boxes'],
-          'rpn_features_to_crop': (2, image_size, image_size, 3),
+              result_tensor_dict['proposal_boxes_normalized'],
-          'image_shape': (4,),
+              result_tensor_dict['anchors'],
-          'refined_box_encodings': (2 * max_num_proposals, 2, 4),
+              result_tensor_dict['rpn_box_encodings'],
-          'class_predictions_with_background': (2 * max_num_proposals, 2 + 1),
+              result_tensor_dict['rpn_objectness_predictions_with_background'],
-          'num_proposals': (2,),
+              result_tensor_dict['rpn_features_to_crop'],
-          'proposal_boxes': (2, max_num_proposals, 4),
+              result_tensor_dict['rpn_box_predictor_features'],
-          'proposal_boxes_normalized': (2, max_num_proposals, 4),
+             )
-          'box_classifier_features':
-          self._get_box_classifier_features_shape(image_size,
+    image_shape = (batch_size, image_size, image_size, 3)
-                                                  batch_size,
+    images = np.zeros(image_shape, dtype=np.float32)
-                                                  max_num_proposals,
+    gt_boxes = np.stack([
-                                                  initial_crop_size,
+        np.array([[0, 0, .5, .5], [.5, .5, 1, 1]], dtype=np.float32),
-                                                  maxpool_stride,
+        np.array([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=np.float32)
-                                                  3)
+    ])
-      }
+    gt_classes = np.stack([
+        np.array([[1, 0], [0, 1]], dtype=np.float32),
-      init_op = tf.global_variables_initializer()
+        np.array([[1, 0], [1, 0]], dtype=np.float32)
-      with self.test_session(graph=test_graph) as sess:
+    ])
-        sess.run(init_op)
+    gt_weights = np.stack([
-        tensor_dict_out = sess.run(result_tensor_dict)
+        np.array([1, 1], dtype=np.float32),
-        self.assertEqual(set(tensor_dict_out.keys()),
+        np.array([1, 1], dtype=np.float32)
-                         set(expected_shapes.keys()).union(set([
+    ])
-                             'rpn_box_encodings',
+    if use_static_shapes:
-                             'rpn_objectness_predictions_with_background',
+      results = self.execute(graph_fn,
-                             'anchors'])))
+                             [images, gt_boxes, gt_classes, gt_weights])
-        for key in expected_shapes:
+    else:
-          self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key])
+      results = self.execute_cpu(graph_fn,
+                                 [images, gt_boxes, gt_classes, gt_weights])
-        anchors_shape_out = tensor_dict_out['anchors'].shape
-        self.assertEqual(2, len(anchors_shape_out))
-        self.assertEqual(4, anchors_shape_out[1])
-        num_anchors_out = anchors_shape_out[0]
-        self.assertAllEqual(tensor_dict_out['rpn_box_encodings'].shape,
-                            (2, num_anchors_out, 4))
-        self.assertAllEqual(
-            tensor_dict_out['rpn_objectness_predictions_with_background'].shape,
-            (2, num_anchors_out, 2))
-  def test_predict_gives_correct_shapes_in_train_mode_both_stages(self):
-    self._test_predict_gives_correct_shapes_in_train_mode_both_stages()
-  def test_predict_gives_correct_shapes_in_train_mode_matmul_crop_resize(self):
-    self._test_predict_gives_correct_shapes_in_train_mode_both_stages(
-        use_matmul_crop_and_resize=True)
-  def test_predict_gives_correct_shapes_in_train_mode_clip_anchors(self):
+    expected_shapes = {
-    self._test_predict_gives_correct_shapes_in_train_mode_both_stages(
+        'rpn_box_predictor_features': (2, image_size, image_size, 512),
-        clip_anchors_to_image=True)
+        'rpn_features_to_crop': (2, image_size, image_size, 3),
+        'refined_box_encodings': (2 * max_num_proposals, 2, 4),
+        'class_predictions_with_background': (2 * max_num_proposals, 2 + 1),
+        'proposal_boxes': (2, max_num_proposals, 4),
+        'rpn_box_encodings': (2, image_size * image_size * 9, 4),
+        'proposal_boxes_normalized': (2, max_num_proposals, 4),
+        'box_classifier_features':
+            self._get_box_classifier_features_shape(
+                image_size, batch_size, max_num_proposals, initial_crop_size,
+                maxpool_stride, 3),
+        'rpn_objectness_predictions_with_background':
+        (2, image_size * image_size * 9, 2)
+    }
+    # TODO(rathodv): Possibly change utils/test_case.py to accept dictionaries
+    # and return dicionaries so don't have to rely on the order of tensors.
+    self.assertAllEqual(results[0].shape,
+                        expected_shapes['refined_box_encodings'])
+    self.assertAllEqual(results[1].shape,
+                        expected_shapes['class_predictions_with_background'])
+    self.assertAllEqual(results[2].shape, expected_shapes['proposal_boxes'])
+    self.assertAllEqual(results[3].shape,
+                        expected_shapes['proposal_boxes_normalized'])
+    anchors_shape = results[4].shape
+    self.assertAllEqual(results[5].shape,
+                        [batch_size, anchors_shape[0], 4])
+    self.assertAllEqual(results[6].shape,
+                        [batch_size, anchors_shape[0], 2])
+    self.assertAllEqual(results[7].shape,
+                        expected_shapes['rpn_features_to_crop'])
+    self.assertAllEqual(results[8].shape,
+                        expected_shapes['rpn_box_predictor_features'])
  def _test_postprocess_first_stage_only_inference_mode(
      self, pad_to_max_dimension=None):
@@ -848,10 +871,10 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
      loss_dict_out = sess.run(loss_dict)
      self.assertAllClose(loss_dict_out['Loss/RPNLoss/localization_loss'], 0)
      self.assertAllClose(loss_dict_out['Loss/RPNLoss/objectness_loss'], 0)
-      self.assertTrue('Loss/BoxClassifierLoss/localization_loss'
+      self.assertNotIn('Loss/BoxClassifierLoss/localization_loss',
-                      not in loss_dict_out)
+                       loss_dict_out)
-      self.assertTrue('Loss/BoxClassifierLoss/classification_loss'
+      self.assertNotIn('Loss/BoxClassifierLoss/classification_loss',
-                      not in loss_dict_out)
+                       loss_dict_out)
  # TODO(rathodv): Split test into two - with and without masks.
  def test_loss_full(self):
@@ -1157,22 +1180,58 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
          'Loss/BoxClassifierLoss/classification_loss'], 0)
      self.assertAllClose(loss_dict_out['Loss/BoxClassifierLoss/mask_loss'], 0)
-  def test_loss_full_zero_padded_proposals_nonzero_loss_with_two_images(self):
+  def test_loss_full_zero_padded_proposals_nonzero_loss_with_two_images(
-    model = self._build_model(
+      self, use_static_shapes=False, shared_boxes=False):
-        is_training=True, number_of_stages=2, second_stage_batch_size=6)
    batch_size = 2
-    anchors = tf.constant(
+    first_stage_max_proposals = 8
+    second_stage_batch_size = 6
+    num_classes = 2
+    def graph_fn(anchors, rpn_box_encodings,
+                 rpn_objectness_predictions_with_background, images,
+                 num_proposals, proposal_boxes, refined_box_encodings,
+                 class_predictions_with_background, groundtruth_boxes,
+                 groundtruth_classes):
+      """Function to construct tf graph for the test."""
+      model = self._build_model(
+          is_training=True, number_of_stages=2,
+          second_stage_batch_size=second_stage_batch_size,
+          first_stage_max_proposals=first_stage_max_proposals,
+          num_classes=num_classes,
+          use_matmul_crop_and_resize=use_static_shapes,
+          clip_anchors_to_image=use_static_shapes,
+          use_static_shapes=use_static_shapes)
+      prediction_dict = {
+          'rpn_box_encodings': rpn_box_encodings,
+          'rpn_objectness_predictions_with_background':
+          rpn_objectness_predictions_with_background,
+          'image_shape': tf.shape(images),
+          'anchors': anchors,
+          'refined_box_encodings': refined_box_encodings,
+          'class_predictions_with_background':
+          class_predictions_with_background,
+          'proposal_boxes': proposal_boxes,
+          'num_proposals': num_proposals
+      }
+      _, true_image_shapes = model.preprocess(images)
+      model.provide_groundtruth(tf.unstack(groundtruth_boxes),
+                                tf.unstack(groundtruth_classes))
+      loss_dict = model.loss(prediction_dict, true_image_shapes)
+      return (loss_dict['Loss/RPNLoss/localization_loss'],
+              loss_dict['Loss/RPNLoss/objectness_loss'],
+              loss_dict['Loss/BoxClassifierLoss/localization_loss'],
+              loss_dict['Loss/BoxClassifierLoss/classification_loss'])
+    anchors = np.array(
        [[0, 0, 16, 16],
         [0, 16, 16, 32],
         [16, 0, 32, 16],
-         [16, 16, 32, 32]], dtype=tf.float32)
+         [16, 16, 32, 32]], dtype=np.float32)
-    rpn_box_encodings = tf.zeros(
+    rpn_box_encodings = np.zeros(
-        [batch_size,
+        [batch_size, anchors.shape[1], BOX_CODE_SIZE], dtype=np.float32)
-         anchors.get_shape().as_list()[0],
-         BOX_CODE_SIZE], dtype=tf.float32)
    # use different numbers for the objectness category to break ties in
    # order of boxes returned by NMS
-    rpn_objectness_predictions_with_background = tf.constant(
+    rpn_objectness_predictions_with_background = np.array(
        [[[-10, 13],
          [10, -10],
          [10, -11],
@@ -1180,13 +1239,13 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
         [[-10, 13],
          [10, -10],
          [10, -11],
-          [10, -12]]], dtype=tf.float32)
+          [10, -12]]], dtype=np.float32)
-    image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32)
+    images = np.zeros([batch_size, 32, 32, 3], dtype=np.float32)
    # box_classifier_batch_size is 6, but here we assume that the number of
    # actual proposals (not counting zero paddings) is fewer.
-    num_proposals = tf.constant([3, 2], dtype=tf.int32)
+    num_proposals = np.array([3, 2], dtype=np.int32)
-    proposal_boxes = tf.constant(
+    proposal_boxes = np.array(
        [[[0, 0, 16, 16],
          [0, 16, 16, 32],
          [16, 0, 32, 16],
@@ -1198,13 +1257,13 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
          [0, 0, 0, 0],  # begin paddings
          [0, 0, 0, 0],
          [0, 0, 0, 0],
-          [0, 0, 0, 0]]], dtype=tf.float32)
+          [0, 0, 0, 0]]], dtype=np.float32)
-    refined_box_encodings = tf.zeros(
+    refined_box_encodings = np.zeros(
-        (batch_size * model.max_num_proposals,
+        (batch_size * second_stage_batch_size, 1
-         model.num_classes,
+         if shared_boxes else num_classes, BOX_CODE_SIZE),
-         BOX_CODE_SIZE), dtype=tf.float32)
+        dtype=np.float32)
-    class_predictions_with_background = tf.constant(
+    class_predictions_with_background = np.array(
        [[-10, 10, -10],  # first image
         [10, -10, -10],
         [10, -10, -10],
@@ -1216,7 +1275,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
         [0, 0, 0],  # begin paddings
         [0, 0, 0],
         [0, 0, 0],
-         [0, 0, 0],], dtype=tf.float32)
+         [0, 0, 0],], dtype=np.float32)
    # The first groundtruth box is 4/5 of the anchor size in both directions
    # experiencing a loss of:
@@ -1225,38 +1284,29 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
    # The second groundtruth box is identical to the prediction and thus
    # experiences zero loss.
    # Total average loss is (abs(5 * log(1/2)) - .5) / 3.
-    groundtruth_boxes_list = [
+    groundtruth_boxes = np.stack([
-        tf.constant([[0.05, 0.05, 0.45, 0.45]], dtype=tf.float32),
+        np.array([[0.05, 0.05, 0.45, 0.45]], dtype=np.float32),
-        tf.constant([[0.0, 0.0, 0.5, 0.5]], dtype=tf.float32)]
+        np.array([[0.0, 0.0, 0.5, 0.5]], dtype=np.float32)])
-    groundtruth_classes_list = [tf.constant([[1, 0]], dtype=tf.float32),
+    groundtruth_classes = np.stack([np.array([[1, 0]], dtype=np.float32),
-                                tf.constant([[0, 1]], dtype=tf.float32)]
+                                    np.array([[0, 1]], dtype=np.float32)])
-    exp_loc_loss = (-5 * np.log(.8) - 0.5) / 3.0
+    execute_fn = self.execute_cpu
+    if use_static_shapes:
+      execute_fn = self.execute
+    results = execute_fn(graph_fn, [
+        anchors, rpn_box_encodings, rpn_objectness_predictions_with_background,
+        images, num_proposals, proposal_boxes, refined_box_encodings,
+        class_predictions_with_background, groundtruth_boxes,
+        groundtruth_classes
+    ])
-    prediction_dict = {
+    exp_loc_loss = (-5 * np.log(.8) - 0.5) / 3.0
-        'rpn_box_encodings': rpn_box_encodings,
-        'rpn_objectness_predictions_with_background':
-        rpn_objectness_predictions_with_background,
-        'image_shape': image_shape,
-        'anchors': anchors,
-        'refined_box_encodings': refined_box_encodings,
-        'class_predictions_with_background': class_predictions_with_background,
-        'proposal_boxes': proposal_boxes,
-        'num_proposals': num_proposals
-    }
-    _, true_image_shapes = model.preprocess(tf.zeros(image_shape))
-    model.provide_groundtruth(groundtruth_boxes_list,
-                              groundtruth_classes_list)
-    loss_dict = model.loss(prediction_dict, true_image_shapes)
-    with self.test_session() as sess:
+    self.assertAllClose(results[0], exp_loc_loss, rtol=1e-4, atol=1e-4)
-      loss_dict_out = sess.run(loss_dict)
+    self.assertAllClose(results[1], 0.0)
-      self.assertAllClose(loss_dict_out['Loss/RPNLoss/localization_loss'],
+    self.assertAllClose(results[2], exp_loc_loss, rtol=1e-4, atol=1e-4)
-                          exp_loc_loss)
+    self.assertAllClose(results[3], 0.0)
-      self.assertAllClose(loss_dict_out['Loss/RPNLoss/objectness_loss'], 0)
-      self.assertAllClose(loss_dict_out[
-          'Loss/BoxClassifierLoss/localization_loss'], exp_loc_loss)
-      self.assertAllClose(loss_dict_out[
-          'Loss/BoxClassifierLoss/classification_loss'], 0)
  def test_loss_with_hard_mining(self):
    model = self._build_model(is_training=True,
@@ -1346,10 +1396,14 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
      self.assertAllClose(loss_dict_out[
          'Loss/BoxClassifierLoss/classification_loss'], 0)
-  def test_loss_full_with_shared_boxes(self):
+  def test_loss_with_hard_mining_and_losses_mask(self):
-    model = self._build_model(
+    model = self._build_model(is_training=True,
-        is_training=True, number_of_stages=2, second_stage_batch_size=6)
+                              number_of_stages=2,
+                              second_stage_batch_size=None,
+                              first_stage_max_proposals=6,
+                              hard_mining=True)
    batch_size = 2
+    number_of_proposals = 3
    anchors = tf.constant(
        [[0, 0, 16, 16],
         [0, 16, 16, 32],
@@ -1361,63 +1415,77 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
         BOX_CODE_SIZE], dtype=tf.float32)
    # use different numbers for the objectness category to break ties in
    # order of boxes returned by NMS
-    rpn_objectness_predictions_with_background = tf.constant([
+    rpn_objectness_predictions_with_background = tf.constant(
-        [[-10, 13],
+        [[[-10, 13],
-         [10, -10],
+          [-10, 12],
-         [10, -11],
+          [10, -11],
-         [-10, 12]],
+          [10, -12]],
-        [[10, -10],
+         [[-10, 13],
-         [-10, 13],
+          [-10, 12],
-         [-10, 12],
+          [10, -11],
-         [10, -11]]], dtype=tf.float32)
+          [10, -12]]], dtype=tf.float32)
    image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32)
-    num_proposals = tf.constant([6, 6], dtype=tf.int32)
+    # box_classifier_batch_size is 6, but here we assume that the number of
+    # actual proposals (not counting zero paddings) is fewer (3).
+    num_proposals = tf.constant([number_of_proposals, number_of_proposals],
+                                dtype=tf.int32)
    proposal_boxes = tf.constant(
-        2 * [[[0, 0, 16, 16],
+        [[[0, 0, 16, 16],  # first image
-              [0, 16, 16, 32],
+          [0, 16, 16, 32],
-              [16, 0, 32, 16],
+          [16, 0, 32, 16],
-              [16, 16, 32, 32],
+          [0, 0, 0, 0],  # begin paddings
-              [0, 0, 16, 16],
+          [0, 0, 0, 0],
-              [0, 16, 16, 32]]], dtype=tf.float32)
+          [0, 0, 0, 0]],
+         [[0, 0, 16, 16],  # second image
+          [0, 16, 16, 32],
+          [16, 0, 32, 16],
+          [0, 0, 0, 0],  # begin paddings
+          [0, 0, 0, 0],
+          [0, 0, 0, 0]]], dtype=tf.float32)
    refined_box_encodings = tf.zeros(
        (batch_size * model.max_num_proposals,
-         1,  # one box shared among all the classes
+         model.num_classes,
         BOX_CODE_SIZE), dtype=tf.float32)
    class_predictions_with_background = tf.constant(
        [[-10, 10, -10],  # first image
-         [10, -10, -10],
-         [10, -10, -10],
         [-10, -10, 10],
-         [-10, 10, -10],
-         [10, -10, -10],
-         [10, -10, -10],  # second image
-         [-10, 10, -10],
-         [-10, 10, -10],
         [10, -10, -10],
+         [0, 0, 0],  # begin paddings
+         [0, 0, 0],
+         [0, 0, 0],
+         [-10, 10, -10],  # second image
+         [-10, -10, 10],
         [10, -10, -10],
-         [-10, 10, -10]], dtype=tf.float32)
+         [0, 0, 0],  # begin paddings
+         [0, 0, 0],
-    mask_predictions_logits = 20 * tf.ones((batch_size *
+         [0, 0, 0]], dtype=tf.float32)
-                                            model.max_num_proposals,
-                                            model.num_classes,
-                                            14, 14),
-                                           dtype=tf.float32)
+    # The first groundtruth box is 4/5 of the anchor size in both directions
+    # experiencing a loss of:
+    # 2 * SmoothL1(5 * log(4/5)) / (num_proposals * batch_size)
+    #   = 2 * (abs(5 * log(1/2)) - .5) / 3
+    # The second groundtruth box is 46/50 of the anchor size in both directions
+    # experiencing a loss of:
+    # 2 * SmoothL1(5 * log(42/50)) / (num_proposals * batch_size)
+    #   = 2 * (.5(5 * log(.92))^2 - .5) / 3.
+    # Since the first groundtruth box experiences greater loss, and we have
+    # set num_hard_examples=1 in the HardMiner, the final localization loss
+    # corresponds to that of the first groundtruth box.
    groundtruth_boxes_list = [
-        tf.constant([[0, 0, .5, .5], [.5, .5, 1, 1]], dtype=tf.float32),
+        tf.constant([[0.05, 0.05, 0.45, 0.45],
-        tf.constant([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=tf.float32)]
+                     [0.02, 0.52, 0.48, 0.98]], dtype=tf.float32),
-    groundtruth_classes_list = [tf.constant([[1, 0], [0, 1]], dtype=tf.float32),
+        tf.constant([[0.05, 0.05, 0.45, 0.45],
-                                tf.constant([[1, 0], [1, 0]], dtype=tf.float32)]
+                     [0.02, 0.52, 0.48, 0.98]], dtype=tf.float32)]
+    groundtruth_classes_list = [
+        tf.constant([[1, 0], [0, 1]], dtype=tf.float32),
+        tf.constant([[1, 0], [0, 1]], dtype=tf.float32)]
+    is_annotated_list = [tf.constant(True, dtype=tf.bool),
+                         tf.constant(False, dtype=tf.bool)]
+    exp_loc_loss = (2 * (-5 * np.log(.8) - 0.5) /
+                    (number_of_proposals * batch_size))
-    # Set all elements of groundtruth mask to 1.0. In this case all proposal
-    # crops of the groundtruth masks should return a mask that covers the entire
-    # proposal. Thus, if mask_predictions_logits element values are all greater
-    # than 20, the loss should be zero.
-    groundtruth_masks_list = [tf.convert_to_tensor(np.ones((2, 32, 32)),
-                                                   dtype=tf.float32),
-                              tf.convert_to_tensor(np.ones((2, 32, 32)),
-                                                   dtype=tf.float32)]
    prediction_dict = {
        'rpn_box_encodings': rpn_box_encodings,
        'rpn_objectness_predictions_with_background':
@@ -1427,24 +1495,20 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
        'refined_box_encodings': refined_box_encodings,
        'class_predictions_with_background': class_predictions_with_background,
        'proposal_boxes': proposal_boxes,
-        'num_proposals': num_proposals,
+        'num_proposals': num_proposals
-        'mask_predictions': mask_predictions_logits
    }
    _, true_image_shapes = model.preprocess(tf.zeros(image_shape))
    model.provide_groundtruth(groundtruth_boxes_list,
                              groundtruth_classes_list,
-                              groundtruth_masks_list)
+                              is_annotated_list=is_annotated_list)
    loss_dict = model.loss(prediction_dict, true_image_shapes)
    with self.test_session() as sess:
      loss_dict_out = sess.run(loss_dict)
-      self.assertAllClose(loss_dict_out['Loss/RPNLoss/localization_loss'], 0)
-      self.assertAllClose(loss_dict_out['Loss/RPNLoss/objectness_loss'], 0)
      self.assertAllClose(loss_dict_out[
-          'Loss/BoxClassifierLoss/localization_loss'], 0)
+          'Loss/BoxClassifierLoss/localization_loss'], exp_loc_loss)
      self.assertAllClose(loss_dict_out[
          'Loss/BoxClassifierLoss/classification_loss'], 0)
-      self.assertAllClose(loss_dict_out['Loss/BoxClassifierLoss/mask_loss'], 0)
  def test_restore_map_for_classification_ckpt(self):
    # Define mock tensorflow classification graph and save variables.

--- a/research/object_detection/meta_architectures/rfcn_meta_arch.py
+++ b/research/object_detection/meta_architectures/rfcn_meta_arch.py
@@ -62,11 +62,11 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
               first_stage_box_predictor_depth,
               first_stage_minibatch_size,
               first_stage_sampler,
-               first_stage_nms_score_threshold,
+               first_stage_non_max_suppression_fn,
-               first_stage_nms_iou_threshold,
               first_stage_max_proposals,
               first_stage_localization_loss_weight,
               first_stage_objectness_loss_weight,
+               crop_and_resize_fn,
               second_stage_target_assigner,
               second_stage_rfcn_box_predictor,
               second_stage_batch_size,
@@ -79,8 +79,9 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
               hard_example_miner,
               parallel_iterations=16,
               add_summaries=True,
-               use_matmul_crop_and_resize=False,
+               clip_anchors_to_image=False,
-               clip_anchors_to_image=False):
+               use_static_shapes=False,
+               resize_masks=False):
    """RFCNMetaArch Constructor.
    Args:
@@ -123,18 +124,22 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        only called "batch_size" due to terminology from the Faster R-CNN paper.
      first_stage_sampler: The sampler for the boxes used to calculate the RPN
        loss after the first stage.
-      first_stage_nms_score_threshold: Score threshold for non max suppression
+      first_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression
-        for the Region Proposal Network (RPN).  This value is expected to be in
+        callable that takes `boxes`, `scores` and optional `clip_window`(with
-        [0, 1] as it is applied directly after a softmax transformation.  The
+        all other inputs already set) and returns a dictionary containing
-        recommended value for Faster R-CNN is 0.
+        tensors with keys: `detection_boxes`, `detection_scores`,
-      first_stage_nms_iou_threshold: The Intersection Over Union (IOU) threshold
+        `detection_classes`, `num_detections`. This is used to perform non max
-        for performing Non-Max Suppression (NMS) on the boxes predicted by the
+        suppression  on the boxes predicted by the Region Proposal Network
-        Region Proposal Network (RPN).
+        (RPN).
+        See `post_processing.batch_multiclass_non_max_suppression` for the type
+        and shape of these tensors.
      first_stage_max_proposals: Maximum number of boxes to retain after
        performing Non-Max Suppression (NMS) on the boxes predicted by the
        Region Proposal Network (RPN).
      first_stage_localization_loss_weight: A float
      first_stage_objectness_loss_weight: A float
+      crop_and_resize_fn: A differentiable resampler to use for cropping RPN
+        proposal features.
      second_stage_target_assigner: Target assigner to use for second stage of
        R-FCN. If the model is configured with multiple prediction heads, this
        target assigner is used to generate targets for all heads (with the
@@ -168,12 +173,13 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        in parallel for calls to tf.map_fn.
      add_summaries: boolean (default: True) controlling whether summary ops
        should be added to tensorflow graph.
-      use_matmul_crop_and_resize: Force the use of matrix multiplication based
-        crop and resize instead of standard tf.image.crop_and_resize while
-        computing second stage input feature maps.
      clip_anchors_to_image: The anchors generated are clip to the
        window size without filtering the nonoverlapping anchors. This generates
        a static number of anchors. This argument is unused.
+      use_static_shapes: If True, uses implementation of ops with static shape
+        guarantees.
+      resize_masks: Indicates whether the masks presend in the groundtruth
+        should be resized in the model with `image_resizer_fn`
    Raises:
      ValueError: If `second_stage_batch_size` > `first_stage_max_proposals`
@@ -196,11 +202,11 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        first_stage_box_predictor_depth,
        first_stage_minibatch_size,
        first_stage_sampler,
-        first_stage_nms_score_threshold,
+        first_stage_non_max_suppression_fn,
-        first_stage_nms_iou_threshold,
        first_stage_max_proposals,
        first_stage_localization_loss_weight,
        first_stage_objectness_loss_weight,
+        crop_and_resize_fn,
        None,  # initial_crop_size is not used in R-FCN
        None,  # maxpool_kernel_size is not use in R-FCN
        None,  # maxpool_stride is not use in R-FCN
@@ -215,7 +221,11 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
        second_stage_classification_loss,
        1.0,  # second stage mask prediction loss weight isn't used in R-FCN.
        hard_example_miner,
-        parallel_iterations)
+        parallel_iterations,
+        add_summaries,
+        clip_anchors_to_image,
+        use_static_shapes,
+        resize_masks)
    self._rfcn_box_predictor = second_stage_rfcn_box_predictor

--- a/research/object_detection/meta_architectures/ssd_meta_arch.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch.py
@@ -125,12 +125,13 @@ class SSDKerasFeatureExtractor(tf.keras.Model):
               depth_multiplier,
               min_depth,
               pad_to_multiple,
-               conv_hyperparams_config,
+               conv_hyperparams,
               freeze_batchnorm,
               inplace_batchnorm_update,
               use_explicit_padding=False,
               use_depthwise=False,
-               override_base_feature_extractor_hyperparams=False):
+               override_base_feature_extractor_hyperparams=False,
+               name=None):
    """Constructor.
    Args:
@@ -139,9 +140,9 @@ class SSDKerasFeatureExtractor(tf.keras.Model):
      min_depth: minimum feature extractor depth.
      pad_to_multiple: the nearest multiple to zero pad the input height and
        width dimensions to.
-      conv_hyperparams_config: A hyperparams.proto object containing
+      conv_hyperparams: `hyperparams_builder.KerasLayerHyperparams` object
-        convolution hyperparameters for the layers added on top of the
+        containing convolution hyperparameters for the layers added on top of
-        base feature extractor.
+        the base feature extractor.
      freeze_batchnorm: Whether to freeze batch norm parameters during
        training or not. When training with a small batch size (e.g. 1), it is
        desirable to freeze batch norm update and use pretrained batch norm
@@ -156,14 +157,16 @@ class SSDKerasFeatureExtractor(tf.keras.Model):
      override_base_feature_extractor_hyperparams: Whether to override
        hyperparameters of the base feature extractor with the one from
        `conv_hyperparams_config`.
+      name: A string name scope to assign to the model. If 'None', Keras
+        will auto-generate one from the class name.
    """
-    super(SSDKerasFeatureExtractor, self).__init__()
+    super(SSDKerasFeatureExtractor, self).__init__(name=name)
    self._is_training = is_training
    self._depth_multiplier = depth_multiplier
    self._min_depth = min_depth
    self._pad_to_multiple = pad_to_multiple
-    self._conv_hyperparams_config = conv_hyperparams_config
+    self._conv_hyperparams = conv_hyperparams
    self._freeze_batchnorm = freeze_batchnorm
    self._inplace_batchnorm_update = inplace_batchnorm_update
    self._use_explicit_padding = use_explicit_padding
@@ -225,10 +228,7 @@ class SSDMetaArch(model.DetectionModel):
               box_predictor,
               box_coder,
               feature_extractor,
-               matcher,
-               region_similarity_calculator,
               encode_background_as_zeros,
-               negative_class_weight,
               image_resizer_fn,
               non_max_suppression_fn,
               score_conversion_fn,
@@ -238,14 +238,14 @@ class SSDMetaArch(model.DetectionModel):
               localization_loss_weight,
               normalize_loss_by_num_matches,
               hard_example_miner,
+               target_assigner_instance,
               add_summaries=True,
               normalize_loc_loss_by_codesize=False,
               freeze_batchnorm=False,
               inplace_batchnorm_update=False,
               add_background_class=True,
               random_example_sampler=None,
-               expected_classification_loss_under_sampling=None,
+               expected_classification_loss_under_sampling=None):
-               target_assigner_instance=None):
    """SSDMetaArch Constructor.
    TODO(rathodv,jonathanhuang): group NMS parameters + score converter into
@@ -259,13 +259,9 @@ class SSDMetaArch(model.DetectionModel):
      box_predictor: a box_predictor.BoxPredictor object.
      box_coder: a box_coder.BoxCoder object.
      feature_extractor: a SSDFeatureExtractor object.
-      matcher: a matcher.Matcher object.
-      region_similarity_calculator: a
-        region_similarity_calculator.RegionSimilarityCalculator object.
      encode_background_as_zeros: boolean determining whether background
        targets are to be encoded as an all zeros vector or a one-hot
        vector (where background is the 0th class).
-      negative_class_weight: Weight for confidence loss of negative anchors.
      image_resizer_fn: a callable for image resizing.  This callable always
        takes a rank-3 image tensor (corresponding to a single image) and
        returns a rank-3 image tensor, possibly with new spatial dimensions and
@@ -288,6 +284,7 @@ class SSDMetaArch(model.DetectionModel):
      localization_loss_weight: float
      normalize_loss_by_num_matches: boolean
      hard_example_miner: a losses.HardExampleMiner object (can be None)
+      target_assigner_instance: target_assigner.TargetAssigner instance to use.
      add_summaries: boolean (default: True) controlling whether summary ops
        should be added to tensorflow graph.
      normalize_loc_loss_by_codesize: whether to normalize localization loss
@@ -312,7 +309,6 @@ class SSDMetaArch(model.DetectionModel):
        the random sampled examples.
      expected_classification_loss_under_sampling: If not None, use
        to calcualte classification loss by background/foreground weighting.
-      target_assigner_instance: target_assigner.TargetAssigner instance to use.
    """
    super(SSDMetaArch, self).__init__(num_classes=box_predictor.num_classes)
    self._is_training = is_training
@@ -324,8 +320,6 @@ class SSDMetaArch(model.DetectionModel):
    self._box_coder = box_coder
    self._feature_extractor = feature_extractor
-    self._matcher = matcher
-    self._region_similarity_calculator = region_similarity_calculator
    self._add_background_class = add_background_class
    # Needed for fine-tuning from classification checkpoints whose
@@ -347,14 +341,7 @@ class SSDMetaArch(model.DetectionModel):
      self._unmatched_class_label = tf.constant((self.num_classes + 1) * [0],
                                                tf.float32)
-    if target_assigner_instance:
+    self._target_assigner = target_assigner_instance
-      self._target_assigner = target_assigner_instance
-    else:
-      self._target_assigner = target_assigner.TargetAssigner(
-          self._region_similarity_calculator,
-          self._matcher,
-          self._box_coder,
-          negative_class_weight=negative_class_weight)
    self._classification_loss = classification_loss
    self._localization_loss = localization_loss
@@ -523,28 +510,25 @@ class SSDMetaArch(model.DetectionModel):
            im_height=image_shape[1],
            im_width=image_shape[2]))
    if self._box_predictor.is_keras_model:
-      prediction_dict = self._box_predictor(feature_maps)
+      predictor_results_dict = self._box_predictor(feature_maps)
    else:
      with slim.arg_scope([slim.batch_norm],
                          is_training=(self._is_training and
                                       not self._freeze_batchnorm),
                          updates_collections=batchnorm_updates_collections):
-        prediction_dict = self._box_predictor.predict(
+        predictor_results_dict = self._box_predictor.predict(
            feature_maps, self._anchor_generator.num_anchors_per_location())
-    box_encodings = tf.concat(prediction_dict['box_encodings'], axis=1)
-    if box_encodings.shape.ndims == 4 and box_encodings.shape[2] == 1:
-      box_encodings = tf.squeeze(box_encodings, axis=2)
-    class_predictions_with_background = tf.concat(
-        prediction_dict['class_predictions_with_background'], axis=1)
    predictions_dict = {
        'preprocessed_inputs': preprocessed_inputs,
-        'box_encodings': box_encodings,
-        'class_predictions_with_background':
-        class_predictions_with_background,
        'feature_maps': feature_maps,
        'anchors': self._anchors.get()
    }
+    for prediction_key, prediction_list in iter(predictor_results_dict.items()):
+      prediction = tf.concat(prediction_list, axis=1)
+      if (prediction_key == 'box_encodings' and prediction.shape.ndims == 4 and
+          prediction.shape[2] == 1):
+        prediction = tf.squeeze(prediction, axis=2)
+      predictions_dict[prediction_key] = prediction
    self._batched_prediction_tensor_names = [x for x in predictions_dict
                                             if x != 'anchors']
    return predictions_dict
@@ -587,6 +571,10 @@ class SSDMetaArch(model.DetectionModel):
          [batch_size, num_anchors, num_classes+1] containing class predictions
          (logits) for each of the anchors.  Note that this tensor *includes*
          background class predictions.
+        4) mask_predictions: (optional) a 5-D float tensor of shape
+          [batch_size, num_anchors, q, mask_height, mask_width]. `q` can be
+          either number of classes or 1 depending on whether a separate mask is
+          predicted per class.
      true_image_shapes: int32 tensor of shape [batch, 3] where each row is
        of the form [height, width, channels] indicating the shapes
        of true images in the resized images, as resized images can be padded
@@ -599,6 +587,8 @@ class SSDMetaArch(model.DetectionModel):
        detection_classes: [batch, max_detections]
        detection_keypoints: [batch, max_detections, num_keypoints, 2] (if
          encoded in the prediction_dict 'box_encodings')
+        detection_masks: [batch_size, max_detections, mask_height, mask_width]
+          (optional)
        num_detections: [batch]
    Raises:
      ValueError: if prediction_dict does not contain `box_encodings` or
@@ -627,13 +617,14 @@ class SSDMetaArch(model.DetectionModel):
      if detection_keypoints is not None:
        additional_fields = {
            fields.BoxListFields.keypoints: detection_keypoints}
-      (nmsed_boxes, nmsed_scores, nmsed_classes, _, nmsed_additional_fields,
+      (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks,
-       num_detections) = self._non_max_suppression_fn(
+       nmsed_additional_fields, num_detections) = self._non_max_suppression_fn(
           detection_boxes,
           detection_scores,
-           clip_window=self._compute_clip_window(
+           clip_window=self._compute_clip_window(preprocessed_images,
-               preprocessed_images, true_image_shapes),
+                                                 true_image_shapes),
-           additional_fields=additional_fields)
+           additional_fields=additional_fields,
+           masks=prediction_dict.get('mask_predictions'))
      detection_dict = {
          fields.DetectionResultFields.detection_boxes: nmsed_boxes,
          fields.DetectionResultFields.detection_scores: nmsed_scores,
@@ -645,6 +636,9 @@ class SSDMetaArch(model.DetectionModel):
          fields.BoxListFields.keypoints in nmsed_additional_fields):
        detection_dict[fields.DetectionResultFields.detection_keypoints] = (
            nmsed_additional_fields[fields.BoxListFields.keypoints])
+      if nmsed_masks is not None:
+        detection_dict[
+            fields.DetectionResultFields.detection_masks] = nmsed_masks
      return detection_dict
  def loss(self, prediction_dict, true_image_shapes, scope=None):
@@ -701,16 +695,22 @@ class SSDMetaArch(model.DetectionModel):
        batch_cls_weights = tf.multiply(batch_sampled_indicator,
                                        batch_cls_weights)
+      losses_mask = None
+      if self.groundtruth_has_field(fields.InputDataFields.is_annotated):
+        losses_mask = tf.stack(self.groundtruth_lists(
+            fields.InputDataFields.is_annotated))
      location_losses = self._localization_loss(
          prediction_dict['box_encodings'],
          batch_reg_targets,
          ignore_nan_targets=True,
-          weights=batch_reg_weights)
+          weights=batch_reg_weights,
+          losses_mask=losses_mask)
      cls_losses = self._classification_loss(
          prediction_dict['class_predictions_with_background'],
          batch_cls_targets,
-          weights=batch_cls_weights)
+          weights=batch_cls_weights,
+          losses_mask=losses_mask)
      if self._expected_classification_loss_under_sampling:
        if cls_losses.get_shape().ndims == 3:
@@ -734,12 +734,6 @@ class SSDMetaArch(model.DetectionModel):
          self._hard_example_miner.summarize()
      else:
        cls_losses = ops.reduce_sum_trailing_dimensions(cls_losses, ndims=2)
-        if self._add_summaries:
-          class_ids = tf.argmax(batch_cls_targets, axis=2)
-          flattened_class_ids = tf.reshape(class_ids, [-1])
-          flattened_classification_losses = tf.reshape(cls_losses, [-1])
-          self._summarize_anchor_classification_loss(
-              flattened_class_ids, flattened_classification_losses)
        localization_loss = tf.reduce_sum(location_losses)
        classification_loss = tf.reduce_sum(cls_losses)

--- a/research/object_detection/meta_architectures/ssd_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch_test.py
@@ -14,105 +14,26 @@
 # ==============================================================================
 """Tests for object_detection.meta_architectures.ssd_meta_arch."""
-import functools
 from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
-from object_detection.core import anchor_generator
-from object_detection.core import balanced_positive_negative_sampler as sampler
-from object_detection.core import box_list
-from object_detection.core import losses
-from object_detection.core import post_processing
-from object_detection.core import region_similarity_calculator as sim_calc
-from object_detection.core import target_assigner
 from object_detection.meta_architectures import ssd_meta_arch
-from object_detection.utils import ops
+from object_detection.meta_architectures import ssd_meta_arch_test_lib
-from object_detection.utils import test_case
 from object_detection.utils import test_utils
 slim = tf.contrib.slim
 keras = tf.keras.layers
-class FakeSSDFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
-  def __init__(self):
-    super(FakeSSDFeatureExtractor, self).__init__(
-        is_training=True,
-        depth_multiplier=0,
-        min_depth=0,
-        pad_to_multiple=1,
-        conv_hyperparams_fn=None)
-  def preprocess(self, resized_inputs):
-    return tf.identity(resized_inputs)
-  def extract_features(self, preprocessed_inputs):
-    with tf.variable_scope('mock_model'):
-      features = slim.conv2d(inputs=preprocessed_inputs, num_outputs=32,
-                             kernel_size=1, scope='layer1')
-      return [features]
-class FakeSSDKerasFeatureExtractor(ssd_meta_arch.SSDKerasFeatureExtractor):
-  def __init__(self):
-    with tf.name_scope('mock_model'):
-      super(FakeSSDKerasFeatureExtractor, self).__init__(
-          is_training=True,
-          depth_multiplier=0,
-          min_depth=0,
-          pad_to_multiple=1,
-          conv_hyperparams_config=None,
-          freeze_batchnorm=False,
-          inplace_batchnorm_update=False,
-      )
-      self._conv = keras.Conv2D(filters=32, kernel_size=1, name='layer1')
-  def preprocess(self, resized_inputs):
-    return tf.identity(resized_inputs)
-  def _extract_features(self, preprocessed_inputs, **kwargs):
-    with tf.name_scope('mock_model'):
-      return [self._conv(preprocessed_inputs)]
-class MockAnchorGenerator2x2(anchor_generator.AnchorGenerator):
-  """Sets up a simple 2x2 anchor grid on the unit square."""
-  def name_scope(self):
-    return 'MockAnchorGenerator'
-  def num_anchors_per_location(self):
-    return [1]
-  def _generate(self, feature_map_shape_list, im_height, im_width):
-    return [box_list.BoxList(
-        tf.constant([[0, 0, .5, .5],
-                     [0, .5, .5, 1],
-                     [.5, 0, 1, .5],
-                     [1., 1., 1.5, 1.5]  # Anchor that is outside clip_window.
-                    ], tf.float32))]
-  def num_anchors(self):
-    return 4
-def _get_value_for_matching_key(dictionary, suffix):
-  for key in dictionary.keys():
-    if key.endswith(suffix):
-      return dictionary[key]
-  raise ValueError('key not found {}'.format(suffix))
 @parameterized.parameters(
    {'use_keras': False},
    {'use_keras': True},
 )
-class SsdMetaArchTest(test_case.TestCase, parameterized.TestCase):
+class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase,
+                      parameterized.TestCase):
  def _create_model(self,
                    apply_hard_mining=True,
@@ -123,96 +44,25 @@ class SsdMetaArchTest(test_case.TestCase, parameterized.TestCase):
                    use_expected_classification_loss_under_sampling=False,
                    minimum_negative_sampling=1,
                    desired_negative_sampling_ratio=3,
-                    use_keras=False):
+                    use_keras=False,
-    is_training = False
+                    predict_mask=False,
-    num_classes = 1
+                    use_static_shapes=False,
-    mock_anchor_generator = MockAnchorGenerator2x2()
+                    nms_max_size_per_class=5):
-    if use_keras:
+    return super(SsdMetaArchTest, self)._create_model(
-      mock_box_predictor = test_utils.MockKerasBoxPredictor(
+        model_fn=ssd_meta_arch.SSDMetaArch,
-          is_training, num_classes)
+        apply_hard_mining=apply_hard_mining,
-    else:
-      mock_box_predictor = test_utils.MockBoxPredictor(
-          is_training, num_classes)
-    mock_box_coder = test_utils.MockBoxCoder()
-    if use_keras:
-      fake_feature_extractor = FakeSSDKerasFeatureExtractor()
-    else:
-      fake_feature_extractor = FakeSSDFeatureExtractor()
-    mock_matcher = test_utils.MockMatcher()
-    region_similarity_calculator = sim_calc.IouSimilarity()
-    encode_background_as_zeros = False
-    def image_resizer_fn(image):
-      return [tf.identity(image), tf.shape(image)]
-    classification_loss = losses.WeightedSigmoidClassificationLoss()
-    localization_loss = losses.WeightedSmoothL1LocalizationLoss()
-    non_max_suppression_fn = functools.partial(
-        post_processing.batch_multiclass_non_max_suppression,
-        score_thresh=-20.0,
-        iou_thresh=1.0,
-        max_size_per_class=5,
-        max_total_size=5)
-    classification_loss_weight = 1.0
-    localization_loss_weight = 1.0
-    negative_class_weight = 1.0
-    normalize_loss_by_num_matches = False
-    hard_example_miner = None
-    if apply_hard_mining:
-      # This hard example miner is expected to be a no-op.
-      hard_example_miner = losses.HardExampleMiner(
-          num_hard_examples=None,
-          iou_threshold=1.0)
-    random_example_sampler = None
-    if random_example_sampling:
-      random_example_sampler = sampler.BalancedPositiveNegativeSampler(
-          positive_fraction=0.5)
-    target_assigner_instance = target_assigner.TargetAssigner(
-        region_similarity_calculator,
-        mock_matcher,
-        mock_box_coder,
-        negative_class_weight=negative_class_weight,
-        weight_regression_loss_by_score=weight_regression_loss_by_score)
-    expected_classification_loss_under_sampling = None
-    if use_expected_classification_loss_under_sampling:
-      expected_classification_loss_under_sampling = functools.partial(
-          ops.expected_classification_loss_under_sampling,
-          minimum_negative_sampling=minimum_negative_sampling,
-          desired_negative_sampling_ratio=desired_negative_sampling_ratio)
-    code_size = 4
-    model = ssd_meta_arch.SSDMetaArch(
-        is_training,
-        mock_anchor_generator,
-        mock_box_predictor,
-        mock_box_coder,
-        fake_feature_extractor,
-        mock_matcher,
-        region_similarity_calculator,
-        encode_background_as_zeros,
-        negative_class_weight,
-        image_resizer_fn,
-        non_max_suppression_fn,
-        tf.identity,
-        classification_loss,
-        localization_loss,
-        classification_loss_weight,
-        localization_loss_weight,
-        normalize_loss_by_num_matches,
-        hard_example_miner,
-        target_assigner_instance=target_assigner_instance,
-        add_summaries=False,
        normalize_loc_loss_by_codesize=normalize_loc_loss_by_codesize,
-        freeze_batchnorm=False,
-        inplace_batchnorm_update=False,
        add_background_class=add_background_class,
-        random_example_sampler=random_example_sampler,
+        random_example_sampling=random_example_sampling,
-        expected_classification_loss_under_sampling=
+        weight_regression_loss_by_score=weight_regression_loss_by_score,
-        expected_classification_loss_under_sampling)
+        use_expected_classification_loss_under_sampling=
-    return model, num_classes, mock_anchor_generator.num_anchors(), code_size
+        use_expected_classification_loss_under_sampling,
+        minimum_negative_sampling=minimum_negative_sampling,
+        desired_negative_sampling_ratio=desired_negative_sampling_ratio,
+        use_keras=use_keras,
+        predict_mask=predict_mask,
+        use_static_shapes=use_static_shapes,
+        nms_max_size_per_class=nms_max_size_per_class)
  def test_preprocess_preserves_shapes_with_dynamic_input_image(
      self, use_keras):
@@ -360,6 +210,7 @@ class SsdMetaArchTest(test_case.TestCase, parameterized.TestCase):
      self.assertAllClose(detections_out['num_detections'],
                          expected_num_detections)
  def test_loss_results_are_correct(self, use_keras):
    with tf.Graph().as_default():
@@ -374,9 +225,10 @@ class SsdMetaArchTest(test_case.TestCase, parameterized.TestCase):
      prediction_dict = model.predict(preprocessed_tensor,
                                      true_image_shapes=None)
      loss_dict = model.loss(prediction_dict, true_image_shapes=None)
-      return (
+      return (self._get_value_for_matching_key(loss_dict,
-          _get_value_for_matching_key(loss_dict, 'Loss/localization_loss'),
+                                               'Loss/localization_loss'),
-          _get_value_for_matching_key(loss_dict, 'Loss/classification_loss'))
+              self._get_value_for_matching_key(loss_dict,
+                                               'Loss/classification_loss'))
    batch_size = 2
    preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)
@@ -413,7 +265,8 @@ class SsdMetaArchTest(test_case.TestCase, parameterized.TestCase):
      prediction_dict = model.predict(preprocessed_tensor,
                                      true_image_shapes=None)
      loss_dict = model.loss(prediction_dict, true_image_shapes=None)
-      return (_get_value_for_matching_key(loss_dict, 'Loss/localization_loss'),)
+      return (self._get_value_for_matching_key(loss_dict,
+                                               'Loss/localization_loss'),)
    batch_size = 2
    preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)
@@ -443,9 +296,10 @@ class SsdMetaArchTest(test_case.TestCase, parameterized.TestCase):
      prediction_dict = model.predict(preprocessed_tensor,
                                      true_image_shapes=None)
      loss_dict = model.loss(prediction_dict, true_image_shapes=None)
-      return (
+      return (self._get_value_for_matching_key(loss_dict,
-          _get_value_for_matching_key(loss_dict, 'Loss/localization_loss'),
+                                               'Loss/localization_loss'),
-          _get_value_for_matching_key(loss_dict, 'Loss/classification_loss'))
+              self._get_value_for_matching_key(loss_dict,
+                                               'Loss/classification_loss'))
    batch_size = 2
    preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)
@@ -591,6 +445,55 @@ class SsdMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertAllClose(localization_loss, expected_localization_loss)
    self.assertAllClose(classification_loss, expected_classification_loss)
+  def test_loss_results_are_correct_with_losses_mask(self, use_keras):
+    with tf.Graph().as_default():
+      _, num_classes, num_anchors, _ = self._create_model(use_keras=use_keras)
+    def graph_fn(preprocessed_tensor, groundtruth_boxes1, groundtruth_boxes2,
+                 groundtruth_boxes3, groundtruth_classes1, groundtruth_classes2,
+                 groundtruth_classes3):
+      groundtruth_boxes_list = [groundtruth_boxes1, groundtruth_boxes2,
+                                groundtruth_boxes3]
+      groundtruth_classes_list = [groundtruth_classes1, groundtruth_classes2,
+                                  groundtruth_classes3]
+      is_annotated_list = [tf.constant(True), tf.constant(True),
+                           tf.constant(False)]
+      model, _, _, _ = self._create_model(apply_hard_mining=False)
+      model.provide_groundtruth(groundtruth_boxes_list,
+                                groundtruth_classes_list,
+                                is_annotated_list=is_annotated_list)
+      prediction_dict = model.predict(preprocessed_tensor,
+                                      true_image_shapes=None)
+      loss_dict = model.loss(prediction_dict, true_image_shapes=None)
+      return (self._get_value_for_matching_key(loss_dict,
+                                               'Loss/localization_loss'),
+              self._get_value_for_matching_key(loss_dict,
+                                               'Loss/classification_loss'))
+    batch_size = 3
+    preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)
+    groundtruth_boxes1 = np.array([[0, 0, .5, .5]], dtype=np.float32)
+    groundtruth_boxes2 = np.array([[0, 0, .5, .5]], dtype=np.float32)
+    groundtruth_boxes3 = np.array([[0, 0, .5, .5]], dtype=np.float32)
+    groundtruth_classes1 = np.array([[1]], dtype=np.float32)
+    groundtruth_classes2 = np.array([[1]], dtype=np.float32)
+    groundtruth_classes3 = np.array([[1]], dtype=np.float32)
+    expected_localization_loss = 0.0
+    # Note that we are subtracting 1 from batch_size, since the final image is
+    # not annotated.
+    expected_classification_loss = ((batch_size - 1) * num_anchors
+                                    * (num_classes+1) * np.log(2.0))
+    (localization_loss,
+     classification_loss) = self.execute(graph_fn, [preprocessed_input,
+                                                    groundtruth_boxes1,
+                                                    groundtruth_boxes2,
+                                                    groundtruth_boxes3,
+                                                    groundtruth_classes1,
+                                                    groundtruth_classes2,
+                                                    groundtruth_classes3])
+    self.assertAllClose(localization_loss, expected_localization_loss)
+    self.assertAllClose(classification_loss, expected_classification_loss)
  def test_restore_map_for_detection_ckpt(self, use_keras):
    model, _, _, _ = self._create_model(use_keras=use_keras)
    model.predict(tf.constant(np.array([[[[0, 0], [1, 1]], [[1, 0], [0, 1]]]],
@@ -678,10 +581,8 @@ class SsdMetaArchTest(test_case.TestCase, parameterized.TestCase):
      use_keras):
    with tf.Graph().as_default():
-      _, num_classes, num_anchors, _ = self._create_model(
+      _, num_classes, _, _ = self._create_model(
-          random_example_sampling=True,
+          random_example_sampling=True, use_keras=use_keras)
-          use_keras=use_keras)
-    print num_classes, num_anchors
    def graph_fn(preprocessed_tensor, groundtruth_boxes1, groundtruth_boxes2,
                 groundtruth_classes1, groundtruth_classes2):
@@ -694,9 +595,10 @@ class SsdMetaArchTest(test_case.TestCase, parameterized.TestCase):
      prediction_dict = model.predict(
          preprocessed_tensor, true_image_shapes=None)
      loss_dict = model.loss(prediction_dict, true_image_shapes=None)
-      return (_get_value_for_matching_key(loss_dict, 'Loss/localization_loss'),
+      return (self._get_value_for_matching_key(loss_dict,
-              _get_value_for_matching_key(loss_dict,
+                                               'Loss/localization_loss'),
-                                          'Loss/classification_loss'))
+              self._get_value_for_matching_key(loss_dict,
+                                               'Loss/classification_loss'))
    batch_size = 2
    preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)

--- a/research/object_detection/meta_architectures/ssd_meta_arch_test_lib.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch_test_lib.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helper functions for SSD models meta architecture tests."""
+import functools
+import tensorflow as tf
+from object_detection.core import anchor_generator
+from object_detection.core import balanced_positive_negative_sampler as sampler
+from object_detection.core import box_list
+from object_detection.core import losses
+from object_detection.core import post_processing
+from object_detection.core import region_similarity_calculator as sim_calc
+from object_detection.core import target_assigner
+from object_detection.meta_architectures import ssd_meta_arch
+from object_detection.utils import ops
+from object_detection.utils import test_case
+from object_detection.utils import test_utils
+slim = tf.contrib.slim
+keras = tf.keras.layers
+class FakeSSDFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
+  """Fake ssd feature extracture for ssd meta arch tests."""
+  def __init__(self):
+    super(FakeSSDFeatureExtractor, self).__init__(
+        is_training=True,
+        depth_multiplier=0,
+        min_depth=0,
+        pad_to_multiple=1,
+        conv_hyperparams_fn=None)
+  def preprocess(self, resized_inputs):
+    return tf.identity(resized_inputs)
+  def extract_features(self, preprocessed_inputs):
+    with tf.variable_scope('mock_model'):
+      features = slim.conv2d(
+          inputs=preprocessed_inputs,
+          num_outputs=32,
+          kernel_size=1,
+          scope='layer1')
+      return [features]
+class FakeSSDKerasFeatureExtractor(ssd_meta_arch.SSDKerasFeatureExtractor):
+  """Fake keras based ssd feature extracture for ssd meta arch tests."""
+  def __init__(self):
+    with tf.name_scope('mock_model'):
+      super(FakeSSDKerasFeatureExtractor, self).__init__(
+          is_training=True,
+          depth_multiplier=0,
+          min_depth=0,
+          pad_to_multiple=1,
+          conv_hyperparams=None,
+          freeze_batchnorm=False,
+          inplace_batchnorm_update=False,
+      )
+      self._conv = keras.Conv2D(filters=32, kernel_size=1, name='layer1')
+  def preprocess(self, resized_inputs):
+    return tf.identity(resized_inputs)
+  def _extract_features(self, preprocessed_inputs, **kwargs):
+    with tf.name_scope('mock_model'):
+      return [self._conv(preprocessed_inputs)]
+class MockAnchorGenerator2x2(anchor_generator.AnchorGenerator):
+  """A simple 2x2 anchor grid on the unit square used for test only."""
+  def name_scope(self):
+    return 'MockAnchorGenerator'
+  def num_anchors_per_location(self):
+    return [1]
+  def _generate(self, feature_map_shape_list, im_height, im_width):
+    return [
+        box_list.BoxList(
+            tf.constant(
+                [
+                    [0, 0, .5, .5],
+                    [0, .5, .5, 1],
+                    [.5, 0, 1, .5],
+                    [1., 1., 1.5, 1.5]  # Anchor that is outside clip_window.
+                ],
+                tf.float32))
+    ]
+  def num_anchors(self):
+    return 4
+class SSDMetaArchTestBase(test_case.TestCase):
+  """Base class to test SSD based meta architectures."""
+  def _create_model(self,
+                    model_fn=ssd_meta_arch.SSDMetaArch,
+                    apply_hard_mining=True,
+                    normalize_loc_loss_by_codesize=False,
+                    add_background_class=True,
+                    random_example_sampling=False,
+                    weight_regression_loss_by_score=False,
+                    use_expected_classification_loss_under_sampling=False,
+                    minimum_negative_sampling=1,
+                    desired_negative_sampling_ratio=3,
+                    use_keras=False,
+                    predict_mask=False,
+                    use_static_shapes=False,
+                    nms_max_size_per_class=5):
+    is_training = False
+    num_classes = 1
+    mock_anchor_generator = MockAnchorGenerator2x2()
+    if use_keras:
+      mock_box_predictor = test_utils.MockKerasBoxPredictor(
+          is_training, num_classes, predict_mask=predict_mask)
+    else:
+      mock_box_predictor = test_utils.MockBoxPredictor(
+          is_training, num_classes, predict_mask=predict_mask)
+    mock_box_coder = test_utils.MockBoxCoder()
+    if use_keras:
+      fake_feature_extractor = FakeSSDKerasFeatureExtractor()
+    else:
+      fake_feature_extractor = FakeSSDFeatureExtractor()
+    mock_matcher = test_utils.MockMatcher()
+    region_similarity_calculator = sim_calc.IouSimilarity()
+    encode_background_as_zeros = False
+    def image_resizer_fn(image):
+      return [tf.identity(image), tf.shape(image)]
+    classification_loss = losses.WeightedSigmoidClassificationLoss()
+    localization_loss = losses.WeightedSmoothL1LocalizationLoss()
+    non_max_suppression_fn = functools.partial(
+        post_processing.batch_multiclass_non_max_suppression,
+        score_thresh=-20.0,
+        iou_thresh=1.0,
+        max_size_per_class=nms_max_size_per_class,
+        max_total_size=nms_max_size_per_class,
+        use_static_shapes=use_static_shapes)
+    classification_loss_weight = 1.0
+    localization_loss_weight = 1.0
+    negative_class_weight = 1.0
+    normalize_loss_by_num_matches = False
+    hard_example_miner = None
+    if apply_hard_mining:
+      # This hard example miner is expected to be a no-op.
+      hard_example_miner = losses.HardExampleMiner(
+          num_hard_examples=None, iou_threshold=1.0)
+    random_example_sampler = None
+    if random_example_sampling:
+      random_example_sampler = sampler.BalancedPositiveNegativeSampler(
+          positive_fraction=0.5)
+    target_assigner_instance = target_assigner.TargetAssigner(
+        region_similarity_calculator,
+        mock_matcher,
+        mock_box_coder,
+        negative_class_weight=negative_class_weight,
+        weight_regression_loss_by_score=weight_regression_loss_by_score)
+    expected_classification_loss_under_sampling = None
+    if use_expected_classification_loss_under_sampling:
+      expected_classification_loss_under_sampling = functools.partial(
+          ops.expected_classification_loss_under_sampling,
+          minimum_negative_sampling=minimum_negative_sampling,
+          desired_negative_sampling_ratio=desired_negative_sampling_ratio)
+    code_size = 4
+    model = model_fn(
+        is_training=is_training,
+        anchor_generator=mock_anchor_generator,
+        box_predictor=mock_box_predictor,
+        box_coder=mock_box_coder,
+        feature_extractor=fake_feature_extractor,
+        encode_background_as_zeros=encode_background_as_zeros,
+        image_resizer_fn=image_resizer_fn,
+        non_max_suppression_fn=non_max_suppression_fn,
+        score_conversion_fn=tf.identity,
+        classification_loss=classification_loss,
+        localization_loss=localization_loss,
+        classification_loss_weight=classification_loss_weight,
+        localization_loss_weight=localization_loss_weight,
+        normalize_loss_by_num_matches=normalize_loss_by_num_matches,
+        hard_example_miner=hard_example_miner,
+        target_assigner_instance=target_assigner_instance,
+        add_summaries=False,
+        normalize_loc_loss_by_codesize=normalize_loc_loss_by_codesize,
+        freeze_batchnorm=False,
+        inplace_batchnorm_update=False,
+        add_background_class=add_background_class,
+        random_example_sampler=random_example_sampler,
+        expected_classification_loss_under_sampling=
+        expected_classification_loss_under_sampling)
+    return model, num_classes, mock_anchor_generator.num_anchors(), code_size
+  def _get_value_for_matching_key(self, dictionary, suffix):
+    for key in dictionary.keys():
+      if key.endswith(suffix):
+        return dictionary[key]
+    raise ValueError('key not found {}'.format(suffix))
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/metrics/coco_evaluation.py
+++ b/research/object_detection/metrics/coco_evaluation.py
@@ -18,6 +18,7 @@ import tensorflow as tf
 from object_detection.core import standard_fields
 from object_detection.metrics import coco_tools
+from object_detection.utils import json_utils
 from object_detection.utils import object_detection_evaluation
@@ -148,6 +149,19 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
                                              detection_classes]))
    self._image_ids[image_id] = True
+  def dump_detections_to_json_file(self, json_output_path):
+    """Saves the detections into json_output_path in the format used by MS COCO.
+    Args:
+      json_output_path: String containing the output file's path. It can be also
+        None. In that case nothing will be written to the output file.
+    """
+    if json_output_path and json_output_path is not None:
+      with tf.gfile.GFile(json_output_path, 'w') as fid:
+        tf.logging.info('Dumping detections to output json file.')
+        json_utils.Dump(
+            obj=self._detection_boxes_list, fid=fid, float_digits=4, indent=2)
  def evaluate(self):
    """Evaluates the detection boxes and returns a dictionary of coco metrics.
@@ -245,10 +259,11 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
               detection_boxes_batched, detection_scores_batched,
               detection_classes_batched, num_det_boxes_per_image):
        self.add_single_ground_truth_image_info(
-            image_id,
+            image_id, {
-            {'groundtruth_boxes': gt_box[:num_gt_box],
+                'groundtruth_boxes': gt_box[:num_gt_box],
-             'groundtruth_classes': gt_class[:num_gt_box],
+                'groundtruth_classes': gt_class[:num_gt_box],
-             'groundtruth_is_crowd': gt_is_crowd[:num_gt_box]})
+                'groundtruth_is_crowd': gt_is_crowd[:num_gt_box]
+            })
        self.add_single_detected_image_info(
            image_id,
            {'detection_boxes': det_box[:num_det_box],
@@ -268,8 +283,7 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
    detection_classes = eval_dict[detection_fields.detection_classes]
    num_gt_boxes_per_image = eval_dict.get(
        'num_groundtruth_boxes_per_image', None)
-    num_det_boxes_per_image = eval_dict.get(
+    num_det_boxes_per_image = eval_dict.get('num_det_boxes_per_image', None)
-        'num_groundtruth_boxes_per_image', None)
    if groundtruth_is_crowd is None:
      groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool)
@@ -491,6 +505,19 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
                                              detection_classes]))
    self._image_ids_with_detections.update([image_id])
+  def dump_detections_to_json_file(self, json_output_path):
+    """Saves the detections into json_output_path in the format used by MS COCO.
+    Args:
+      json_output_path: String containing the output file's path. It can be also
+        None. In that case nothing will be written to the output file.
+    """
+    if json_output_path and json_output_path is not None:
+      tf.logging.info('Dumping detections to output json file.')
+      with tf.gfile.GFile(json_output_path, 'w') as fid:
+        json_utils.Dump(
+            obj=self._detection_masks_list, fid=fid, float_digits=4, indent=2)
  def evaluate(self):
    """Evaluates the detection masks and returns a dictionary of coco metrics.

--- a/research/object_detection/metrics/coco_evaluation_test.py
+++ b/research/object_detection/metrics/coco_evaluation_test.py
@@ -24,14 +24,25 @@ from object_detection.core import standard_fields
 from object_detection.metrics import coco_evaluation
+def _get_categories_list():
+  return [{
+      'id': 1,
+      'name': 'person'
+  }, {
+      'id': 2,
+      'name': 'dog'
+  }, {
+      'id': 3,
+      'name': 'cat'
+  }]
 class CocoDetectionEvaluationTest(tf.test.TestCase):
  def testGetOneMAPWithMatchingGroundtruthAndDetections(self):
    """Tests that mAP is calculated correctly on GT and Detections."""
-    category_list = [{'id': 0, 'name': 'person'},
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
-                     {'id': 1, 'name': 'cat'},
+        _get_categories_list())
-                     {'id': 2, 'name': 'dog'}]
-    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
    coco_evaluator.add_single_ground_truth_image_info(
        image_id='image1',
        groundtruth_dict={
@@ -88,17 +99,8 @@ class CocoDetectionEvaluationTest(tf.test.TestCase):
  def testGetOneMAPWithMatchingGroundtruthAndDetectionsSkipCrowd(self):
    """Tests computing mAP with is_crowd GT boxes skipped."""
-    category_list = [{
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
-        'id': 0,
+        _get_categories_list())
-        'name': 'person'
-    }, {
-        'id': 1,
-        'name': 'cat'
-    }, {
-        'id': 2,
-        'name': 'dog'
-    }]
-    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
    coco_evaluator.add_single_ground_truth_image_info(
        image_id='image1',
        groundtruth_dict={
@@ -124,17 +126,8 @@ class CocoDetectionEvaluationTest(tf.test.TestCase):
  def testGetOneMAPWithMatchingGroundtruthAndDetectionsEmptyCrowd(self):
    """Tests computing mAP with empty is_crowd array passed in."""
-    category_list = [{
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
-        'id': 0,
+        _get_categories_list())
-        'name': 'person'
-    }, {
-        'id': 1,
-        'name': 'cat'
-    }, {
-        'id': 2,
-        'name': 'dog'
-    }]
-    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
    coco_evaluator.add_single_ground_truth_image_info(
        image_id='image1',
        groundtruth_dict={
@@ -160,11 +153,9 @@ class CocoDetectionEvaluationTest(tf.test.TestCase):
  def testRejectionOnDuplicateGroundtruth(self):
    """Tests that groundtruth cannot be added more than once for an image."""
-    categories = [{'id': 1, 'name': 'cat'},
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
-                  {'id': 2, 'name': 'dog'},
+        _get_categories_list())
-                  {'id': 3, 'name': 'elephant'}]
    #  Add groundtruth
-    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(categories)
    image_key1 = 'img1'
    groundtruth_boxes1 = np.array([[0, 0, 1, 1], [0, 0, 2, 2], [0, 0, 3, 3]],
                                  dtype=float)
@@ -189,11 +180,9 @@ class CocoDetectionEvaluationTest(tf.test.TestCase):
  def testRejectionOnDuplicateDetections(self):
    """Tests that detections cannot be added more than once for an image."""
-    categories = [{'id': 1, 'name': 'cat'},
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
-                  {'id': 2, 'name': 'dog'},
+        _get_categories_list())
-                  {'id': 3, 'name': 'elephant'}]
    #  Add groundtruth
-    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(categories)
    coco_evaluator.add_single_ground_truth_image_info(
        image_id='image1',
        groundtruth_dict={
@@ -227,10 +216,8 @@ class CocoDetectionEvaluationTest(tf.test.TestCase):
  def testExceptionRaisedWithMissingGroundtruth(self):
    """Tests that exception is raised for detection with missing groundtruth."""
-    categories = [{'id': 1, 'name': 'cat'},
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
-                  {'id': 2, 'name': 'dog'},
+        _get_categories_list())
-                  {'id': 3, 'name': 'elephant'}]
-    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(categories)
    with self.assertRaises(ValueError):
      coco_evaluator.add_single_detected_image_info(
          image_id='image1',
@@ -247,10 +234,8 @@ class CocoDetectionEvaluationTest(tf.test.TestCase):
 class CocoEvaluationPyFuncTest(tf.test.TestCase):
  def testGetOneMAPWithMatchingGroundtruthAndDetections(self):
-    category_list = [{'id': 0, 'name': 'person'},
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
-                     {'id': 1, 'name': 'cat'},
+        _get_categories_list())
-                     {'id': 2, 'name': 'dog'}]
-    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
    image_id = tf.placeholder(tf.string, shape=())
    groundtruth_boxes = tf.placeholder(tf.float32, shape=(None, 4))
    groundtruth_classes = tf.placeholder(tf.float32, shape=(None))
@@ -310,31 +295,22 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'],
-                           -1.0)
+                           1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'],
-                           -1.0)
+                           1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0)
    self.assertFalse(coco_evaluator._groundtruth_list)
    self.assertFalse(coco_evaluator._detection_boxes_list)
    self.assertFalse(coco_evaluator._image_ids)
  def testGetOneMAPWithMatchingGroundtruthAndDetectionsPadded(self):
-    category_list = [{
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
-        'id': 0,
+        _get_categories_list())
-        'name': 'person'
-    }, {
-        'id': 1,
-        'name': 'cat'
-    }, {
-        'id': 2,
-        'name': 'dog'
-    }]
-    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
    image_id = tf.placeholder(tf.string, shape=())
    groundtruth_boxes = tf.placeholder(tf.float32, shape=(None, 4))
    groundtruth_classes = tf.placeholder(tf.float32, shape=(None))
@@ -415,24 +391,22 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'],
-                           -1.0)
+                           1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0)
-    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 0.75)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 0.83333331)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'],
-                           -1.0)
+                           1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0)
    self.assertFalse(coco_evaluator._groundtruth_list)
    self.assertFalse(coco_evaluator._detection_boxes_list)
    self.assertFalse(coco_evaluator._image_ids)
  def testGetOneMAPWithMatchingGroundtruthAndDetectionsBatched(self):
-    category_list = [{'id': 0, 'name': 'person'},
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
-                     {'id': 1, 'name': 'cat'},
+        _get_categories_list())
-                     {'id': 2, 'name': 'dog'}]
-    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
    batch_size = 3
    image_id = tf.placeholder(tf.string, shape=(batch_size))
    groundtruth_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4))
@@ -479,24 +453,22 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'],
-                           -1.0)
+                           1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'],
-                           -1.0)
+                           1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0)
    self.assertFalse(coco_evaluator._groundtruth_list)
    self.assertFalse(coco_evaluator._detection_boxes_list)
    self.assertFalse(coco_evaluator._image_ids)
  def testGetOneMAPWithMatchingGroundtruthAndDetectionsPaddedBatches(self):
-    category_list = [{'id': 0, 'name': 'person'},
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
-                     {'id': 1, 'name': 'cat'},
+        _get_categories_list())
-                     {'id': 2, 'name': 'dog'}]
-    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
    batch_size = 3
    image_id = tf.placeholder(tf.string, shape=(batch_size))
    groundtruth_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4))
@@ -525,27 +497,40 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
    _, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
    with self.test_session() as sess:
-      sess.run(update_op,
+      sess.run(
-               feed_dict={
+          update_op,
-                   image_id: ['image1', 'image2', 'image3'],
+          feed_dict={
-                   groundtruth_boxes: np.array([[[100., 100., 200., 200.],
+              image_id: ['image1', 'image2', 'image3'],
-                                                 [-1, -1, -1, -1]],
+              groundtruth_boxes:
-                                                [[50., 50., 100., 100.],
+                  np.array([[[100., 100., 200., 200.], [-1, -1, -1, -1]],
-                                                 [-1, -1, -1, -1]],
+                            [[50., 50., 100., 100.], [-1, -1, -1, -1]],
-                                                [[25., 25., 50., 50.],
+                            [[25., 25., 50., 50.], [10., 10., 15., 15.]]]),
-                                                 [10., 10., 15., 15.]]]),
+              groundtruth_classes:
-                   groundtruth_classes: np.array([[1, -1], [3, -1], [2, 2]]),
+                  np.array([[1, -1], [3, -1], [2, 2]]),
-                   num_gt_boxes_per_image: np.array([1, 1, 2]),
+              num_gt_boxes_per_image:
-                   detection_boxes: np.array([[[100., 100., 200., 200.],
+                  np.array([1, 1, 2]),
-                                               [0., 0., 0., 0.]],
+              detection_boxes:
-                                              [[50., 50., 100., 100.],
+                  np.array([[[100., 100., 200., 200.],
-                                               [0., 0., 0., 0.]],
+                             [0., 0., 0., 0.],
-                                              [[25., 25., 50., 50.],
+                             [0., 0., 0., 0.]],
-                                               [10., 10., 15., 15.]]]),
+                            [[50., 50., 100., 100.],
-                   detection_scores: np.array([[.8, 0.], [.7, 0.], [.95, .9]]),
+                             [0., 0., 0., 0.],
-                   detection_classes: np.array([[1, -1], [3, -1], [2, 2]]),
+                             [0., 0., 0., 0.]],
-                   num_det_boxes_per_image: np.array([1, 1, 2]),
+                            [[25., 25., 50., 50.],
-               })
+                             [10., 10., 15., 15.],
+                             [10., 10., 15., 15.]]]),
+              detection_scores:
+                  np.array([[.8, 0., 0.], [.7, 0., 0.], [.95, .9, 0.9]]),
+              detection_classes:
+                  np.array([[1, -1, -1], [3, -1, -1], [2, 2, 2]]),
+              num_det_boxes_per_image:
+                  np.array([1, 1, 3]),
+          })
+    # Check the number of bounding boxes added.
+    self.assertEqual(len(coco_evaluator._groundtruth_list), 4)
+    self.assertEqual(len(coco_evaluator._detection_boxes_list), 5)
    metrics = {}
    for key, (value_op, _) in eval_metric_ops.iteritems():
      metrics[key] = value_op
@@ -555,14 +540,14 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'],
-                           -1.0)
+                           1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0)
-    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 0.75)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 0.83333331)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'],
-                           -1.0)
+                           1.0)
    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0)
    self.assertFalse(coco_evaluator._groundtruth_list)
    self.assertFalse(coco_evaluator._detection_boxes_list)
@@ -572,10 +557,7 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
 class CocoMaskEvaluationTest(tf.test.TestCase):
  def testGetOneMAPWithMatchingGroundtruthAndDetections(self):
-    category_list = [{'id': 0, 'name': 'person'},
+    coco_evaluator = coco_evaluation.CocoMaskEvaluator(_get_categories_list())
-                     {'id': 1, 'name': 'cat'},
-                     {'id': 2, 'name': 'dog'}]
-    coco_evaluator = coco_evaluation.CocoMaskEvaluator(category_list)
    coco_evaluator.add_single_ground_truth_image_info(
        image_id='image1',
        groundtruth_dict={
@@ -657,10 +639,7 @@ class CocoMaskEvaluationTest(tf.test.TestCase):
 class CocoMaskEvaluationPyFuncTest(tf.test.TestCase):
  def testGetOneMAPWithMatchingGroundtruthAndDetections(self):
-    category_list = [{'id': 0, 'name': 'person'},
+    coco_evaluator = coco_evaluation.CocoMaskEvaluator(_get_categories_list())
-                     {'id': 1, 'name': 'cat'},
-                     {'id': 2, 'name': 'dog'}]
-    coco_evaluator = coco_evaluation.CocoMaskEvaluator(category_list)
    image_id = tf.placeholder(tf.string, shape=())
    groundtruth_boxes = tf.placeholder(tf.float32, shape=(None, 4))
    groundtruth_classes = tf.placeholder(tf.float32, shape=(None))
@@ -756,5 +735,6 @@ class CocoMaskEvaluationPyFuncTest(tf.test.TestCase):
    self.assertFalse(coco_evaluator._image_id_to_mask_shape_map)
    self.assertFalse(coco_evaluator._detection_masks_list)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/metrics/offline_eval_map_corloc.py
+++ b/research/object_detection/metrics/offline_eval_map_corloc.py
@@ -91,10 +91,8 @@ def read_data_and_evaluate(input_config, eval_config):
  if input_config.WhichOneof('input_reader') == 'tf_record_input_reader':
    input_paths = input_config.tf_record_input_reader.input_path
-    label_map = label_map_util.load_labelmap(input_config.label_map_path)
+    categories = label_map_util.create_categories_from_labelmap(
-    max_num_classes = max([item.id for item in label_map.item])
+        input_config.label_map_path)
-    categories = label_map_util.convert_label_map_to_categories(
-        label_map, max_num_classes)
    object_detection_evaluators = evaluator.get_evaluators(
        eval_config, categories)

--- a/research/object_detection/model_lib.py
+++ b/research/object_detection/model_lib.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import copy
 import functools
 import os
@@ -43,9 +44,12 @@ MODEL_BUILD_UTIL_MAP = {
        config_util.create_pipeline_proto_from_configs,
    'merge_external_params_with_configs':
        config_util.merge_external_params_with_configs,
-    'create_train_input_fn': inputs.create_train_input_fn,
+    'create_train_input_fn':
-    'create_eval_input_fn': inputs.create_eval_input_fn,
+        inputs.create_train_input_fn,
-    'create_predict_input_fn': inputs.create_predict_input_fn,
+    'create_eval_input_fn':
+        inputs.create_eval_input_fn,
+    'create_predict_input_fn':
+        inputs.create_predict_input_fn,
 }
@@ -126,8 +130,9 @@ def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True):
    ValueError: If unpad_tensors is True and `tensor_dict` does not contain
      `num_groundtruth_boxes` tensor.
  """
-  unbatched_tensor_dict = {key: tf.unstack(tensor)
+  unbatched_tensor_dict = {
-                           for key, tensor in tensor_dict.items()}
+      key: tf.unstack(tensor) for key, tensor in tensor_dict.items()
+  }
  if unpad_groundtruth_tensors:
    if (fields.InputDataFields.num_groundtruth_boxes not in
        unbatched_tensor_dict):
@@ -206,8 +211,8 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
    # Make sure to set the Keras learning phase. True during training,
    # False for inference.
    tf.keras.backend.set_learning_phase(is_training)
-    detection_model = detection_model_fn(is_training=is_training,
+    detection_model = detection_model_fn(
-                                         add_summaries=(not use_tpu))
+        is_training=is_training, add_summaries=(not use_tpu))
    scaffold_fn = None
    if mode == tf.estimator.ModeKeys.TRAIN:
@@ -237,6 +242,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
      gt_weights_list = None
      if fields.InputDataFields.groundtruth_weights in labels:
        gt_weights_list = labels[fields.InputDataFields.groundtruth_weights]
+      gt_is_crowd_list = None
      if fields.InputDataFields.groundtruth_is_crowd in labels:
        gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd]
      detection_model.provide_groundtruth(
@@ -248,8 +254,18 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
          groundtruth_is_crowd_list=gt_is_crowd_list)
    preprocessed_images = features[fields.InputDataFields.image]
-    prediction_dict = detection_model.predict(
+    if use_tpu and train_config.use_bfloat16:
-        preprocessed_images, features[fields.InputDataFields.true_image_shape])
+      with tf.contrib.tpu.bfloat16_scope():
+        prediction_dict = detection_model.predict(
+            preprocessed_images,
+            features[fields.InputDataFields.true_image_shape])
+        for k, v in prediction_dict.items():
+          if v.dtype == tf.bfloat16:
+            prediction_dict[k] = tf.cast(v, tf.float32)
+    else:
+      prediction_dict = detection_model.predict(
+          preprocessed_images,
+          features[fields.InputDataFields.true_image_shape])
    if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
      detections = detection_model.postprocess(
          prediction_dict, features[fields.InputDataFields.true_image_shape])
@@ -270,13 +286,16 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
                train_config.load_all_detection_checkpoint_vars))
        available_var_map = (
            variables_helper.get_variables_available_in_checkpoint(
-                asg_map, train_config.fine_tune_checkpoint,
+                asg_map,
+                train_config.fine_tune_checkpoint,
                include_global_step=False))
        if use_tpu:
          def tpu_scaffold():
            tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                          available_var_map)
            return tf.train.Scaffold()
          scaffold_fn = tpu_scaffold
        else:
          tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
@@ -290,8 +309,8 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
        regularization_losses = tf.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES)
        if regularization_losses:
-          regularization_loss = tf.add_n(regularization_losses,
+          regularization_loss = tf.add_n(
-                                         name='regularization_loss')
+              regularization_losses, name='regularization_loss')
          losses.append(regularization_loss)
          losses_dict['Loss/regularization_loss'] = regularization_loss
      total_loss = tf.add_n(losses, name='total_loss')
@@ -353,14 +372,19 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
    eval_metric_ops = None
    scaffold = None
    if mode == tf.estimator.ModeKeys.EVAL:
-      class_agnostic = (fields.DetectionResultFields.detection_classes
+      class_agnostic = (
-                        not in detections)
+          fields.DetectionResultFields.detection_classes not in detections)
-      groundtruth = _prepare_groundtruth_for_eval(
+      groundtruth = _prepare_groundtruth_for_eval(detection_model,
-          detection_model, class_agnostic)
+                                                  class_agnostic)
      use_original_images = fields.InputDataFields.original_image in features
-      eval_images = (
+      if use_original_images:
-          features[fields.InputDataFields.original_image] if use_original_images
+        eval_images = tf.cast(tf.image.resize_bilinear(
-          else features[fields.InputDataFields.image])
+            features[fields.InputDataFields.original_image][0:1],
+            features[fields.InputDataFields.original_image_spatial_shape][0]),
+                              tf.uint8)
+      else:
+        eval_images = features[fields.InputDataFields.image]
      eval_dict = eval_util.result_dict_for_single_example(
          eval_images[0:1],
          features[inputs.HASH_KEY][0],
@@ -374,28 +398,26 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
      else:
        category_index = label_map_util.create_category_index_from_labelmap(
            eval_input_config.label_map_path)
-      img_summary = None
+      vis_metric_ops = None
      if not use_tpu and use_original_images:
-        detection_and_groundtruth = (
+        eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
-            vis_utils.draw_side_by_side_evaluation_image(
+            category_index,
-                eval_dict, category_index, max_boxes_to_draw=eval_config.max_num_boxes_to_visualize,
+            max_examples_to_draw=eval_config.num_visualizations,
-                min_score_thresh=eval_config.min_score_threshold,
+            max_boxes_to_draw=eval_config.max_num_boxes_to_visualize,
-                use_normalized_coordinates=False))
+            min_score_thresh=eval_config.min_score_threshold,
-        img_summary = tf.summary.image('Detections_Left_Groundtruth_Right',
+            use_normalized_coordinates=False)
-                                       detection_and_groundtruth)
+        vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
+            eval_dict)
      # Eval metrics on a single example.
      eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
-          eval_config,
+          eval_config, category_index.values(), eval_dict)
-          category_index.values(),
-          eval_dict)
      for loss_key, loss_tensor in iter(losses_dict.items()):
        eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
      for var in optimizer_summary_vars:
        eval_metric_ops[var.op.name] = (var, tf.no_op())
-      if img_summary is not None:
+      if vis_metric_ops is not None:
-        eval_metric_ops['Detections_Left_Groundtruth_Right'] = (
+        eval_metric_ops.update(vis_metric_ops)
-            img_summary, tf.no_op())
      eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()}
      if eval_config.use_moving_averages:
@@ -435,12 +457,14 @@ def create_estimator_and_inputs(run_config,
                                hparams,
                                pipeline_config_path,
                                train_steps=None,
-                                eval_steps=None,
+                                sample_1_of_n_eval_examples=1,
+                                sample_1_of_n_eval_on_train_examples=1,
                                model_fn_creator=create_model_fn,
                                use_tpu_estimator=False,
                                use_tpu=False,
                                num_shards=1,
                                params=None,
+                                override_eval_num_epochs=True,
                                **kwargs):
  """Creates `Estimator`, input functions, and steps.
@@ -450,8 +474,11 @@ def create_estimator_and_inputs(run_config,
    pipeline_config_path: A path to a pipeline config file.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
-    eval_steps: Number of evaluation steps per evaluation cycle. If None, the
+    sample_1_of_n_eval_examples: Integer representing how often an eval example
-      number of evaluation steps is set from the `EvalConfig` proto.
+      should be sampled. If 1, will sample all examples.
+    sample_1_of_n_eval_on_train_examples: Similar to
+      `sample_1_of_n_eval_examples`, except controls the sampling of training
+      data for evaluation.
    model_fn_creator: A function that creates a `model_fn` for `Estimator`.
      Follows the signature:
@@ -470,19 +497,20 @@ def create_estimator_and_inputs(run_config,
      is True.
    params: Parameter dictionary passed from the estimator. Only used if
      `use_tpu_estimator` is True.
+    override_eval_num_epochs: Whether to overwrite the number of epochs to
+      1 for eval_input.
    **kwargs: Additional keyword arguments for configuration override.
  Returns:
    A dictionary with the following fields:
    'estimator': An `Estimator` or `TPUEstimator`.
    'train_input_fn': A training input function.
-    'eval_input_fn': An evaluation input function.
+    'eval_input_fns': A list of all evaluation input functions.
+    'eval_input_names': A list of names for each evaluation input.
    'eval_on_train_input_fn': An evaluation-on-train input function.
    'predict_input_fn': A prediction input function.
    'train_steps': Number of training steps. Either directly from input or from
      configuration.
-    'eval_steps': Number of evaluation steps. Either directly from input or from
-      configuration.
  """
  get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
      'get_configs_from_pipeline_file']
@@ -495,27 +523,36 @@ def create_estimator_and_inputs(run_config,
  create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn']
  configs = get_configs_from_pipeline_file(pipeline_config_path)
+  kwargs.update({
+      'train_steps': train_steps,
+      'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples
+  })
+  if override_eval_num_epochs:
+    kwargs.update({'eval_num_epochs': 1})
+    tf.logging.warning(
+        'Forced number of epochs for all eval validations to be 1.')
  configs = merge_external_params_with_configs(
-      configs,
+      configs, hparams, kwargs_dict=kwargs)
-      hparams,
-      train_steps=train_steps,
-      eval_steps=eval_steps,
-      retain_original_images_in_eval=False if use_tpu else True,
-      **kwargs)
  model_config = configs['model']
  train_config = configs['train_config']
  train_input_config = configs['train_input_config']
  eval_config = configs['eval_config']
-  eval_input_config = configs['eval_input_config']
+  eval_input_configs = configs['eval_input_configs']
+  eval_on_train_input_config = copy.deepcopy(train_input_config)
+  eval_on_train_input_config.sample_1_of_n_examples = (
+      sample_1_of_n_eval_on_train_examples)
+  if override_eval_num_epochs and eval_on_train_input_config.num_epochs != 1:
+    tf.logging.warning('Expected number of evaluation epochs is 1, but '
+                       'instead encountered `eval_on_train_input_config'
+                       '.num_epochs` = '
+                       '{}. Overwriting `num_epochs` to 1.'.format(
+                           eval_on_train_input_config.num_epochs))
+    eval_on_train_input_config.num_epochs = 1
  # update train_steps from config but only when non-zero value is provided
  if train_steps is None and train_config.num_steps != 0:
    train_steps = train_config.num_steps
-  # update eval_steps from config but only when non-zero value is provided
-  if eval_steps is None and eval_config.num_examples != 0:
-    eval_steps = eval_config.num_examples
  detection_model_fn = functools.partial(
      model_builder.build, model_config=model_config)
@@ -524,18 +561,25 @@ def create_estimator_and_inputs(run_config,
      train_config=train_config,
      train_input_config=train_input_config,
      model_config=model_config)
-  eval_input_fn = create_eval_input_fn(
+  eval_input_fns = [
-      eval_config=eval_config,
+      create_eval_input_fn(
-      eval_input_config=eval_input_config,
+          eval_config=eval_config,
-      model_config=model_config)
+          eval_input_config=eval_input_config,
+          model_config=model_config) for eval_input_config in eval_input_configs
+  ]
+  eval_input_names = [
+      eval_input_config.name for eval_input_config in eval_input_configs
+  ]
  eval_on_train_input_fn = create_eval_input_fn(
      eval_config=eval_config,
-      eval_input_config=train_input_config,
+      eval_input_config=eval_on_train_input_config,
      model_config=model_config)
  predict_input_fn = create_predict_input_fn(
-      model_config=model_config, predict_input_config=eval_input_config)
+      model_config=model_config, predict_input_config=eval_input_configs[0])
-  tf.logging.info('create_estimator_and_inputs: use_tpu %s', use_tpu)
+  export_to_tpu = hparams.get('export_to_tpu', False)
+  tf.logging.info('create_estimator_and_inputs: use_tpu %s, export_to_tpu %s',
+                  use_tpu, export_to_tpu)
  model_fn = model_fn_creator(detection_model_fn, configs, hparams, use_tpu)
  if use_tpu_estimator:
    estimator = tf.contrib.tpu.TPUEstimator(
@@ -552,89 +596,85 @@ def create_estimator_and_inputs(run_config,
  # Write the as-run pipeline config to disk.
  if run_config.is_chief:
-    pipeline_config_final = create_pipeline_proto_from_configs(
+    pipeline_config_final = create_pipeline_proto_from_configs(configs)
-        configs)
    config_util.save_pipeline_config(pipeline_config_final, estimator.model_dir)
  return dict(
      estimator=estimator,
      train_input_fn=train_input_fn,
-      eval_input_fn=eval_input_fn,
+      eval_input_fns=eval_input_fns,
+      eval_input_names=eval_input_names,
      eval_on_train_input_fn=eval_on_train_input_fn,
      predict_input_fn=predict_input_fn,
-      train_steps=train_steps,
+      train_steps=train_steps)
-      eval_steps=eval_steps)
 def create_train_and_eval_specs(train_input_fn,
-                                eval_input_fn,
+                                eval_input_fns,
                                eval_on_train_input_fn,
                                predict_input_fn,
                                train_steps,
-                                eval_steps,
                                eval_on_train_data=False,
-                                eval_on_train_steps=None,
                                final_exporter_name='Servo',
-                                eval_spec_name='eval'):
+                                eval_spec_names=None):
  """Creates a `TrainSpec` and `EvalSpec`s.
  Args:
    train_input_fn: Function that produces features and labels on train data.
-    eval_input_fn: Function that produces features and labels on eval data.
+    eval_input_fns: A list of functions that produce features and labels on eval
+      data.
    eval_on_train_input_fn: Function that produces features and labels for
      evaluation on train data.
    predict_input_fn: Function that produces features for inference.
    train_steps: Number of training steps.
-    eval_steps: Number of eval steps.
    eval_on_train_data: Whether to evaluate model on training data. Default is
      False.
-    eval_on_train_steps: Number of eval steps for training data. If not given,
-      uses eval_steps.
    final_exporter_name: String name given to `FinalExporter`.
-    eval_spec_name: String name given to main `EvalSpec`.
+    eval_spec_names: A list of string names for each `EvalSpec`.
  Returns:
-    Tuple of `TrainSpec` and list of `EvalSpecs`. The first `EvalSpec` is for
+    Tuple of `TrainSpec` and list of `EvalSpecs`. If `eval_on_train_data` is
-    evaluation data. If `eval_on_train_data` is True, the second `EvalSpec` in
+    True, the last `EvalSpec` in the list will correspond to training data. The
-    the list will correspond to training data.
+    rest EvalSpecs in the list are evaluation datas.
  """
-  exporter = tf.estimator.FinalExporter(
-      name=final_exporter_name, serving_input_receiver_fn=predict_input_fn)
  train_spec = tf.estimator.TrainSpec(
      input_fn=train_input_fn, max_steps=train_steps)
-  eval_specs = [
+  if eval_spec_names is None:
-      tf.estimator.EvalSpec(
+    eval_spec_names = range(len(eval_input_fns))
-          name=eval_spec_name,
-          input_fn=eval_input_fn,
+  eval_specs = []
-          steps=eval_steps,
+  for eval_spec_name, eval_input_fn in zip(eval_spec_names, eval_input_fns):
-          exporters=exporter)
+    exporter_name = '{}_{}'.format(final_exporter_name, eval_spec_name)
-  ]
+    exporter = tf.estimator.FinalExporter(
+        name=exporter_name, serving_input_receiver_fn=predict_input_fn)
+    eval_specs.append(
+        tf.estimator.EvalSpec(
+            name=eval_spec_name,
+            input_fn=eval_input_fn,
+            steps=None,
+            exporters=exporter))
  if eval_on_train_data:
    eval_specs.append(
        tf.estimator.EvalSpec(
-            name='eval_on_train', input_fn=eval_on_train_input_fn,
+            name='eval_on_train', input_fn=eval_on_train_input_fn, steps=None))
-            steps=eval_on_train_steps or eval_steps))
  return train_spec, eval_specs
-def continuous_eval(estimator, model_dir, input_fn, eval_steps, train_steps,
+def continuous_eval(estimator, model_dir, input_fn, train_steps, name):
-                    name):
  """Perform continuous evaluation on checkpoints written to a model directory.
  Args:
    estimator: Estimator object to use for evaluation.
    model_dir: Model directory to read checkpoints for continuous evaluation.
    input_fn: Input function to use for evaluation.
-    eval_steps: Number of steps to run during each evaluation.
    train_steps: Number of training steps. This is used to infer the last
      checkpoint and stop evaluation loop.
    name: Namescope for eval summary.
  """
  def terminate_eval():
    tf.logging.info('Terminating eval after 180 seconds of no checkpoints')
    return True
@@ -646,10 +686,7 @@ def continuous_eval(estimator, model_dir, input_fn, eval_steps, train_steps,
    tf.logging.info('Starting Evaluation.')
    try:
      eval_results = estimator.evaluate(
-          input_fn=input_fn,
+          input_fn=input_fn, steps=None, checkpoint_path=ckpt, name=name)
-          steps=eval_steps,
-          checkpoint_path=ckpt,
-          name=name)
      tf.logging.info('Eval results: %s' % eval_results)
      # Terminate eval job when final checkpoint is reached
@@ -713,10 +750,9 @@ def populate_experiment(run_config,
      **kwargs)
  estimator = train_and_eval_dict['estimator']
  train_input_fn = train_and_eval_dict['train_input_fn']
-  eval_input_fn = train_and_eval_dict['eval_input_fn']
+  eval_input_fns = train_and_eval_dict['eval_input_fns']
  predict_input_fn = train_and_eval_dict['predict_input_fn']
  train_steps = train_and_eval_dict['train_steps']
-  eval_steps = train_and_eval_dict['eval_steps']
  export_strategies = [
      tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy(
@@ -726,8 +762,9 @@ def populate_experiment(run_config,
  return tf.contrib.learn.Experiment(
      estimator=estimator,
      train_input_fn=train_input_fn,
-      eval_input_fn=eval_input_fn,
+      eval_input_fn=eval_input_fns[0],
      train_steps=train_steps,
-      eval_steps=eval_steps,
+      eval_steps=None,
      export_strategies=export_strategies,
-      eval_delay_secs=120,)
+      eval_delay_secs=120,
+  )
--- a/research/object_detection/model_lib_test.py
+++ b/research/object_detection/model_lib_test.py
@@ -64,11 +64,13 @@ def _get_configs_for_model(model_name):
  data_path = _get_data_path()
  label_map_path = _get_labelmap_path()
  configs = config_util.get_configs_from_pipeline_file(filename)
+  override_dict = {
+      'train_input_path': data_path,
+      'eval_input_path': data_path,
+      'label_map_path': label_map_path
+  }
  configs = config_util.merge_external_params_with_configs(
-      configs,
+      configs, kwargs_dict=override_dict)
-      train_input_path=data_path,
-      eval_input_path=data_path,
-      label_map_path=label_map_path)
  return configs
@@ -145,6 +147,9 @@ class ModelLibTest(tf.test.TestCase):
        self.assertEqual(batch_size, detection_scores.shape.as_list()[0])
        self.assertEqual(tf.float32, detection_scores.dtype)
        self.assertEqual(tf.float32, num_detections.dtype)
+        if mode == 'eval':
+          self.assertIn('Detections_Left_Groundtruth_Right/0',
+                        estimator_spec.eval_metric_ops)
      if model_mode == tf.estimator.ModeKeys.TRAIN:
        self.assertIsNotNone(estimator_spec.train_op)
      return estimator_spec
@@ -225,21 +230,17 @@ class ModelLibTest(tf.test.TestCase):
        hparams_overrides='load_pretrained=false')
    pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST)
    train_steps = 20
-    eval_steps = 10
    train_and_eval_dict = model_lib.create_estimator_and_inputs(
        run_config,
        hparams,
        pipeline_config_path,
-        train_steps=train_steps,
+        train_steps=train_steps)
-        eval_steps=eval_steps)
    estimator = train_and_eval_dict['estimator']
    train_steps = train_and_eval_dict['train_steps']
-    eval_steps = train_and_eval_dict['eval_steps']
    self.assertIsInstance(estimator, tf.estimator.Estimator)
    self.assertEqual(20, train_steps)
-    self.assertEqual(10, eval_steps)
    self.assertIn('train_input_fn', train_and_eval_dict)
-    self.assertIn('eval_input_fn', train_and_eval_dict)
+    self.assertIn('eval_input_fns', train_and_eval_dict)
    self.assertIn('eval_on_train_input_fn', train_and_eval_dict)
  def test_create_estimator_with_default_train_eval_steps(self):
@@ -250,16 +251,13 @@ class ModelLibTest(tf.test.TestCase):
    pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST)
    configs = config_util.get_configs_from_pipeline_file(pipeline_config_path)
    config_train_steps = configs['train_config'].num_steps
-    config_eval_steps = configs['eval_config'].num_examples
    train_and_eval_dict = model_lib.create_estimator_and_inputs(
        run_config, hparams, pipeline_config_path)
    estimator = train_and_eval_dict['estimator']
    train_steps = train_and_eval_dict['train_steps']
-    eval_steps = train_and_eval_dict['eval_steps']
    self.assertIsInstance(estimator, tf.estimator.Estimator)
    self.assertEqual(config_train_steps, train_steps)
-    self.assertEqual(config_eval_steps, eval_steps)
  def test_create_tpu_estimator_and_inputs(self):
    """Tests that number of train/eval defaults to config values."""
@@ -269,21 +267,17 @@ class ModelLibTest(tf.test.TestCase):
        hparams_overrides='load_pretrained=false')
    pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST)
    train_steps = 20
-    eval_steps = 10
    train_and_eval_dict = model_lib.create_estimator_and_inputs(
        run_config,
        hparams,
        pipeline_config_path,
        train_steps=train_steps,
-        eval_steps=eval_steps,
        use_tpu_estimator=True)
    estimator = train_and_eval_dict['estimator']
    train_steps = train_and_eval_dict['train_steps']
-    eval_steps = train_and_eval_dict['eval_steps']
    self.assertIsInstance(estimator, tpu_estimator.TPUEstimator)
    self.assertEqual(20, train_steps)
-    self.assertEqual(10, eval_steps)
  def test_create_train_and_eval_specs(self):
    """Tests that `TrainSpec` and `EvalSpec` is created correctly."""
@@ -292,38 +286,32 @@ class ModelLibTest(tf.test.TestCase):
        hparams_overrides='load_pretrained=false')
    pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST)
    train_steps = 20
-    eval_steps = 10
-    eval_on_train_steps = 15
    train_and_eval_dict = model_lib.create_estimator_and_inputs(
        run_config,
        hparams,
        pipeline_config_path,
-        train_steps=train_steps,
+        train_steps=train_steps)
-        eval_steps=eval_steps)
    train_input_fn = train_and_eval_dict['train_input_fn']
-    eval_input_fn = train_and_eval_dict['eval_input_fn']
+    eval_input_fns = train_and_eval_dict['eval_input_fns']
    eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn']
    predict_input_fn = train_and_eval_dict['predict_input_fn']
    train_steps = train_and_eval_dict['train_steps']
-    eval_steps = train_and_eval_dict['eval_steps']
    train_spec, eval_specs = model_lib.create_train_and_eval_specs(
        train_input_fn,
-        eval_input_fn,
+        eval_input_fns,
        eval_on_train_input_fn,
        predict_input_fn,
        train_steps,
-        eval_steps,
        eval_on_train_data=True,
-        eval_on_train_steps=eval_on_train_steps,
        final_exporter_name='exporter',
-        eval_spec_name='holdout')
+        eval_spec_names=['holdout'])
    self.assertEqual(train_steps, train_spec.max_steps)
    self.assertEqual(2, len(eval_specs))
-    self.assertEqual(eval_steps, eval_specs[0].steps)
+    self.assertEqual(None, eval_specs[0].steps)
    self.assertEqual('holdout', eval_specs[0].name)
-    self.assertEqual('exporter', eval_specs[0].exporters[0].name)
+    self.assertEqual('exporter_holdout', eval_specs[0].exporters[0].name)
-    self.assertEqual(eval_on_train_steps, eval_specs[1].steps)
+    self.assertEqual(None, eval_specs[1].steps)
    self.assertEqual('eval_on_train', eval_specs[1].name)
  def test_experiment(self):
@@ -339,7 +327,7 @@ class ModelLibTest(tf.test.TestCase):
        train_steps=10,
        eval_steps=20)
    self.assertEqual(10, experiment.train_steps)
-    self.assertEqual(20, experiment.eval_steps)
+    self.assertEqual(None, experiment.eval_steps)
 class UnbatchTensorsTest(tf.test.TestCase):

--- a/research/object_detection/model_main.py
+++ b/research/object_detection/model_main.py
@@ -31,7 +31,16 @@ flags.DEFINE_string(
 flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config '
                    'file.')
 flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.')
-flags.DEFINE_integer('num_eval_steps', None, 'Number of train steps.')
+flags.DEFINE_boolean('eval_training_data', False,
+                     'If training data should be evaluated for this job. Note '
+                     'that one call only use this in eval-only mode, and '
+                     '`checkpoint_dir` must be supplied.')
+flags.DEFINE_integer('sample_1_of_n_eval_examples', 1, 'Will sample one of '
+                     'every n eval input examples, where n is provided.')
+flags.DEFINE_integer('sample_1_of_n_eval_on_train_examples', 5, 'Will sample '
+                     'one of every n train input examples for evaluation, '
+                     'where n is provided. This is only used if '
+                     '`eval_training_data` is True.')
 flags.DEFINE_string(
    'hparams_overrides', None, 'Hyperparameter overrides, '
    'represented as a string containing comma-separated '
@@ -44,8 +53,6 @@ flags.DEFINE_boolean(
    'run_once', False, 'If running in eval-only mode, whether to run just '
    'one round of eval vs running continuously (default).'
 )
-flags.DEFINE_boolean('eval_training_data', False,
-                     'If training data should be evaluated for this job.')
 FLAGS = flags.FLAGS
@@ -59,14 +66,15 @@ def main(unused_argv):
      hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
      pipeline_config_path=FLAGS.pipeline_config_path,
      train_steps=FLAGS.num_train_steps,
-      eval_steps=FLAGS.num_eval_steps)
+      sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
+      sample_1_of_n_eval_on_train_examples=(
+          FLAGS.sample_1_of_n_eval_on_train_examples))
  estimator = train_and_eval_dict['estimator']
  train_input_fn = train_and_eval_dict['train_input_fn']
-  eval_input_fn = train_and_eval_dict['eval_input_fn']
+  eval_input_fns = train_and_eval_dict['eval_input_fns']
  eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn']
  predict_input_fn = train_and_eval_dict['predict_input_fn']
  train_steps = train_and_eval_dict['train_steps']
-  eval_steps = train_and_eval_dict['eval_steps']
  if FLAGS.checkpoint_dir:
    if FLAGS.eval_training_data:
@@ -74,23 +82,23 @@ def main(unused_argv):
      input_fn = eval_on_train_input_fn
    else:
      name = 'validation_data'
-      input_fn = eval_input_fn
+      # The first eval input will be evaluated.
+      input_fn = eval_input_fns[0]
    if FLAGS.run_once:
      estimator.evaluate(input_fn,
-                         eval_steps,
+                         num_eval_steps=None,
                         checkpoint_path=tf.train.latest_checkpoint(
                             FLAGS.checkpoint_dir))
    else:
-      model_lib.continuous_eval(estimator, FLAGS.model_dir, input_fn,
+      model_lib.continuous_eval(estimator, FLAGS.checkpoint_dir, input_fn,
-                                eval_steps, train_steps, name)
+                                train_steps, name)
  else:
    train_spec, eval_specs = model_lib.create_train_and_eval_specs(
        train_input_fn,
-        eval_input_fn,
+        eval_input_fns,
        eval_on_train_input_fn,
        predict_input_fn,
        train_steps,
-        eval_steps,
        eval_on_train_data=False)
    # Currently only a single Eval Spec is allowed.

--- a/research/object_detection/model_tpu_main.py
+++ b/research/object_detection/model_tpu_main.py
@@ -62,15 +62,20 @@ flags.DEFINE_integer('train_batch_size', None, 'Batch size for training. If '
 flags.DEFINE_string(
    'hparams_overrides', None, 'Comma-separated list of '
    'hyperparameters to override defaults.')
+flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.')
 flags.DEFINE_boolean('eval_training_data', False,
                     'If training data should be evaluated for this job.')
+flags.DEFINE_integer('sample_1_of_n_eval_examples', 1, 'Will sample one of '
+                     'every n eval input examples, where n is provided.')
+flags.DEFINE_integer('sample_1_of_n_eval_on_train_examples', 5, 'Will sample '
+                     'one of every n train input examples for evaluation, '
+                     'where n is provided. This is only used if '
+                     '`eval_training_data` is True.')
 flags.DEFINE_string(
    'model_dir', None, 'Path to output model directory '
    'where event and checkpoint files will be written.')
 flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config '
                    'file.')
-flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.')
-flags.DEFINE_integer('num_eval_steps', None, 'Number of train steps.')
 FLAGS = tf.flags.FLAGS
@@ -103,17 +108,18 @@ def main(unused_argv):
      hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
      pipeline_config_path=FLAGS.pipeline_config_path,
      train_steps=FLAGS.num_train_steps,
-      eval_steps=FLAGS.num_eval_steps,
+      sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
+      sample_1_of_n_eval_on_train_examples=(
+          FLAGS.sample_1_of_n_eval_on_train_examples),
      use_tpu_estimator=True,
      use_tpu=FLAGS.use_tpu,
      num_shards=FLAGS.num_shards,
      **kwargs)
  estimator = train_and_eval_dict['estimator']
  train_input_fn = train_and_eval_dict['train_input_fn']
-  eval_input_fn = train_and_eval_dict['eval_input_fn']
+  eval_input_fns = train_and_eval_dict['eval_input_fns']
  eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn']
  train_steps = train_and_eval_dict['train_steps']
-  eval_steps = train_and_eval_dict['eval_steps']
  if FLAGS.mode == 'train':
    estimator.train(input_fn=train_input_fn, max_steps=train_steps)
@@ -125,9 +131,10 @@ def main(unused_argv):
      input_fn = eval_on_train_input_fn
    else:
      name = 'validation_data'
-      input_fn = eval_input_fn
+      # Currently only a single eval input is allowed.
-    model_lib.continuous_eval(estimator, FLAGS.model_dir, input_fn, eval_steps,
+      input_fn = eval_input_fns[0]
-                              train_steps, name)
+    model_lib.continuous_eval(estimator, FLAGS.model_dir, input_fn, train_steps,
+                              name)
 if __name__ == '__main__':

--- a/research/object_detection/models/feature_map_generators.py
+++ b/research/object_detection/models/feature_map_generators.py
@@ -24,6 +24,7 @@ Feature map generators build on the base feature extractors and produce a list
 of final feature maps.
 """
 import collections
+import functools
 import tensorflow as tf
 from object_detection.utils import ops
 slim = tf.contrib.slim
@@ -45,6 +46,222 @@ def get_depth_fn(depth_multiplier, min_depth):
  return multiply_depth
+class KerasMultiResolutionFeatureMaps(tf.keras.Model):
+  """Generates multi resolution feature maps from input image features.
+  A Keras model that generates multi-scale feature maps for detection as in the
+  SSD papers by Liu et al: https://arxiv.org/pdf/1512.02325v2.pdf, See Sec 2.1.
+  More specifically, when called on inputs it performs the following two tasks:
+  1) If a layer name is provided in the configuration, returns that layer as a
+     feature map.
+  2) If a layer name is left as an empty string, constructs a new feature map
+     based on the spatial shape and depth configuration. Note that the current
+     implementation only supports generating new layers using convolution of
+     stride 2 resulting in a spatial resolution reduction by a factor of 2.
+     By default convolution kernel size is set to 3, and it can be customized
+     by caller.
+  An example of the configuration for Inception V3:
+  {
+    'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''],
+    'layer_depth': [-1, -1, -1, 512, 256, 128]
+  }
+  When this feature generator object is called on input image_features:
+    Args:
+      image_features: A dictionary of handles to activation tensors from the
+        base feature extractor.
+    Returns:
+      feature_maps: an OrderedDict mapping keys (feature map names) to
+        tensors where each tensor has shape [batch, height_i, width_i, depth_i].
+  """
+  def __init__(self,
+               feature_map_layout,
+               depth_multiplier,
+               min_depth,
+               insert_1x1_conv,
+               is_training,
+               conv_hyperparams,
+               freeze_batchnorm,
+               name=None):
+    """Constructor.
+    Args:
+      feature_map_layout: Dictionary of specifications for the feature map
+        layouts in the following format (Inception V2/V3 respectively):
+        {
+          'from_layer': ['Mixed_3c', 'Mixed_4c', 'Mixed_5c', '', '', ''],
+          'layer_depth': [-1, -1, -1, 512, 256, 128]
+        }
+        or
+        {
+          'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''],
+          'layer_depth': [-1, -1, -1, 512, 256, 128]
+        }
+        If 'from_layer' is specified, the specified feature map is directly used
+        as a box predictor layer, and the layer_depth is directly infered from
+        the feature map (instead of using the provided 'layer_depth' parameter).
+        In this case, our convention is to set 'layer_depth' to -1 for clarity.
+        Otherwise, if 'from_layer' is an empty string, then the box predictor
+        layer will be built from the previous layer using convolution
+        operations. Note that the current implementation only supports
+        generating new layers using convolutions of stride 2 (resulting in a
+        spatial resolution reduction by a factor of 2), and will be extended to
+        a more flexible design. Convolution kernel size is set to 3 by default,
+        and can be customized by 'conv_kernel_size' parameter (similarily,
+        'conv_kernel_size' should be set to -1 if 'from_layer' is specified).
+        The created convolution operation will be a normal 2D convolution by
+        default, and a depthwise convolution followed by 1x1 convolution if
+        'use_depthwise' is set to True.
+      depth_multiplier: Depth multiplier for convolutional layers.
+      min_depth: Minimum depth for convolutional layers.
+      insert_1x1_conv: A boolean indicating whether an additional 1x1
+        convolution should be inserted before shrinking the feature map.
+      is_training: Indicates whether the feature generator is in training mode.
+      conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
+        containing hyperparameters for convolution ops.
+      freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
+        training or not. When training with a small batch size (e.g. 1), it is
+        desirable to freeze batch norm update and use pretrained batch norm
+        params.
+      name: A string name scope to assign to the model. If 'None', Keras
+        will auto-generate one from the class name.
+    """
+    super(KerasMultiResolutionFeatureMaps, self).__init__(name=name)
+    self.feature_map_layout = feature_map_layout
+    self.convolutions = []
+    depth_fn = get_depth_fn(depth_multiplier, min_depth)
+    base_from_layer = ''
+    use_explicit_padding = False
+    if 'use_explicit_padding' in feature_map_layout:
+      use_explicit_padding = feature_map_layout['use_explicit_padding']
+    use_depthwise = False
+    if 'use_depthwise' in feature_map_layout:
+      use_depthwise = feature_map_layout['use_depthwise']
+    for index, from_layer in enumerate(feature_map_layout['from_layer']):
+      net = []
+      self.convolutions.append(net)
+      layer_depth = feature_map_layout['layer_depth'][index]
+      conv_kernel_size = 3
+      if 'conv_kernel_size' in feature_map_layout:
+        conv_kernel_size = feature_map_layout['conv_kernel_size'][index]
+      if from_layer:
+        base_from_layer = from_layer
+      else:
+        if insert_1x1_conv:
+          layer_name = '{}_1_Conv2d_{}_1x1_{}'.format(
+              base_from_layer, index, depth_fn(layer_depth / 2))
+          net.append(tf.keras.layers.Conv2D(depth_fn(layer_depth / 2),
+                                            [1, 1],
+                                            padding='SAME',
+                                            strides=1,
+                                            name=layer_name + '_conv',
+                                            **conv_hyperparams.params()))
+          net.append(
+              conv_hyperparams.build_batch_norm(
+                  training=(is_training and not freeze_batchnorm),
+                  name=layer_name + '_batchnorm'))
+          net.append(
+              conv_hyperparams.build_activation_layer(
+                  name=layer_name))
+        layer_name = '{}_2_Conv2d_{}_{}x{}_s2_{}'.format(
+            base_from_layer, index, conv_kernel_size, conv_kernel_size,
+            depth_fn(layer_depth))
+        stride = 2
+        padding = 'SAME'
+        if use_explicit_padding:
+          padding = 'VALID'
+          # We define this function here while capturing the value of
+          # conv_kernel_size, to avoid holding a reference to the loop variable
+          # conv_kernel_size inside of a lambda function
+          def fixed_padding(features, kernel_size=conv_kernel_size):
+            return ops.fixed_padding(features, kernel_size)
+          net.append(tf.keras.layers.Lambda(fixed_padding))
+        # TODO(rathodv): Add some utilities to simplify the creation of
+        # Depthwise & non-depthwise convolutions w/ normalization & activations
+        if use_depthwise:
+          net.append(tf.keras.layers.DepthwiseConv2D(
+              [conv_kernel_size, conv_kernel_size],
+              depth_multiplier=1,
+              padding=padding,
+              strides=stride,
+              name=layer_name + '_depthwise_conv',
+              **conv_hyperparams.params()))
+          net.append(
+              conv_hyperparams.build_batch_norm(
+                  training=(is_training and not freeze_batchnorm),
+                  name=layer_name + '_depthwise_batchnorm'))
+          net.append(
+              conv_hyperparams.build_activation_layer(
+                  name=layer_name + '_depthwise'))
+          net.append(tf.keras.layers.Conv2D(depth_fn(layer_depth), [1, 1],
+                                            padding='SAME',
+                                            strides=1,
+                                            name=layer_name + '_conv',
+                                            **conv_hyperparams.params()))
+          net.append(
+              conv_hyperparams.build_batch_norm(
+                  training=(is_training and not freeze_batchnorm),
+                  name=layer_name + '_batchnorm'))
+          net.append(
+              conv_hyperparams.build_activation_layer(
+                  name=layer_name))
+        else:
+          net.append(tf.keras.layers.Conv2D(
+              depth_fn(layer_depth),
+              [conv_kernel_size, conv_kernel_size],
+              padding=padding,
+              strides=stride,
+              name=layer_name + '_conv',
+              **conv_hyperparams.params()))
+          net.append(
+              conv_hyperparams.build_batch_norm(
+                  training=(is_training and not freeze_batchnorm),
+                  name=layer_name + '_batchnorm'))
+          net.append(
+              conv_hyperparams.build_activation_layer(
+                  name=layer_name))
+  def call(self, image_features):
+    """Generate the multi-resolution feature maps.
+    Executed when calling the `.__call__` method on input.
+    Args:
+      image_features: A dictionary of handles to activation tensors from the
+        base feature extractor.
+    Returns:
+      feature_maps: an OrderedDict mapping keys (feature map names) to
+        tensors where each tensor has shape [batch, height_i, width_i, depth_i].
+    """
+    feature_maps = []
+    feature_map_keys = []
+    for index, from_layer in enumerate(self.feature_map_layout['from_layer']):
+      if from_layer:
+        feature_map = image_features[from_layer]
+        feature_map_keys.append(from_layer)
+      else:
+        feature_map = feature_maps[-1]
+        for layer in self.convolutions[index]:
+          feature_map = layer(feature_map)
+        layer_name = self.convolutions[index][-1].name
+        feature_map_keys.append(layer_name)
+      feature_maps.append(feature_map)
+    return collections.OrderedDict(
+        [(x, y) for (x, y) in zip(feature_map_keys, feature_maps)])
 def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
                                  min_depth, insert_1x1_conv, image_features):
  """Generates multi resolution feature maps from input image features.
@@ -77,7 +294,7 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
      }
      or
      {
-        'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', '', ''],
+        'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''],
        'layer_depth': [-1, -1, -1, 512, 256, 128]
      }
      If 'from_layer' is specified, the specified feature map is directly used
@@ -179,7 +396,10 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
      [(x, y) for (x, y) in zip(feature_map_keys, feature_maps)])
-def fpn_top_down_feature_maps(image_features, depth, scope=None):
+def fpn_top_down_feature_maps(image_features,
+                              depth,
+                              use_depthwise=False,
+                              scope=None):
  """Generates `top-down` feature maps for Feature Pyramid Networks.
  See https://arxiv.org/abs/1612.03144 for details.
@@ -189,6 +409,7 @@ def fpn_top_down_feature_maps(image_features, depth, scope=None):
      Spatial resolutions of succesive tensors must reduce exactly by a factor
      of 2.
    depth: depth of output feature maps.
+    use_depthwise: use depthwise separable conv instead of regular conv.
    scope: A scope name to wrap this op under.
  Returns:
@@ -200,7 +421,7 @@ def fpn_top_down_feature_maps(image_features, depth, scope=None):
    output_feature_maps_list = []
    output_feature_map_keys = []
    with slim.arg_scope(
-        [slim.conv2d], padding='SAME', stride=1):
+        [slim.conv2d, slim.separable_conv2d], padding='SAME', stride=1):
      top_down = slim.conv2d(
          image_features[-1][1],
          depth, [1, 1], activation_fn=None, normalizer_fn=None,
@@ -216,7 +437,11 @@ def fpn_top_down_feature_maps(image_features, depth, scope=None):
            activation_fn=None, normalizer_fn=None,
            scope='projection_%d' % (level + 1))
        top_down += residual
-        output_feature_maps_list.append(slim.conv2d(
+        if use_depthwise:
+          conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1)
+        else:
+          conv_op = slim.conv2d
+        output_feature_maps_list.append(conv_op(
            top_down,
            depth, [3, 3],
            scope='smoothing_%d' % (level + 1)))
@@ -226,7 +451,7 @@ def fpn_top_down_feature_maps(image_features, depth, scope=None):
 def pooling_pyramid_feature_maps(base_feature_map_depth, num_layers,
-                                 image_features):
+                                 image_features, replace_pool_with_conv=False):
  """Generates pooling pyramid feature maps.
  The pooling pyramid feature maps is motivated by
@@ -250,6 +475,8 @@ def pooling_pyramid_feature_maps(base_feature_map_depth, num_layers,
      from the base feature.
    image_features: A dictionary of handles to activation tensors from the
      feature extractor.
+    replace_pool_with_conv: Whether or not to replace pooling operations with
+      convolutions in the PPN. Default is False.
  Returns:
    feature_maps: an OrderedDict mapping keys (feature map names) to
@@ -279,12 +506,22 @@ def pooling_pyramid_feature_maps(base_feature_map_depth, num_layers,
  feature_map_keys.append(feature_map_key)
  feature_maps.append(image_features)
  feature_map = image_features
-  with slim.arg_scope([slim.max_pool2d], padding='SAME', stride=2):
+  if replace_pool_with_conv:
-    for i in range(num_layers - 1):
+    with slim.arg_scope([slim.conv2d], padding='SAME', stride=2):
-      feature_map_key = 'MaxPool2d_%d_2x2' % i
+      for i in range(num_layers - 1):
-      feature_map = slim.max_pool2d(
+        feature_map_key = 'Conv2d_{}_3x3_s2_{}'.format(i,
-          feature_map, [2, 2], padding='SAME', scope=feature_map_key)
+                                                       base_feature_map_depth)
-      feature_map_keys.append(feature_map_key)
+        feature_map = slim.conv2d(
-      feature_maps.append(feature_map)
+            feature_map, base_feature_map_depth, [3, 3], scope=feature_map_key)
+        feature_map_keys.append(feature_map_key)
+        feature_maps.append(feature_map)
+  else:
+    with slim.arg_scope([slim.max_pool2d], padding='SAME', stride=2):
+      for i in range(num_layers - 1):
+        feature_map_key = 'MaxPool2d_%d_2x2' % i
+        feature_map = slim.max_pool2d(
+            feature_map, [2, 2], padding='SAME', scope=feature_map_key)
+        feature_map_keys.append(feature_map_key)
+        feature_maps.append(feature_map)
  return collections.OrderedDict(
      [(x, y) for (x, y) in zip(feature_map_keys, feature_maps)])
--- a/research/object_detection/models/feature_map_generators_test.py
+++ b/research/object_detection/models/feature_map_generators_test.py
@@ -15,9 +15,15 @@
 """Tests for feature map generators."""
+from absl.testing import parameterized
 import tensorflow as tf
+from google.protobuf import text_format
+from object_detection.builders import hyperparams_builder
 from object_detection.models import feature_map_generators
+from object_detection.protos import hyperparams_pb2
 INCEPTION_V2_LAYOUT = {
    'from_layer': ['Mixed_3c', 'Mixed_4c', 'Mixed_5c', '', '', ''],
@@ -40,21 +46,60 @@ EMBEDDED_SSD_MOBILENET_V1_LAYOUT = {
 }
-# TODO(rathodv): add tests with different anchor strides.
+@parameterized.parameters(
+    {'use_keras': False},
+    {'use_keras': True},
+)
 class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):
-  def test_get_expected_feature_map_shapes_with_inception_v2(self):
+  def _build_conv_hyperparams(self):
+    conv_hyperparams = hyperparams_pb2.Hyperparams()
+    conv_hyperparams_text_proto = """
+      regularizer {
+        l2_regularizer {
+        }
+      }
+      initializer {
+        truncated_normal_initializer {
+        }
+      }
+    """
+    text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
+    return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
+  def _build_feature_map_generator(self, feature_map_layout, use_keras):
+    if use_keras:
+      return feature_map_generators.KerasMultiResolutionFeatureMaps(
+          feature_map_layout=feature_map_layout,
+          depth_multiplier=1,
+          min_depth=32,
+          insert_1x1_conv=True,
+          freeze_batchnorm=False,
+          is_training=True,
+          conv_hyperparams=self._build_conv_hyperparams(),
+          name='FeatureMaps'
+      )
+    else:
+      def feature_map_generator(image_features):
+        return feature_map_generators.multi_resolution_feature_maps(
+            feature_map_layout=feature_map_layout,
+            depth_multiplier=1,
+            min_depth=32,
+            insert_1x1_conv=True,
+            image_features=image_features)
+      return feature_map_generator
+  def test_get_expected_feature_map_shapes_with_inception_v2(self, use_keras):
    image_features = {
        'Mixed_3c': tf.random_uniform([4, 28, 28, 256], dtype=tf.float32),
        'Mixed_4c': tf.random_uniform([4, 14, 14, 576], dtype=tf.float32),
        'Mixed_5c': tf.random_uniform([4, 7, 7, 1024], dtype=tf.float32)
    }
-    feature_maps = feature_map_generators.multi_resolution_feature_maps(
+    feature_map_generator = self._build_feature_map_generator(
        feature_map_layout=INCEPTION_V2_LAYOUT,
-        depth_multiplier=1,
+        use_keras=use_keras
-        min_depth=32,
+    )
-        insert_1x1_conv=True,
+    feature_maps = feature_map_generator(image_features)
-        image_features=image_features)
    expected_feature_map_shapes = {
        'Mixed_3c': (4, 28, 28, 256),
@@ -70,21 +115,53 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):
      out_feature_maps = sess.run(feature_maps)
      out_feature_map_shapes = dict(
          (key, value.shape) for key, value in out_feature_maps.items())
-      self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)
+      self.assertDictEqual(expected_feature_map_shapes, out_feature_map_shapes)
+  # TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
+  def test_get_expected_feature_map_shapes_use_explicit_padding(
+      self, use_keras):
+    image_features = {
+        'Mixed_3c': tf.random_uniform([4, 28, 28, 256], dtype=tf.float32),
+        'Mixed_4c': tf.random_uniform([4, 14, 14, 576], dtype=tf.float32),
+        'Mixed_5c': tf.random_uniform([4, 7, 7, 1024], dtype=tf.float32)
+    }
+    layout_copy = INCEPTION_V2_LAYOUT.copy()
+    layout_copy['use_explicit_padding'] = True
+    feature_map_generator = self._build_feature_map_generator(
+        feature_map_layout=layout_copy,
+        use_keras=use_keras
+    )
+    feature_maps = feature_map_generator(image_features)
+    expected_feature_map_shapes = {
+        'Mixed_3c': (4, 28, 28, 256),
+        'Mixed_4c': (4, 14, 14, 576),
+        'Mixed_5c': (4, 7, 7, 1024),
+        'Mixed_5c_2_Conv2d_3_3x3_s2_512': (4, 4, 4, 512),
+        'Mixed_5c_2_Conv2d_4_3x3_s2_256': (4, 2, 2, 256),
+        'Mixed_5c_2_Conv2d_5_3x3_s2_256': (4, 1, 1, 256)}
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      out_feature_maps = sess.run(feature_maps)
+      out_feature_map_shapes = dict(
+          (key, value.shape) for key, value in out_feature_maps.items())
+      self.assertDictEqual(expected_feature_map_shapes, out_feature_map_shapes)
-  def test_get_expected_feature_map_shapes_with_inception_v3(self):
+  def test_get_expected_feature_map_shapes_with_inception_v3(self, use_keras):
    image_features = {
        'Mixed_5d': tf.random_uniform([4, 35, 35, 256], dtype=tf.float32),
        'Mixed_6e': tf.random_uniform([4, 17, 17, 576], dtype=tf.float32),
        'Mixed_7c': tf.random_uniform([4, 8, 8, 1024], dtype=tf.float32)
    }
-    feature_maps = feature_map_generators.multi_resolution_feature_maps(
+    feature_map_generator = self._build_feature_map_generator(
        feature_map_layout=INCEPTION_V3_LAYOUT,
-        depth_multiplier=1,
+        use_keras=use_keras
-        min_depth=32,
+    )
-        insert_1x1_conv=True,
+    feature_maps = feature_map_generator(image_features)
-        image_features=image_features)
    expected_feature_map_shapes = {
        'Mixed_5d': (4, 35, 35, 256),
@@ -100,10 +177,10 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):
      out_feature_maps = sess.run(feature_maps)
      out_feature_map_shapes = dict(
          (key, value.shape) for key, value in out_feature_maps.items())
-      self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)
+      self.assertDictEqual(expected_feature_map_shapes, out_feature_map_shapes)
  def test_get_expected_feature_map_shapes_with_embedded_ssd_mobilenet_v1(
-      self):
+      self, use_keras):
    image_features = {
        'Conv2d_11_pointwise': tf.random_uniform([4, 16, 16, 512],
                                                 dtype=tf.float32),
@@ -111,12 +188,11 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):
                                                 dtype=tf.float32),
    }
-    feature_maps = feature_map_generators.multi_resolution_feature_maps(
+    feature_map_generator = self._build_feature_map_generator(
        feature_map_layout=EMBEDDED_SSD_MOBILENET_V1_LAYOUT,
-        depth_multiplier=1,
+        use_keras=use_keras
-        min_depth=32,
+    )
-        insert_1x1_conv=True,
+    feature_maps = feature_map_generator(image_features)
-        image_features=image_features)
    expected_feature_map_shapes = {
        'Conv2d_11_pointwise': (4, 16, 16, 512),
@@ -131,7 +207,62 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):
      out_feature_maps = sess.run(feature_maps)
      out_feature_map_shapes = dict(
          (key, value.shape) for key, value in out_feature_maps.items())
-      self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)
+      self.assertDictEqual(expected_feature_map_shapes, out_feature_map_shapes)
+  def test_get_expected_variable_names_with_inception_v2(self, use_keras):
+    image_features = {
+        'Mixed_3c': tf.random_uniform([4, 28, 28, 256], dtype=tf.float32),
+        'Mixed_4c': tf.random_uniform([4, 14, 14, 576], dtype=tf.float32),
+        'Mixed_5c': tf.random_uniform([4, 7, 7, 1024], dtype=tf.float32)
+    }
+    feature_map_generator = self._build_feature_map_generator(
+        feature_map_layout=INCEPTION_V2_LAYOUT,
+        use_keras=use_keras
+    )
+    feature_maps = feature_map_generator(image_features)
+    expected_slim_variables = set([
+        'Mixed_5c_1_Conv2d_3_1x1_256/weights',
+        'Mixed_5c_1_Conv2d_3_1x1_256/biases',
+        'Mixed_5c_2_Conv2d_3_3x3_s2_512/weights',
+        'Mixed_5c_2_Conv2d_3_3x3_s2_512/biases',
+        'Mixed_5c_1_Conv2d_4_1x1_128/weights',
+        'Mixed_5c_1_Conv2d_4_1x1_128/biases',
+        'Mixed_5c_2_Conv2d_4_3x3_s2_256/weights',
+        'Mixed_5c_2_Conv2d_4_3x3_s2_256/biases',
+        'Mixed_5c_1_Conv2d_5_1x1_128/weights',
+        'Mixed_5c_1_Conv2d_5_1x1_128/biases',
+        'Mixed_5c_2_Conv2d_5_3x3_s2_256/weights',
+        'Mixed_5c_2_Conv2d_5_3x3_s2_256/biases',
+    ])
+    expected_keras_variables = set([
+        'FeatureMaps/Mixed_5c_1_Conv2d_3_1x1_256_conv/kernel',
+        'FeatureMaps/Mixed_5c_1_Conv2d_3_1x1_256_conv/bias',
+        'FeatureMaps/Mixed_5c_2_Conv2d_3_3x3_s2_512_conv/kernel',
+        'FeatureMaps/Mixed_5c_2_Conv2d_3_3x3_s2_512_conv/bias',
+        'FeatureMaps/Mixed_5c_1_Conv2d_4_1x1_128_conv/kernel',
+        'FeatureMaps/Mixed_5c_1_Conv2d_4_1x1_128_conv/bias',
+        'FeatureMaps/Mixed_5c_2_Conv2d_4_3x3_s2_256_conv/kernel',
+        'FeatureMaps/Mixed_5c_2_Conv2d_4_3x3_s2_256_conv/bias',
+        'FeatureMaps/Mixed_5c_1_Conv2d_5_1x1_128_conv/kernel',
+        'FeatureMaps/Mixed_5c_1_Conv2d_5_1x1_128_conv/bias',
+        'FeatureMaps/Mixed_5c_2_Conv2d_5_3x3_s2_256_conv/kernel',
+        'FeatureMaps/Mixed_5c_2_Conv2d_5_3x3_s2_256_conv/bias',
+    ])
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      sess.run(feature_maps)
+      actual_variable_set = set(
+          [var.op.name for var in tf.trainable_variables()])
+      if use_keras:
+        self.assertSetEqual(expected_keras_variables, actual_variable_set)
+      else:
+        self.assertSetEqual(expected_slim_variables, actual_variable_set)
+  # TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
 class FPNFeatureMapGeneratorTest(tf.test.TestCase):
@@ -161,6 +292,31 @@ class FPNFeatureMapGeneratorTest(tf.test.TestCase):
                                for key, value in out_feature_maps.items()}
      self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)
+  def test_get_expected_feature_map_shapes_with_depthwise(self):
+    image_features = [
+        ('block2', tf.random_uniform([4, 8, 8, 256], dtype=tf.float32)),
+        ('block3', tf.random_uniform([4, 4, 4, 256], dtype=tf.float32)),
+        ('block4', tf.random_uniform([4, 2, 2, 256], dtype=tf.float32)),
+        ('block5', tf.random_uniform([4, 1, 1, 256], dtype=tf.float32))
+    ]
+    feature_maps = feature_map_generators.fpn_top_down_feature_maps(
+        image_features=image_features, depth=128, use_depthwise=True)
+    expected_feature_map_shapes = {
+        'top_down_block2': (4, 8, 8, 128),
+        'top_down_block3': (4, 4, 4, 128),
+        'top_down_block4': (4, 2, 2, 128),
+        'top_down_block5': (4, 1, 1, 128)
+    }
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      out_feature_maps = sess.run(feature_maps)
+      out_feature_map_shapes = {key: value.shape
+                                for key, value in out_feature_maps.items()}
+      self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)
 class GetDepthFunctionTest(tf.test.TestCase):
@@ -175,5 +331,94 @@ class GetDepthFunctionTest(tf.test.TestCase):
    self.assertEqual(depth_fn(64), 32)
+@parameterized.parameters(
+    {'replace_pool_with_conv': False},
+    {'replace_pool_with_conv': True},
+)
+class PoolingPyramidFeatureMapGeneratorTest(tf.test.TestCase):
+  def test_get_expected_feature_map_shapes(self, replace_pool_with_conv):
+    image_features = {
+        'image_features': tf.random_uniform([4, 19, 19, 1024])
+    }
+    feature_maps = feature_map_generators.pooling_pyramid_feature_maps(
+        base_feature_map_depth=1024,
+        num_layers=6,
+        image_features=image_features,
+        replace_pool_with_conv=replace_pool_with_conv)
+    expected_pool_feature_map_shapes = {
+        'Base_Conv2d_1x1_1024': (4, 19, 19, 1024),
+        'MaxPool2d_0_2x2': (4, 10, 10, 1024),
+        'MaxPool2d_1_2x2': (4, 5, 5, 1024),
+        'MaxPool2d_2_2x2': (4, 3, 3, 1024),
+        'MaxPool2d_3_2x2': (4, 2, 2, 1024),
+        'MaxPool2d_4_2x2': (4, 1, 1, 1024),
+    }
+    expected_conv_feature_map_shapes = {
+        'Base_Conv2d_1x1_1024': (4, 19, 19, 1024),
+        'Conv2d_0_3x3_s2_1024': (4, 10, 10, 1024),
+        'Conv2d_1_3x3_s2_1024': (4, 5, 5, 1024),
+        'Conv2d_2_3x3_s2_1024': (4, 3, 3, 1024),
+        'Conv2d_3_3x3_s2_1024': (4, 2, 2, 1024),
+        'Conv2d_4_3x3_s2_1024': (4, 1, 1, 1024),
+    }
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      out_feature_maps = sess.run(feature_maps)
+      out_feature_map_shapes = {key: value.shape
+                                for key, value in out_feature_maps.items()}
+      if replace_pool_with_conv:
+        self.assertDictEqual(expected_conv_feature_map_shapes,
+                             out_feature_map_shapes)
+      else:
+        self.assertDictEqual(expected_pool_feature_map_shapes,
+                             out_feature_map_shapes)
+  def test_get_expected_variable_names(self, replace_pool_with_conv):
+    image_features = {
+        'image_features': tf.random_uniform([4, 19, 19, 1024])
+    }
+    feature_maps = feature_map_generators.pooling_pyramid_feature_maps(
+        base_feature_map_depth=1024,
+        num_layers=6,
+        image_features=image_features,
+        replace_pool_with_conv=replace_pool_with_conv)
+    expected_pool_variables = set([
+        'Base_Conv2d_1x1_1024/weights',
+        'Base_Conv2d_1x1_1024/biases',
+    ])
+    expected_conv_variables = set([
+        'Base_Conv2d_1x1_1024/weights',
+        'Base_Conv2d_1x1_1024/biases',
+        'Conv2d_0_3x3_s2_1024/weights',
+        'Conv2d_0_3x3_s2_1024/biases',
+        'Conv2d_1_3x3_s2_1024/weights',
+        'Conv2d_1_3x3_s2_1024/biases',
+        'Conv2d_2_3x3_s2_1024/weights',
+        'Conv2d_2_3x3_s2_1024/biases',
+        'Conv2d_3_3x3_s2_1024/weights',
+        'Conv2d_3_3x3_s2_1024/biases',
+        'Conv2d_4_3x3_s2_1024/weights',
+        'Conv2d_4_3x3_s2_1024/biases',
+    ])
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      sess.run(feature_maps)
+      actual_variable_set = set(
+          [var.op.name for var in tf.trainable_variables()])
+      if replace_pool_with_conv:
+        self.assertSetEqual(expected_conv_variables, actual_variable_set)
+      else:
+        self.assertSetEqual(expected_pool_variables, actual_variable_set)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/models/keras_applications/__init__.py
+++ b/research/object_detection/models/keras_applications/__init__.py
--- a/research/object_detection/models/keras_applications/mobilenet_v2.py
+++ b/research/object_detection/models/keras_applications/mobilenet_v2.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A wrapper around the MobileNet v2 models for Keras, for object detection."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from object_detection.core import freezable_batch_norm
+from object_detection.utils import ops
+# pylint: disable=invalid-name
+# This method copied from the slim mobilenet base network code (same license)
+def _make_divisible(v, divisor, min_value=None):
+  if min_value is None:
+    min_value = divisor
+  new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_v < 0.9 * v:
+    new_v += divisor
+  return new_v
+class _LayersOverride(object):
+  """Alternative Keras layers interface for the Keras MobileNetV2."""
+  def __init__(self,
+               batchnorm_training,
+               default_batchnorm_momentum=0.999,
+               conv_hyperparams=None,
+               use_explicit_padding=False,
+               alpha=1.0,
+               min_depth=None):
+    """Alternative tf.keras.layers interface, for use by the Keras MobileNetV2.
+    It is used by the Keras applications kwargs injection API to
+    modify the Mobilenet v2 Keras application with changes required by
+    the Object Detection API.
+    These injected interfaces make the following changes to the network:
+    - Applies the Object Detection hyperparameter configuration
+    - Supports FreezableBatchNorms
+    - Adds support for a min number of filters for each layer
+    - Makes the `alpha` parameter affect the final convolution block even if it
+        is less than 1.0
+    - Adds support for explicit padding of convolutions
+    Args:
+      batchnorm_training: Bool. Assigned to Batch norm layer `training` param
+        when constructing `freezable_batch_norm.FreezableBatchNorm` layers.
+      default_batchnorm_momentum: Float. When 'conv_hyperparams' is None,
+        batch norm layers will be constructed using this value as the momentum.
+      conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
+        containing hyperparameters for convolution ops. Optionally set to `None`
+        to use default mobilenet_v2 layer builders.
+      use_explicit_padding: If True, use 'valid' padding for convolutions,
+        but explicitly pre-pads inputs so that the output dimensions are the
+        same as if 'same' padding were used. Off by default.
+      alpha: The width multiplier referenced in the MobileNetV2 paper. It
+        modifies the number of filters in each convolutional layer.
+      min_depth: Minimum number of filters in the convolutional layers.
+    """
+    self._alpha = alpha
+    self._batchnorm_training = batchnorm_training
+    self._default_batchnorm_momentum = default_batchnorm_momentum
+    self._conv_hyperparams = conv_hyperparams
+    self._use_explicit_padding = use_explicit_padding
+    self._min_depth = min_depth
+  def _FixedPaddingLayer(self, kernel_size):
+    return tf.keras.layers.Lambda(lambda x: ops.fixed_padding(x, kernel_size))
+  def Conv2D(self, filters, **kwargs):
+    """Builds a Conv2D layer according to the current Object Detection config.
+    Overrides the Keras MobileNetV2 application's convolutions with ones that
+    follow the spec specified by the Object Detection hyperparameters.
+    Args:
+      filters: The number of filters to use for the convolution.
+      **kwargs: Keyword args specified by the Keras application for
+        constructing the convolution.
+    Returns:
+      A one-arg callable that will either directly apply a Keras Conv2D layer to
+      the input argument, or that will first pad the input then apply a Conv2D
+      layer.
+    """
+    # Make sure 'alpha' is always applied to the last convolution block's size
+    # (This overrides the Keras application's functionality)
+    if kwargs.get('name') == 'Conv_1' and self._alpha < 1.0:
+      filters = _make_divisible(1280 * self._alpha, 8)
+    # Apply the minimum depth to the convolution layers
+    if (self._min_depth and (filters < self._min_depth)
+        and not kwargs.get('name').endswith('expand')):
+      filters = self._min_depth
+    if self._conv_hyperparams:
+      kwargs = self._conv_hyperparams.params(**kwargs)
+    kwargs['padding'] = 'same'
+    kernel_size = kwargs.get('kernel_size')
+    if self._use_explicit_padding and kernel_size > 1:
+      kwargs['padding'] = 'valid'
+      def padded_conv(features):
+        padded_features = self._FixedPaddingLayer(kernel_size)(features)
+        return tf.keras.layers.Conv2D(filters, **kwargs)(padded_features)
+      return padded_conv
+    else:
+      return tf.keras.layers.Conv2D(filters, **kwargs)
+  def DepthwiseConv2D(self, **kwargs):
+    """Builds a DepthwiseConv2D according to the Object Detection config.
+    Overrides the Keras MobileNetV2 application's convolutions with ones that
+    follow the spec specified by the Object Detection hyperparameters.
+    Args:
+      **kwargs: Keyword args specified by the Keras application for
+        constructing the convolution.
+    Returns:
+      A one-arg callable that will either directly apply a Keras DepthwiseConv2D
+      layer to the input argument, or that will first pad the input then apply
+      the depthwise convolution.
+    """
+    if self._conv_hyperparams:
+      kwargs = self._conv_hyperparams.params(**kwargs)
+    kwargs['padding'] = 'same'
+    kernel_size = kwargs.get('kernel_size')
+    if self._use_explicit_padding and kernel_size > 1:
+      kwargs['padding'] = 'valid'
+      def padded_depthwise_conv(features):
+        padded_features = self._FixedPaddingLayer(kernel_size)(features)
+        return tf.keras.layers.DepthwiseConv2D(**kwargs)(padded_features)
+      return padded_depthwise_conv
+    else:
+      return tf.keras.layers.DepthwiseConv2D(**kwargs)
+  def BatchNormalization(self, **kwargs):
+    """Builds a normalization layer.
+    Overrides the Keras application batch norm with the norm specified by the
+    Object Detection configuration.
+    Args:
+      **kwargs: Only the name is used, all other params ignored.
+        Required for matching `layers.BatchNormalization` calls in the Keras
+        application.
+    Returns:
+      A normalization layer specified by the Object Detection hyperparameter
+      configurations.
+    """
+    name = kwargs.get('name')
+    if self._conv_hyperparams:
+      return self._conv_hyperparams.build_batch_norm(
+          training=self._batchnorm_training,
+          name=name)
+    else:
+      return freezable_batch_norm.FreezableBatchNorm(
+          training=self._batchnorm_training,
+          epsilon=1e-3,
+          momentum=self._default_batchnorm_momentum,
+          name=name)
+  def Input(self, shape):
+    """Builds an Input layer.
+    Overrides the Keras application Input layer with one that uses a
+    tf.placeholder_with_default instead of a tf.placeholder. This is necessary
+    to ensure the application works when run on a TPU.
+    Args:
+      shape: The shape for the input layer to use. (Does not include a dimension
+        for the batch size).
+    Returns:
+      An input layer for the specified shape that internally uses a
+      placeholder_with_default.
+    """
+    default_size = 224
+    default_batch_size = 1
+    shape = list(shape)
+    default_shape = [default_size if dim is None else dim for dim in shape]
+    input_tensor = tf.constant(0.0, shape=[default_batch_size] + default_shape)
+    placeholder_with_default = tf.placeholder_with_default(
+        input=input_tensor, shape=[None] + shape)
+    return tf.keras.layers.Input(tensor=placeholder_with_default)
+  # pylint: disable=unused-argument
+  def ReLU(self, *args, **kwargs):
+    """Builds an activation layer.
+    Overrides the Keras application ReLU with the activation specified by the
+    Object Detection configuration.
+    Args:
+      *args: Ignored, required to match the `tf.keras.ReLU` interface
+      **kwargs: Only the name is used,
+        required to match `tf.keras.ReLU` interface
+    Returns:
+      An activation layer specified by the Object Detection hyperparameter
+      configurations.
+    """
+    name = kwargs.get('name')
+    if self._conv_hyperparams:
+      return self._conv_hyperparams.build_activation_layer(name=name)
+    else:
+      return tf.keras.layers.Lambda(tf.nn.relu6, name=name)
+  # pylint: enable=unused-argument
+  # pylint: disable=unused-argument
+  def ZeroPadding2D(self, **kwargs):
+    """Replaces explicit padding in the Keras application with a no-op.
+    Args:
+      **kwargs: Ignored, required to match the Keras applications usage.
+    Returns:
+      A no-op identity lambda.
+    """
+    return lambda x: x
+  # pylint: enable=unused-argument
+  # Forward all non-overridden methods to the keras layers
+  def __getattr__(self, item):
+    return getattr(tf.keras.layers, item)
+def mobilenet_v2(batchnorm_training,
+                 default_batchnorm_momentum=0.9997,
+                 conv_hyperparams=None,
+                 use_explicit_padding=False,
+                 alpha=1.0,
+                 min_depth=None,
+                 **kwargs):
+  """Instantiates the MobileNetV2 architecture, modified for object detection.
+  This wraps the MobileNetV2 tensorflow Keras application, but uses the
+  Keras application's kwargs-based monkey-patching API to override the Keras
+  architecture with the following changes:
+  - Changes the default batchnorm momentum to 0.9997
+  - Applies the Object Detection hyperparameter configuration
+  - Supports FreezableBatchNorms
+  - Adds support for a min number of filters for each layer
+  - Makes the `alpha` parameter affect the final convolution block even if it
+      is less than 1.0
+  - Adds support for explicit padding of convolutions
+  - Makes the Input layer use a tf.placeholder_with_default instead of a
+      tf.placeholder, to work on TPUs.
+  Args:
+      batchnorm_training: Bool. Assigned to Batch norm layer `training` param
+        when constructing `freezable_batch_norm.FreezableBatchNorm` layers.
+      default_batchnorm_momentum: Float. When 'conv_hyperparams' is None,
+        batch norm layers will be constructed using this value as the momentum.
+      conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
+        containing hyperparameters for convolution ops. Optionally set to `None`
+        to use default mobilenet_v2 layer builders.
+      use_explicit_padding: If True, use 'valid' padding for convolutions,
+        but explicitly pre-pads inputs so that the output dimensions are the
+        same as if 'same' padding were used. Off by default.
+      alpha: The width multiplier referenced in the MobileNetV2 paper. It
+        modifies the number of filters in each convolutional layer.
+      min_depth: Minimum number of filters in the convolutional layers.
+      **kwargs: Keyword arguments forwarded directly to the
+        `tf.keras.applications.MobilenetV2` method that constructs the Keras
+        model.
+  Returns:
+      A Keras model instance.
+  """
+  layers_override = _LayersOverride(
+      batchnorm_training,
+      default_batchnorm_momentum=default_batchnorm_momentum,
+      conv_hyperparams=conv_hyperparams,
+      use_explicit_padding=use_explicit_padding,
+      min_depth=min_depth,
+      alpha=alpha)
+  return tf.keras.applications.MobileNetV2(alpha=alpha,
+                                           layers=layers_override,
+                                           **kwargs)
+# pylint: enable=invalid-name
--- a/research/object_detection/models/keras_applications/mobilenet_v2_test.py
+++ b/research/object_detection/models/keras_applications/mobilenet_v2_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for mobilenet_v2."""
+import itertools
+import numpy as np
+import tensorflow as tf
+from google.protobuf import text_format
+from object_detection.builders import hyperparams_builder
+from object_detection.models.keras_applications import mobilenet_v2
+from object_detection.protos import hyperparams_pb2
+from object_detection.utils import test_case
+_layers_to_check = [
+    'Conv1_relu',
+    'block_1_expand_relu', 'block_1_depthwise_relu', 'block_1_project_BN',
+    'block_2_expand_relu', 'block_2_depthwise_relu', 'block_2_project_BN',
+    'block_3_expand_relu', 'block_3_depthwise_relu', 'block_3_project_BN',
+    'block_4_expand_relu', 'block_4_depthwise_relu', 'block_4_project_BN',
+    'block_5_expand_relu', 'block_5_depthwise_relu', 'block_5_project_BN',
+    'block_6_expand_relu', 'block_6_depthwise_relu', 'block_6_project_BN',
+    'block_7_expand_relu', 'block_7_depthwise_relu', 'block_7_project_BN',
+    'block_8_expand_relu', 'block_8_depthwise_relu', 'block_8_project_BN',
+    'block_9_expand_relu', 'block_9_depthwise_relu', 'block_9_project_BN',
+    'block_10_expand_relu', 'block_10_depthwise_relu', 'block_10_project_BN',
+    'block_11_expand_relu', 'block_11_depthwise_relu', 'block_11_project_BN',
+    'block_12_expand_relu', 'block_12_depthwise_relu', 'block_12_project_BN',
+    'block_13_expand_relu', 'block_13_depthwise_relu', 'block_13_project_BN',
+    'block_14_expand_relu', 'block_14_depthwise_relu', 'block_14_project_BN',
+    'block_15_expand_relu', 'block_15_depthwise_relu', 'block_15_project_BN',
+    'block_16_expand_relu', 'block_16_depthwise_relu', 'block_16_project_BN',
+    'out_relu']
+class MobilenetV2Test(test_case.TestCase):
+  def _build_conv_hyperparams(self):
+    conv_hyperparams = hyperparams_pb2.Hyperparams()
+    conv_hyperparams_text_proto = """
+      activation: RELU_6
+      regularizer {
+        l2_regularizer {
+        }
+      }
+      initializer {
+        truncated_normal_initializer {
+        }
+      }
+      batch_norm {
+        train: true,
+        scale: false,
+        center: true,
+        decay: 0.2,
+        epsilon: 0.1,
+      }
+    """
+    text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
+    return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
+  def _create_application_with_layer_outputs(
+      self, layer_names, batchnorm_training,
+      conv_hyperparams=None,
+      use_explicit_padding=False,
+      alpha=1.0,
+      min_depth=None):
+    """Constructs Keras mobilenetv2 that extracts intermediate layer outputs."""
+    if not layer_names:
+      layer_names = _layers_to_check
+    full_model = mobilenet_v2.mobilenet_v2(
+        batchnorm_training=batchnorm_training,
+        conv_hyperparams=conv_hyperparams,
+        weights=None,
+        use_explicit_padding=use_explicit_padding,
+        alpha=alpha,
+        min_depth=min_depth,
+        include_top=False)
+    layer_outputs = [full_model.get_layer(name=layer).output
+                     for layer in layer_names]
+    return tf.keras.Model(
+        inputs=full_model.inputs,
+        outputs=layer_outputs)
+  def _check_returns_correct_shape(
+      self, batch_size, image_height, image_width, depth_multiplier,
+      expected_feature_map_shapes, use_explicit_padding=False, min_depth=None,
+      layer_names=None):
+    def graph_fn(image_tensor):
+      model = self._create_application_with_layer_outputs(
+          layer_names=layer_names,
+          batchnorm_training=False, use_explicit_padding=use_explicit_padding,
+          min_depth=min_depth,
+          alpha=depth_multiplier)
+      return model(image_tensor)
+    image_tensor = np.random.rand(batch_size, image_height, image_width,
+                                  3).astype(np.float32)
+    feature_maps = self.execute(graph_fn, [image_tensor])
+    for feature_map, expected_shape in itertools.izip(
+        feature_maps, expected_feature_map_shapes):
+      self.assertAllEqual(feature_map.shape, expected_shape)
+  def _check_returns_correct_shapes_with_dynamic_inputs(
+      self, batch_size, image_height, image_width, depth_multiplier,
+      expected_feature_map_shapes, use_explicit_padding=False,
+      layer_names=None):
+    def graph_fn(image_height, image_width):
+      image_tensor = tf.random_uniform([batch_size, image_height, image_width,
+                                        3], dtype=tf.float32)
+      model = self._create_application_with_layer_outputs(
+          layer_names=layer_names,
+          batchnorm_training=False, use_explicit_padding=use_explicit_padding,
+          alpha=depth_multiplier)
+      return model(image_tensor)
+    feature_maps = self.execute_cpu(graph_fn, [
+        np.array(image_height, dtype=np.int32),
+        np.array(image_width, dtype=np.int32)
+    ])
+    for feature_map, expected_shape in itertools.izip(
+        feature_maps, expected_feature_map_shapes):
+      self.assertAllEqual(feature_map.shape, expected_shape)
+  def _get_variables(self, depth_multiplier, layer_names=None):
+    g = tf.Graph()
+    with g.as_default():
+      preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3))
+      model = self._create_application_with_layer_outputs(
+          layer_names=layer_names,
+          batchnorm_training=False, use_explicit_padding=False,
+          alpha=depth_multiplier)
+      model(preprocessed_inputs)
+      return g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
+  def test_returns_correct_shapes_128(self):
+    image_height = 128
+    image_width = 128
+    depth_multiplier = 1.0
+    expected_feature_map_shape = [(2, 64, 64, 32),
+                                  (2, 64, 64, 96),
+                                  (2, 32, 32, 96),
+                                  (2, 32, 32, 24),
+                                  (2, 32, 32, 144),
+                                  (2, 32, 32, 144),
+                                  (2, 32, 32, 24),
+                                  (2, 32, 32, 144),
+                                  (2, 16, 16, 144),
+                                  (2, 16, 16, 32),
+                                  (2, 16, 16, 192),
+                                  (2, 16, 16, 192),
+                                  (2, 16, 16, 32),
+                                  (2, 16, 16, 192),
+                                  (2, 16, 16, 192),
+                                  (2, 16, 16, 32),
+                                  (2, 16, 16, 192),
+                                  (2, 8, 8, 192),
+                                  (2, 8, 8, 64),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 64),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 64),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 64),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 96),
+                                  (2, 8, 8, 576),
+                                  (2, 8, 8, 576),
+                                  (2, 8, 8, 96),
+                                  (2, 8, 8, 576),
+                                  (2, 8, 8, 576),
+                                  (2, 8, 8, 96),
+                                  (2, 8, 8, 576),
+                                  (2, 4, 4, 576),
+                                  (2, 4, 4, 160),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 160),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 160),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 320),
+                                  (2, 4, 4, 1280)]
+    self._check_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier,
+        expected_feature_map_shape)
+  def test_returns_correct_shapes_128_explicit_padding(
+      self):
+    image_height = 128
+    image_width = 128
+    depth_multiplier = 1.0
+    expected_feature_map_shape = [(2, 64, 64, 32),
+                                  (2, 64, 64, 96),
+                                  (2, 32, 32, 96),
+                                  (2, 32, 32, 24),
+                                  (2, 32, 32, 144),
+                                  (2, 32, 32, 144),
+                                  (2, 32, 32, 24),
+                                  (2, 32, 32, 144),
+                                  (2, 16, 16, 144),
+                                  (2, 16, 16, 32),
+                                  (2, 16, 16, 192),
+                                  (2, 16, 16, 192),
+                                  (2, 16, 16, 32),
+                                  (2, 16, 16, 192),
+                                  (2, 16, 16, 192),
+                                  (2, 16, 16, 32),
+                                  (2, 16, 16, 192),
+                                  (2, 8, 8, 192),
+                                  (2, 8, 8, 64),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 64),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 64),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 64),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 96),
+                                  (2, 8, 8, 576),
+                                  (2, 8, 8, 576),
+                                  (2, 8, 8, 96),
+                                  (2, 8, 8, 576),
+                                  (2, 8, 8, 576),
+                                  (2, 8, 8, 96),
+                                  (2, 8, 8, 576),
+                                  (2, 4, 4, 576),
+                                  (2, 4, 4, 160),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 160),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 160),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 320),
+                                  (2, 4, 4, 1280)]
+    self._check_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier,
+        expected_feature_map_shape, use_explicit_padding=True)
+  def test_returns_correct_shapes_with_dynamic_inputs(
+      self):
+    image_height = 128
+    image_width = 128
+    depth_multiplier = 1.0
+    expected_feature_map_shape = [(2, 64, 64, 32),
+                                  (2, 64, 64, 96),
+                                  (2, 32, 32, 96),
+                                  (2, 32, 32, 24),
+                                  (2, 32, 32, 144),
+                                  (2, 32, 32, 144),
+                                  (2, 32, 32, 24),
+                                  (2, 32, 32, 144),
+                                  (2, 16, 16, 144),
+                                  (2, 16, 16, 32),
+                                  (2, 16, 16, 192),
+                                  (2, 16, 16, 192),
+                                  (2, 16, 16, 32),
+                                  (2, 16, 16, 192),
+                                  (2, 16, 16, 192),
+                                  (2, 16, 16, 32),
+                                  (2, 16, 16, 192),
+                                  (2, 8, 8, 192),
+                                  (2, 8, 8, 64),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 64),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 64),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 64),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 384),
+                                  (2, 8, 8, 96),
+                                  (2, 8, 8, 576),
+                                  (2, 8, 8, 576),
+                                  (2, 8, 8, 96),
+                                  (2, 8, 8, 576),
+                                  (2, 8, 8, 576),
+                                  (2, 8, 8, 96),
+                                  (2, 8, 8, 576),
+                                  (2, 4, 4, 576),
+                                  (2, 4, 4, 160),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 160),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 160),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 960),
+                                  (2, 4, 4, 320),
+                                  (2, 4, 4, 1280)]
+    self._check_returns_correct_shapes_with_dynamic_inputs(
+        2, image_height, image_width, depth_multiplier,
+        expected_feature_map_shape)
+  def test_returns_correct_shapes_299(self):
+    image_height = 299
+    image_width = 299
+    depth_multiplier = 1.0
+    expected_feature_map_shape = [(2, 150, 150, 32),
+                                  (2, 150, 150, 96),
+                                  (2, 75, 75, 96),
+                                  (2, 75, 75, 24),
+                                  (2, 75, 75, 144),
+                                  (2, 75, 75, 144),
+                                  (2, 75, 75, 24),
+                                  (2, 75, 75, 144),
+                                  (2, 38, 38, 144),
+                                  (2, 38, 38, 32),
+                                  (2, 38, 38, 192),
+                                  (2, 38, 38, 192),
+                                  (2, 38, 38, 32),
+                                  (2, 38, 38, 192),
+                                  (2, 38, 38, 192),
+                                  (2, 38, 38, 32),
+                                  (2, 38, 38, 192),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 64),
+                                  (2, 19, 19, 384),
+                                  (2, 19, 19, 384),
+                                  (2, 19, 19, 64),
+                                  (2, 19, 19, 384),
+                                  (2, 19, 19, 384),
+                                  (2, 19, 19, 64),
+                                  (2, 19, 19, 384),
+                                  (2, 19, 19, 384),
+                                  (2, 19, 19, 64),
+                                  (2, 19, 19, 384),
+                                  (2, 19, 19, 384),
+                                  (2, 19, 19, 96),
+                                  (2, 19, 19, 576),
+                                  (2, 19, 19, 576),
+                                  (2, 19, 19, 96),
+                                  (2, 19, 19, 576),
+                                  (2, 19, 19, 576),
+                                  (2, 19, 19, 96),
+                                  (2, 19, 19, 576),
+                                  (2, 10, 10, 576),
+                                  (2, 10, 10, 160),
+                                  (2, 10, 10, 960),
+                                  (2, 10, 10, 960),
+                                  (2, 10, 10, 160),
+                                  (2, 10, 10, 960),
+                                  (2, 10, 10, 960),
+                                  (2, 10, 10, 160),
+                                  (2, 10, 10, 960),
+                                  (2, 10, 10, 960),
+                                  (2, 10, 10, 320),
+                                  (2, 10, 10, 1280)]
+    self._check_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier,
+        expected_feature_map_shape)
+  def test_returns_correct_shapes_enforcing_min_depth(
+      self):
+    image_height = 299
+    image_width = 299
+    depth_multiplier = 0.5**12
+    expected_feature_map_shape = [(2, 150, 150, 32),
+                                  (2, 150, 150, 192),
+                                  (2, 75, 75, 192),
+                                  (2, 75, 75, 32),
+                                  (2, 75, 75, 192),
+                                  (2, 75, 75, 192),
+                                  (2, 75, 75, 32),
+                                  (2, 75, 75, 192),
+                                  (2, 38, 38, 192),
+                                  (2, 38, 38, 32),
+                                  (2, 38, 38, 192),
+                                  (2, 38, 38, 192),
+                                  (2, 38, 38, 32),
+                                  (2, 38, 38, 192),
+                                  (2, 38, 38, 192),
+                                  (2, 38, 38, 32),
+                                  (2, 38, 38, 192),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 32),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 32),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 32),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 32),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 32),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 32),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 192),
+                                  (2, 19, 19, 32),
+                                  (2, 19, 19, 192),
+                                  (2, 10, 10, 192),
+                                  (2, 10, 10, 32),
+                                  (2, 10, 10, 192),
+                                  (2, 10, 10, 192),
+                                  (2, 10, 10, 32),
+                                  (2, 10, 10, 192),
+                                  (2, 10, 10, 192),
+                                  (2, 10, 10, 32),
+                                  (2, 10, 10, 192),
+                                  (2, 10, 10, 192),
+                                  (2, 10, 10, 32),
+                                  (2, 10, 10, 32)]
+    self._check_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier,
+        expected_feature_map_shape, min_depth=32)
+  def test_hyperparam_override(self):
+    hyperparams = self._build_conv_hyperparams()
+    model = mobilenet_v2.mobilenet_v2(
+        batchnorm_training=True,
+        conv_hyperparams=hyperparams,
+        weights=None,
+        use_explicit_padding=False,
+        alpha=1.0,
+        min_depth=32,
+        include_top=False)
+    hyperparams.params()
+    bn_layer = model.get_layer(name='block_5_project_BN')
+    self.assertAllClose(bn_layer.momentum, 0.2)
+    self.assertAllClose(bn_layer.epsilon, 0.1)
+  def test_variable_count(self):
+    depth_multiplier = 1
+    variables = self._get_variables(depth_multiplier)
+    self.assertEqual(len(variables), 260)
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/models/ssd_feature_extractor_test.py
+++ b/research/object_detection/models/ssd_feature_extractor_test.py
@@ -21,18 +21,40 @@ import itertools
 import numpy as np
 import tensorflow as tf
+from google.protobuf import text_format
+from object_detection.builders import hyperparams_builder
+from object_detection.protos import hyperparams_pb2
 from object_detection.utils import test_case
 class SsdFeatureExtractorTestBase(test_case.TestCase):
+  def _build_conv_hyperparams(self):
+    conv_hyperparams = hyperparams_pb2.Hyperparams()
+    conv_hyperparams_text_proto = """
+      activation: RELU_6
+      regularizer {
+        l2_regularizer {
+        }
+      }
+      initializer {
+        truncated_normal_initializer {
+        }
+      }
+      batch_norm {
+        scale: false
+      }
+    """
+    text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
+    return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
  def conv_hyperparams_fn(self):
    with tf.contrib.slim.arg_scope([]) as sc:
      return sc
  @abstractmethod
  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
-                                use_explicit_padding=False):
+                                use_explicit_padding=False, use_keras=False):
    """Constructs a new feature extractor.
    Args:
@@ -42,20 +64,42 @@ class SsdFeatureExtractorTestBase(test_case.TestCase):
      use_explicit_padding: use 'VALID' padding for convolutions, but prepad
        inputs so that the output dimensions are the same as if 'SAME' padding
        were used.
+      use_keras: if True builds a keras-based feature extractor, if False builds
+        a slim-based one.
    Returns:
-      an ssd_meta_arch.SSDFeatureExtractor object.
+      an ssd_meta_arch.SSDFeatureExtractor or an
+      ssd_meta_arch.SSDKerasFeatureExtractor object.
    """
    pass
-  def check_extract_features_returns_correct_shape(
+  def _extract_features(self, image_tensor, depth_multiplier, pad_to_multiple,
-      self, batch_size, image_height, image_width, depth_multiplier,
+                        use_explicit_padding=False, use_keras=False):
-      pad_to_multiple, expected_feature_map_shapes, use_explicit_padding=False):
+    try:
-    def graph_fn(image_tensor):
+      feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                         pad_to_multiple,
+                                                         use_explicit_padding,
+                                                         use_keras=use_keras)
+    # If the unit test does not support a use_keras arg, it raises an error:
+    except TypeError:
      feature_extractor = self._create_feature_extractor(depth_multiplier,
                                                         pad_to_multiple,
                                                         use_explicit_padding)
+    if use_keras:
+      feature_maps = feature_extractor(image_tensor)
+    else:
      feature_maps = feature_extractor.extract_features(image_tensor)
-      return feature_maps
+    return feature_maps
+  def check_extract_features_returns_correct_shape(
+      self, batch_size, image_height, image_width, depth_multiplier,
+      pad_to_multiple, expected_feature_map_shapes, use_explicit_padding=False,
+      use_keras=False):
+    def graph_fn(image_tensor):
+      return self._extract_features(image_tensor,
+                                    depth_multiplier,
+                                    pad_to_multiple,
+                                    use_explicit_padding,
+                                    use_keras=use_keras)
    image_tensor = np.random.rand(batch_size, image_height, image_width,
                                  3).astype(np.float32)
@@ -66,15 +110,16 @@ class SsdFeatureExtractorTestBase(test_case.TestCase):
  def check_extract_features_returns_correct_shapes_with_dynamic_inputs(
      self, batch_size, image_height, image_width, depth_multiplier,
-      pad_to_multiple, expected_feature_map_shapes, use_explicit_padding=False):
+      pad_to_multiple, expected_feature_map_shapes, use_explicit_padding=False,
+      use_keras=False):
    def graph_fn(image_height, image_width):
-      feature_extractor = self._create_feature_extractor(depth_multiplier,
-                                                         pad_to_multiple,
-                                                         use_explicit_padding)
      image_tensor = tf.random_uniform([batch_size, image_height, image_width,
                                        3], dtype=tf.float32)
-      feature_maps = feature_extractor.extract_features(image_tensor)
+      return self._extract_features(image_tensor,
-      return feature_maps
+                                    depth_multiplier,
+                                    pad_to_multiple,
+                                    use_explicit_padding,
+                                    use_keras=use_keras)
    feature_maps = self.execute_cpu(graph_fn, [
        np.array(image_height, dtype=np.int32),
@@ -85,11 +130,13 @@ class SsdFeatureExtractorTestBase(test_case.TestCase):
      self.assertAllEqual(feature_map.shape, expected_shape)
  def check_extract_features_raises_error_with_invalid_image_size(
-      self, image_height, image_width, depth_multiplier, pad_to_multiple):
+      self, image_height, image_width, depth_multiplier, pad_to_multiple,
-    feature_extractor = self._create_feature_extractor(depth_multiplier,
+      use_keras=False):
-                                                       pad_to_multiple)
    preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3))
-    feature_maps = feature_extractor.extract_features(preprocessed_inputs)
+    feature_maps = self._extract_features(preprocessed_inputs,
+                                          depth_multiplier,
+                                          pad_to_multiple,
+                                          use_keras=use_keras)
    test_preprocessed_image = np.random.rand(4, image_height, image_width, 3)
    with self.test_session() as sess:
      sess.run(tf.global_variables_initializer())
@@ -98,13 +145,19 @@ class SsdFeatureExtractorTestBase(test_case.TestCase):
                 feed_dict={preprocessed_inputs: test_preprocessed_image})
  def check_feature_extractor_variables_under_scope(
-      self, depth_multiplier, pad_to_multiple, scope_name):
+      self, depth_multiplier, pad_to_multiple, scope_name, use_keras=False):
+    variables = self.get_feature_extractor_variables(
+        depth_multiplier, pad_to_multiple, use_keras)
+    for variable in variables:
+      self.assertTrue(variable.name.startswith(scope_name))
+  def get_feature_extractor_variables(
+      self, depth_multiplier, pad_to_multiple, use_keras=False):
    g = tf.Graph()
    with g.as_default():
-      feature_extractor = self._create_feature_extractor(
-          depth_multiplier, pad_to_multiple)
      preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3))
-      feature_extractor.extract_features(preprocessed_inputs)
+      self._extract_features(preprocessed_inputs,
-      variables = g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
+                             depth_multiplier,
-      for variable in variables:
+                             pad_to_multiple,
-        self.assertTrue(variable.name.startswith(scope_name))
+                             use_keras=use_keras)
+      return g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
--- a/research/object_detection/models/ssd_mobilenet_v1_fpn_feature_extractor.py
+++ b/research/object_detection/models/ssd_mobilenet_v1_fpn_feature_extractor.py
@@ -15,6 +15,8 @@
 """SSD MobilenetV1 FPN Feature Extractor."""
+import copy
+import functools
 import tensorflow as tf
 from object_detection.meta_architectures import ssd_meta_arch
@@ -27,6 +29,15 @@ from nets import mobilenet_v1
 slim = tf.contrib.slim
+# A modified config of mobilenet v1 that makes it more detection friendly,
+def _create_modified_mobilenet_config():
+  conv_defs = copy.copy(mobilenet_v1.MOBILENETV1_CONV_DEFS)
+  conv_defs[-2] = mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=512)
+  conv_defs[-1] = mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=256)
+  return conv_defs
+_CONV_DEFS = _create_modified_mobilenet_config()
 class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
  """SSD Feature Extractor using MobilenetV1 FPN features."""
@@ -38,6 +49,7 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
               conv_hyperparams_fn,
               fpn_min_level=3,
               fpn_max_level=7,
+               additional_layer_depth=256,
               reuse_weights=None,
               use_explicit_padding=False,
               use_depthwise=False,
@@ -63,6 +75,7 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
        maps in the backbone network, additional feature maps are created by
        applying stride 2 convolutions until we get the desired number of fpn
        levels.
+      additional_layer_depth: additional feature map layer channel depth.
      reuse_weights: whether to reuse variables. Default is None.
      use_explicit_padding: Whether to use explicit padding when extracting
        features. Default is False.
@@ -84,6 +97,7 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
        override_base_feature_extractor_hyperparams)
    self._fpn_min_level = fpn_min_level
    self._fpn_max_level = fpn_max_level
+    self._additional_layer_depth = additional_layer_depth
  def preprocess(self, resized_inputs):
    """SSD preprocessing.
@@ -127,6 +141,7 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
              final_endpoint='Conv2d_13_pointwise',
              min_depth=self._min_depth,
              depth_multiplier=self._depth_multiplier,
+              conv_defs=_CONV_DEFS if self._use_depthwise else None,
              use_explicit_padding=self._use_explicit_padding,
              scope=scope)
@@ -143,7 +158,8 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
            feature_block_list.append(feature_blocks[level - 2])
          fpn_features = feature_map_generators.fpn_top_down_feature_maps(
              [(key, image_features[key]) for key in feature_block_list],
-              depth=depth_fn(256))
+              depth=depth_fn(self._additional_layer_depth),
+              use_depthwise=self._use_depthwise)
          feature_maps = []
          for level in range(self._fpn_min_level, base_fpn_max_level + 1):
            feature_maps.append(fpn_features['top_down_{}'.format(
@@ -152,9 +168,14 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
              feature_blocks[base_fpn_max_level - 2])]
          # Construct coarse features
          for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1):
-            last_feature_map = slim.conv2d(
+            if self._use_depthwise:
+              conv_op = functools.partial(
+                  slim.separable_conv2d, depth_multiplier=1)
+            else:
+              conv_op = slim.conv2d
+            last_feature_map = conv_op(
                last_feature_map,
-                num_outputs=depth_fn(256),
+                num_outputs=depth_fn(self._additional_layer_depth),
                kernel_size=[3, 3],
                stride=2,
                padding='SAME',

--- a/research/object_detection/models/ssd_mobilenet_v2_feature_extractor_test.py
+++ b/research/object_detection/models/ssd_mobilenet_v2_feature_extractor_test.py
@@ -14,20 +14,27 @@
 # ==============================================================================
 """Tests for ssd_mobilenet_v2_feature_extractor."""
+from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
 from object_detection.models import ssd_feature_extractor_test
 from object_detection.models import ssd_mobilenet_v2_feature_extractor
+from object_detection.models import ssd_mobilenet_v2_keras_feature_extractor
 slim = tf.contrib.slim
+@parameterized.parameters(
+    {'use_keras': False},
+    {'use_keras': True},
+)
 class SsdMobilenetV2FeatureExtractorTest(
    ssd_feature_extractor_test.SsdFeatureExtractorTestBase):
  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
-                                use_explicit_padding=False):
+                                use_explicit_padding=False, use_keras=False):
    """Constructs a new feature extractor.
    Args:
@@ -37,19 +44,47 @@ class SsdMobilenetV2FeatureExtractorTest(
      use_explicit_padding: use 'VALID' padding for convolutions, but prepad
        inputs so that the output dimensions are the same as if 'SAME' padding
        were used.
+      use_keras: if True builds a keras-based feature extractor, if False builds
+        a slim-based one.
    Returns:
      an ssd_meta_arch.SSDFeatureExtractor object.
    """
    min_depth = 32
-    return ssd_mobilenet_v2_feature_extractor.SSDMobileNetV2FeatureExtractor(
+    if use_keras:
-        False,
+      return (ssd_mobilenet_v2_keras_feature_extractor.
-        depth_multiplier,
+              SSDMobileNetV2KerasFeatureExtractor(
-        min_depth,
+                  is_training=False,
-        pad_to_multiple,
+                  depth_multiplier=depth_multiplier,
-        self.conv_hyperparams_fn,
+                  min_depth=min_depth,
-        use_explicit_padding=use_explicit_padding)
+                  pad_to_multiple=pad_to_multiple,
+                  conv_hyperparams=self._build_conv_hyperparams(),
-  def test_extract_features_returns_correct_shapes_128(self):
+                  freeze_batchnorm=False,
+                  inplace_batchnorm_update=False,
+                  use_explicit_padding=use_explicit_padding,
+                  name='MobilenetV2'))
+    else:
+      return ssd_mobilenet_v2_feature_extractor.SSDMobileNetV2FeatureExtractor(
+          False,
+          depth_multiplier,
+          min_depth,
+          pad_to_multiple,
+          self.conv_hyperparams_fn,
+          use_explicit_padding=use_explicit_padding)
+  def test_extract_features_returns_correct_shapes_128(self, use_keras):
+    image_height = 128
+    image_width = 128
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 8, 8, 576), (2, 4, 4, 1280),
+                                  (2, 2, 2, 512), (2, 1, 1, 256),
+                                  (2, 1, 1, 256), (2, 1, 1, 128)]
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape, use_keras=use_keras)
+  def test_extract_features_returns_correct_shapes_128_explicit_padding(
+      self, use_keras):
    image_height = 128
    image_width = 128
    depth_multiplier = 1.0
@@ -59,9 +94,11 @@ class SsdMobilenetV2FeatureExtractorTest(
                                  (2, 1, 1, 256), (2, 1, 1, 128)]
    self.check_extract_features_returns_correct_shape(
        2, image_height, image_width, depth_multiplier, pad_to_multiple,
-        expected_feature_map_shape)
+        expected_feature_map_shape, use_explicit_padding=True,
+        use_keras=use_keras)
-  def test_extract_features_returns_correct_shapes_with_dynamic_inputs(self):
+  def test_extract_features_returns_correct_shapes_with_dynamic_inputs(
+      self, use_keras):
    image_height = 128
    image_width = 128
    depth_multiplier = 1.0
@@ -71,9 +108,9 @@ class SsdMobilenetV2FeatureExtractorTest(
                                  (2, 1, 1, 256), (2, 1, 1, 128)]
    self.check_extract_features_returns_correct_shapes_with_dynamic_inputs(
        2, image_height, image_width, depth_multiplier, pad_to_multiple,
-        expected_feature_map_shape)
+        expected_feature_map_shape, use_keras=use_keras)
-  def test_extract_features_returns_correct_shapes_299(self):
+  def test_extract_features_returns_correct_shapes_299(self, use_keras):
    image_height = 299
    image_width = 299
    depth_multiplier = 1.0
@@ -83,9 +120,10 @@ class SsdMobilenetV2FeatureExtractorTest(
                                  (2, 2, 2, 256), (2, 1, 1, 128)]
    self.check_extract_features_returns_correct_shape(
        2, image_height, image_width, depth_multiplier, pad_to_multiple,
-        expected_feature_map_shape)
+        expected_feature_map_shape, use_keras=use_keras)
-  def test_extract_features_returns_correct_shapes_enforcing_min_depth(self):
+  def test_extract_features_returns_correct_shapes_enforcing_min_depth(
+      self, use_keras):
    image_height = 299
    image_width = 299
    depth_multiplier = 0.5**12
@@ -95,9 +133,10 @@ class SsdMobilenetV2FeatureExtractorTest(
                                  (2, 2, 2, 32), (2, 1, 1, 32)]
    self.check_extract_features_returns_correct_shape(
        2, image_height, image_width, depth_multiplier, pad_to_multiple,
-        expected_feature_map_shape)
+        expected_feature_map_shape, use_keras=use_keras)
-  def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self):
+  def test_extract_features_returns_correct_shapes_with_pad_to_multiple(
+      self, use_keras):
    image_height = 299
    image_width = 299
    depth_multiplier = 1.0
@@ -107,35 +146,45 @@ class SsdMobilenetV2FeatureExtractorTest(
                                  (2, 2, 2, 256), (2, 1, 1, 128)]
    self.check_extract_features_returns_correct_shape(
        2, image_height, image_width, depth_multiplier, pad_to_multiple,
-        expected_feature_map_shape)
+        expected_feature_map_shape, use_keras=use_keras)
-  def test_extract_features_raises_error_with_invalid_image_size(self):
+  def test_extract_features_raises_error_with_invalid_image_size(
+      self, use_keras):
    image_height = 32
    image_width = 32
    depth_multiplier = 1.0
    pad_to_multiple = 1
    self.check_extract_features_raises_error_with_invalid_image_size(
-        image_height, image_width, depth_multiplier, pad_to_multiple)
+        image_height, image_width, depth_multiplier, pad_to_multiple,
+        use_keras=use_keras)
-  def test_preprocess_returns_correct_value_range(self):
+  def test_preprocess_returns_correct_value_range(self, use_keras):
    image_height = 128
    image_width = 128
    depth_multiplier = 1
    pad_to_multiple = 1
    test_image = np.random.rand(4, image_height, image_width, 3)
    feature_extractor = self._create_feature_extractor(depth_multiplier,
-                                                       pad_to_multiple)
+                                                       pad_to_multiple,
+                                                       use_keras=use_keras)
    preprocessed_image = feature_extractor.preprocess(test_image)
    self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))
-  def test_variables_only_created_in_scope(self):
+  def test_variables_only_created_in_scope(self, use_keras):
    depth_multiplier = 1
    pad_to_multiple = 1
    scope_name = 'MobilenetV2'
    self.check_feature_extractor_variables_under_scope(
-        depth_multiplier, pad_to_multiple, scope_name)
+        depth_multiplier, pad_to_multiple, scope_name, use_keras=use_keras)
+  def test_variable_count(self, use_keras):
+    depth_multiplier = 1
+    pad_to_multiple = 1
+    variables = self.get_feature_extractor_variables(
+        depth_multiplier, pad_to_multiple, use_keras=use_keras)
+    self.assertEqual(len(variables), 292)
-  def test_has_fused_batchnorm(self):
+  def test_has_fused_batchnorm(self, use_keras):
    image_height = 40
    image_width = 40
    depth_multiplier = 1
@@ -143,9 +192,13 @@ class SsdMobilenetV2FeatureExtractorTest(
    image_placeholder = tf.placeholder(tf.float32,
                                       [1, image_height, image_width, 3])
    feature_extractor = self._create_feature_extractor(depth_multiplier,
-                                                       pad_to_multiple)
+                                                       pad_to_multiple,
+                                                       use_keras=use_keras)
    preprocessed_image = feature_extractor.preprocess(image_placeholder)
-    _ = feature_extractor.extract_features(preprocessed_image)
+    if use_keras:
+      _ = feature_extractor(preprocessed_image)
+    else:
+      _ = feature_extractor.extract_features(preprocessed_image)
    self.assertTrue(any(op.type == 'FusedBatchNorm'
                        for op in tf.get_default_graph().get_operations()))