Merge remote-tracking branch 'upstream/master'

e00e0e13 · dreamdragon · b915db4e · 402b561b · e00e0e13 · e00e0e13
Commit e00e0e13 authored Dec 03, 2018 by dreamdragon
20 changed files
--- a/research/object_detection/inputs.py
+++ b/research/object_detection/inputs.py
@@ -124,6 +124,8 @@ def transform_input_data(tensor_dict,
  if fields.InputDataFields.groundtruth_instance_masks in tensor_dict:
    masks = tensor_dict[fields.InputDataFields.groundtruth_instance_masks]
    _, resized_masks, _ = image_resizer_fn(image, masks)
+    if use_bfloat16:
+      resized_masks = tf.cast(resized_masks, tf.bfloat16)
    tensor_dict[fields.InputDataFields.
                groundtruth_instance_masks] = resized_masks
@@ -161,6 +163,9 @@ def transform_input_data(tensor_dict,
    tensor_dict[fields.InputDataFields.groundtruth_classes] = merged_classes
    tensor_dict[fields.InputDataFields.groundtruth_confidences] = (
        merged_confidences)
+  if fields.InputDataFields.groundtruth_boxes in tensor_dict:
+    tensor_dict[fields.InputDataFields.num_groundtruth_boxes] = tf.shape(
+        tensor_dict[fields.InputDataFields.groundtruth_boxes])[0]
  return tensor_dict
@@ -282,12 +287,9 @@ def augment_input_data(tensor_dict, data_augmentation_options):
                            in tensor_dict)
  include_keypoints = (fields.InputDataFields.groundtruth_keypoints
                       in tensor_dict)
-  include_label_scores = (fields.InputDataFields.groundtruth_confidences in
-                          tensor_dict)
  tensor_dict = preprocessor.preprocess(
      tensor_dict, data_augmentation_options,
      func_arg_map=preprocessor.get_default_func_arg_map(
-          include_label_scores=include_label_scores,
          include_instance_masks=include_instance_masks,
          include_keypoints=include_keypoints))
  tensor_dict[fields.InputDataFields.image] = tf.squeeze(

--- a/research/object_detection/inputs_test.py
+++ b/research/object_detection/inputs_test.py
@@ -630,6 +630,9 @@ class DataTransformationFnTest(test_case.TestCase):
    self.assertAllClose(
        transformed_inputs[fields.InputDataFields.groundtruth_confidences],
        [[1, 0, 1]])
+    self.assertAllClose(
+        transformed_inputs[fields.InputDataFields.num_groundtruth_boxes],
+        1)
  def test_returns_resized_masks(self):
    tensor_dict = {

--- a/research/object_detection/legacy/trainer_test.py
+++ b/research/object_detection/legacy/trainer_test.py
@@ -160,6 +160,17 @@ class FakeDetectionModel(model.DetectionModel):
    }
    return loss_dict
+  def regularization_losses(self):
+    """Returns a list of regularization losses for this model.
+    Returns a list of regularization losses for this model that the estimator
+    needs to use during training/optimization.
+    Returns:
+      A list of regularization loss tensors.
+    """
+    pass
  def restore_map(self, fine_tune_checkpoint_type='detection'):
    """Returns a map of variables to load from a foreign checkpoint.
@@ -174,6 +185,18 @@ class FakeDetectionModel(model.DetectionModel):
    """
    return {var.op.name: var for var in tf.global_variables()}
+  def updates(self):
+    """Returns a list of update operators for this model.
+    Returns a list of update operators for this model that must be executed at
+    each training step. The estimator's train op needs to have a control
+    dependency on these updates.
+    Returns:
+      A list of update operators.
+    """
+    pass
 class TrainerTest(tf.test.TestCase):

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
@@ -662,7 +662,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
             anchors_boxlist, clip_window)
    else:
      anchors_boxlist = box_list_ops.clip_to_window(
-          anchors_boxlist, clip_window)
+          anchors_boxlist, clip_window,
+          filter_nonoverlapping=not self._use_static_shapes)
    self._anchors = anchors_boxlist
    prediction_dict = {
@@ -917,12 +918,14 @@ class FasterRCNNMetaArch(model.DetectionModel):
      _, num_classes, mask_height, mask_width = (
          detection_masks.get_shape().as_list())
      _, max_detection = detection_classes.get_shape().as_list()
+      prediction_dict['mask_predictions'] = tf.reshape(
+          detection_masks, [-1, num_classes, mask_height, mask_width])
      if num_classes > 1:
        detection_masks = self._gather_instance_masks(
            detection_masks, detection_classes)
      prediction_dict[fields.DetectionResultFields.detection_masks] = (
-          tf.reshape(detection_masks,
+          tf.reshape(tf.sigmoid(detection_masks),
                     [batch_size, max_detection, mask_height, mask_width]))
    return prediction_dict
@@ -1159,9 +1162,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
        }
    # TODO(jrru): Remove mask_predictions from _post_process_box_classifier.
-    with tf.name_scope('SecondStagePostprocessor'):
+    if (self._number_of_stages == 2 or
-      if (self._number_of_stages == 2 or
+        (self._number_of_stages == 3 and self._is_training)):
-          (self._number_of_stages == 3 and self._is_training)):
+      with tf.name_scope('SecondStagePostprocessor'):
        mask_predictions = prediction_dict.get(box_predictor.MASK_PREDICTIONS)
        detections_dict = self._postprocess_box_classifier(
            prediction_dict['refined_box_encodings'],
@@ -1170,18 +1173,53 @@ class FasterRCNNMetaArch(model.DetectionModel):
            prediction_dict['num_proposals'],
            true_image_shapes,
            mask_predictions=mask_predictions)
-        return detections_dict
+      if 'rpn_features_to_crop' in prediction_dict and self._initial_crop_size:
+        self._add_detection_features_output_node(
+            detections_dict[fields.DetectionResultFields.detection_boxes],
+            prediction_dict['rpn_features_to_crop'])
+      return detections_dict
    if self._number_of_stages == 3:
      # Post processing is already performed in 3rd stage. We need to transfer
      # postprocessed tensors from `prediction_dict` to `detections_dict`.
-      detections_dict = {}
+      return prediction_dict
-      for key in prediction_dict:
-        if key == fields.DetectionResultFields.detection_masks:
+  def _add_detection_features_output_node(self, detection_boxes,
-          detections_dict[key] = tf.sigmoid(prediction_dict[key])
+                                          rpn_features_to_crop):
-        elif 'detection' in key:
+    """Add the detection features to the output node.
-          detections_dict[key] = prediction_dict[key]
-      return detections_dict
+    The detection features are from cropping rpn_features with boxes.
+    Each bounding box has one feature vector of length depth, which comes from
+    mean_pooling of the cropped rpn_features.
+    Args:
+      detection_boxes: a 3-D float32 tensor of shape
+        [batch_size, max_detection, 4] which represents the bounding boxes.
+      rpn_features_to_crop: A 4-D float32 tensor with shape
+        [batch, height, width, depth] representing image features to crop using
+        the proposals boxes.
+    """
+    with tf.name_scope('SecondStageDetectionFeaturesExtract'):
+      flattened_detected_feature_maps = (
+          self._compute_second_stage_input_feature_maps(
+              rpn_features_to_crop, detection_boxes))
+      detection_features_unpooled = (
+          self._feature_extractor.extract_box_classifier_features(
+              flattened_detected_feature_maps,
+              scope=self.second_stage_feature_extractor_scope))
+      batch_size = tf.shape(detection_boxes)[0]
+      max_detection = tf.shape(detection_boxes)[1]
+      detection_features_pool = tf.reduce_mean(
+          detection_features_unpooled, axis=[1, 2])
+      detection_features = tf.reshape(
+          detection_features_pool,
+          [batch_size, max_detection, tf.shape(detection_features_pool)[-1]])
+    detection_features = tf.identity(
+        detection_features, 'detection_features')
  def _postprocess_rpn(self,
                       rpn_box_encodings_batch,
@@ -1454,6 +1492,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
    # to cls_weights. This could happen as boxes within certain IOU ranges
    # are ignored. If triggered, the selected boxes will still be ignored
    # during loss computation.
+    cls_weights = tf.reduce_mean(cls_weights, axis=-1)
    positive_indicator = tf.greater(tf.argmax(cls_targets, axis=1), 0)
    valid_indicator = tf.logical_and(
        tf.range(proposal_boxlist.num_boxes()) < num_valid_proposals,
@@ -1566,6 +1605,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
      mask_predictions_batch = tf.reshape(
          mask_predictions, [-1, self.max_num_proposals,
                             self.num_classes, mask_height, mask_width])
    (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, _,
     num_detections) = self._second_stage_nms_fn(
         refined_decoded_boxes_batch,
@@ -1713,6 +1753,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
           gt_box_batch=groundtruth_boxlists,
           gt_class_targets_batch=(len(groundtruth_boxlists) * [None]),
           gt_weights_batch=groundtruth_weights_list)
+      batch_cls_weights = tf.reduce_mean(batch_cls_weights, axis=2)
      batch_cls_targets = tf.squeeze(batch_cls_targets, axis=2)
      def _minibatch_subsample_fn(inputs):
@@ -1743,7 +1784,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
          losses_mask=losses_mask)
      objectness_losses = self._first_stage_objectness_loss(
          rpn_objectness_predictions_with_background,
-          batch_one_hot_targets, weights=batch_sampled_indices,
+          batch_one_hot_targets,
+          weights=tf.expand_dims(batch_sampled_indices, axis=-1),
          losses_mask=losses_mask)
      localization_loss = tf.reduce_mean(
          tf.reduce_sum(localization_losses, axis=1) / normalizer)
@@ -1960,25 +2002,28 @@ class FasterRCNNMetaArch(model.DetectionModel):
            tf.expand_dims(flat_gt_masks, -1),
            tf.expand_dims(flat_normalized_proposals, axis=1),
            [mask_height, mask_width])
+        # Without stopping gradients into cropped groundtruth masks the
+        # performance with 100-padded groundtruth masks when batch size > 1 is
+        # about 4% worse.
+        # TODO(rathodv): Investigate this since we don't expect any variables
+        # upstream of flat_cropped_gt_mask.
+        flat_cropped_gt_mask = tf.stop_gradient(flat_cropped_gt_mask)
        batch_cropped_gt_mask = tf.reshape(
            flat_cropped_gt_mask,
            [batch_size, -1, mask_height * mask_width])
-        second_stage_mask_losses = ops.reduce_sum_trailing_dimensions(
+        mask_losses_weights = (
-            self._second_stage_mask_loss(
+            batch_mask_target_weights * tf.to_float(paddings_indicator))
-                reshaped_prediction_masks,
+        mask_losses = self._second_stage_mask_loss(
-                batch_cropped_gt_mask,
+            reshaped_prediction_masks,
-                weights=batch_mask_target_weights,
+            batch_cropped_gt_mask,
-                losses_mask=losses_mask),
+            weights=tf.expand_dims(mask_losses_weights, axis=-1),
-            ndims=2) / (
+            losses_mask=losses_mask)
-                mask_height * mask_width * tf.maximum(
+        total_mask_loss = tf.reduce_sum(mask_losses)
-                    tf.reduce_sum(
+        normalizer = tf.maximum(
-                        batch_mask_target_weights, axis=1, keep_dims=True
+            tf.reduce_sum(mask_losses_weights * mask_height * mask_width), 1.0)
-                    ), tf.ones((batch_size, 1))))
+        second_stage_mask_loss = total_mask_loss / normalizer
-        second_stage_mask_loss = tf.reduce_sum(
-            tf.where(paddings_indicator, second_stage_mask_losses,
-                     tf.zeros_like(second_stage_mask_losses)))
      if second_stage_mask_loss is not None:
        mask_loss = tf.multiply(self._second_stage_mask_loss_weight,
@@ -2073,6 +2118,17 @@ class FasterRCNNMetaArch(model.DetectionModel):
          cls_losses=tf.expand_dims(single_image_cls_loss, 0),
          decoded_boxlist_list=[proposal_boxlist])
+  def regularization_losses(self):
+    """Returns a list of regularization losses for this model.
+    Returns a list of regularization losses for this model that the estimator
+    needs to use during training/optimization.
+    Returns:
+      A list of regularization loss tensors.
+    """
+    return tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
  def restore_map(self,
                  fine_tune_checkpoint_type='detection',
                  load_all_detection_checkpoint_vars=False):
@@ -2117,3 +2173,16 @@ class FasterRCNNMetaArch(model.DetectionModel):
    feature_extractor_variables = tf.contrib.framework.filter_variables(
        variables_to_restore, include_patterns=include_patterns)
    return {var.op.name: var for var in feature_extractor_variables}
+  def updates(self):
+    """Returns a list of update operators for this model.
+    Returns a list of update operators for this model that must be executed at
+    each training step. The estimator's train op needs to have a control
+    dependency on these updates.
+    Returns:
+      A list of update operators.
+    """
+    return tf.get_collection(tf.GraphKeys.UPDATE_OPS)
--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test.py
@@ -189,7 +189,7 @@ class FasterRCNNMetaArchTest(
          set(expected_shapes.keys()).union(
              set([
                  'detection_boxes', 'detection_scores', 'detection_classes',
-                  'detection_masks', 'num_detections'
+                  'detection_masks', 'num_detections', 'mask_predictions',
              ])))
      for key in expected_shapes:
        self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key])
@@ -199,6 +199,9 @@ class FasterRCNNMetaArchTest(
      self.assertAllEqual(tensor_dict_out['detection_classes'].shape, [2, 5])
      self.assertAllEqual(tensor_dict_out['detection_scores'].shape, [2, 5])
      self.assertAllEqual(tensor_dict_out['num_detections'].shape, [2])
+      num_classes = 1 if masks_are_class_agnostic else 2
+      self.assertAllEqual(tensor_dict_out['mask_predictions'].shape,
+                          [10, num_classes, 14, 14])
  @parameterized.parameters(
      {'masks_are_class_agnostic': False},

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
@@ -250,6 +250,7 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
        iou_threshold: 1.0
        max_detections_per_class: 5
        max_total_detections: 5
+        use_static_shapes: """ +'{}'.format(use_static_shapes) + """
      }
    """
    post_processing_config = post_processing_pb2.PostProcessing()
@@ -336,61 +337,71 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
            masks_are_class_agnostic=masks_are_class_agnostic), **common_kwargs)
  def test_predict_gives_correct_shapes_in_inference_mode_first_stage_only(
-      self):
+      self, use_static_shapes=False):
-    test_graph = tf.Graph()
+    batch_size = 2
-    with test_graph.as_default():
+    height = 10
-      model = self._build_model(
+    width = 12
-          is_training=False, number_of_stages=1, second_stage_batch_size=2)
+    input_image_shape = (batch_size, height, width, 3)
-      batch_size = 2
-      height = 10
-      width = 12
-      input_image_shape = (batch_size, height, width, 3)
-      _, true_image_shapes = model.preprocess(tf.zeros(input_image_shape))
+    def graph_fn(images):
-      preprocessed_inputs = tf.placeholder(
+      """Function to construct tf graph for the test."""
-          dtype=tf.float32, shape=(batch_size, None, None, 3))
+      model = self._build_model(
+          is_training=False,
+          number_of_stages=1,
+          second_stage_batch_size=2,
+          clip_anchors_to_image=use_static_shapes,
+          use_static_shapes=use_static_shapes)
+      preprocessed_inputs, true_image_shapes = model.preprocess(images)
      prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
+      return (prediction_dict['rpn_box_predictor_features'],
+              prediction_dict['rpn_features_to_crop'],
+              prediction_dict['image_shape'],
+              prediction_dict['rpn_box_encodings'],
+              prediction_dict['rpn_objectness_predictions_with_background'],
+              prediction_dict['anchors'])
+    images = np.zeros(input_image_shape, dtype=np.float32)
+    # In inference mode, anchors are clipped to the image window, but not
+    # pruned.  Since MockFasterRCNN.extract_proposal_features returns a
+    # tensor with the same shape as its input, the expected number of anchors
+    # is height * width * the number of anchors per location (i.e. 3x3).
+    expected_num_anchors = height * width * 3 * 3
+    expected_output_shapes = {
+        'rpn_box_predictor_features': (batch_size, height, width, 512),
+        'rpn_features_to_crop': (batch_size, height, width, 3),
+        'rpn_box_encodings': (batch_size, expected_num_anchors, 4),
+        'rpn_objectness_predictions_with_background':
+        (batch_size, expected_num_anchors, 2),
+        'anchors': (expected_num_anchors, 4)
+    }
-      # In inference mode, anchors are clipped to the image window, but not
+    if use_static_shapes:
-      # pruned.  Since MockFasterRCNN.extract_proposal_features returns a
+      results = self.execute(graph_fn, [images])
-      # tensor with the same shape as its input, the expected number of anchors
+    else:
-      # is height * width * the number of anchors per location (i.e. 3x3).
+      results = self.execute_cpu(graph_fn, [images])
-      expected_num_anchors = height * width * 3 * 3
-      expected_output_keys = set([
-          'rpn_box_predictor_features', 'rpn_features_to_crop', 'image_shape',
-          'rpn_box_encodings', 'rpn_objectness_predictions_with_background',
-          'anchors'])
-      expected_output_shapes = {
-          'rpn_box_predictor_features': (batch_size, height, width, 512),
-          'rpn_features_to_crop': (batch_size, height, width, 3),
-          'rpn_box_encodings': (batch_size, expected_num_anchors, 4),
-          'rpn_objectness_predictions_with_background':
-          (batch_size, expected_num_anchors, 2),
-          'anchors': (expected_num_anchors, 4)
-      }
-      init_op = tf.global_variables_initializer()
-      with self.test_session(graph=test_graph) as sess:
-        sess.run(init_op)
-        prediction_out = sess.run(prediction_dict,
-                                  feed_dict={
-                                      preprocessed_inputs:
-                                      np.zeros(input_image_shape)
-                                  })
-        self.assertEqual(set(prediction_out.keys()), expected_output_keys)
-        self.assertAllEqual(prediction_out['image_shape'], input_image_shape)
+    self.assertAllEqual(results[0].shape,
-        for output_key, expected_shape in expected_output_shapes.items():
+                        expected_output_shapes['rpn_box_predictor_features'])
-          self.assertAllEqual(prediction_out[output_key].shape, expected_shape)
+    self.assertAllEqual(results[1].shape,
+                        expected_output_shapes['rpn_features_to_crop'])
+    self.assertAllEqual(results[2],
+                        input_image_shape)
+    self.assertAllEqual(results[3].shape,
+                        expected_output_shapes['rpn_box_encodings'])
+    self.assertAllEqual(
+        results[4].shape,
+        expected_output_shapes['rpn_objectness_predictions_with_background'])
+    self.assertAllEqual(results[5].shape,
+                        expected_output_shapes['anchors'])
-        # Check that anchors are clipped to window.
+    # Check that anchors are clipped to window.
-        anchors = prediction_out['anchors']
+    anchors = results[5]
-        self.assertTrue(np.all(np.greater_equal(anchors, 0)))
+    self.assertTrue(np.all(np.greater_equal(anchors, 0)))
-        self.assertTrue(np.all(np.less_equal(anchors[:, 0], height)))
+    self.assertTrue(np.all(np.less_equal(anchors[:, 0], height)))
-        self.assertTrue(np.all(np.less_equal(anchors[:, 1], width)))
+    self.assertTrue(np.all(np.less_equal(anchors[:, 1], width)))
-        self.assertTrue(np.all(np.less_equal(anchors[:, 2], height)))
+    self.assertTrue(np.all(np.less_equal(anchors[:, 2], height)))
-        self.assertTrue(np.all(np.less_equal(anchors[:, 3], width)))
+    self.assertTrue(np.all(np.less_equal(anchors[:, 3], width)))
  def test_predict_gives_valid_anchors_in_training_mode_first_stage_only(self):
    test_graph = tf.Graph()
@@ -446,7 +457,38 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
            prediction_out['rpn_objectness_predictions_with_background'].shape,
            (batch_size, num_anchors_out, 2))
-  def test_predict_correct_shapes_in_inference_mode_two_stages(self):
+  def test_predict_correct_shapes_in_inference_mode_two_stages(
+      self, use_static_shapes=False):
+    def compare_results(results, expected_output_shapes):
+      """Checks if the shape of the predictions are as expected."""
+      self.assertAllEqual(results[0].shape,
+                          expected_output_shapes['rpn_box_predictor_features'])
+      self.assertAllEqual(results[1].shape,
+                          expected_output_shapes['rpn_features_to_crop'])
+      self.assertAllEqual(results[2].shape,
+                          expected_output_shapes['image_shape'])
+      self.assertAllEqual(results[3].shape,
+                          expected_output_shapes['rpn_box_encodings'])
+      self.assertAllEqual(
+          results[4].shape,
+          expected_output_shapes['rpn_objectness_predictions_with_background'])
+      self.assertAllEqual(results[5].shape,
+                          expected_output_shapes['anchors'])
+      self.assertAllEqual(results[6].shape,
+                          expected_output_shapes['refined_box_encodings'])
+      self.assertAllEqual(
+          results[7].shape,
+          expected_output_shapes['class_predictions_with_background'])
+      self.assertAllEqual(results[8].shape,
+                          expected_output_shapes['num_proposals'])
+      self.assertAllEqual(results[9].shape,
+                          expected_output_shapes['proposal_boxes'])
+      self.assertAllEqual(results[10].shape,
+                          expected_output_shapes['proposal_boxes_normalized'])
+      self.assertAllEqual(results[11].shape,
+                          expected_output_shapes['box_classifier_features'])
    batch_size = 2
    image_size = 10
    max_num_proposals = 8
@@ -457,6 +499,32 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
                    (None, image_size, image_size, 3),
                    (batch_size, None, None, 3),
                    (None, None, None, 3)]
+    def graph_fn_tpu(images):
+      """Function to construct tf graph for the test."""
+      model = self._build_model(
+          is_training=False,
+          number_of_stages=2,
+          second_stage_batch_size=2,
+          predict_masks=False,
+          use_matmul_crop_and_resize=use_static_shapes,
+          clip_anchors_to_image=use_static_shapes,
+          use_static_shapes=use_static_shapes)
+      preprocessed_inputs, true_image_shapes = model.preprocess(images)
+      prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
+      return (prediction_dict['rpn_box_predictor_features'],
+              prediction_dict['rpn_features_to_crop'],
+              prediction_dict['image_shape'],
+              prediction_dict['rpn_box_encodings'],
+              prediction_dict['rpn_objectness_predictions_with_background'],
+              prediction_dict['anchors'],
+              prediction_dict['refined_box_encodings'],
+              prediction_dict['class_predictions_with_background'],
+              prediction_dict['num_proposals'],
+              prediction_dict['proposal_boxes'],
+              prediction_dict['proposal_boxes_normalized'],
+              prediction_dict['box_classifier_features'])
    expected_num_anchors = image_size * image_size * 3 * 3
    expected_shapes = {
        'rpn_box_predictor_features':
@@ -481,28 +549,34 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
                                                3)
    }
-    for input_shape in input_shapes:
+    if use_static_shapes:
-      test_graph = tf.Graph()
+      input_shape = (batch_size, image_size, image_size, 3)
-      with test_graph.as_default():
+      images = np.zeros(input_shape, dtype=np.float32)
-        model = self._build_model(
+      results = self.execute(graph_fn_tpu, [images])
-            is_training=False,
+      compare_results(results, expected_shapes)
-            number_of_stages=2,
+    else:
-            second_stage_batch_size=2,
+      for input_shape in input_shapes:
-            predict_masks=False)
+        test_graph = tf.Graph()
-        preprocessed_inputs = tf.placeholder(tf.float32, shape=input_shape)
+        with test_graph.as_default():
-        _, true_image_shapes = model.preprocess(preprocessed_inputs)
+          model = self._build_model(
-        result_tensor_dict = model.predict(
+              is_training=False,
-            preprocessed_inputs, true_image_shapes)
+              number_of_stages=2,
-        init_op = tf.global_variables_initializer()
+              second_stage_batch_size=2,
-      with self.test_session(graph=test_graph) as sess:
+              predict_masks=False)
-        sess.run(init_op)
+          preprocessed_inputs = tf.placeholder(tf.float32, shape=input_shape)
-        tensor_dict_out = sess.run(result_tensor_dict, feed_dict={
+          _, true_image_shapes = model.preprocess(preprocessed_inputs)
-            preprocessed_inputs:
+          result_tensor_dict = model.predict(
-            np.zeros((batch_size, image_size, image_size, 3))})
+              preprocessed_inputs, true_image_shapes)
-      self.assertEqual(set(tensor_dict_out.keys()),
+          init_op = tf.global_variables_initializer()
-                       set(expected_shapes.keys()))
+        with self.test_session(graph=test_graph) as sess:
-      for key in expected_shapes:
+          sess.run(init_op)
-        self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key])
+          tensor_dict_out = sess.run(result_tensor_dict, feed_dict={
+              preprocessed_inputs:
+              np.zeros((batch_size, image_size, image_size, 3))})
+        self.assertEqual(set(tensor_dict_out.keys()),
+                         set(expected_shapes.keys()))
+        for key in expected_shapes:
+          self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key])
  def test_predict_gives_correct_shapes_in_train_mode_both_stages(
      self,
@@ -596,23 +670,46 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
    self.assertAllEqual(results[8].shape,
                        expected_shapes['rpn_box_predictor_features'])
-  def _test_postprocess_first_stage_only_inference_mode(
+  def test_postprocess_first_stage_only_inference_mode(
-      self, pad_to_max_dimension=None):
+      self, use_static_shapes=False, pad_to_max_dimension=None):
-    model = self._build_model(
-        is_training=False, number_of_stages=1, second_stage_batch_size=6,
-        pad_to_max_dimension=pad_to_max_dimension)
    batch_size = 2
-    anchors = tf.constant(
+    first_stage_max_proposals = 4 if use_static_shapes else 8
+    def graph_fn(images,
+                 rpn_box_encodings,
+                 rpn_objectness_predictions_with_background,
+                 rpn_features_to_crop,
+                 anchors):
+      """Function to construct tf graph for the test."""
+      model = self._build_model(
+          is_training=False, number_of_stages=1, second_stage_batch_size=6,
+          use_matmul_crop_and_resize=use_static_shapes,
+          clip_anchors_to_image=use_static_shapes,
+          use_static_shapes=use_static_shapes,
+          use_matmul_gather_in_matcher=use_static_shapes,
+          first_stage_max_proposals=first_stage_max_proposals,
+          pad_to_max_dimension=pad_to_max_dimension)
+      _, true_image_shapes = model.preprocess(images)
+      proposals = model.postprocess({
+          'rpn_box_encodings': rpn_box_encodings,
+          'rpn_objectness_predictions_with_background':
+          rpn_objectness_predictions_with_background,
+          'rpn_features_to_crop': rpn_features_to_crop,
+          'anchors': anchors}, true_image_shapes)
+      return (proposals['num_detections'],
+              proposals['detection_boxes'],
+              proposals['detection_scores'])
+    anchors = np.array(
        [[0, 0, 16, 16],
         [0, 16, 16, 32],
         [16, 0, 32, 16],
-         [16, 16, 32, 32]], dtype=tf.float32)
+         [16, 16, 32, 32]], dtype=np.float32)
-    rpn_box_encodings = tf.zeros(
+    rpn_box_encodings = np.zeros(
-        [batch_size, anchors.get_shape().as_list()[0],
+        (batch_size, anchors.shape[0], BOX_CODE_SIZE), dtype=np.float32)
-         BOX_CODE_SIZE], dtype=tf.float32)
    # use different numbers for the objectness category to break ties in
    # order of boxes returned by NMS
-    rpn_objectness_predictions_with_background = tf.constant([
+    rpn_objectness_predictions_with_background = np.array([
        [[-10, 13],
         [10, -10],
         [10, -11],
@@ -620,16 +717,22 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
        [[10, -10],
         [-10, 13],
         [-10, 12],
-         [10, -11]]], dtype=tf.float32)
+         [10, -11]]], dtype=np.float32)
-    rpn_features_to_crop = tf.ones((batch_size, 8, 8, 10), dtype=tf.float32)
+    rpn_features_to_crop = np.ones((batch_size, 8, 8, 10), dtype=np.float32)
-    image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32)
+    image_shape = (batch_size, 32, 32, 3)
-    _, true_image_shapes = model.preprocess(tf.zeros(image_shape))
+    images = np.zeros(image_shape, dtype=np.float32)
-    proposals = model.postprocess({
-        'rpn_box_encodings': rpn_box_encodings,
+    if use_static_shapes:
-        'rpn_objectness_predictions_with_background':
+      results = self.execute(graph_fn,
-        rpn_objectness_predictions_with_background,
+                             [images, rpn_box_encodings,
-        'rpn_features_to_crop': rpn_features_to_crop,
+                              rpn_objectness_predictions_with_background,
-        'anchors': anchors}, true_image_shapes)
+                              rpn_features_to_crop, anchors])
+    else:
+      results = self.execute_cpu(graph_fn,
+                                 [images, rpn_box_encodings,
+                                  rpn_objectness_predictions_with_background,
+                                  rpn_features_to_crop, anchors])
    expected_proposal_boxes = [
        [[0, 0, .5, .5], [.5, .5, 1, 1], [0, .5, .5, 1], [.5, 0, 1.0, .5]]
        + 4 * [4 * [0]],
@@ -639,24 +742,12 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
                                [1, 1, 0, 0, 0, 0, 0, 0]]
    expected_num_proposals = [4, 4]
-    expected_output_keys = set(['detection_boxes', 'detection_scores',
+    self.assertAllClose(results[0], expected_num_proposals)
-                                'num_detections'])
+    for indx, num_proposals in enumerate(expected_num_proposals):
-    self.assertEqual(set(proposals.keys()), expected_output_keys)
+      self.assertAllClose(results[1][indx][0:num_proposals],
-    with self.test_session() as sess:
+                          expected_proposal_boxes[indx][0:num_proposals])
-      proposals_out = sess.run(proposals)
+      self.assertAllClose(results[2][indx][0:num_proposals],
-      self.assertAllClose(proposals_out['detection_boxes'],
+                          expected_proposal_scores[indx][0:num_proposals])
-                          expected_proposal_boxes)
-      self.assertAllClose(proposals_out['detection_scores'],
-                          expected_proposal_scores)
-      self.assertAllEqual(proposals_out['num_detections'],
-                          expected_num_proposals)
-  def test_postprocess_first_stage_only_inference_mode(self):
-    self._test_postprocess_first_stage_only_inference_mode()
-  def test_postprocess_first_stage_only_inference_mode_padded_image(self):
-    self._test_postprocess_first_stage_only_inference_mode(
-        pad_to_max_dimension=56)
  def _test_postprocess_first_stage_only_train_mode(self,
                                                    pad_to_max_dimension=None):
@@ -733,83 +824,80 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
  def test_postprocess_first_stage_only_train_mode_padded_image(self):
    self._test_postprocess_first_stage_only_train_mode(pad_to_max_dimension=56)
-  def _test_postprocess_second_stage_only_inference_mode(
+  def test_postprocess_second_stage_only_inference_mode(
-      self, pad_to_max_dimension=None):
+      self, use_static_shapes=False, pad_to_max_dimension=None):
-    num_proposals_shapes = [(2), (None,)]
-    refined_box_encodings_shapes = [(16, 2, 4), (None, 2, 4)]
-    class_predictions_with_background_shapes = [(16, 3), (None, 3)]
-    proposal_boxes_shapes = [(2, 8, 4), (None, 8, 4)]
    batch_size = 2
+    num_classes = 2
    image_shape = np.array((2, 36, 48, 3), dtype=np.int32)
-    for (num_proposals_shape, refined_box_encoding_shape,
+    first_stage_max_proposals = 8
-         class_predictions_with_background_shape,
+    total_num_padded_proposals = batch_size * first_stage_max_proposals
-         proposal_boxes_shape) in zip(num_proposals_shapes,
-                                      refined_box_encodings_shapes,
+    def graph_fn(images,
-                                      class_predictions_with_background_shapes,
+                 refined_box_encodings,
-                                      proposal_boxes_shapes):
+                 class_predictions_with_background,
-      tf_graph = tf.Graph()
+                 num_proposals,
-      with tf_graph.as_default():
+                 proposal_boxes):
-        model = self._build_model(
+      """Function to construct tf graph for the test."""
-            is_training=False, number_of_stages=2,
+      model = self._build_model(
-            second_stage_batch_size=6,
+          is_training=False, number_of_stages=2,
-            pad_to_max_dimension=pad_to_max_dimension)
+          second_stage_batch_size=6,
-        _, true_image_shapes = model.preprocess(tf.zeros(image_shape))
+          use_matmul_crop_and_resize=use_static_shapes,
-        total_num_padded_proposals = batch_size * model.max_num_proposals
+          clip_anchors_to_image=use_static_shapes,
-        proposal_boxes = np.array(
+          use_static_shapes=use_static_shapes,
-            [[[1, 1, 2, 3],
+          use_matmul_gather_in_matcher=use_static_shapes,
-              [0, 0, 1, 1],
+          pad_to_max_dimension=pad_to_max_dimension)
-              [.5, .5, .6, .6],
+      _, true_image_shapes = model.preprocess(images)
-              4*[0], 4*[0], 4*[0], 4*[0], 4*[0]],
+      detections = model.postprocess({
-             [[2, 3, 6, 8],
+          'refined_box_encodings': refined_box_encodings,
-              [1, 2, 5, 3],
+          'class_predictions_with_background':
-              4*[0], 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]]])
+          class_predictions_with_background,
-        num_proposals = np.array([3, 2], dtype=np.int32)
+          'num_proposals': num_proposals,
-        refined_box_encodings = np.zeros(
+          'proposal_boxes': proposal_boxes,
-            [total_num_padded_proposals, model.num_classes, 4])
+      }, true_image_shapes)
-        class_predictions_with_background = np.ones(
+      return (detections['num_detections'],
-            [total_num_padded_proposals, model.num_classes+1])
+              detections['detection_boxes'],
+              detections['detection_scores'],
-        num_proposals_placeholder = tf.placeholder(tf.int32,
+              detections['detection_classes'])
-                                                   shape=num_proposals_shape)
-        refined_box_encodings_placeholder = tf.placeholder(
+    proposal_boxes = np.array(
-            tf.float32, shape=refined_box_encoding_shape)
+        [[[1, 1, 2, 3],
-        class_predictions_with_background_placeholder = tf.placeholder(
+          [0, 0, 1, 1],
-            tf.float32, shape=class_predictions_with_background_shape)
+          [.5, .5, .6, .6],
-        proposal_boxes_placeholder = tf.placeholder(
+          4*[0], 4*[0], 4*[0], 4*[0], 4*[0]],
-            tf.float32, shape=proposal_boxes_shape)
+         [[2, 3, 6, 8],
-        image_shape_placeholder = tf.placeholder(tf.int32, shape=(4))
+          [1, 2, 5, 3],
+          4*[0], 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]]], dtype=np.float32)
-        detections = model.postprocess({
+    num_proposals = np.array([3, 2], dtype=np.int32)
-            'refined_box_encodings': refined_box_encodings_placeholder,
+    refined_box_encodings = np.zeros(
-            'class_predictions_with_background':
+        [total_num_padded_proposals, num_classes, 4], dtype=np.float32)
-            class_predictions_with_background_placeholder,
+    class_predictions_with_background = np.ones(
-            'num_proposals': num_proposals_placeholder,
+        [total_num_padded_proposals, num_classes+1], dtype=np.float32)
-            'proposal_boxes': proposal_boxes_placeholder,
+    images = np.zeros(image_shape, dtype=np.float32)
-        }, true_image_shapes)
-      with self.test_session(graph=tf_graph) as sess:
+    if use_static_shapes:
-        detections_out = sess.run(
+      results = self.execute(graph_fn,
-            detections,
+                             [images, refined_box_encodings,
-            feed_dict={
+                              class_predictions_with_background,
-                refined_box_encodings_placeholder: refined_box_encodings,
+                              num_proposals, proposal_boxes])
-                class_predictions_with_background_placeholder:
+    else:
-                class_predictions_with_background,
+      results = self.execute_cpu(graph_fn,
-                num_proposals_placeholder: num_proposals,
+                                 [images, refined_box_encodings,
-                proposal_boxes_placeholder: proposal_boxes,
+                                  class_predictions_with_background,
-                image_shape_placeholder: image_shape
+                                  num_proposals, proposal_boxes])
-            })
+    expected_num_detections = [5, 4]
-      self.assertAllEqual(detections_out['detection_boxes'].shape, [2, 5, 4])
+    expected_detection_classes = [[0, 0, 0, 1, 1], [0, 0, 1, 1, 0]]
-      self.assertAllClose(detections_out['detection_scores'],
+    expected_detection_scores = [[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]
-                          [[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]])
-      self.assertAllClose(detections_out['detection_classes'],
+    self.assertAllClose(results[0], expected_num_detections)
-                          [[0, 0, 0, 1, 1], [0, 0, 1, 1, 0]])
-      self.assertAllClose(detections_out['num_detections'], [5, 4])
+    for indx, num_proposals in enumerate(expected_num_detections):
+      self.assertAllClose(results[2][indx][0:num_proposals],
-  def test_postprocess_second_stage_only_inference_mode(self):
+                          expected_detection_scores[indx][0:num_proposals])
-    self._test_postprocess_second_stage_only_inference_mode()
+      self.assertAllClose(results[3][indx][0:num_proposals],
+                          expected_detection_classes[indx][0:num_proposals])
-  def test_postprocess_second_stage_only_inference_mode_padded_image(self):
-    self._test_postprocess_second_stage_only_inference_mode(
+    if not use_static_shapes:
-        pad_to_max_dimension=56)
+      self.assertAllEqual(results[1].shape, [2, 5, 4])
  def test_preprocess_preserves_input_shapes(self):
    image_shapes = [(3, None, None, 3),

--- a/research/object_detection/meta_architectures/ssd_meta_arch.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch.py
@@ -19,7 +19,6 @@ models.
 """
 from abc import abstractmethod
-import re
 import tensorflow as tf
 from object_detection.core import box_list
@@ -116,6 +115,25 @@ class SSDFeatureExtractor(object):
    """
    raise NotImplementedError
+  def restore_from_classification_checkpoint_fn(self, feature_extractor_scope):
+    """Returns a map of variables to load from a foreign checkpoint.
+    Args:
+      feature_extractor_scope: A scope name for the feature extractor.
+    Returns:
+      A dict mapping variable names (to load from a checkpoint) to variables in
+      the model graph.
+    """
+    variables_to_restore = {}
+    for variable in tf.global_variables():
+      var_name = variable.op.name
+      if var_name.startswith(feature_extractor_scope + '/'):
+        var_name = var_name.replace(feature_extractor_scope + '/', '')
+        variables_to_restore[var_name] = variable
+    return variables_to_restore
 class SSDKerasFeatureExtractor(tf.keras.Model):
  """SSD Feature Extractor definition."""
@@ -218,6 +236,25 @@ class SSDKerasFeatureExtractor(tf.keras.Model):
  def call(self, inputs, **kwargs):
    return self._extract_features(inputs)
+  def restore_from_classification_checkpoint_fn(self, feature_extractor_scope):
+    """Returns a map of variables to load from a foreign checkpoint.
+    Args:
+      feature_extractor_scope: A scope name for the feature extractor.
+    Returns:
+      A dict mapping variable names (to load from a checkpoint) to variables in
+      the model graph.
+    """
+    variables_to_restore = {}
+    for variable in tf.global_variables():
+      var_name = variable.op.name
+      if var_name.startswith(feature_extractor_scope + '/'):
+        var_name = var_name.replace(feature_extractor_scope + '/', '')
+        variables_to_restore[var_name] = variable
+    return variables_to_restore
 class SSDMetaArch(model.DetectionModel):
  """SSD Meta-architecture definition."""
@@ -333,13 +370,15 @@ class SSDMetaArch(model.DetectionModel):
      # Slim feature extractors get an explicit naming scope
      self._extract_features_scope = 'FeatureExtractor'
-    # TODO(jonathanhuang): handle agnostic mode
+    if self._add_background_class and encode_background_as_zeros:
-    # weights
-    self._unmatched_class_label = tf.constant([1] + self.num_classes * [0],
-                                              tf.float32)
-    if encode_background_as_zeros:
      self._unmatched_class_label = tf.constant((self.num_classes + 1) * [0],
                                                tf.float32)
+    elif self._add_background_class:
+      self._unmatched_class_label = tf.constant([1] + self.num_classes * [0],
+                                                tf.float32)
+    else:
+      self._unmatched_class_label = tf.constant(self.num_classes * [0],
+                                                tf.float32)
    self._target_assigner = target_assigner_instance
@@ -606,14 +645,22 @@ class SSDMetaArch(model.DetectionModel):
      detection_boxes = tf.identity(detection_boxes, 'raw_box_locations')
      detection_boxes = tf.expand_dims(detection_boxes, axis=2)
-      detection_scores_with_background = self._score_conversion_fn(
+      detection_scores = self._score_conversion_fn(class_predictions)
-          class_predictions)
+      detection_scores = tf.identity(detection_scores, 'raw_box_scores')
-      detection_scores_with_background = tf.identity(
+      if self._add_background_class:
-          detection_scores_with_background, 'raw_box_scores')
+        detection_scores = tf.slice(detection_scores, [0, 0, 1], [-1, -1, -1])
-      detection_scores = tf.slice(detection_scores_with_background, [0, 0, 1],
-                                  [-1, -1, -1])
      additional_fields = None
+      batch_size = (
+          shape_utils.combined_static_and_dynamic_shape(preprocessed_images)[0])
+      if 'feature_maps' in prediction_dict:
+        feature_map_list = []
+        for feature_map in prediction_dict['feature_maps']:
+          feature_map_list.append(tf.reshape(feature_map, [batch_size, -1]))
+        box_features = tf.concat(feature_map_list, 1)
+        box_features = tf.identity(box_features, 'raw_box_features')
      if detection_keypoints is not None:
        additional_fields = {
            fields.BoxListFields.keypoints: detection_keypoints}
@@ -683,17 +730,20 @@ class SSDMetaArch(model.DetectionModel):
            self.groundtruth_lists(fields.BoxListFields.boxes), match_list)
      if self._random_example_sampler:
+        batch_cls_per_anchor_weights = tf.reduce_mean(
+            batch_cls_weights, axis=-1)
        batch_sampled_indicator = tf.to_float(
            shape_utils.static_or_dynamic_map_fn(
                self._minibatch_subsample_fn,
-                [batch_cls_targets, batch_cls_weights],
+                [batch_cls_targets, batch_cls_per_anchor_weights],
                dtype=tf.bool,
                parallel_iterations=self._parallel_iterations,
                back_prop=True))
        batch_reg_weights = tf.multiply(batch_sampled_indicator,
                                        batch_reg_weights)
-        batch_cls_weights = tf.multiply(batch_sampled_indicator,
+        batch_cls_weights = tf.multiply(
-                                        batch_cls_weights)
+            tf.expand_dims(batch_sampled_indicator, -1),
+            batch_cls_weights)
      losses_mask = None
      if self.groundtruth_has_field(fields.InputDataFields.is_annotated):
@@ -713,16 +763,32 @@ class SSDMetaArch(model.DetectionModel):
          losses_mask=losses_mask)
      if self._expected_classification_loss_under_sampling:
+        # Need to compute losses for assigned targets against the
+        # unmatched_class_label as well as their assigned targets.
+        # simplest thing (but wasteful) is just to calculate all losses
+        # twice
+        batch_size, num_anchors, num_classes = batch_cls_targets.get_shape()
+        unmatched_targets = tf.ones([batch_size, num_anchors, 1
+                                    ]) * self._unmatched_class_label
+        unmatched_cls_losses = self._classification_loss(
+            prediction_dict['class_predictions_with_background'],
+            unmatched_targets,
+            weights=batch_cls_weights,
+            losses_mask=losses_mask)
        if cls_losses.get_shape().ndims == 3:
          batch_size, num_anchors, num_classes = cls_losses.get_shape()
          cls_losses = tf.reshape(cls_losses, [batch_size, -1])
+          unmatched_cls_losses = tf.reshape(unmatched_cls_losses,
+                                            [batch_size, -1])
          batch_cls_targets = tf.reshape(
              batch_cls_targets, [batch_size, num_anchors * num_classes, -1])
          batch_cls_targets = tf.concat(
              [1 - batch_cls_targets, batch_cls_targets], axis=-1)
        cls_losses = self._expected_classification_loss_under_sampling(
-            batch_cls_targets, cls_losses)
+            batch_cls_targets, cls_losses, unmatched_cls_losses)
        classification_loss = tf.reduce_sum(cls_losses)
        localization_loss = tf.reduce_sum(location_losses)
@@ -971,6 +1037,26 @@ class SSDMetaArch(model.DetectionModel):
        [combined_shape[0], combined_shape[1], 4]))
    return decoded_boxes, decoded_keypoints
+  def regularization_losses(self):
+    """Returns a list of regularization losses for this model.
+    Returns a list of regularization losses for this model that the estimator
+    needs to use during training/optimization.
+    Returns:
+      A list of regularization loss tensors.
+    """
+    losses = []
+    slim_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+    # Copy the slim losses to avoid modifying the collection
+    if slim_losses:
+      losses.extend(slim_losses)
+    if self._box_predictor.is_keras_model:
+      losses.extend(self._box_predictor.losses)
+    if self._feature_extractor.is_keras_model:
+      losses.extend(self._feature_extractor.losses)
+    return losses
  def restore_map(self,
                  fine_tune_checkpoint_type='detection',
                  load_all_detection_checkpoint_vars=False):
@@ -997,18 +1083,44 @@ class SSDMetaArch(model.DetectionModel):
    if fine_tune_checkpoint_type not in ['detection', 'classification']:
      raise ValueError('Not supported fine_tune_checkpoint_type: {}'.format(
          fine_tune_checkpoint_type))
-    variables_to_restore = {}
-    for variable in tf.global_variables():
+    if fine_tune_checkpoint_type == 'classification':
-      var_name = variable.op.name
+      return self._feature_extractor.restore_from_classification_checkpoint_fn(
-      if (fine_tune_checkpoint_type == 'detection' and
+          self._extract_features_scope)
-          load_all_detection_checkpoint_vars):
-        variables_to_restore[var_name] = variable
+    if fine_tune_checkpoint_type == 'detection':
-      else:
+      variables_to_restore = {}
-        if var_name.startswith(self._extract_features_scope):
+      for variable in tf.global_variables():
-          if fine_tune_checkpoint_type == 'classification':
+        var_name = variable.op.name
-            var_name = (
+        if load_all_detection_checkpoint_vars:
-                re.split('^' + self._extract_features_scope + '/',
-                         var_name)[-1])
          variables_to_restore[var_name] = variable
+        else:
+          if var_name.startswith(self._extract_features_scope):
+            variables_to_restore[var_name] = variable
    return variables_to_restore
+  def updates(self):
+    """Returns a list of update operators for this model.
+    Returns a list of update operators for this model that must be executed at
+    each training step. The estimator's train op needs to have a control
+    dependency on these updates.
+    Returns:
+      A list of update operators.
+    """
+    update_ops = []
+    slim_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+    # Copy the slim ops to avoid modifying the collection
+    if slim_update_ops:
+      update_ops.extend(slim_update_ops)
+    if self._box_predictor.is_keras_model:
+      update_ops.extend(self._box_predictor.get_updates_for(None))
+      update_ops.extend(self._box_predictor.get_updates_for(
+          self._box_predictor.inputs))
+    if self._feature_extractor.is_keras_model:
+      update_ops.extend(self._feature_extractor.get_updates_for(None))
+      update_ops.extend(self._feature_extractor.get_updates_for(
+          self._feature_extractor.inputs))
+    return update_ops
--- a/research/object_detection/meta_architectures/ssd_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch_test.py
@@ -42,7 +42,7 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase,
                    random_example_sampling=False,
                    weight_regression_loss_by_score=False,
                    use_expected_classification_loss_under_sampling=False,
-                    minimum_negative_sampling=1,
+                    min_num_negative_samples=1,
                    desired_negative_sampling_ratio=3,
                    use_keras=False,
                    predict_mask=False,
@@ -57,7 +57,7 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase,
        weight_regression_loss_by_score=weight_regression_loss_by_score,
        use_expected_classification_loss_under_sampling=
        use_expected_classification_loss_under_sampling,
-        minimum_negative_sampling=minimum_negative_sampling,
+        min_num_negative_samples=min_num_negative_samples,
        desired_negative_sampling_ratio=desired_negative_sampling_ratio,
        use_keras=use_keras,
        predict_mask=predict_mask,
@@ -344,11 +344,11 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase,
    preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)
    groundtruth_boxes1 = np.array([[0, 0, .5, .5]], dtype=np.float32)
    groundtruth_boxes2 = np.array([[0, 0, .5, .5]], dtype=np.float32)
-    groundtruth_classes1 = np.array([[0, 1]], dtype=np.float32)
+    groundtruth_classes1 = np.array([[1]], dtype=np.float32)
-    groundtruth_classes2 = np.array([[0, 1]], dtype=np.float32)
+    groundtruth_classes2 = np.array([[1]], dtype=np.float32)
    expected_localization_loss = 0.0
    expected_classification_loss = (
-        batch_size * num_anchors * (num_classes + 1) * np.log(2.0))
+        batch_size * num_anchors * num_classes * np.log(2.0))
    (localization_loss, classification_loss) = self.execute(
        graph_fn, [
            preprocessed_input, groundtruth_boxes1, groundtruth_boxes2,
@@ -371,7 +371,7 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase,
          apply_hard_mining=False,
          add_background_class=True,
          use_expected_classification_loss_under_sampling=True,
-          minimum_negative_sampling=1,
+          min_num_negative_samples=1,
          desired_negative_sampling_ratio=desired_negative_sampling_ratio)
      model.provide_groundtruth(groundtruth_boxes_list,
                                groundtruth_classes_list)
@@ -391,8 +391,7 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase,
    expected_localization_loss = 0.0
    expected_classification_loss = (
-        batch_size * (desired_negative_sampling_ratio * num_anchors +
+        batch_size * (num_anchors + num_classes * num_anchors) * np.log(2.0))
-                      num_classes * num_anchors) * np.log(2.0))
    (localization_loss, classification_loss) = self.execute(
        graph_fn, [
            preprocessed_input, groundtruth_boxes1, groundtruth_boxes2,
@@ -432,11 +431,11 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase,
    preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)
    groundtruth_boxes1 = np.array([[0, 0, 1, 1]], dtype=np.float32)
    groundtruth_boxes2 = np.array([[0, 0, 1, 1]], dtype=np.float32)
-    groundtruth_classes1 = np.array([[0, 1]], dtype=np.float32)
+    groundtruth_classes1 = np.array([[1]], dtype=np.float32)
-    groundtruth_classes2 = np.array([[1, 0]], dtype=np.float32)
+    groundtruth_classes2 = np.array([[0]], dtype=np.float32)
    expected_localization_loss = 0.25
    expected_classification_loss = (
-        batch_size * num_anchors * (num_classes + 1) * np.log(2.0))
+        batch_size * num_anchors * num_classes * np.log(2.0))
    (localization_loss, classification_loss) = self.execute(
        graph_fn, [
            preprocessed_input, groundtruth_boxes1, groundtruth_boxes2,

--- a/research/object_detection/meta_architectures/ssd_meta_arch_test_lib.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch_test_lib.py
@@ -119,7 +119,7 @@ class SSDMetaArchTestBase(test_case.TestCase):
                    random_example_sampling=False,
                    weight_regression_loss_by_score=False,
                    use_expected_classification_loss_under_sampling=False,
-                    minimum_negative_sampling=1,
+                    min_num_negative_samples=1,
                    desired_negative_sampling_ratio=3,
                    use_keras=False,
                    predict_mask=False,
@@ -130,10 +130,12 @@ class SSDMetaArchTestBase(test_case.TestCase):
    mock_anchor_generator = MockAnchorGenerator2x2()
    if use_keras:
      mock_box_predictor = test_utils.MockKerasBoxPredictor(
-          is_training, num_classes, predict_mask=predict_mask)
+          is_training, num_classes, add_background_class=add_background_class,
+          predict_mask=predict_mask)
    else:
      mock_box_predictor = test_utils.MockBoxPredictor(
-          is_training, num_classes, predict_mask=predict_mask)
+          is_training, num_classes, add_background_class=add_background_class,
+          predict_mask=predict_mask)
    mock_box_coder = test_utils.MockBoxCoder()
    if use_keras:
      fake_feature_extractor = FakeSSDKerasFeatureExtractor()
@@ -182,7 +184,7 @@ class SSDMetaArchTestBase(test_case.TestCase):
    if use_expected_classification_loss_under_sampling:
      expected_classification_loss_under_sampling = functools.partial(
          ops.expected_classification_loss_under_sampling,
-          minimum_negative_sampling=minimum_negative_sampling,
+          min_num_negative_samples=min_num_negative_samples,
          desired_negative_sampling_ratio=desired_negative_sampling_ratio)
    code_size = 4

--- a/research/object_detection/metrics/coco_evaluation.py
+++ b/research/object_detection/metrics/coco_evaluation.py
@@ -248,27 +248,30 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
        detection_boxes_batched,
        detection_scores_batched,
        detection_classes_batched,
-        num_det_boxes_per_image):
+        num_det_boxes_per_image,
+        is_annotated_batched):
      """Update operation for adding batch of images to Coco evaluator."""
      for (image_id, gt_box, gt_class, gt_is_crowd, num_gt_box, det_box,
-           det_score, det_class, num_det_box) in zip(
+           det_score, det_class, num_det_box, is_annotated) in zip(
               image_id_batched, groundtruth_boxes_batched,
               groundtruth_classes_batched, groundtruth_is_crowd_batched,
               num_gt_boxes_per_image,
               detection_boxes_batched, detection_scores_batched,
-               detection_classes_batched, num_det_boxes_per_image):
+               detection_classes_batched, num_det_boxes_per_image,
-        self.add_single_ground_truth_image_info(
+               is_annotated_batched):
-            image_id, {
+        if is_annotated:
-                'groundtruth_boxes': gt_box[:num_gt_box],
+          self.add_single_ground_truth_image_info(
-                'groundtruth_classes': gt_class[:num_gt_box],
+              image_id, {
-                'groundtruth_is_crowd': gt_is_crowd[:num_gt_box]
+                  'groundtruth_boxes': gt_box[:num_gt_box],
-            })
+                  'groundtruth_classes': gt_class[:num_gt_box],
-        self.add_single_detected_image_info(
+                  'groundtruth_is_crowd': gt_is_crowd[:num_gt_box]
-            image_id,
+              })
-            {'detection_boxes': det_box[:num_det_box],
+          self.add_single_detected_image_info(
-             'detection_scores': det_score[:num_det_box],
+              image_id,
-             'detection_classes': det_class[:num_det_box]})
+              {'detection_boxes': det_box[:num_det_box],
+               'detection_scores': det_score[:num_det_box],
+               'detection_classes': det_class[:num_det_box]})
    # Unpack items from the evaluation dictionary.
    input_data_fields = standard_fields.InputDataFields
@@ -284,6 +287,7 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
    num_gt_boxes_per_image = eval_dict.get(
        'num_groundtruth_boxes_per_image', None)
    num_det_boxes_per_image = eval_dict.get('num_det_boxes_per_image', None)
+    is_annotated = eval_dict.get('is_annotated', None)
    if groundtruth_is_crowd is None:
      groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool)
@@ -306,6 +310,11 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
        num_det_boxes_per_image = tf.shape(detection_boxes)[1:2]
      else:
        num_det_boxes_per_image = tf.expand_dims(num_det_boxes_per_image, 0)
+      if is_annotated is None:
+        is_annotated = tf.constant([True])
+      else:
+        is_annotated = tf.expand_dims(is_annotated, 0)
    else:
      if num_gt_boxes_per_image is None:
        num_gt_boxes_per_image = tf.tile(
@@ -315,6 +324,8 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
        num_det_boxes_per_image = tf.tile(
            tf.shape(detection_boxes)[1:2],
            multiples=tf.shape(detection_boxes)[0:1])
+      if is_annotated is None:
+        is_annotated = tf.ones_like(image_id, dtype=tf.bool)
    update_op = tf.py_func(update_op, [image_id,
                                       groundtruth_boxes,
@@ -324,7 +335,8 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
                                       detection_boxes,
                                       detection_scores,
                                       detection_classes,
-                                       num_det_boxes_per_image], [])
+                                       num_det_boxes_per_image,
+                                       is_annotated], [])
    metric_names = ['DetectionBoxes_Precision/mAP',
                    'DetectionBoxes_Precision/mAP@.50IOU',
                    'DetectionBoxes_Precision/mAP@.75IOU',
@@ -581,8 +593,11 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
    Args:
      eval_dict: A dictionary that holds tensors for evaluating object detection
-        performance. This dictionary may be produced from
+        performance. For single-image evaluation, this dictionary may be
-        eval_util.result_dict_for_single_example().
+        produced from eval_util.result_dict_for_single_example(). If multi-image
+        evaluation, `eval_dict` should contain the fields
+        'num_groundtruth_boxes_per_image' and 'num_det_boxes_per_image' to
+        properly unpad the tensors from the batch.
    Returns:
      a dictionary of metric names to tuple of value_op and update_op that can
@@ -590,27 +605,41 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
      update ops  must be run together and similarly all value ops must be run
      together to guarantee correct behaviour.
    """
-    def update_op(
-        image_id,
+    def update_op(image_id_batched, groundtruth_boxes_batched,
-        groundtruth_boxes,
+                  groundtruth_classes_batched,
-        groundtruth_classes,
+                  groundtruth_instance_masks_batched,
-        groundtruth_instance_masks,
+                  groundtruth_is_crowd_batched, num_gt_boxes_per_image,
-        groundtruth_is_crowd,
+                  detection_scores_batched, detection_classes_batched,
-        detection_scores,
+                  detection_masks_batched, num_det_boxes_per_image):
-        detection_classes,
-        detection_masks):
      """Update op for metrics."""
-      self.add_single_ground_truth_image_info(
-          image_id,
+      for (image_id, groundtruth_boxes, groundtruth_classes,
-          {'groundtruth_boxes': groundtruth_boxes,
+           groundtruth_instance_masks, groundtruth_is_crowd, num_gt_box,
-           'groundtruth_classes': groundtruth_classes,
+           detection_scores, detection_classes,
-           'groundtruth_instance_masks': groundtruth_instance_masks,
+           detection_masks, num_det_box) in zip(
-           'groundtruth_is_crowd': groundtruth_is_crowd})
+               image_id_batched, groundtruth_boxes_batched,
-      self.add_single_detected_image_info(
+               groundtruth_classes_batched, groundtruth_instance_masks_batched,
-          image_id,
+               groundtruth_is_crowd_batched, num_gt_boxes_per_image,
-          {'detection_scores': detection_scores,
+               detection_scores_batched, detection_classes_batched,
-           'detection_classes': detection_classes,
+               detection_masks_batched, num_det_boxes_per_image):
-           'detection_masks': detection_masks})
+        self.add_single_ground_truth_image_info(
+            image_id, {
+                'groundtruth_boxes':
+                    groundtruth_boxes[:num_gt_box],
+                'groundtruth_classes':
+                    groundtruth_classes[:num_gt_box],
+                'groundtruth_instance_masks':
+                    groundtruth_instance_masks[:num_gt_box],
+                'groundtruth_is_crowd':
+                    groundtruth_is_crowd[:num_gt_box]
+            })
+        self.add_single_detected_image_info(
+            image_id, {
+                'detection_scores': detection_scores[:num_det_box],
+                'detection_classes': detection_classes[:num_det_box],
+                'detection_masks': detection_masks[:num_det_box]
+            })
    # Unpack items from the evaluation dictionary.
    input_data_fields = standard_fields.InputDataFields
@@ -622,20 +651,54 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
        input_data_fields.groundtruth_instance_masks]
    groundtruth_is_crowd = eval_dict.get(
        input_data_fields.groundtruth_is_crowd, None)
+    num_gt_boxes_per_image = eval_dict.get(
+        input_data_fields.num_groundtruth_boxes, None)
    detection_scores = eval_dict[detection_fields.detection_scores]
    detection_classes = eval_dict[detection_fields.detection_classes]
    detection_masks = eval_dict[detection_fields.detection_masks]
+    num_det_boxes_per_image = eval_dict.get(detection_fields.num_detections,
+                                            None)
    if groundtruth_is_crowd is None:
      groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool)
-    update_op = tf.py_func(update_op, [image_id,
-                                       groundtruth_boxes,
+    if not image_id.shape.as_list():
-                                       groundtruth_classes,
+      # Apply a batch dimension to all tensors.
-                                       groundtruth_instance_masks,
+      image_id = tf.expand_dims(image_id, 0)
-                                       groundtruth_is_crowd,
+      groundtruth_boxes = tf.expand_dims(groundtruth_boxes, 0)
-                                       detection_scores,
+      groundtruth_classes = tf.expand_dims(groundtruth_classes, 0)
-                                       detection_classes,
+      groundtruth_instance_masks = tf.expand_dims(groundtruth_instance_masks, 0)
-                                       detection_masks], [])
+      groundtruth_is_crowd = tf.expand_dims(groundtruth_is_crowd, 0)
+      detection_scores = tf.expand_dims(detection_scores, 0)
+      detection_classes = tf.expand_dims(detection_classes, 0)
+      detection_masks = tf.expand_dims(detection_masks, 0)
+      if num_gt_boxes_per_image is None:
+        num_gt_boxes_per_image = tf.shape(groundtruth_boxes)[1:2]
+      else:
+        num_gt_boxes_per_image = tf.expand_dims(num_gt_boxes_per_image, 0)
+      if num_det_boxes_per_image is None:
+        num_det_boxes_per_image = tf.shape(detection_scores)[1:2]
+      else:
+        num_det_boxes_per_image = tf.expand_dims(num_det_boxes_per_image, 0)
+    else:
+      if num_gt_boxes_per_image is None:
+        num_gt_boxes_per_image = tf.tile(
+            tf.shape(groundtruth_boxes)[1:2],
+            multiples=tf.shape(groundtruth_boxes)[0:1])
+      if num_det_boxes_per_image is None:
+        num_det_boxes_per_image = tf.tile(
+            tf.shape(detection_scores)[1:2],
+            multiples=tf.shape(detection_scores)[0:1])
+    update_op = tf.py_func(update_op, [
+        image_id, groundtruth_boxes, groundtruth_classes,
+        groundtruth_instance_masks, groundtruth_is_crowd,
+        num_gt_boxes_per_image, detection_scores, detection_classes,
+        detection_masks, num_det_boxes_per_image
+    ], [])
    metric_names = ['DetectionMasks_Precision/mAP',
                    'DetectionMasks_Precision/mAP@.50IOU',
                    'DetectionMasks_Precision/mAP@.75IOU',

--- a/research/object_detection/metrics/coco_evaluation_test.py
+++ b/research/object_detection/metrics/coco_evaluation_test.py
@@ -308,6 +308,99 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
    self.assertFalse(coco_evaluator._detection_boxes_list)
    self.assertFalse(coco_evaluator._image_ids)
+  def testGetOneMAPWithMatchingGroundtruthAndDetectionsIsAnnotated(self):
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
+        _get_categories_list())
+    image_id = tf.placeholder(tf.string, shape=())
+    groundtruth_boxes = tf.placeholder(tf.float32, shape=(None, 4))
+    groundtruth_classes = tf.placeholder(tf.float32, shape=(None))
+    is_annotated = tf.placeholder(tf.bool, shape=())
+    detection_boxes = tf.placeholder(tf.float32, shape=(None, 4))
+    detection_scores = tf.placeholder(tf.float32, shape=(None))
+    detection_classes = tf.placeholder(tf.float32, shape=(None))
+    input_data_fields = standard_fields.InputDataFields
+    detection_fields = standard_fields.DetectionResultFields
+    eval_dict = {
+        input_data_fields.key: image_id,
+        input_data_fields.groundtruth_boxes: groundtruth_boxes,
+        input_data_fields.groundtruth_classes: groundtruth_classes,
+        'is_annotated': is_annotated,
+        detection_fields.detection_boxes: detection_boxes,
+        detection_fields.detection_scores: detection_scores,
+        detection_fields.detection_classes: detection_classes
+    }
+    eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(eval_dict)
+    _, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
+    with self.test_session() as sess:
+      sess.run(update_op,
+               feed_dict={
+                   image_id: 'image1',
+                   groundtruth_boxes: np.array([[100., 100., 200., 200.]]),
+                   groundtruth_classes: np.array([1]),
+                   is_annotated: True,
+                   detection_boxes: np.array([[100., 100., 200., 200.]]),
+                   detection_scores: np.array([.8]),
+                   detection_classes: np.array([1])
+               })
+      sess.run(update_op,
+               feed_dict={
+                   image_id: 'image2',
+                   groundtruth_boxes: np.array([[50., 50., 100., 100.]]),
+                   groundtruth_classes: np.array([3]),
+                   is_annotated: True,
+                   detection_boxes: np.array([[50., 50., 100., 100.]]),
+                   detection_scores: np.array([.7]),
+                   detection_classes: np.array([3])
+               })
+      sess.run(update_op,
+               feed_dict={
+                   image_id: 'image3',
+                   groundtruth_boxes: np.array([[25., 25., 50., 50.]]),
+                   groundtruth_classes: np.array([2]),
+                   is_annotated: True,
+                   detection_boxes: np.array([[25., 25., 50., 50.]]),
+                   detection_scores: np.array([.9]),
+                   detection_classes: np.array([2])
+               })
+      sess.run(update_op,
+               feed_dict={
+                   image_id: 'image4',
+                   groundtruth_boxes: np.zeros((0, 4)),
+                   groundtruth_classes: np.zeros((0)),
+                   is_annotated: False,  # Note that this image isn't annotated.
+                   detection_boxes: np.array([[25., 25., 50., 50.],
+                                              [25., 25., 70., 50.],
+                                              [25., 25., 80., 50.],
+                                              [25., 25., 90., 50.]]),
+                   detection_scores: np.array([0.6, 0.7, 0.8, 0.9]),
+                   detection_classes: np.array([1, 2, 2, 3])
+               })
+    metrics = {}
+    for key, (value_op, _) in eval_metric_ops.iteritems():
+      metrics[key] = value_op
+    metrics = sess.run(metrics)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.50IOU'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'],
+                           1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'],
+                           1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0)
+    self.assertFalse(coco_evaluator._groundtruth_list)
+    self.assertFalse(coco_evaluator._detection_boxes_list)
+    self.assertFalse(coco_evaluator._image_ids)
  def testGetOneMAPWithMatchingGroundtruthAndDetectionsPadded(self):
    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
        _get_categories_list())
@@ -665,22 +758,40 @@ class CocoMaskEvaluationPyFuncTest(tf.test.TestCase):
    _, update_op = eval_metric_ops['DetectionMasks_Precision/mAP']
    with self.test_session() as sess:
-      sess.run(update_op,
+      sess.run(
-               feed_dict={
+          update_op,
-                   image_id: 'image1',
+          feed_dict={
-                   groundtruth_boxes: np.array([[100., 100., 200., 200.]]),
+              image_id:
-                   groundtruth_classes: np.array([1]),
+                  'image1',
-                   groundtruth_masks: np.pad(np.ones([1, 100, 100],
+              groundtruth_boxes:
-                                                     dtype=np.uint8),
+                  np.array([[100., 100., 200., 200.], [50., 50., 100., 100.]]),
-                                             ((0, 0), (10, 10), (10, 10)),
+              groundtruth_classes:
-                                             mode='constant'),
+                  np.array([1, 2]),
-                   detection_scores: np.array([.8]),
+              groundtruth_masks:
-                   detection_classes: np.array([1]),
+                  np.stack([
-                   detection_masks: np.pad(np.ones([1, 100, 100],
+                      np.pad(
-                                                   dtype=np.uint8),
+                          np.ones([100, 100], dtype=np.uint8), ((10, 10),
-                                           ((0, 0), (10, 10), (10, 10)),
+                                                                (10, 10)),
-                                           mode='constant')
+                          mode='constant'),
-               })
+                      np.pad(
+                          np.ones([50, 50], dtype=np.uint8), ((0, 70), (0, 70)),
+                          mode='constant')
+                  ]),
+              detection_scores:
+                  np.array([.9, .8]),
+              detection_classes:
+                  np.array([2, 1]),
+              detection_masks:
+                  np.stack([
+                      np.pad(
+                          np.ones([50, 50], dtype=np.uint8), ((0, 70), (0, 70)),
+                          mode='constant'),
+                      np.pad(
+                          np.ones([100, 100], dtype=np.uint8), ((10, 10),
+                                                                (10, 10)),
+                          mode='constant'),
+                  ])
+          })
      sess.run(update_op,
               feed_dict={
                   image_id: 'image2',
@@ -735,6 +846,106 @@ class CocoMaskEvaluationPyFuncTest(tf.test.TestCase):
    self.assertFalse(coco_evaluator._image_id_to_mask_shape_map)
    self.assertFalse(coco_evaluator._detection_masks_list)
+  def testGetOneMAPWithMatchingGroundtruthAndDetectionsBatched(self):
+    coco_evaluator = coco_evaluation.CocoMaskEvaluator(_get_categories_list())
+    batch_size = 3
+    image_id = tf.placeholder(tf.string, shape=(batch_size))
+    groundtruth_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4))
+    groundtruth_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
+    groundtruth_masks = tf.placeholder(
+        tf.uint8, shape=(batch_size, None, None, None))
+    detection_scores = tf.placeholder(tf.float32, shape=(batch_size, None))
+    detection_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
+    detection_masks = tf.placeholder(
+        tf.uint8, shape=(batch_size, None, None, None))
+    input_data_fields = standard_fields.InputDataFields
+    detection_fields = standard_fields.DetectionResultFields
+    eval_dict = {
+        input_data_fields.key: image_id,
+        input_data_fields.groundtruth_boxes: groundtruth_boxes,
+        input_data_fields.groundtruth_classes: groundtruth_classes,
+        input_data_fields.groundtruth_instance_masks: groundtruth_masks,
+        detection_fields.detection_scores: detection_scores,
+        detection_fields.detection_classes: detection_classes,
+        detection_fields.detection_masks: detection_masks,
+    }
+    eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(eval_dict)
+    _, update_op = eval_metric_ops['DetectionMasks_Precision/mAP']
+    with self.test_session() as sess:
+      sess.run(
+          update_op,
+          feed_dict={
+              image_id: ['image1', 'image2', 'image3'],
+              groundtruth_boxes:
+                  np.array([[[100., 100., 200., 200.]],
+                            [[50., 50., 100., 100.]],
+                            [[25., 25., 50., 50.]]]),
+              groundtruth_classes:
+                  np.array([[1], [1], [1]]),
+              groundtruth_masks:
+                  np.stack([
+                      np.pad(
+                          np.ones([1, 100, 100], dtype=np.uint8),
+                          ((0, 0), (0, 0), (0, 0)),
+                          mode='constant'),
+                      np.pad(
+                          np.ones([1, 50, 50], dtype=np.uint8),
+                          ((0, 0), (25, 25), (25, 25)),
+                          mode='constant'),
+                      np.pad(
+                          np.ones([1, 25, 25], dtype=np.uint8),
+                          ((0, 0), (37, 38), (37, 38)),
+                          mode='constant')
+                  ],
+                           axis=0),
+              detection_scores:
+                  np.array([[.8], [.8], [.8]]),
+              detection_classes:
+                  np.array([[1], [1], [1]]),
+              detection_masks:
+                  np.stack([
+                      np.pad(
+                          np.ones([1, 100, 100], dtype=np.uint8),
+                          ((0, 0), (0, 0), (0, 0)),
+                          mode='constant'),
+                      np.pad(
+                          np.ones([1, 50, 50], dtype=np.uint8),
+                          ((0, 0), (25, 25), (25, 25)),
+                          mode='constant'),
+                      np.pad(
+                          np.ones([1, 25, 25], dtype=np.uint8),
+                          ((0, 0), (37, 38), (37, 38)),
+                          mode='constant')
+                  ],
+                           axis=0)
+          })
+    metrics = {}
+    for key, (value_op, _) in eval_metric_ops.iteritems():
+      metrics[key] = value_op
+    metrics = sess.run(metrics)
+    self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP@.50IOU'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP@.75IOU'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP (large)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP (medium)'],
+                           1.0)
+    self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP (small)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@1'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@10'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@100'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@100 (large)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@100 (medium)'],
+                           1.0)
+    self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@100 (small)'], 1.0)
+    self.assertFalse(coco_evaluator._groundtruth_list)
+    self.assertFalse(coco_evaluator._image_ids_with_detections)
+    self.assertFalse(coco_evaluator._image_id_to_mask_shape_map)
+    self.assertFalse(coco_evaluator._detection_masks_list)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/model_lib.py
+++ b/research/object_detection/model_lib.py
@@ -25,6 +25,7 @@ import os
 import tensorflow as tf
 from object_detection import eval_util
+from object_detection import exporter as exporter_lib
 from object_detection import inputs
 from object_detection.builders import graph_rewriter_builder
 from object_detection.builders import model_builder
@@ -306,8 +307,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
          prediction_dict, features[fields.InputDataFields.true_image_shape])
      losses = [loss_tensor for loss_tensor in losses_dict.values()]
      if train_config.add_regularization_loss:
-        regularization_losses = tf.get_collection(
+        regularization_losses = detection_model.regularization_losses()
-            tf.GraphKeys.REGULARIZATION_LOSSES)
        if regularization_losses:
          regularization_loss = tf.add_n(
              regularization_losses, name='regularization_loss')
@@ -353,20 +353,24 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
        for var in optimizer_summary_vars:
          tf.summary.scalar(var.op.name, var)
      summaries = [] if use_tpu else None
+      if train_config.summarize_gradients:
+        summaries = ['gradients', 'gradient_norm', 'global_gradient_norm']
      train_op = tf.contrib.layers.optimize_loss(
          loss=total_loss,
          global_step=global_step,
          learning_rate=None,
          clip_gradients=clip_gradients_value,
          optimizer=training_optimizer,
+          update_ops=detection_model.updates(),
          variables=trainable_variables,
          summaries=summaries,
          name='')  # Preventing scope prefix on all variables.
    if mode == tf.estimator.ModeKeys.PREDICT:
+      exported_output = exporter_lib.add_output_tensor_nodes(detections)
      export_outputs = {
          tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
-              tf.estimator.export.PredictOutput(detections)
+              tf.estimator.export.PredictOutput(exported_output)
      }
    eval_metric_ops = None
@@ -456,6 +460,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
 def create_estimator_and_inputs(run_config,
                                hparams,
                                pipeline_config_path,
+                                config_override=None,
                                train_steps=None,
                                sample_1_of_n_eval_examples=1,
                                sample_1_of_n_eval_on_train_examples=1,
@@ -465,6 +470,7 @@ def create_estimator_and_inputs(run_config,
                                num_shards=1,
                                params=None,
                                override_eval_num_epochs=True,
+                                save_final_config=False,
                                **kwargs):
  """Creates `Estimator`, input functions, and steps.
@@ -472,6 +478,8 @@ def create_estimator_and_inputs(run_config,
    run_config: A `RunConfig`.
    hparams: A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
+    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
+      override the config from `pipeline_config_path`.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    sample_1_of_n_eval_examples: Integer representing how often an eval example
@@ -499,6 +507,8 @@ def create_estimator_and_inputs(run_config,
      `use_tpu_estimator` is True.
    override_eval_num_epochs: Whether to overwrite the number of epochs to
      1 for eval_input.
+    save_final_config: Whether to save final config (obtained after applying
+      overrides) to `estimator.model_dir`.
    **kwargs: Additional keyword arguments for configuration override.
  Returns:
@@ -522,7 +532,8 @@ def create_estimator_and_inputs(run_config,
  create_eval_input_fn = MODEL_BUILD_UTIL_MAP['create_eval_input_fn']
  create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn']
-  configs = get_configs_from_pipeline_file(pipeline_config_path)
+  configs = get_configs_from_pipeline_file(pipeline_config_path,
+                                           config_override=config_override)
  kwargs.update({
      'train_steps': train_steps,
      'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples
@@ -595,7 +606,7 @@ def create_estimator_and_inputs(run_config,
    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)
  # Write the as-run pipeline config to disk.
-  if run_config.is_chief:
+  if run_config.is_chief and save_final_config:
    pipeline_config_final = create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(pipeline_config_final, estimator.model_dir)
@@ -641,11 +652,17 @@ def create_train_and_eval_specs(train_input_fn,
      input_fn=train_input_fn, max_steps=train_steps)
  if eval_spec_names is None:
-    eval_spec_names = [ str(i) for i in range(len(eval_input_fns)) ]
+    eval_spec_names = [str(i) for i in range(len(eval_input_fns))]
  eval_specs = []
-  for eval_spec_name, eval_input_fn in zip(eval_spec_names, eval_input_fns):
+  for index, (eval_spec_name, eval_input_fn) in enumerate(
-    exporter_name = '{}_{}'.format(final_exporter_name, eval_spec_name)
+      zip(eval_spec_names, eval_input_fns)):
+    # Uses final_exporter_name as exporter_name for the first eval spec for
+    # backward compatibility.
+    if index == 0:
+      exporter_name = final_exporter_name
+    else:
+      exporter_name = '{}_{}'.format(final_exporter_name, eval_spec_name)
    exporter = tf.estimator.FinalExporter(
        name=exporter_name, serving_input_receiver_fn=predict_input_fn)
    eval_specs.append(
@@ -747,6 +764,7 @@ def populate_experiment(run_config,
      train_steps=train_steps,
      eval_steps=eval_steps,
      model_fn_creator=model_fn_creator,
+      save_final_config=True,
      **kwargs)
  estimator = train_and_eval_dict['estimator']
  train_input_fn = train_and_eval_dict['train_input_fn']

--- a/research/object_detection/model_lib_test.py
+++ b/research/object_detection/model_lib_test.py
@@ -310,7 +310,7 @@ class ModelLibTest(tf.test.TestCase):
    self.assertEqual(2, len(eval_specs))
    self.assertEqual(None, eval_specs[0].steps)
    self.assertEqual('holdout', eval_specs[0].name)
-    self.assertEqual('exporter_holdout', eval_specs[0].exporters[0].name)
+    self.assertEqual('exporter', eval_specs[0].exporters[0].name)
    self.assertEqual(None, eval_specs[1].steps)
    self.assertEqual('eval_on_train', eval_specs[1].name)

--- a/research/object_detection/model_tpu_main.py
+++ b/research/object_detection/model_tpu_main.py
@@ -114,6 +114,7 @@ def main(unused_argv):
      use_tpu_estimator=True,
      use_tpu=FLAGS.use_tpu,
      num_shards=FLAGS.num_shards,
+      save_final_config=FLAGS.mode == 'train',
      **kwargs)
  estimator = train_and_eval_dict['estimator']
  train_input_fn = train_and_eval_dict['train_input_fn']

--- a/research/object_detection/models/faster_rcnn_resnet_v1_feature_extractor.py
+++ b/research/object_detection/models/faster_rcnn_resnet_v1_feature_extractor.py
@@ -72,6 +72,8 @@ class FasterRCNNResnetV1FeatureExtractor(
    VGG style channel mean subtraction as described here:
    https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-md
+    Note that if the number of channels is not equal to 3, the mean subtraction
+    will be skipped and the original resized_inputs will be returned.
    Args:
      resized_inputs: A [batch, height_in, width_in, channels] float32 tensor
@@ -82,8 +84,11 @@ class FasterRCNNResnetV1FeatureExtractor(
        tensor representing a batch of images.
    """
-    channel_means = [123.68, 116.779, 103.939]
+    if resized_inputs.shape.as_list()[3] == 3:
-    return resized_inputs - [[channel_means]]
+      channel_means = [123.68, 116.779, 103.939]
+      return resized_inputs - [[channel_means]]
+    else:
+      return resized_inputs
  def _extract_proposal_features(self, preprocessed_inputs, scope):
    """Extracts first stage RPN features.

--- a/research/object_detection/models/feature_map_generators.py
+++ b/research/object_detection/models/feature_map_generators.py
@@ -146,7 +146,6 @@ class KerasMultiResolutionFeatureMaps(tf.keras.Model):
      use_depthwise = feature_map_layout['use_depthwise']
    for index, from_layer in enumerate(feature_map_layout['from_layer']):
      net = []
-      self.convolutions.append(net)
      layer_depth = feature_map_layout['layer_depth'][index]
      conv_kernel_size = 3
      if 'conv_kernel_size' in feature_map_layout:
@@ -231,6 +230,10 @@ class KerasMultiResolutionFeatureMaps(tf.keras.Model):
              conv_hyperparams.build_activation_layer(
                  name=layer_name))
+      # Until certain bugs are fixed in checkpointable lists,
+      # this net must be appended only once it's been filled with layers
+      self.convolutions.append(net)
  def call(self, image_features):
    """Generate the multi-resolution feature maps.
@@ -263,7 +266,8 @@ class KerasMultiResolutionFeatureMaps(tf.keras.Model):
 def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
-                                  min_depth, insert_1x1_conv, image_features):
+                                  min_depth, insert_1x1_conv, image_features,
+                                  pool_residual=False):
  """Generates multi resolution feature maps from input image features.
  Generates multi-scale feature maps for detection as in the SSD papers by
@@ -317,6 +321,13 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
      should be inserted before shrinking the feature map.
    image_features: A dictionary of handles to activation tensors from the
      base feature extractor.
+    pool_residual: Whether to add an average pooling layer followed by a
+      residual connection between subsequent feature maps when the channel
+      depth match. For example, with option 'layer_depth': [-1, 512, 256, 256],
+      a pooling and residual layer is added between the third and forth feature
+      map. This option is better used with Weight Shared Convolution Box
+      Predictor when all feature maps have the same channel depth to encourage
+      more consistent features across multi-scale feature maps.
  Returns:
    feature_maps: an OrderedDict mapping keys (feature map names) to
@@ -350,6 +361,7 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
      feature_map_keys.append(from_layer)
    else:
      pre_layer = feature_maps[-1]
+      pre_layer_depth = pre_layer.get_shape().as_list()[3]
      intermediate_layer = pre_layer
      if insert_1x1_conv:
        layer_name = '{}_1_Conv2d_{}_1x1_{}'.format(
@@ -383,6 +395,12 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
            padding='SAME',
            stride=1,
            scope=layer_name)
+        if pool_residual and pre_layer_depth == depth_fn(layer_depth):
+          feature_map += slim.avg_pool2d(
+              pre_layer, [3, 3],
+              padding='SAME',
+              stride=2,
+              scope=layer_name + '_pool')
      else:
        feature_map = slim.conv2d(
            intermediate_layer,
@@ -399,6 +417,7 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
 def fpn_top_down_feature_maps(image_features,
                              depth,
                              use_depthwise=False,
+                              use_explicit_padding=False,
                              scope=None):
  """Generates `top-down` feature maps for Feature Pyramid Networks.
@@ -409,7 +428,9 @@ def fpn_top_down_feature_maps(image_features,
      Spatial resolutions of succesive tensors must reduce exactly by a factor
      of 2.
    depth: depth of output feature maps.
-    use_depthwise: use depthwise separable conv instead of regular conv.
+    use_depthwise: whether to use depthwise separable conv instead of regular
+      conv.
+    use_explicit_padding: whether to use explicit padding.
    scope: A scope name to wrap this op under.
  Returns:
@@ -420,8 +441,10 @@ def fpn_top_down_feature_maps(image_features,
    num_levels = len(image_features)
    output_feature_maps_list = []
    output_feature_map_keys = []
+    padding = 'VALID' if use_explicit_padding else 'SAME'
+    kernel_size = 3
    with slim.arg_scope(
-        [slim.conv2d, slim.separable_conv2d], padding='SAME', stride=1):
+        [slim.conv2d, slim.separable_conv2d], padding=padding, stride=1):
      top_down = slim.conv2d(
          image_features[-1][1],
          depth, [1, 1], activation_fn=None, normalizer_fn=None,
@@ -436,14 +459,20 @@ def fpn_top_down_feature_maps(image_features,
            image_features[level][1], depth, [1, 1],
            activation_fn=None, normalizer_fn=None,
            scope='projection_%d' % (level + 1))
+        if use_explicit_padding:
+          # slice top_down to the same shape as residual
+          residual_shape = tf.shape(residual)
+          top_down = top_down[:, :residual_shape[1], :residual_shape[2], :]
        top_down += residual
        if use_depthwise:
          conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1)
        else:
          conv_op = slim.conv2d
+        if use_explicit_padding:
+          top_down = ops.fixed_padding(top_down, kernel_size)
        output_feature_maps_list.append(conv_op(
            top_down,
-            depth, [3, 3],
+            depth, [kernel_size, kernel_size],
            scope='smoothing_%d' % (level + 1)))
        output_feature_map_keys.append('top_down_%s' % image_features[level][0])
      return collections.OrderedDict(reversed(

--- a/research/object_detection/models/feature_map_generators_test.py
+++ b/research/object_detection/models/feature_map_generators_test.py
@@ -45,6 +45,11 @@ EMBEDDED_SSD_MOBILENET_V1_LAYOUT = {
    'conv_kernel_size': [-1, -1, 3, 3, 2],
 }
+SSD_MOBILENET_V1_WEIGHT_SHARED_LAYOUT = {
+    'from_layer': ['Conv2d_13_pointwise', '', '', ''],
+    'layer_depth': [-1, 256, 256, 256],
+}
 @parameterized.parameters(
    {'use_keras': False},
@@ -67,7 +72,8 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):
    text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
    return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
-  def _build_feature_map_generator(self, feature_map_layout, use_keras):
+  def _build_feature_map_generator(self, feature_map_layout, use_keras,
+                                   pool_residual=False):
    if use_keras:
      return feature_map_generators.KerasMultiResolutionFeatureMaps(
          feature_map_layout=feature_map_layout,
@@ -86,7 +92,8 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):
            depth_multiplier=1,
            min_depth=32,
            insert_1x1_conv=True,
-            image_features=image_features)
+            image_features=image_features,
+            pool_residual=pool_residual)
      return feature_map_generator
  def test_get_expected_feature_map_shapes_with_inception_v2(self, use_keras):
@@ -209,6 +216,34 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):
          (key, value.shape) for key, value in out_feature_maps.items())
      self.assertDictEqual(expected_feature_map_shapes, out_feature_map_shapes)
+  def test_feature_map_shapes_with_pool_residual_ssd_mobilenet_v1(
+      self, use_keras):
+    image_features = {
+        'Conv2d_13_pointwise': tf.random_uniform([4, 8, 8, 1024],
+                                                 dtype=tf.float32),
+    }
+    feature_map_generator = self._build_feature_map_generator(
+        feature_map_layout=SSD_MOBILENET_V1_WEIGHT_SHARED_LAYOUT,
+        use_keras=use_keras,
+        pool_residual=True
+    )
+    feature_maps = feature_map_generator(image_features)
+    expected_feature_map_shapes = {
+        'Conv2d_13_pointwise': (4, 8, 8, 1024),
+        'Conv2d_13_pointwise_2_Conv2d_1_3x3_s2_256': (4, 4, 4, 256),
+        'Conv2d_13_pointwise_2_Conv2d_2_3x3_s2_256': (4, 2, 2, 256),
+        'Conv2d_13_pointwise_2_Conv2d_3_3x3_s2_256': (4, 1, 1, 256)}
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      out_feature_maps = sess.run(feature_maps)
+      out_feature_map_shapes = dict(
+          (key, value.shape) for key, value in out_feature_maps.items())
+      self.assertDictEqual(expected_feature_map_shapes, out_feature_map_shapes)
  def test_get_expected_variable_names_with_inception_v2(self, use_keras):
    image_features = {
        'Mixed_3c': tf.random_uniform([4, 28, 28, 256], dtype=tf.float32),

--- a/research/object_detection/models/keras_applications/mobilenet_v2.py
+++ b/research/object_detection/models/keras_applications/mobilenet_v2.py
@@ -82,6 +82,8 @@ class _LayersOverride(object):
    self._conv_hyperparams = conv_hyperparams
    self._use_explicit_padding = use_explicit_padding
    self._min_depth = min_depth
+    self.regularizer = tf.keras.regularizers.l2(0.00004 * 0.5)
+    self.initializer = tf.truncated_normal_initializer(stddev=0.09)
  def _FixedPaddingLayer(self, kernel_size):
    return tf.keras.layers.Lambda(lambda x: ops.fixed_padding(x, kernel_size))
@@ -114,6 +116,9 @@ class _LayersOverride(object):
    if self._conv_hyperparams:
      kwargs = self._conv_hyperparams.params(**kwargs)
+    else:
+      kwargs['kernel_regularizer'] = self.regularizer
+      kwargs['kernel_initializer'] = self.initializer
    kwargs['padding'] = 'same'
    kernel_size = kwargs.get('kernel_size')
@@ -144,6 +149,8 @@ class _LayersOverride(object):
    """
    if self._conv_hyperparams:
      kwargs = self._conv_hyperparams.params(**kwargs)
+    else:
+      kwargs['depthwise_initializer'] = self.initializer
    kwargs['padding'] = 'same'
    kernel_size = kwargs.get('kernel_size')

--- a/research/object_detection/models/ssd_mobilenet_v1_fpn_feature_extractor.py
+++ b/research/object_detection/models/ssd_mobilenet_v1_fpn_feature_extractor.py
@@ -31,11 +31,10 @@ slim = tf.contrib.slim
 # A modified config of mobilenet v1 that makes it more detection friendly,
 def _create_modified_mobilenet_config():
-  conv_defs = copy.copy(mobilenet_v1.MOBILENETV1_CONV_DEFS)
+  conv_defs = copy.deepcopy(mobilenet_v1.MOBILENETV1_CONV_DEFS)
  conv_defs[-2] = mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=512)
  conv_defs[-1] = mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=256)
  return conv_defs
-_CONV_DEFS = _create_modified_mobilenet_config()
 class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
@@ -98,6 +97,9 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
    self._fpn_min_level = fpn_min_level
    self._fpn_max_level = fpn_max_level
    self._additional_layer_depth = additional_layer_depth
+    self._conv_defs = None
+    if self._use_depthwise:
+      self._conv_defs = _create_modified_mobilenet_config()
  def preprocess(self, resized_inputs):
    """SSD preprocessing.
@@ -141,7 +143,7 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
              final_endpoint='Conv2d_13_pointwise',
              min_depth=self._min_depth,
              depth_multiplier=self._depth_multiplier,
-              conv_defs=_CONV_DEFS if self._use_depthwise else None,
+              conv_defs=self._conv_defs,
              use_explicit_padding=self._use_explicit_padding,
              scope=scope)
@@ -159,7 +161,8 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
          fpn_features = feature_map_generators.fpn_top_down_feature_maps(
              [(key, image_features[key]) for key in feature_block_list],
              depth=depth_fn(self._additional_layer_depth),
-              use_depthwise=self._use_depthwise)
+              use_depthwise=self._use_depthwise,
+              use_explicit_padding=self._use_explicit_padding)
          feature_maps = []
          for level in range(self._fpn_min_level, base_fpn_max_level + 1):
            feature_maps.append(fpn_features['top_down_{}'.format(
@@ -167,18 +170,23 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
          last_feature_map = fpn_features['top_down_{}'.format(
              feature_blocks[base_fpn_max_level - 2])]
          # Construct coarse features
+          padding = 'VALID' if self._use_explicit_padding else 'SAME'
+          kernel_size = 3
          for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1):
            if self._use_depthwise:
              conv_op = functools.partial(
                  slim.separable_conv2d, depth_multiplier=1)
            else:
              conv_op = slim.conv2d
+            if self._use_explicit_padding:
+              last_feature_map = ops.fixed_padding(
+                  last_feature_map, kernel_size)
            last_feature_map = conv_op(
                last_feature_map,
                num_outputs=depth_fn(self._additional_layer_depth),
-                kernel_size=[3, 3],
+                kernel_size=[kernel_size, kernel_size],
                stride=2,
-                padding='SAME',
+                padding=padding,
                scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 13))
            feature_maps.append(last_feature_map)
    return feature_maps
--- a/research/object_detection/models/ssd_mobilenet_v2_fpn_feature_extractor.py
+++ b/research/object_detection/models/ssd_mobilenet_v2_fpn_feature_extractor.py
@@ -30,17 +30,14 @@ from nets.mobilenet import mobilenet_v2
 slim = tf.contrib.slim
-# A modified config of mobilenet v2 that makes it more detection friendly,
+# A modified config of mobilenet v2 that makes it more detection friendly.
 def _create_modified_mobilenet_config():
-  conv_defs = copy.copy(mobilenet_v2.V2_DEF)
+  conv_defs = copy.deepcopy(mobilenet_v2.V2_DEF)
  conv_defs['spec'][-1] = mobilenet.op(
      slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=256)
  return conv_defs
-_CONV_DEFS = _create_modified_mobilenet_config()
 class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
  """SSD Feature Extractor using MobilenetV2 FPN features."""
@@ -100,6 +97,9 @@ class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
    self._fpn_min_level = fpn_min_level
    self._fpn_max_level = fpn_max_level
    self._additional_layer_depth = additional_layer_depth
+    self._conv_defs = None
+    if self._use_depthwise:
+      self._conv_defs = _create_modified_mobilenet_config()
  def preprocess(self, resized_inputs):
    """SSD preprocessing.
@@ -142,7 +142,7 @@ class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
              final_endpoint='layer_19',
              depth_multiplier=self._depth_multiplier,
-              conv_defs=_CONV_DEFS if self._use_depthwise else None,
+              conv_defs=self._conv_defs,
              use_explicit_padding=self._use_explicit_padding,
              scope=scope)
      depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth)
@@ -158,7 +158,8 @@ class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
          fpn_features = feature_map_generators.fpn_top_down_feature_maps(
              [(key, image_features[key]) for key in feature_block_list],
              depth=depth_fn(self._additional_layer_depth),
-              use_depthwise=self._use_depthwise)
+              use_depthwise=self._use_depthwise,
+              use_explicit_padding=self._use_explicit_padding)
          feature_maps = []
          for level in range(self._fpn_min_level, base_fpn_max_level + 1):
            feature_maps.append(fpn_features['top_down_{}'.format(
@@ -166,18 +167,23 @@ class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
          last_feature_map = fpn_features['top_down_{}'.format(
              feature_blocks[base_fpn_max_level - 2])]
          # Construct coarse features
+          padding = 'VALID' if self._use_explicit_padding else 'SAME'
+          kernel_size = 3
          for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1):
            if self._use_depthwise:
              conv_op = functools.partial(
                  slim.separable_conv2d, depth_multiplier=1)
            else:
              conv_op = slim.conv2d
+            if self._use_explicit_padding:
+              last_feature_map = ops.fixed_padding(
+                  last_feature_map, kernel_size)
            last_feature_map = conv_op(
                last_feature_map,
                num_outputs=depth_fn(self._additional_layer_depth),
-                kernel_size=[3, 3],
+                kernel_size=[kernel_size, kernel_size],
                stride=2,
-                padding='SAME',
+                padding=padding,
                scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 19))
            feature_maps.append(last_feature_map)
    return feature_maps