Merged commit includes the following changes:

184048729 by Zhichao Lu: Modify target_assigner so that it creates regression targets taking keypoints into account. -- 184027183 by Zhichao Lu: Resnet V1 FPN based feature extractors for SSD meta architecture in Object Detection V2 API. -- 184004730 by Zhichao Lu: Expose a lever to override the configured mask_type. -- 183933113 by Zhichao Lu: Weight shared convolutional box predictor as described in https://arxiv.org/abs/1708.02002 -- 183929669 by Zhichao Lu: Expanding box list operations for future data augmentations. -- 183916792 by Zhichao Lu: Fix unrecognized assertion function in tests. -- 183906851 by Zhichao Lu: - Change ssd meta architecture to use regression weights to compute loss normalizer. -- 183871003 by Zhichao Lu: Fix config_util_test wrong dependency. -- 183782120 by Zhichao Lu: Add __init__ file to third_party directories. -- 183779109 by Zhichao Lu: Setup regular version s...

Merged commit includes the following changes:
184048729 by Zhichao Lu: Modify target_assigner so that it creates regression targets taking keypoints into account. -- 184027183 by Zhichao Lu: Resnet V1 FPN based feature extractors for SSD meta architecture in Object Detection V2 API. -- 184004730 by Zhichao Lu: Expose a lever to override the configured mask_type. -- 183933113 by Zhichao Lu: Weight shared convolutional box predictor as described in https://arxiv.org/abs/1708.02002 -- 183929669 by Zhichao Lu: Expanding box list operations for future data augmentations. -- 183916792 by Zhichao Lu: Fix unrecognized assertion function in tests. -- 183906851 by Zhichao Lu: - Change ssd meta architecture to use regression weights to compute loss normalizer. -- 183871003 by Zhichao Lu: Fix config_util_test wrong dependency. -- 183782120 by Zhichao Lu: Add __init__ file to third_party directories. -- 183779109 by Zhichao Lu: Setup regular version s...
7a9934df · Zhichao Lu · lzc5123016 · 7ef602be · 7a9934df · 7a9934df
Commit 7a9934df authored Jan 31, 2018 by Zhichao Lu Committed by lzc5123016 Jan 31, 2018
20 changed files
--- a/research/object_detection/meta_architectures/ssd_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch_test.py
@@ -24,6 +24,7 @@ from object_detection.core import losses
 from object_detection.core import post_processing
 from object_detection.core import region_similarity_calculator as sim_calc
 from object_detection.meta_architectures import ssd_meta_arch
+from object_detection.utils import test_case
 from object_detection.utils import test_utils

 slim = tf.contrib.slim
@@ -46,7 +47,7 @@ class FakeSSDFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
  def extract_features(self, preprocessed_inputs):
    with tf.variable_scope('mock_model'):
      features = slim.conv2d(inputs=preprocessed_inputs, num_outputs=32,
-                             kernel_size=[1, 1], scope='layer1')
+                             kernel_size=1, scope='layer1')
      return [features]


@@ -64,37 +65,31 @@ class MockAnchorGenerator2x2(anchor_generator.AnchorGenerator):
        tf.constant([[0, 0, .5, .5],
                     [0, .5, .5, 1],
                     [.5, 0, 1, .5],
-                     [.5, .5, 1, 1]], tf.float32))
+                     [1., 1., 1.5, 1.5]  # Anchor that is outside clip_window.
+                    ], tf.float32))

+  def num_anchors(self):
+    return 4

-class SsdMetaArchTest(tf.test.TestCase):

-  def setUp(self):
-    """Set up mock SSD model.
+class SsdMetaArchTest(test_case.TestCase):

-    Here we set up a simple mock SSD model that will always predict 4
-    detections that happen to always be exactly the anchors that are set up
-    in the above MockAnchorGenerator.  Because we let max_detections=5,
-    we will also always end up with an extra padded row in the detection
-    results.
-    """
+  def _create_model(self, apply_hard_mining=True):
    is_training = False
-    self._num_classes = 1
+    num_classes = 1
    mock_anchor_generator = MockAnchorGenerator2x2()
    mock_box_predictor = test_utils.MockBoxPredictor(
-        is_training, self._num_classes)
+        is_training, num_classes)
    mock_box_coder = test_utils.MockBoxCoder()
    fake_feature_extractor = FakeSSDFeatureExtractor()
    mock_matcher = test_utils.MockMatcher()
    region_similarity_calculator = sim_calc.IouSimilarity()

    def image_resizer_fn(image):
-      return tf.identity(image)
+      return [tf.identity(image), tf.shape(image)]

-    classification_loss = losses.WeightedSigmoidClassificationLoss(
-        anchorwise_output=True)
-    localization_loss = losses.WeightedSmoothL1LocalizationLoss(
-        anchorwise_output=True)
+    classification_loss = losses.WeightedSigmoidClassificationLoss()
+    localization_loss = losses.WeightedSmoothL1LocalizationLoss()
    non_max_suppression_fn = functools.partial(
        post_processing.batch_multiclass_non_max_suppression,
        score_thresh=-20.0,
@@ -105,48 +100,56 @@ class SsdMetaArchTest(tf.test.TestCase):
    localization_loss_weight = 1.0
    normalize_loss_by_num_matches = False

-    # This hard example miner is expected to be a no-op.
-    hard_example_miner = losses.HardExampleMiner(
-        num_hard_examples=None,
-        iou_threshold=1.0)
+    hard_example_miner = None
+    if apply_hard_mining:
+      # This hard example miner is expected to be a no-op.
+      hard_example_miner = losses.HardExampleMiner(
+          num_hard_examples=None,
+          iou_threshold=1.0)

-    self._num_anchors = 4
-    self._code_size = 4
-    self._model = ssd_meta_arch.SSDMetaArch(
+    code_size = 4
+    model = ssd_meta_arch.SSDMetaArch(
        is_training, mock_anchor_generator, mock_box_predictor, mock_box_coder,
        fake_feature_extractor, mock_matcher, region_similarity_calculator,
        image_resizer_fn, non_max_suppression_fn, tf.identity,
        classification_loss, localization_loss, classification_loss_weight,
        localization_loss_weight, normalize_loss_by_num_matches,
-        hard_example_miner)
+        hard_example_miner, add_summaries=False)
+    return model, num_classes, mock_anchor_generator.num_anchors(), code_size

-  def test_preprocess_preserves_input_shapes(self):
+  def test_preprocess_preserves_shapes_with_dynamic_input_image(self):
    image_shapes = [(3, None, None, 3),
                    (None, 10, 10, 3),
                    (None, None, None, 3)]
+    model, _, _, _ = self._create_model()
    for image_shape in image_shapes:
      image_placeholder = tf.placeholder(tf.float32, shape=image_shape)
-      preprocessed_inputs = self._model.preprocess(image_placeholder)
+      preprocessed_inputs, _ = model.preprocess(image_placeholder)
      self.assertAllEqual(preprocessed_inputs.shape.as_list(), image_shape)

-  def test_predict_results_have_correct_keys_and_shapes(self):
+  def test_preprocess_preserves_shape_with_static_input_image(self):
+    def graph_fn(input_image):
+      model, _, _, _ = self._create_model()
+      return model.preprocess(input_image)
+    input_image = np.random.rand(2, 3, 3, 3).astype(np.float32)
+    preprocessed_inputs, _ = self.execute(graph_fn, [input_image])
+    self.assertAllEqual(preprocessed_inputs.shape, [2, 3, 3, 3])
+
+  def test_predict_result_shapes_on_image_with_dynamic_shape(self):
    batch_size = 3
    image_size = 2
-    input_shapes = [(batch_size, image_size, image_size, 3),
-                    (None, image_size, image_size, 3),
+    input_shapes = [(None, image_size, image_size, 3),
                    (batch_size, None, None, 3),
                    (None, None, None, 3)]
-    expected_box_encodings_shape_out = (
-        batch_size, self._num_anchors, self._code_size)
-    expected_class_predictions_with_background_shape_out = (
-        batch_size, self._num_anchors, self._num_classes+1)

    for input_shape in input_shapes:
      tf_graph = tf.Graph()
      with tf_graph.as_default():
+        model, num_classes, num_anchors, code_size = self._create_model()
        preprocessed_input_placeholder = tf.placeholder(tf.float32,
                                                        shape=input_shape)
-        prediction_dict = self._model.predict(preprocessed_input_placeholder)
+        prediction_dict = model.predict(
+            preprocessed_input_placeholder, true_image_shapes=None)

        self.assertTrue('box_encodings' in prediction_dict)
        self.assertTrue('class_predictions_with_background' in prediction_dict)
@@ -161,12 +164,42 @@ class SsdMetaArchTest(tf.test.TestCase):
                                      preprocessed_input_placeholder:
                                      np.random.uniform(
                                          size=(batch_size, 2, 2, 3))})
+      expected_box_encodings_shape_out = (batch_size, num_anchors, code_size)
+      expected_class_predictions_with_background_shape_out = (batch_size,
+                                                              num_anchors,
+                                                              num_classes + 1)
+
      self.assertAllEqual(prediction_out['box_encodings'].shape,
                          expected_box_encodings_shape_out)
      self.assertAllEqual(
          prediction_out['class_predictions_with_background'].shape,
          expected_class_predictions_with_background_shape_out)

+  def test_predict_result_shapes_on_image_with_static_shape(self):
+
+    with tf.Graph().as_default():
+      _, num_classes, num_anchors, code_size = self._create_model()
+
+    def graph_fn(input_image):
+      model, _, _, _ = self._create_model()
+      predictions = model.predict(input_image, true_image_shapes=None)
+      return (predictions['box_encodings'],
+              predictions['class_predictions_with_background'],
+              predictions['feature_maps'],
+              predictions['anchors'])
+    batch_size = 3
+    image_size = 2
+    channels = 3
+    input_image = np.random.rand(batch_size, image_size, image_size,
+                                 channels).astype(np.float32)
+    expected_box_encodings_shape = (batch_size, num_anchors, code_size)
+    expected_class_predictions_shape = (batch_size, num_anchors, num_classes+1)
+    (box_encodings, class_predictions, _, _) = self.execute(graph_fn,
+                                                            [input_image])
+    self.assertAllEqual(box_encodings.shape, expected_box_encodings_shape)
+    self.assertAllEqual(class_predictions.shape,
+                        expected_class_predictions_shape)
+
  def test_postprocess_results_are_correct(self):
    batch_size = 2
    image_size = 2
@@ -178,26 +211,30 @@ class SsdMetaArchTest(tf.test.TestCase):
    expected_boxes = np.array([[[0, 0, .5, .5],
                                [0, .5, .5, 1],
                                [.5, 0, 1, .5],
-                                [.5, .5, 1, 1],
-                                [0, 0, 0, 0]],
+                                [0, 0, 0, 0],   # pruned prediction
+                                [0, 0, 0, 0]],  # padding
                               [[0, 0, .5, .5],
                                [0, .5, .5, 1],
                                [.5, 0, 1, .5],
-                                [.5, .5, 1, 1],
-                                [0, 0, 0, 0]]])
+                                [0, 0, 0, 0],  # pruned prediction
+                                [0, 0, 0, 0]]  # padding
+                              ])
    expected_scores = np.array([[0, 0, 0, 0, 0],
                                [0, 0, 0, 0, 0]])
    expected_classes = np.array([[0, 0, 0, 0, 0],
                                 [0, 0, 0, 0, 0]])
-    expected_num_detections = np.array([4, 4])
+    expected_num_detections = np.array([3, 3])

    for input_shape in input_shapes:
      tf_graph = tf.Graph()
      with tf_graph.as_default():
-        preprocessed_input_placeholder = tf.placeholder(tf.float32,
-                                                        shape=input_shape)
-        prediction_dict = self._model.predict(preprocessed_input_placeholder)
-        detections = self._model.postprocess(prediction_dict)
+        model, _, _, _ = self._create_model()
+        input_placeholder = tf.placeholder(tf.float32, shape=input_shape)
+        preprocessed_inputs, true_image_shapes = model.preprocess(
+            input_placeholder)
+        prediction_dict = model.predict(preprocessed_inputs,
+                                        true_image_shapes)
+        detections = model.postprocess(prediction_dict, true_image_shapes)
        self.assertTrue('detection_boxes' in detections)
        self.assertTrue('detection_scores' in detections)
        self.assertTrue('detection_classes' in detections)
@@ -207,7 +244,7 @@ class SsdMetaArchTest(tf.test.TestCase):
        sess.run(init_op)
        detections_out = sess.run(detections,
                                  feed_dict={
-                                      preprocessed_input_placeholder:
+                                      input_placeholder:
                                      np.random.uniform(
                                          size=(batch_size, 2, 2, 3))})
      self.assertAllClose(detections_out['detection_boxes'], expected_boxes)
@@ -217,47 +254,91 @@ class SsdMetaArchTest(tf.test.TestCase):
                          expected_num_detections)

  def test_loss_results_are_correct(self):
-    batch_size = 2
-    preprocessed_input = tf.random_uniform((batch_size, 2, 2, 3),
-                                           dtype=tf.float32)
-    groundtruth_boxes_list = [tf.constant([[0, 0, .5, .5]], dtype=tf.float32),
-                              tf.constant([[0, 0, .5, .5]], dtype=tf.float32)]
-    groundtruth_classes_list = [tf.constant([[1]], dtype=tf.float32),
-                                tf.constant([[1]], dtype=tf.float32)]
-    self._model.provide_groundtruth(groundtruth_boxes_list,
-                                    groundtruth_classes_list)
-    prediction_dict = self._model.predict(preprocessed_input)
-    loss_dict = self._model.loss(prediction_dict)
-
-    self.assertTrue('localization_loss' in loss_dict)
-    self.assertTrue('classification_loss' in loss_dict)

+    with tf.Graph().as_default():
+      _, num_classes, num_anchors, _ = self._create_model()
+    def graph_fn(preprocessed_tensor, groundtruth_boxes1, groundtruth_boxes2,
+                 groundtruth_classes1, groundtruth_classes2):
+      groundtruth_boxes_list = [groundtruth_boxes1, groundtruth_boxes2]
+      groundtruth_classes_list = [groundtruth_classes1, groundtruth_classes2]
+      model, _, _, _ = self._create_model(apply_hard_mining=False)
+      model.provide_groundtruth(groundtruth_boxes_list,
+                                groundtruth_classes_list)
+      prediction_dict = model.predict(preprocessed_tensor,
+                                      true_image_shapes=None)
+      loss_dict = model.loss(prediction_dict, true_image_shapes=None)
+      return (loss_dict['localization_loss'], loss_dict['classification_loss'])
+
+    batch_size = 2
+    preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)
+    groundtruth_boxes1 = np.array([[0, 0, .5, .5]], dtype=np.float32)
+    groundtruth_boxes2 = np.array([[0, 0, .5, .5]], dtype=np.float32)
+    groundtruth_classes1 = np.array([[1]], dtype=np.float32)
+    groundtruth_classes2 = np.array([[1]], dtype=np.float32)
    expected_localization_loss = 0.0
-    expected_classification_loss = (batch_size * self._num_anchors
-                                    * (self._num_classes+1) * np.log(2.0))
-    init_op = tf.global_variables_initializer()
-    with self.test_session() as sess:
-      sess.run(init_op)
-      losses_out = sess.run(loss_dict)
+    expected_classification_loss = (batch_size * num_anchors
+                                    * (num_classes+1) * np.log(2.0))
+    (localization_loss,
+     classification_loss) = self.execute(graph_fn, [preprocessed_input,
+                                                    groundtruth_boxes1,
+                                                    groundtruth_boxes2,
+                                                    groundtruth_classes1,
+                                                    groundtruth_classes2])
+    self.assertAllClose(localization_loss, expected_localization_loss)
+    self.assertAllClose(classification_loss, expected_classification_loss)
+
+  def test_loss_results_are_correct_with_hard_example_mining(self):
+
+    with tf.Graph().as_default():
+      _, num_classes, num_anchors, _ = self._create_model()
+    def graph_fn(preprocessed_tensor, groundtruth_boxes1, groundtruth_boxes2,
+                 groundtruth_classes1, groundtruth_classes2):
+      groundtruth_boxes_list = [groundtruth_boxes1, groundtruth_boxes2]
+      groundtruth_classes_list = [groundtruth_classes1, groundtruth_classes2]
+      model, _, _, _ = self._create_model()
+      model.provide_groundtruth(groundtruth_boxes_list,
+                                groundtruth_classes_list)
+      prediction_dict = model.predict(preprocessed_tensor,
+                                      true_image_shapes=None)
+      loss_dict = model.loss(prediction_dict, true_image_shapes=None)
+      return (loss_dict['localization_loss'], loss_dict['classification_loss'])

-      self.assertAllClose(losses_out['localization_loss'],
-                          expected_localization_loss)
-      self.assertAllClose(losses_out['classification_loss'],
-                          expected_classification_loss)
+    batch_size = 2
+    preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)
+    groundtruth_boxes1 = np.array([[0, 0, .5, .5]], dtype=np.float32)
+    groundtruth_boxes2 = np.array([[0, 0, .5, .5]], dtype=np.float32)
+    groundtruth_classes1 = np.array([[1]], dtype=np.float32)
+    groundtruth_classes2 = np.array([[1]], dtype=np.float32)
+    expected_localization_loss = 0.0
+    expected_classification_loss = (batch_size * num_anchors
+                                    * (num_classes+1) * np.log(2.0))
+    (localization_loss, classification_loss) = self.execute_cpu(
+        graph_fn, [
+            preprocessed_input, groundtruth_boxes1, groundtruth_boxes2,
+            groundtruth_classes1, groundtruth_classes2
+        ])
+    self.assertAllClose(localization_loss, expected_localization_loss)
+    self.assertAllClose(classification_loss, expected_classification_loss)

  def test_restore_map_for_detection_ckpt(self):
+    model, _, _, _ = self._create_model()
+    model.predict(tf.constant(np.array([[[0, 0], [1, 1]], [[1, 0], [0, 1]]],
+                                       dtype=np.float32)),
+                  true_image_shapes=None)
    init_op = tf.global_variables_initializer()
    saver = tf.train.Saver()
    save_path = self.get_temp_dir()
    with self.test_session() as sess:
      sess.run(init_op)
      saved_model_path = saver.save(sess, save_path)
-      var_map = self._model.restore_map(from_detection_checkpoint=True)
+      var_map = model.restore_map(
+          from_detection_checkpoint=True,
+          load_all_detection_checkpoint_vars=False)
      self.assertIsInstance(var_map, dict)
      saver = tf.train.Saver(var_map)
      saver.restore(sess, saved_model_path)
      for var in sess.run(tf.report_uninitialized_variables()):
-        self.assertNotIn('FeatureExtractor', var.name)
+        self.assertNotIn('FeatureExtractor', var)

  def test_restore_map_for_classification_ckpt(self):
    # Define mock tensorflow classification graph and save variables.
@@ -271,7 +352,7 @@ class SsdMetaArchTest(tf.test.TestCase):
      init_op = tf.global_variables_initializer()
      saver = tf.train.Saver()
      save_path = self.get_temp_dir()
-      with self.test_session() as sess:
+      with self.test_session(graph=test_graph_classification) as sess:
        sess.run(init_op)
        saved_model_path = saver.save(sess, save_path)

@@ -279,19 +360,39 @@ class SsdMetaArchTest(tf.test.TestCase):
    # classification checkpoint.
    test_graph_detection = tf.Graph()
    with test_graph_detection.as_default():
+      model, _, _, _ = self._create_model()
      inputs_shape = [2, 2, 2, 3]
      inputs = tf.to_float(tf.random_uniform(
          inputs_shape, minval=0, maxval=255, dtype=tf.int32))
-      preprocessed_inputs = self._model.preprocess(inputs)
-      prediction_dict = self._model.predict(preprocessed_inputs)
-      self._model.postprocess(prediction_dict)
-      var_map = self._model.restore_map(from_detection_checkpoint=False)
+      preprocessed_inputs, true_image_shapes = model.preprocess(inputs)
+      prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
+      model.postprocess(prediction_dict, true_image_shapes)
+      another_variable = tf.Variable([17.0], name='another_variable')  # pylint: disable=unused-variable
+      var_map = model.restore_map(from_detection_checkpoint=False)
+      self.assertNotIn('another_variable', var_map)
      self.assertIsInstance(var_map, dict)
      saver = tf.train.Saver(var_map)
-      with self.test_session() as sess:
+      with self.test_session(graph=test_graph_detection) as sess:
        saver.restore(sess, saved_model_path)
        for var in sess.run(tf.report_uninitialized_variables()):
-          self.assertNotIn('FeatureExtractor', var.name)
+          self.assertNotIn('FeatureExtractor', var)
+
+  def test_load_all_det_checkpoint_vars(self):
+    test_graph_detection = tf.Graph()
+    with test_graph_detection.as_default():
+      model, _, _, _ = self._create_model()
+      inputs_shape = [2, 2, 2, 3]
+      inputs = tf.to_float(
+          tf.random_uniform(inputs_shape, minval=0, maxval=255, dtype=tf.int32))
+      preprocessed_inputs, true_image_shapes = model.preprocess(inputs)
+      prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
+      model.postprocess(prediction_dict, true_image_shapes)
+      another_variable = tf.Variable([17.0], name='another_variable')  # pylint: disable=unused-variable
+      var_map = model.restore_map(
+          from_detection_checkpoint=True,
+          load_all_detection_checkpoint_vars=True)
+      self.assertIsInstance(var_map, dict)
+      self.assertIn('another_variable', var_map)


 if __name__ == '__main__':

--- a/research/object_detection/metrics/BUILD
+++ b/research/object_detection/metrics/BUILD
@@ -8,6 +8,57 @@ licenses(["notice"])

 # Apache 2.0

+py_library(
+    name = "coco_tools",
+    srcs = [
+        "coco_tools.py",
+    ],
+    deps = [
+        "//file/localfile",
+        "//file/placer",
+        "//pycocotools",
+        "//tensorflow",
+        "//tensorflow/models/research/object_detection/utils:json_utils",
+    ],
+)
+
+py_test(
+    name = "coco_tools_test",
+    srcs = [
+        "coco_tools_test.py",
+    ],
+    deps = [
+        ":coco_tools",
+        "//testing/pybase",
+        "//numpy",
+    ],
+)
+
+py_library(
+    name = "coco_evaluation",
+    srcs = [
+        "coco_evaluation.py",
+    ],
+    deps = [
+        ":coco_tools",
+        "//tensorflow",
+        "//tensorflow/models/research/object_detection/core:standard_fields",
+        "//tensorflow/models/research/object_detection/utils:object_detection_evaluation",
+    ],
+)
+
+py_test(
+    name = "coco_evaluation_test",
+    srcs = [
+        "coco_evaluation_test.py",
+    ],
+    deps = [
+        ":coco_evaluation",
+        "//tensorflow",
+        "//tensorflow/models/research/object_detection/core:standard_fields",
+    ],
+)
+
 py_binary(
    name = "offline_eval_map_corloc",
    srcs = [
@@ -15,11 +66,11 @@ py_binary(
    ],
    deps = [
        ":tf_example_parser",
-        "//tensorflow_models/object_detection:evaluator",
-        "//tensorflow_models/object_detection/builders:input_reader_builder",
-        "//tensorflow_models/object_detection/core:standard_fields",
-        "//tensorflow_models/object_detection/utils:config_util",
-        "//tensorflow_models/object_detection/utils:label_map_util",
+        "//tensorflow/models/research/object_detection:evaluator",
+        "//tensorflow/models/research/object_detection/builders:input_reader_builder",
+        "//tensorflow/models/research/object_detection/core:standard_fields",
+        "//tensorflow/models/research/object_detection/utils:config_util",
+        "//tensorflow/models/research/object_detection/utils:label_map_util",
    ],
 )

@@ -39,8 +90,8 @@ py_library(
    srcs = ["tf_example_parser.py"],
    deps = [
        "//tensorflow",
-        "//tensorflow_models/object_detection/core:data_parser",
-        "//tensorflow_models/object_detection/core:standard_fields",
+        "//tensorflow/models/research/object_detection/core:data_parser",
+        "//tensorflow/models/research/object_detection/core:standard_fields",
    ],
 )

@@ -50,6 +101,6 @@ py_test(
    deps = [
        ":tf_example_parser",
        "//tensorflow",
-        "//tensorflow_models/object_detection/core:standard_fields",
+        "//tensorflow/models/research/object_detection/core:standard_fields",
    ],
 )
--- a/research/object_detection/metrics/__init__.py
+++ b/research/object_detection/metrics/__init__.py
--- a/research/object_detection/metrics/coco_evaluation.py
+++ b/research/object_detection/metrics/coco_evaluation.py
+"""Class for evaluating object detections with COCO metrics."""
+import numpy as np
+import tensorflow as tf
+
+from object_detection.core import standard_fields
+from object_detection.metrics import coco_tools
+from object_detection.utils import object_detection_evaluation
+
+
+class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
+  """Class to evaluate COCO detection metrics."""
+
+  def __init__(self, categories, all_metrics_per_category=False):
+    """Constructor.
+
+    Args:
+      categories: A list of dicts, each of which has the following keys -
+        'id': (required) an integer id uniquely identifying this category.
+        'name': (required) string representing category name e.g., 'cat', 'dog'.
+      all_metrics_per_category: Whether to include all the summary metrics for
+        each category in per_category_ap. Be careful with setting it to true if
+        you have more than handful of categories, because it will pollute
+        your mldash.
+    """
+    super(CocoDetectionEvaluator, self).__init__(categories)
+    # _image_ids is a dictionary that maps unique image ids to Booleans which
+    # indicate whether a corresponding detection has been added.
+    self._image_ids = {}
+    self._groundtruth_list = []
+    self._detection_boxes_list = []
+    self._category_id_set = set([cat['id'] for cat in self._categories])
+    self._annotation_id = 1
+    self._metrics = None
+    self._all_metrics_per_category = all_metrics_per_category
+
+  def clear(self):
+    """Clears the state to prepare for a fresh evaluation."""
+    self._image_ids.clear()
+    self._groundtruth_list = []
+    self._detection_boxes_list = []
+
+  def add_single_ground_truth_image_info(self,
+                                         image_id,
+                                         groundtruth_dict):
+    """Adds groundtruth for a single image to be used for evaluation.
+
+    If the image has already been added, a warning is logged, and groundtruth is
+    ignored.
+
+    Args:
+      image_id: A unique string/integer identifier for the image.
+      groundtruth_dict: A dictionary containing -
+        InputDataFields.groundtruth_boxes: float32 numpy array of shape
+          [num_boxes, 4] containing `num_boxes` groundtruth boxes of the format
+          [ymin, xmin, ymax, xmax] in absolute image coordinates.
+        InputDataFields.groundtruth_classes: integer numpy array of shape
+          [num_boxes] containing 1-indexed groundtruth classes for the boxes.
+    """
+    if image_id in self._image_ids:
+      tf.logging.warning('Ignoring ground truth with image id %s since it was '
+                         'previously added', image_id)
+      return
+
+    self._groundtruth_list.extend(
+        coco_tools.
+        ExportSingleImageGroundtruthToCoco(
+            image_id=image_id,
+            next_annotation_id=self._annotation_id,
+            category_id_set=self._category_id_set,
+            groundtruth_boxes=groundtruth_dict[standard_fields.InputDataFields.
+                                               groundtruth_boxes],
+            groundtruth_classes=groundtruth_dict[standard_fields.
+                                                 InputDataFields.
+                                                 groundtruth_classes]))
+    self._annotation_id += groundtruth_dict[standard_fields.InputDataFields.
+                                            groundtruth_boxes].shape[0]
+    self._image_ids[image_id] = False
+
+  def add_single_detected_image_info(self,
+                                     image_id,
+                                     detections_dict):
+    """Adds detections for a single image to be used for evaluation.
+
+    If a detection has already been added for this image id, a warning is
+    logged, and the detection is skipped.
+
+    Args:
+      image_id: A unique string/integer identifier for the image.
+      detections_dict: A dictionary containing -
+        DetectionResultFields.detection_boxes: float32 numpy array of shape
+          [num_boxes, 4] containing `num_boxes` detection boxes of the format
+          [ymin, xmin, ymax, xmax] in absolute image coordinates.
+        DetectionResultFields.detection_scores: float32 numpy array of shape
+          [num_boxes] containing detection scores for the boxes.
+        DetectionResultFields.detection_classes: integer numpy array of shape
+          [num_boxes] containing 1-indexed detection classes for the boxes.
+        DetectionResultFields.detection_masks: optional uint8 numpy array of
+          shape [num_boxes, image_height, image_width] containing instance
+          masks for the boxes.
+
+    Raises:
+      ValueError: If groundtruth for the image_id is not available.
+    """
+    if image_id not in self._image_ids:
+      raise ValueError('Missing groundtruth for image id: {}'.format(image_id))
+
+    if self._image_ids[image_id]:
+      tf.logging.warning('Ignoring detection with image id %s since it was '
+                         'previously added', image_id)
+      return
+
+    self._detection_boxes_list.extend(
+        coco_tools.ExportSingleImageDetectionBoxesToCoco(
+            image_id=image_id,
+            category_id_set=self._category_id_set,
+            detection_boxes=detections_dict[standard_fields.
+                                            DetectionResultFields
+                                            .detection_boxes],
+            detection_scores=detections_dict[standard_fields.
+                                             DetectionResultFields.
+                                             detection_scores],
+            detection_classes=detections_dict[standard_fields.
+                                              DetectionResultFields.
+                                              detection_classes]))
+    self._image_ids[image_id] = True
+
+  def evaluate(self):
+    """Evaluates the detection boxes and returns a dictionary of coco metrics.
+
+    Returns:
+      A dictionary holding -
+
+      1. summary_metrics:
+      'DetectionBoxes_Precision/mAP': mean average precision over classes
+        averaged over IOU thresholds ranging from .5 to .95 with .05
+        increments.
+      'DetectionBoxes_Precision/mAP@.50IOU': mean average precision at 50% IOU
+      'DetectionBoxes_Precision/mAP@.75IOU': mean average precision at 75% IOU
+      'DetectionBoxes_Precision/mAP (small)': mean average precision for small
+        objects (area < 32^2 pixels).
+      'DetectionBoxes_Precision/mAP (medium)': mean average precision for
+        medium sized objects (32^2 pixels < area < 96^2 pixels).
+      'DetectionBoxes_Precision/mAP (large)': mean average precision for large
+        objects (96^2 pixels < area < 10000^2 pixels).
+      'DetectionBoxes_Recall/AR@1': average recall with 1 detection.
+      'DetectionBoxes_Recall/AR@10': average recall with 10 detections.
+      'DetectionBoxes_Recall/AR@100': average recall with 100 detections.
+      'DetectionBoxes_Recall/AR@100 (small)': average recall for small objects
+        with 100.
+      'DetectionBoxes_Recall/AR@100 (medium)': average recall for medium objects
+        with 100.
+      'DetectionBoxes_Recall/AR@100 (large)': average recall for large objects
+        with 100 detections.
+
+      2. per_category_ap: category specific results with keys of the form:
+      'Precision mAP ByCategory/category' (without the supercategory part if
+      no supercategories exist). For backward compatibility
+      'PerformanceByCategory' is included in the output regardless of
+      all_metrics_per_category.
+    """
+    groundtruth_dict = {
+        'annotations': self._groundtruth_list,
+        'images': [{'id': image_id} for image_id in self._image_ids],
+        'categories': self._categories
+    }
+    coco_wrapped_groundtruth = coco_tools.COCOWrapper(groundtruth_dict)
+    coco_wrapped_detections = coco_wrapped_groundtruth.LoadAnnotations(
+        self._detection_boxes_list)
+    box_evaluator = coco_tools.COCOEvalWrapper(
+        coco_wrapped_groundtruth, coco_wrapped_detections, agnostic_mode=False)
+    box_metrics, box_per_category_ap = box_evaluator.ComputeMetrics(
+        all_metrics_per_category=self._all_metrics_per_category)
+    box_metrics.update(box_per_category_ap)
+    box_metrics = {'DetectionBoxes_'+ key: value
+                   for key, value in box_metrics.iteritems()}
+    return box_metrics
+
+  def get_estimator_eval_metric_ops(self, image_id, groundtruth_boxes,
+                                    groundtruth_classes, detection_boxes,
+                                    detection_scores, detection_classes):
+    """Returns a dictionary of eval metric ops to use with `tf.EstimatorSpec`.
+
+    Note that once value_op is called, the detections and groundtruth added via
+    update_op are cleared.
+
+    Args:
+      image_id: Unique string/integer identifier for the image.
+      groundtruth_boxes: float32 tensor of shape [num_boxes, 4] containing
+        `num_boxes` groundtruth boxes of the format
+        [ymin, xmin, ymax, xmax] in absolute image coordinates.
+      groundtruth_classes: int32 tensor of shape [num_boxes] containing
+        1-indexed groundtruth classes for the boxes.
+      detection_boxes: float32 tensor of shape [num_boxes, 4] containing
+        `num_boxes` detection boxes of the format [ymin, xmin, ymax, xmax]
+        in absolute image coordinates.
+      detection_scores: float32 tensor of shape [num_boxes] containing
+        detection scores for the boxes.
+      detection_classes: int32 tensor of shape [num_boxes] containing
+        1-indexed detection classes for the boxes.
+
+    Returns:
+      a dictionary of metric names to tuple of value_op and update_op that can
+      be used as eval metric ops in tf.EstimatorSpec. Note that all update ops
+      must be run together and similarly all value ops must be run together to
+      guarantee correct behaviour.
+    """
+    def update_op(
+        image_id,
+        groundtruth_boxes,
+        groundtruth_classes,
+        detection_boxes,
+        detection_scores,
+        detection_classes):
+      self.add_single_ground_truth_image_info(
+          image_id,
+          {'groundtruth_boxes': groundtruth_boxes,
+           'groundtruth_classes': groundtruth_classes})
+      self.add_single_detected_image_info(
+          image_id,
+          {'detection_boxes': detection_boxes,
+           'detection_scores': detection_scores,
+           'detection_classes': detection_classes})
+
+    update_op = tf.py_func(update_op, [image_id,
+                                       groundtruth_boxes,
+                                       groundtruth_classes,
+                                       detection_boxes,
+                                       detection_scores,
+                                       detection_classes], [])
+    metric_names = ['DetectionBoxes_Precision/mAP',
+                    'DetectionBoxes_Precision/mAP@.50IOU',
+                    'DetectionBoxes_Precision/mAP@.75IOU',
+                    'DetectionBoxes_Precision/mAP (large)',
+                    'DetectionBoxes_Precision/mAP (medium)',
+                    'DetectionBoxes_Precision/mAP (small)',
+                    'DetectionBoxes_Recall/AR@1',
+                    'DetectionBoxes_Recall/AR@10',
+                    'DetectionBoxes_Recall/AR@100',
+                    'DetectionBoxes_Recall/AR@100 (large)',
+                    'DetectionBoxes_Recall/AR@100 (medium)',
+                    'DetectionBoxes_Recall/AR@100 (small)']
+    for category_dict in self._categories:
+      metric_names.append('DetectionBoxes_PerformanceByCategory/mAP/' +
+                          category_dict['name'])
+
+    def first_value_func():
+      self._metrics = self.evaluate()
+      self.clear()
+      return np.float32(self._metrics[metric_names[0]])
+
+    def value_func_factory(metric_name):
+      def value_func():
+        return np.float32(self._metrics[metric_name])
+      return value_func
+
+    first_value_op = tf.py_func(first_value_func, [], tf.float32)
+    eval_metric_ops = {metric_names[0]: (first_value_op, update_op)}
+    with tf.control_dependencies([first_value_op]):
+      for metric_name in metric_names[1:]:
+        eval_metric_ops[metric_name] = (tf.py_func(
+            value_func_factory(metric_name), [], np.float32), update_op)
+    return eval_metric_ops
+
+
+def _check_mask_type_and_value(array_name, masks):
+  """Checks whether mask dtype is uint8 anf the values are either 0 or 1."""
+  if masks.dtype != np.uint8:
+    raise ValueError('{} must be of type np.uint8. Found {}.'.format(
+        array_name, masks.dtype))
+  if np.any(np.logical_and(masks != 0, masks != 1)):
+    raise ValueError('{} elements can only be either 0 or 1.'.format(
+        array_name))
+
+
+class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
+  """Class to evaluate COCO detection metrics."""
+
+  def __init__(self, categories):
+    """Constructor.
+
+    Args:
+      categories: A list of dicts, each of which has the following keys -
+        'id': (required) an integer id uniquely identifying this category.
+        'name': (required) string representing category name e.g., 'cat', 'dog'.
+    """
+    super(CocoMaskEvaluator, self).__init__(categories)
+    self._image_id_to_mask_shape_map = {}
+    self._image_ids_with_detections = set([])
+    self._groundtruth_list = []
+    self._detection_masks_list = []
+    self._category_id_set = set([cat['id'] for cat in self._categories])
+    self._annotation_id = 1
+
+  def clear(self):
+    """Clears the state to prepare for a fresh evaluation."""
+    self._image_id_to_mask_shape_map.clear()
+    self._image_ids_with_detections.clear()
+    self._groundtruth_list = []
+    self._detection_masks_list = []
+
+  def add_single_ground_truth_image_info(self,
+                                         image_id,
+                                         groundtruth_dict):
+    """Adds groundtruth for a single image to be used for evaluation.
+
+    Args:
+      image_id: A unique string/integer identifier for the image.
+      groundtruth_dict: A dictionary containing -
+        InputDataFields.groundtruth_boxes: float32 numpy array of shape
+          [num_boxes, 4] containing `num_boxes` groundtruth boxes of the format
+          [ymin, xmin, ymax, xmax] in absolute image coordinates.
+        InputDataFields.groundtruth_classes: integer numpy array of shape
+          [num_boxes] containing 1-indexed groundtruth classes for the boxes.
+        InputDataFields.groundtruth_instance_masks: uint8 numpy array of shape
+          [num_boxes, image_height, image_width] containing groundtruth masks
+          corresponding to the boxes. The elements of the array must be in
+          {0, 1}.
+    """
+    if image_id in self._image_id_to_mask_shape_map:
+      tf.logging.warning('Ignoring ground truth with image id %s since it was '
+                         'previously added', image_id)
+      return
+
+    groundtruth_instance_masks = groundtruth_dict[
+        standard_fields.InputDataFields.groundtruth_instance_masks]
+    _check_mask_type_and_value(standard_fields.InputDataFields.
+                               groundtruth_instance_masks,
+                               groundtruth_instance_masks)
+    self._groundtruth_list.extend(
+        coco_tools.
+        ExportSingleImageGroundtruthToCoco(
+            image_id=image_id,
+            next_annotation_id=self._annotation_id,
+            category_id_set=self._category_id_set,
+            groundtruth_boxes=groundtruth_dict[standard_fields.InputDataFields.
+                                               groundtruth_boxes],
+            groundtruth_classes=groundtruth_dict[standard_fields.
+                                                 InputDataFields.
+                                                 groundtruth_classes],
+            groundtruth_masks=groundtruth_instance_masks))
+    self._annotation_id += groundtruth_dict[standard_fields.InputDataFields.
+                                            groundtruth_boxes].shape[0]
+    self._image_id_to_mask_shape_map[image_id] = groundtruth_dict[
+        standard_fields.InputDataFields.groundtruth_instance_masks].shape
+
+  def add_single_detected_image_info(self,
+                                     image_id,
+                                     detections_dict):
+    """Adds detections for a single image to be used for evaluation.
+
+    Args:
+      image_id: A unique string/integer identifier for the image.
+      detections_dict: A dictionary containing -
+        DetectionResultFields.detection_scores: float32 numpy array of shape
+          [num_boxes] containing detection scores for the boxes.
+        DetectionResultFields.detection_classes: integer numpy array of shape
+          [num_boxes] containing 1-indexed detection classes for the boxes.
+        DetectionResultFields.detection_masks: optional uint8 numpy array of
+          shape [num_boxes, image_height, image_width] containing instance
+          masks corresponding to the boxes. The elements of the array must be
+          in {0, 1}.
+
+    Raises:
+      ValueError: If groundtruth for the image_id is not available or if
+        spatial shapes of groundtruth_instance_masks and detection_masks are
+        incompatible.
+    """
+    if image_id not in self._image_id_to_mask_shape_map:
+      raise ValueError('Missing groundtruth for image id: {}'.format(image_id))
+
+    if image_id in self._image_ids_with_detections:
+      tf.logging.warning('Ignoring detection with image id %s since it was '
+                         'previously added', image_id)
+      return
+
+    groundtruth_masks_shape = self._image_id_to_mask_shape_map[image_id]
+    detection_masks = detections_dict[standard_fields.DetectionResultFields.
+                                      detection_masks]
+    if groundtruth_masks_shape[1:] != detection_masks.shape[1:]:
+      raise ValueError('Spatial shape of groundtruth masks and detection masks '
+                       'are incompatible: {} vs {}'.format(
+                           groundtruth_masks_shape,
+                           detection_masks.shape))
+    _check_mask_type_and_value(standard_fields.DetectionResultFields.
+                               detection_masks,
+                               detection_masks)
+    self._detection_masks_list.extend(
+        coco_tools.ExportSingleImageDetectionMasksToCoco(
+            image_id=image_id,
+            category_id_set=self._category_id_set,
+            detection_masks=detection_masks,
+            detection_scores=detections_dict[standard_fields.
+                                             DetectionResultFields.
+                                             detection_scores],
+            detection_classes=detections_dict[standard_fields.
+                                              DetectionResultFields.
+                                              detection_classes]))
+    self._image_ids_with_detections.update([image_id])
+
+  def evaluate(self):
+    """Evaluates the detection masks and returns a dictionary of coco metrics.
+
+    Returns:
+      A dictionary holding -
+
+      1. summary_metrics:
+      'Precision/mAP': mean average precision over classes averaged over IOU
+        thresholds ranging from .5 to .95 with .05 increments
+      'Precision/mAP@.50IOU': mean average precision at 50% IOU
+      'Precision/mAP@.75IOU': mean average precision at 75% IOU
+      'Precision/mAP (small)': mean average precision for small objects
+                      (area < 32^2 pixels)
+      'Precision/mAP (medium)': mean average precision for medium sized
+                      objects (32^2 pixels < area < 96^2 pixels)
+      'Precision/mAP (large)': mean average precision for large objects
+                      (96^2 pixels < area < 10000^2 pixels)
+      'Recall/AR@1': average recall with 1 detection
+      'Recall/AR@10': average recall with 10 detections
+      'Recall/AR@100': average recall with 100 detections
+      'Recall/AR@100 (small)': average recall for small objects with 100
+        detections
+      'Recall/AR@100 (medium)': average recall for medium objects with 100
+        detections
+      'Recall/AR@100 (large)': average recall for large objects with 100
+        detections
+
+      2. per_category_ap: category specific results with keys of the form:
+      'Precision mAP ByCategory/category' (without the supercategory part if
+      no supercategories exist). For backward compatibility
+      'PerformanceByCategory' is included in the output regardless of
+      all_metrics_per_category.
+    """
+    groundtruth_dict = {
+        'annotations': self._groundtruth_list,
+        'images': [{'id': image_id, 'height': shape[1], 'width': shape[2]}
+                   for image_id, shape in self._image_id_to_mask_shape_map.
+                   iteritems()],
+        'categories': self._categories
+    }
+    coco_wrapped_groundtruth = coco_tools.COCOWrapper(
+        groundtruth_dict, detection_type='segmentation')
+    coco_wrapped_detection_masks = coco_wrapped_groundtruth.LoadAnnotations(
+        self._detection_masks_list)
+    mask_evaluator = coco_tools.COCOEvalWrapper(
+        coco_wrapped_groundtruth, coco_wrapped_detection_masks,
+        agnostic_mode=False, iou_type='segm')
+    mask_metrics, mask_per_category_ap = mask_evaluator.ComputeMetrics()
+    mask_metrics.update(mask_per_category_ap)
+    mask_metrics = {'DetectionMasks_'+ key: value
+                    for key, value in mask_metrics.iteritems()}
+    return mask_metrics
--- a/research/object_detection/metrics/coco_evaluation_test.py
+++ b/research/object_detection/metrics/coco_evaluation_test.py
+"""Tests for image.understanding.object_detection.metrics.coco_evaluation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+import tensorflow as tf
+from object_detection.core import standard_fields
+from object_detection.metrics import coco_evaluation
+
+
+class CocoDetectionEvaluationTest(tf.test.TestCase):
+
+  def testGetOneMAPWithMatchingGroundtruthAndDetections(self):
+    """Tests that mAP is calculated correctly on GT and Detections."""
+    category_list = [{'id': 0, 'name': 'person'},
+                     {'id': 1, 'name': 'cat'},
+                     {'id': 2, 'name': 'dog'}]
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
+    coco_evaluator.add_single_ground_truth_image_info(
+        image_id='image1',
+        groundtruth_dict={
+            standard_fields.InputDataFields.groundtruth_boxes:
+            np.array([[100., 100., 200., 200.]]),
+            standard_fields.InputDataFields.groundtruth_classes: np.array([1])
+        })
+    coco_evaluator.add_single_detected_image_info(
+        image_id='image1',
+        detections_dict={
+            standard_fields.DetectionResultFields.detection_boxes:
+            np.array([[100., 100., 200., 200.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+            np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+            np.array([1])
+        })
+    coco_evaluator.add_single_ground_truth_image_info(
+        image_id='image2',
+        groundtruth_dict={
+            standard_fields.InputDataFields.groundtruth_boxes:
+            np.array([[50., 50., 100., 100.]]),
+            standard_fields.InputDataFields.groundtruth_classes: np.array([1])
+        })
+    coco_evaluator.add_single_detected_image_info(
+        image_id='image2',
+        detections_dict={
+            standard_fields.DetectionResultFields.detection_boxes:
+            np.array([[50., 50., 100., 100.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+            np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+            np.array([1])
+        })
+    coco_evaluator.add_single_ground_truth_image_info(
+        image_id='image3',
+        groundtruth_dict={
+            standard_fields.InputDataFields.groundtruth_boxes:
+            np.array([[25., 25., 50., 50.]]),
+            standard_fields.InputDataFields.groundtruth_classes: np.array([1])
+        })
+    coco_evaluator.add_single_detected_image_info(
+        image_id='image3',
+        detections_dict={
+            standard_fields.DetectionResultFields.detection_boxes:
+            np.array([[25., 25., 50., 50.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+            np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+            np.array([1])
+        })
+    metrics = coco_evaluator.evaluate()
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
+
+  def testReturnAllMetricsPerCategory(self):
+    """Tests that mAP is calculated correctly on GT and Detections."""
+    category_list = [{'id': 0, 'name': 'person'}]
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
+        category_list, all_metrics_per_category=True)
+    coco_evaluator.add_single_ground_truth_image_info(
+        image_id='image1',
+        groundtruth_dict={
+            standard_fields.InputDataFields.groundtruth_boxes:
+            np.array([[100., 100., 200., 200.]]),
+            standard_fields.InputDataFields.groundtruth_classes: np.array([1])
+        })
+    coco_evaluator.add_single_detected_image_info(
+        image_id='image1',
+        detections_dict={
+            standard_fields.DetectionResultFields.detection_boxes:
+            np.array([[100., 100., 200., 200.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+            np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+            np.array([1])
+        })
+    metrics = coco_evaluator.evaluate()
+    expected_metrics = [
+        'DetectionBoxes_Recall AR@10 ByCategory/person',
+        'DetectionBoxes_Precision mAP (medium) ByCategory/person',
+        'DetectionBoxes_Precision mAP ByCategory/person',
+        'DetectionBoxes_Precision mAP@.50IOU ByCategory/person',
+        'DetectionBoxes_Precision mAP (small) ByCategory/person',
+        'DetectionBoxes_Precision mAP (large) ByCategory/person',
+        'DetectionBoxes_Recall AR@1 ByCategory/person',
+        'DetectionBoxes_Precision mAP@.75IOU ByCategory/person',
+        'DetectionBoxes_Recall AR@100 ByCategory/person',
+        'DetectionBoxes_Recall AR@100 (medium) ByCategory/person',
+        'DetectionBoxes_Recall AR@100 (large) ByCategory/person']
+    self.assertTrue(set(expected_metrics).issubset(set(metrics)))
+
+  def testRejectionOnDuplicateGroundtruth(self):
+    """Tests that groundtruth cannot be added more than once for an image."""
+    categories = [{'id': 1, 'name': 'cat'},
+                  {'id': 2, 'name': 'dog'},
+                  {'id': 3, 'name': 'elephant'}]
+    #  Add groundtruth
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(categories)
+    image_key1 = 'img1'
+    groundtruth_boxes1 = np.array([[0, 0, 1, 1], [0, 0, 2, 2], [0, 0, 3, 3]],
+                                  dtype=float)
+    groundtruth_class_labels1 = np.array([1, 3, 1], dtype=int)
+    coco_evaluator.add_single_ground_truth_image_info(image_key1, {
+        standard_fields.InputDataFields.groundtruth_boxes:
+            groundtruth_boxes1,
+        standard_fields.InputDataFields.groundtruth_classes:
+            groundtruth_class_labels1
+    })
+    groundtruth_lists_len = len(coco_evaluator._groundtruth_list)
+
+    # Add groundtruth with the same image id.
+    coco_evaluator.add_single_ground_truth_image_info(image_key1, {
+        standard_fields.InputDataFields.groundtruth_boxes:
+            groundtruth_boxes1,
+        standard_fields.InputDataFields.groundtruth_classes:
+            groundtruth_class_labels1
+    })
+    self.assertEqual(groundtruth_lists_len,
+                     len(coco_evaluator._groundtruth_list))
+
+  def testRejectionOnDuplicateDetections(self):
+    """Tests that detections cannot be added more than once for an image."""
+    categories = [{'id': 1, 'name': 'cat'},
+                  {'id': 2, 'name': 'dog'},
+                  {'id': 3, 'name': 'elephant'}]
+    #  Add groundtruth
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(categories)
+    coco_evaluator.add_single_ground_truth_image_info(
+        image_id='image1',
+        groundtruth_dict={
+            standard_fields.InputDataFields.groundtruth_boxes:
+            np.array([[99., 100., 200., 200.]]),
+            standard_fields.InputDataFields.groundtruth_classes: np.array([1])
+        })
+    coco_evaluator.add_single_detected_image_info(
+        image_id='image1',
+        detections_dict={
+            standard_fields.DetectionResultFields.detection_boxes:
+            np.array([[100., 100., 200., 200.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+            np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+            np.array([1])
+        })
+    detections_lists_len = len(coco_evaluator._detection_boxes_list)
+    coco_evaluator.add_single_detected_image_info(
+        image_id='image1',  # Note that this image id was previously added.
+        detections_dict={
+            standard_fields.DetectionResultFields.detection_boxes:
+            np.array([[100., 100., 200., 200.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+            np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+            np.array([1])
+        })
+    self.assertEqual(detections_lists_len,
+                     len(coco_evaluator._detection_boxes_list))
+
+  def testExceptionRaisedWithMissingGroundtruth(self):
+    """Tests that exception is raised for detection with missing groundtruth."""
+    categories = [{'id': 1, 'name': 'cat'},
+                  {'id': 2, 'name': 'dog'},
+                  {'id': 3, 'name': 'elephant'}]
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(categories)
+    with self.assertRaises(ValueError):
+      coco_evaluator.add_single_detected_image_info(
+          image_id='image1',
+          detections_dict={
+              standard_fields.DetectionResultFields.detection_boxes:
+                  np.array([[100., 100., 200., 200.]]),
+              standard_fields.DetectionResultFields.detection_scores:
+                  np.array([.8]),
+              standard_fields.DetectionResultFields.detection_classes:
+                  np.array([1])
+          })
+
+
+class CocoEvaluationPyFuncTest(tf.test.TestCase):
+
+  def testGetOneMAPWithMatchingGroundtruthAndDetections(self):
+    category_list = [{'id': 0, 'name': 'person'},
+                     {'id': 1, 'name': 'cat'},
+                     {'id': 2, 'name': 'dog'}]
+    coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
+    image_id = tf.placeholder(tf.string, shape=())
+    groundtruth_boxes = tf.placeholder(tf.float32, shape=(None, 4))
+    groundtruth_classes = tf.placeholder(tf.float32, shape=(None))
+    detection_boxes = tf.placeholder(tf.float32, shape=(None, 4))
+    detection_scores = tf.placeholder(tf.float32, shape=(None))
+    detection_classes = tf.placeholder(tf.float32, shape=(None))
+
+    eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
+        image_id, groundtruth_boxes,
+        groundtruth_classes,
+        detection_boxes,
+        detection_scores,
+        detection_classes)
+
+    _, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
+
+    with self.test_session() as sess:
+      sess.run(update_op,
+               feed_dict={
+                   image_id: 'image1',
+                   groundtruth_boxes: np.array([[100., 100., 200., 200.]]),
+                   groundtruth_classes: np.array([1]),
+                   detection_boxes: np.array([[100., 100., 200., 200.]]),
+                   detection_scores: np.array([.8]),
+                   detection_classes: np.array([1])
+               })
+      sess.run(update_op,
+               feed_dict={
+                   image_id: 'image2',
+                   groundtruth_boxes: np.array([[50., 50., 100., 100.]]),
+                   groundtruth_classes: np.array([3]),
+                   detection_boxes: np.array([[50., 50., 100., 100.]]),
+                   detection_scores: np.array([.7]),
+                   detection_classes: np.array([3])
+               })
+      sess.run(update_op,
+               feed_dict={
+                   image_id: 'image3',
+                   groundtruth_boxes: np.array([[25., 25., 50., 50.]]),
+                   groundtruth_classes: np.array([2]),
+                   detection_boxes: np.array([[25., 25., 50., 50.]]),
+                   detection_scores: np.array([.9]),
+                   detection_classes: np.array([2])
+               })
+    metrics = {}
+    for key, (value_op, _) in eval_metric_ops.iteritems():
+      metrics[key] = value_op
+    metrics = sess.run(metrics)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.50IOU'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'],
+                           -1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'],
+                           -1.0)
+    self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0)
+    self.assertAlmostEqual(metrics[
+        'DetectionBoxes_PerformanceByCategory/mAP/dog'], 1.0)
+    self.assertAlmostEqual(metrics[
+        'DetectionBoxes_PerformanceByCategory/mAP/cat'], 1.0)
+    self.assertTrue(math.isnan(metrics[
+        'DetectionBoxes_PerformanceByCategory/mAP/person']))
+    self.assertFalse(coco_evaluator._groundtruth_list)
+    self.assertFalse(coco_evaluator._detection_boxes_list)
+    self.assertFalse(coco_evaluator._image_ids)
+
+
+class CocoMaskEvaluationTest(tf.test.TestCase):
+
+  def testGetOneMAPWithMatchingGroundtruthAndDetections(self):
+    category_list = [{'id': 0, 'name': 'person'},
+                     {'id': 1, 'name': 'cat'},
+                     {'id': 2, 'name': 'dog'}]
+    coco_evaluator = coco_evaluation.CocoMaskEvaluator(category_list)
+    coco_evaluator.add_single_ground_truth_image_info(
+        image_id='image1',
+        groundtruth_dict={
+            standard_fields.InputDataFields.groundtruth_boxes:
+            np.array([[100., 100., 200., 200.]]),
+            standard_fields.InputDataFields.groundtruth_classes: np.array([1]),
+            standard_fields.InputDataFields.groundtruth_instance_masks:
+            np.pad(np.ones([1, 100, 100], dtype=np.uint8),
+                   ((0, 0), (10, 10), (10, 10)), mode='constant')
+        })
+    coco_evaluator.add_single_detected_image_info(
+        image_id='image1',
+        detections_dict={
+            standard_fields.DetectionResultFields.detection_boxes:
+            np.array([[100., 100., 200., 200.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+            np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+            np.array([1]),
+            standard_fields.DetectionResultFields.detection_masks:
+            np.pad(np.ones([1, 100, 100], dtype=np.uint8),
+                   ((0, 0), (10, 10), (10, 10)), mode='constant')
+        })
+    coco_evaluator.add_single_ground_truth_image_info(
+        image_id='image2',
+        groundtruth_dict={
+            standard_fields.InputDataFields.groundtruth_boxes:
+            np.array([[50., 50., 100., 100.]]),
+            standard_fields.InputDataFields.groundtruth_classes: np.array([1]),
+            standard_fields.InputDataFields.groundtruth_instance_masks:
+            np.pad(np.ones([1, 50, 50], dtype=np.uint8),
+                   ((0, 0), (10, 10), (10, 10)), mode='constant')
+        })
+    coco_evaluator.add_single_detected_image_info(
+        image_id='image2',
+        detections_dict={
+            standard_fields.DetectionResultFields.detection_boxes:
+            np.array([[50., 50., 100., 100.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+            np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+            np.array([1]),
+            standard_fields.DetectionResultFields.detection_masks:
+            np.pad(np.ones([1, 50, 50], dtype=np.uint8),
+                   ((0, 0), (10, 10), (10, 10)), mode='constant')
+        })
+    coco_evaluator.add_single_ground_truth_image_info(
+        image_id='image3',
+        groundtruth_dict={
+            standard_fields.InputDataFields.groundtruth_boxes:
+            np.array([[25., 25., 50., 50.]]),
+            standard_fields.InputDataFields.groundtruth_classes: np.array([1]),
+            standard_fields.InputDataFields.groundtruth_instance_masks:
+            np.pad(np.ones([1, 25, 25], dtype=np.uint8),
+                   ((0, 0), (10, 10), (10, 10)), mode='constant')
+        })
+    coco_evaluator.add_single_detected_image_info(
+        image_id='image3',
+        detections_dict={
+            standard_fields.DetectionResultFields.detection_boxes:
+            np.array([[25., 25., 50., 50.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+            np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+            np.array([1]),
+            standard_fields.DetectionResultFields.detection_masks:
+            np.pad(np.ones([1, 25, 25], dtype=np.uint8),
+                   ((0, 0), (10, 10), (10, 10)), mode='constant')
+        })
+    metrics = coco_evaluator.evaluate()
+    self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP'], 1.0)
+    coco_evaluator.clear()
+    self.assertFalse(coco_evaluator._image_id_to_mask_shape_map)
+    self.assertFalse(coco_evaluator._image_ids_with_detections)
+    self.assertFalse(coco_evaluator._groundtruth_list)
+    self.assertFalse(coco_evaluator._detection_masks_list)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/metrics/coco_tools.py
+++ b/research/object_detection/metrics/coco_tools.py
+"""Wrappers for third party pycocotools to be used within i/u/object_detection.
+
+Note that nothing in this file is tensorflow related and thus cannot
+be called directly as a slim metric, for example.
+
+TODO: wrap as a slim metric in metrics.py
+
+
+Usage example: given a set of images with ids in the list image_ids
+and corresponding lists of numpy arrays encoding groundtruth (boxes and classes)
+and detections (boxes, scores and classes), where elements of each list
+correspond to detections/annotations of a single image,
+then evaluation (in multi-class mode) can be invoked as follows:
+
+  groundtruth_dict = coco_tools.ExportGroundtruthToCOCO(
+      image_ids, groundtruth_boxes_list, groundtruth_classes_list,
+      max_num_classes, output_path=None)
+  detections_list = coco_tools.ExportDetectionsToCOCO(
+      image_ids, detection_boxes_list, detection_scores_list,
+      detection_classes_list, output_path=None)
+  groundtruth = coco_tools.COCOWrapper(groundtruth_dict)
+  detections = groundtruth.LoadAnnotations(detections_list)
+  evaluator = coco_tools.COCOEvalWrapper(groundtruth, detections,
+                                         agnostic_mode=False)
+  metrics = evaluator.ComputeMetrics()
+
+"""
+from collections import OrderedDict
+import copy
+import time
+import numpy as np
+
+from pycocotools import coco
+from pycocotools import cocoeval
+from pycocotools import mask
+
+import tensorflow as tf
+
+from object_detection.utils import json_utils
+
+
+class COCOWrapper(coco.COCO):
+  """Wrapper for the pycocotools COCO class."""
+
+  def __init__(self, dataset, detection_type='bbox'):
+    """COCOWrapper constructor.
+
+    See http://mscoco.org/dataset/#format for a description of the format.
+    By default, the coco.COCO class constructor reads from a JSON file.
+    This function duplicates the same behavior but loads from a dictionary,
+    allowing us to perform evaluation without writing to external storage.
+
+    Args:
+      dataset: a dictionary holding bounding box annotations in the COCO format.
+      detection_type: type of detections being wrapped. Can be one of ['bbox',
+        'segmentation']
+
+    Raises:
+      ValueError: if detection_type is unsupported.
+    """
+    supported_detection_types = ['bbox', 'segmentation']
+    if detection_type not in supported_detection_types:
+      raise ValueError('Unsupported detection type: {}. '
+                       'Supported values are: {}'.format(
+                           detection_type, supported_detection_types))
+    self._detection_type = detection_type
+    coco.COCO.__init__(self)
+    self.dataset = dataset
+    self.createIndex()
+
+  def LoadAnnotations(self, annotations):
+    """Load annotations dictionary into COCO datastructure.
+
+    See http://mscoco.org/dataset/#format for a description of the annotations
+    format.  As above, this function replicates the default behavior of the API
+    but does not require writing to external storage.
+
+    Args:
+      annotations: python list holding object detection results where each
+        detection is encoded as a dict with required keys ['image_id',
+        'category_id', 'score'] and one of ['bbox', 'segmentation'] based on
+        `detection_type`.
+
+    Returns:
+      a coco.COCO datastructure holding object detection annotations results
+
+    Raises:
+      ValueError: if annotations is not a list
+      ValueError: if annotations do not correspond to the images contained
+        in self.
+    """
+    results = coco.COCO()
+    results.dataset['images'] = [img for img in self.dataset['images']]
+
+    tf.logging.info('Loading and preparing annotation results...')
+    tic = time.time()
+
+    if not isinstance(annotations, list):
+      raise ValueError('annotations is not a list of objects')
+    annotation_img_ids = [ann['image_id'] for ann in annotations]
+    if (set(annotation_img_ids) != (set(annotation_img_ids)
+                                    & set(self.getImgIds()))):
+      raise ValueError('Results do not correspond to current coco set')
+    results.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+    if self._detection_type == 'bbox':
+      for idx, ann in enumerate(annotations):
+        bb = ann['bbox']
+        ann['area'] = bb[2] * bb[3]
+        ann['id'] = idx + 1
+        ann['iscrowd'] = 0
+    elif self._detection_type == 'segmentation':
+      for idx, ann in enumerate(annotations):
+        ann['area'] = mask.area(ann['segmentation'])
+        ann['bbox'] = mask.toBbox(ann['segmentation'])
+        ann['id'] = idx + 1
+        ann['iscrowd'] = 0
+    tf.logging.info('DONE (t=%0.2fs)', (time.time() - tic))
+
+    results.dataset['annotations'] = annotations
+    results.createIndex()
+    return results
+
+
+class COCOEvalWrapper(cocoeval.COCOeval):
+  """Wrapper for the pycocotools COCOeval class.
+
+  To evaluate, create two objects (groundtruth_dict and detections_list)
+  using the conventions listed at http://mscoco.org/dataset/#format.
+  Then call evaluation as follows:
+
+    groundtruth = coco_tools.COCOWrapper(groundtruth_dict)
+    detections = groundtruth.LoadAnnotations(detections_list)
+    evaluator = coco_tools.COCOEvalWrapper(groundtruth, detections,
+                                           agnostic_mode=False)
+
+    metrics = evaluator.ComputeMetrics()
+  """
+
+  def __init__(self, groundtruth=None, detections=None, agnostic_mode=False,
+               iou_type='bbox'):
+    """COCOEvalWrapper constructor.
+
+    Note that for the area-based metrics to be meaningful, detection and
+    groundtruth boxes must be in image coordinates measured in pixels.
+
+    Args:
+      groundtruth: a coco.COCO (or coco_tools.COCOWrapper) object holding
+        groundtruth annotations
+      detections: a coco.COCO (or coco_tools.COCOWrapper) object holding
+        detections
+      agnostic_mode: boolean (default: False).  If True, evaluation ignores
+        class labels, treating all detections as proposals.
+      iou_type: IOU type to use for evaluation. Supports `bbox` or `segm`.
+    """
+    cocoeval.COCOeval.__init__(self, groundtruth, detections,
+                               iouType=iou_type)
+    if agnostic_mode:
+      self.params.useCats = 0
+
+  def GetCategory(self, category_id):
+    """Fetches dictionary holding category information given category id.
+
+    Args:
+      category_id: integer id
+    Returns:
+      dictionary holding 'id', 'name'.
+    """
+    return self.cocoGt.cats[category_id]
+
+  def GetAgnosticMode(self):
+    """Returns true if COCO Eval is configured to evaluate in agnostic mode."""
+    return self.params.useCats == 0
+
+  def GetCategoryIdList(self):
+    """Returns list of valid category ids."""
+    return self.params.catIds
+
+  def ComputeMetrics(self, all_metrics_per_category=False):
+    """Computes detection metrics.
+
+    Args:
+      all_metrics_per_category: If true, include all the summery metrics for
+        each category in per_category_ap. Be careful with setting it to true if
+        you have more than handful of categories, because it will pollute
+        your mldash.
+    Returns:
+      1. summary_metrics: a dictionary holding:
+        'Precision/mAP': mean average precision over classes averaged over IOU
+          thresholds ranging from .5 to .95 with .05 increments
+        'Precision/mAP@.50IOU': mean average precision at 50% IOU
+        'Precision/mAP@.75IOU': mean average precision at 75% IOU
+        'Precision/mAP (small)': mean average precision for small objects
+                        (area < 32^2 pixels)
+        'Precision/mAP (medium)': mean average precision for medium sized
+                        objects (32^2 pixels < area < 96^2 pixels)
+        'Precision/mAP (large)': mean average precision for large objects
+                        (96^2 pixels < area < 10000^2 pixels)
+        'Recall/AR@1': average recall with 1 detection
+        'Recall/AR@10': average recall with 10 detections
+        'Recall/AR@100': average recall with 100 detections
+        'Recall/AR@100 (small)': average recall for small objects with 100
+          detections
+        'Recall/AR@100 (medium)': average recall for medium objects with 100
+          detections
+        'Recall/AR@100 (large)': average recall for large objects with 100
+          detections
+      2. per_category_ap: a dictionary holding category specific results with
+        keys of the form: 'Precision mAP ByCategory/category'
+        (without the supercategory part if no supercategories exist).
+        For backward compatibility 'PerformanceByCategory' is included in the
+        output regardless of all_metrics_per_category.
+        If evaluating class-agnostic mode, per_category_ap is an empty
+        dictionary.
+    """
+    self.evaluate()
+    self.accumulate()
+    self.summarize()
+
+    summary_metrics = OrderedDict([
+        ('Precision/mAP', self.stats[0]),
+        ('Precision/mAP@.50IOU', self.stats[1]),
+        ('Precision/mAP@.75IOU', self.stats[2]),
+        ('Precision/mAP (small)', self.stats[3]),
+        ('Precision/mAP (medium)', self.stats[4]),
+        ('Precision/mAP (large)', self.stats[5]),
+        ('Recall/AR@1', self.stats[6]),
+        ('Recall/AR@10', self.stats[7]),
+        ('Recall/AR@100', self.stats[8]),
+        ('Recall/AR@100 (small)', self.stats[9]),
+        ('Recall/AR@100 (medium)', self.stats[10]),
+        ('Recall/AR@100 (large)', self.stats[11])
+    ])
+    per_category_ap = OrderedDict([])
+    if self.GetAgnosticMode():
+      return summary_metrics, per_category_ap
+    for category_index, category_id in enumerate(self.GetCategoryIdList()):
+      category = self.GetCategory(category_id)['name']
+      # Kept for backward compatilbility
+      per_category_ap['PerformanceByCategory/mAP/{}'.format(
+          category)] = self.category_stats[0][category_index]
+      if all_metrics_per_category:
+        per_category_ap['Precision mAP ByCategory/{}'.format(
+            category)] = self.category_stats[0][category_index]
+        per_category_ap['Precision mAP@.50IOU ByCategory/{}'.format(
+            category)] = self.category_stats[1][category_index]
+        per_category_ap['Precision mAP@.75IOU ByCategory/{}'.format(
+            category)] = self.category_stats[2][category_index]
+        per_category_ap['Precision mAP (small) ByCategory/{}'.format(
+            category)] = self.category_stats[3][category_index]
+        per_category_ap['Precision mAP (medium) ByCategory/{}'.format(
+            category)] = self.category_stats[4][category_index]
+        per_category_ap['Precision mAP (large) ByCategory/{}'.format(
+            category)] = self.category_stats[5][category_index]
+        per_category_ap['Recall AR@1 ByCategory/{}'.format(
+            category)] = self.category_stats[6][category_index]
+        per_category_ap['Recall AR@10 ByCategory/{}'.format(
+            category)] = self.category_stats[7][category_index]
+        per_category_ap['Recall AR@100 ByCategory/{}'.format(
+            category)] = self.category_stats[8][category_index]
+        per_category_ap['Recall AR@100 (small) ByCategory/{}'.format(
+            category)] = self.category_stats[9][category_index]
+        per_category_ap['Recall AR@100 (medium) ByCategory/{}'.format(
+            category)] = self.category_stats[10][category_index]
+        per_category_ap['Recall AR@100 (large) ByCategory/{}'.format(
+            category)] = self.category_stats[11][category_index]
+
+    return summary_metrics, per_category_ap
+
+
+def _ConvertBoxToCOCOFormat(box):
+  """Converts a box in [ymin, xmin, ymax, xmax] format to COCO format.
+
+  This is a utility function for converting from our internal
+  [ymin, xmin, ymax, xmax] convention to the convention used by the COCO API
+  i.e., [xmin, ymin, width, height].
+
+  Args:
+    box: a [ymin, xmin, ymax, xmax] numpy array
+
+  Returns:
+    a list of floats representing [xmin, ymin, width, height]
+  """
+  return [float(box[1]), float(box[0]), float(box[3] - box[1]),
+          float(box[2] - box[0])]
+
+
+def _RleCompress(masks):
+  """Compresses mask using Run-length encoding provided by pycocotools.
+
+  Args:
+    masks: uint8 numpy array of shape [mask_height, mask_width] with values in
+    {0, 1}.
+
+  Returns:
+    A pycocotools Run-length encoding of the mask.
+  """
+  return mask.encode(np.asfortranarray(masks))
+
+
+def ExportSingleImageGroundtruthToCoco(image_id,
+                                       next_annotation_id,
+                                       category_id_set,
+                                       groundtruth_boxes,
+                                       groundtruth_classes,
+                                       groundtruth_masks=None):
+  """Export groundtruth of a single image to COCO format.
+
+  This function converts groundtruth detection annotations represented as numpy
+  arrays to dictionaries that can be ingested by the COCO evaluation API. Note
+  that the image_ids provided here must match the ones given to
+  ExportSingleImageDetectionsToCoco. We assume that boxes and classes are in
+  correspondence - that is: groundtruth_boxes[i, :], and
+  groundtruth_classes[i] are associated with the same groundtruth annotation.
+
+  In the exported result, "area" fields are always set to the area of the
+  groundtruth bounding box and "iscrowd" fields are always set to 0.
+  TODO: pass in "iscrowd" array for evaluating on COCO dataset.
+
+  Args:
+    image_id: a unique image identifier either of type integer or string.
+    next_annotation_id: integer specifying the first id to use for the
+      groundtruth annotations. All annotations are assigned a continuous integer
+      id starting from this value.
+    category_id_set: A set of valid class ids. Groundtruth with classes not in
+      category_id_set are dropped.
+    groundtruth_boxes: numpy array (float32) with shape [num_gt_boxes, 4]
+    groundtruth_classes: numpy array (int) with shape [num_gt_boxes]
+    groundtruth_masks: optional uint8 numpy array of shape [num_detections,
+      image_height, image_width] containing detection_masks.
+
+  Returns:
+    a list of groundtruth annotations for a single image in the COCO format.
+
+  Raises:
+    ValueError: if (1) groundtruth_boxes and groundtruth_classes do not have the
+      right lengths or (2) if each of the elements inside these lists do not
+      have the correct shapes or (3) if image_ids are not integers
+  """
+
+  if len(groundtruth_classes.shape) != 1:
+    raise ValueError('groundtruth_classes is '
+                     'expected to be of rank 1.')
+  if len(groundtruth_boxes.shape) != 2:
+    raise ValueError('groundtruth_boxes is expected to be of '
+                     'rank 2.')
+  if groundtruth_boxes.shape[1] != 4:
+    raise ValueError('groundtruth_boxes should have '
+                     'shape[1] == 4.')
+  num_boxes = groundtruth_classes.shape[0]
+  if num_boxes != groundtruth_boxes.shape[0]:
+    raise ValueError('Corresponding entries in groundtruth_classes, '
+                     'and groundtruth_boxes should have '
+                     'compatible shapes (i.e., agree on the 0th dimension).'
+                     'Classes shape: %d. Boxes shape: %d. Image ID: %s' % (
+                         groundtruth_classes.shape[0],
+                         groundtruth_boxes.shape[0], image_id))
+  groundtruth_list = []
+  for i in range(num_boxes):
+    if groundtruth_classes[i] in category_id_set:
+      export_dict = {
+          'id': next_annotation_id + i,
+          'image_id': image_id,
+          'category_id': int(groundtruth_classes[i]),
+          'bbox': list(_ConvertBoxToCOCOFormat(groundtruth_boxes[i, :])),
+          'area': float((groundtruth_boxes[i, 2] - groundtruth_boxes[i, 0]) *
+                        (groundtruth_boxes[i, 3] - groundtruth_boxes[i, 1])),
+          'iscrowd': 0
+      }
+      if groundtruth_masks is not None:
+        export_dict['segmentation'] = _RleCompress(groundtruth_masks[i])
+      groundtruth_list.append(export_dict)
+  return groundtruth_list
+
+
+def ExportGroundtruthToCOCO(image_ids,
+                            groundtruth_boxes,
+                            groundtruth_classes,
+                            categories,
+                            output_path=None):
+  """Export groundtruth detection annotations in numpy arrays to COCO API.
+
+  This function converts a set of groundtruth detection annotations represented
+  as numpy arrays to dictionaries that can be ingested by the COCO API.
+  Inputs to this function are three lists: image ids for each groundtruth image,
+  groundtruth boxes for each image and groundtruth classes respectively.
+  Note that the image_ids provided here must match the ones given to the
+  ExportDetectionsToCOCO function in order for evaluation to work properly.
+  We assume that for each image, boxes, scores and classes are in
+  correspondence --- that is: image_id[i], groundtruth_boxes[i, :] and
+  groundtruth_classes[i] are associated with the same groundtruth annotation.
+
+  In the exported result, "area" fields are always set to the area of the
+  groundtruth bounding box and "iscrowd" fields are always set to 0.
+  TODO: pass in "iscrowd" array for evaluating on COCO dataset.
+
+  Args:
+    image_ids: a list of unique image identifier either of type integer or
+      string.
+    groundtruth_boxes: list of numpy arrays with shape [num_gt_boxes, 4]
+      (note that num_gt_boxes can be different for each entry in the list)
+    groundtruth_classes: list of numpy arrays (int) with shape [num_gt_boxes]
+      (note that num_gt_boxes can be different for each entry in the list)
+    categories: a list of dictionaries representing all possible categories.
+        Each dict in this list has the following keys:
+          'id': (required) an integer id uniquely identifying this category
+          'name': (required) string representing category name
+            e.g., 'cat', 'dog', 'pizza'
+          'supercategory': (optional) string representing the supercategory
+            e.g., 'animal', 'vehicle', 'food', etc
+    output_path: (optional) path for exporting result to JSON
+  Returns:
+    dictionary that can be read by COCO API
+  Raises:
+    ValueError: if (1) groundtruth_boxes and groundtruth_classes do not have the
+      right lengths or (2) if each of the elements inside these lists do not
+      have the correct shapes or (3) if image_ids are not integers
+  """
+  category_id_set = set([cat['id'] for cat in categories])
+  groundtruth_export_list = []
+  image_export_list = []
+  if not len(image_ids) == len(groundtruth_boxes) == len(groundtruth_classes):
+    raise ValueError('Input lists must have the same length')
+
+  # For reasons internal to the COCO API, it is important that annotation ids
+  # are not equal to zero; we thus start counting from 1.
+  annotation_id = 1
+  for image_id, boxes, classes in zip(image_ids, groundtruth_boxes,
+                                      groundtruth_classes):
+    image_export_list.append({'id': image_id})
+    groundtruth_export_list.extend(ExportSingleImageGroundtruthToCoco(
+        image_id,
+        annotation_id,
+        category_id_set,
+        boxes,
+        classes))
+    num_boxes = classes.shape[0]
+    annotation_id += num_boxes
+
+  groundtruth_dict = {
+      'annotations': groundtruth_export_list,
+      'images': image_export_list,
+      'categories': categories
+  }
+  if output_path:
+    with tf.gfile.GFile(output_path, 'w') as fid:
+      json_utils.Dump(groundtruth_dict, fid, float_digits=4, indent=2)
+  return groundtruth_dict
+
+
+def ExportSingleImageDetectionBoxesToCoco(image_id,
+                                          category_id_set,
+                                          detection_boxes,
+                                          detection_scores,
+                                          detection_classes):
+  """Export detections of a single image to COCO format.
+
+  This function converts detections represented as numpy arrays to dictionaries
+  that can be ingested by the COCO evaluation API. Note that the image_ids
+  provided here must match the ones given to the
+  ExporSingleImageDetectionBoxesToCoco. We assume that boxes, and classes are in
+  correspondence - that is: boxes[i, :], and classes[i]
+  are associated with the same groundtruth annotation.
+
+  Args:
+    image_id: unique image identifier either of type integer or string.
+    category_id_set: A set of valid class ids. Detections with classes not in
+      category_id_set are dropped.
+    detection_boxes: float numpy array of shape [num_detections, 4] containing
+      detection boxes.
+    detection_scores: float numpy array of shape [num_detections] containing
+      scored for the detection boxes.
+    detection_classes: integer numpy array of shape [num_detections] containing
+      the classes for detection boxes.
+
+  Returns:
+    a list of detection annotations for a single image in the COCO format.
+
+  Raises:
+    ValueError: if (1) detection_boxes, detection_scores and detection_classes
+      do not have the right lengths or (2) if each of the elements inside these
+      lists do not have the correct shapes or (3) if image_ids are not integers.
+  """
+
+  if len(detection_classes.shape) != 1 or len(detection_scores.shape) != 1:
+    raise ValueError('All entries in detection_classes and detection_scores'
+                     'expected to be of rank 1.')
+  if len(detection_boxes.shape) != 2:
+    raise ValueError('All entries in detection_boxes expected to be of '
+                     'rank 2.')
+  if detection_boxes.shape[1] != 4:
+    raise ValueError('All entries in detection_boxes should have '
+                     'shape[1] == 4.')
+  num_boxes = detection_classes.shape[0]
+  if not num_boxes == detection_boxes.shape[0] == detection_scores.shape[0]:
+    raise ValueError('Corresponding entries in detection_classes, '
+                     'detection_scores and detection_boxes should have '
+                     'compatible shapes (i.e., agree on the 0th dimension). '
+                     'Classes shape: %d. Boxes shape: %d. '
+                     'Scores shape: %d' % (
+                         detection_classes.shape[0], detection_boxes.shape[0],
+                         detection_scores.shape[0]
+                     ))
+  detections_list = []
+  for i in range(num_boxes):
+    if detection_classes[i] in category_id_set:
+      detections_list.append({
+          'image_id': image_id,
+          'category_id': int(detection_classes[i]),
+          'bbox': list(_ConvertBoxToCOCOFormat(detection_boxes[i, :])),
+          'score': float(detection_scores[i])
+      })
+  return detections_list
+
+
+def ExportSingleImageDetectionMasksToCoco(image_id,
+                                          category_id_set,
+                                          detection_masks,
+                                          detection_scores,
+                                          detection_classes):
+  """Export detection masks of a single image to COCO format.
+
+  This function converts detections represented as numpy arrays to dictionaries
+  that can be ingested by the COCO evaluation API. We assume that
+  detection_masks, detection_scores, and detection_classes are in correspondence
+  - that is: detection_masks[i, :], detection_classes[i] and detection_scores[i]
+    are associated with the same annotation.
+
+  Args:
+    image_id: unique image identifier either of type integer or string.
+    category_id_set: A set of valid class ids. Detections with classes not in
+      category_id_set are dropped.
+    detection_masks: uint8 numpy array of shape [num_detections, image_height,
+      image_width] containing detection_masks.
+    detection_scores: float numpy array of shape [num_detections] containing
+      scores for detection masks.
+    detection_classes: integer numpy array of shape [num_detections] containing
+      the classes for detection masks.
+
+  Returns:
+    a list of detection mask annotations for a single image in the COCO format.
+
+  Raises:
+    ValueError: if (1) detection_masks, detection_scores and detection_classes
+      do not have the right lengths or (2) if each of the elements inside these
+      lists do not have the correct shapes or (3) if image_ids are not integers.
+  """
+
+  if len(detection_classes.shape) != 1 or len(detection_scores.shape) != 1:
+    raise ValueError('All entries in detection_classes and detection_scores'
+                     'expected to be of rank 1.')
+  num_boxes = detection_classes.shape[0]
+  if not num_boxes == len(detection_masks) == detection_scores.shape[0]:
+    raise ValueError('Corresponding entries in detection_classes, '
+                     'detection_scores and detection_masks should have '
+                     'compatible lengths and shapes '
+                     'Classes length: %d.  Masks length: %d. '
+                     'Scores length: %d' % (
+                         detection_classes.shape[0], len(detection_masks),
+                         detection_scores.shape[0]
+                     ))
+  detections_list = []
+  for i in range(num_boxes):
+    if detection_classes[i] in category_id_set:
+      detections_list.append({
+          'image_id': image_id,
+          'category_id': int(detection_classes[i]),
+          'segmentation': _RleCompress(detection_masks[i]),
+          'score': float(detection_scores[i])
+      })
+  return detections_list
+
+
+def ExportDetectionsToCOCO(image_ids,
+                           detection_boxes,
+                           detection_scores,
+                           detection_classes,
+                           categories,
+                           output_path=None):
+  """Export detection annotations in numpy arrays to COCO API.
+
+  This function converts a set of predicted detections represented
+  as numpy arrays to dictionaries that can be ingested by the COCO API.
+  Inputs to this function are lists, consisting of boxes, scores and
+  classes, respectively, corresponding to each image for which detections
+  have been produced.  Note that the image_ids provided here must
+  match the ones given to the ExportGroundtruthToCOCO function in order
+  for evaluation to work properly.
+
+  We assume that for each image, boxes, scores and classes are in
+  correspondence --- that is: detection_boxes[i, :], detection_scores[i] and
+  detection_classes[i] are associated with the same detection.
+
+  Args:
+    image_ids: a list of unique image identifier either of type integer or
+      string.
+    detection_boxes: list of numpy arrays with shape [num_detection_boxes, 4]
+    detection_scores: list of numpy arrays (float) with shape
+      [num_detection_boxes]. Note that num_detection_boxes can be different
+      for each entry in the list.
+    detection_classes: list of numpy arrays (int) with shape
+      [num_detection_boxes]. Note that num_detection_boxes can be different
+      for each entry in the list.
+    categories: a list of dictionaries representing all possible categories.
+      Each dict in this list must have an integer 'id' key uniquely identifying
+      this category.
+    output_path: (optional) path for exporting result to JSON
+
+  Returns:
+    list of dictionaries that can be read by COCO API, where each entry
+    corresponds to a single detection and has keys from:
+    ['image_id', 'category_id', 'bbox', 'score'].
+  Raises:
+    ValueError: if (1) detection_boxes and detection_classes do not have the
+      right lengths or (2) if each of the elements inside these lists do not
+      have the correct shapes or (3) if image_ids are not integers.
+  """
+  category_id_set = set([cat['id'] for cat in categories])
+  detections_export_list = []
+  if not (len(image_ids) == len(detection_boxes) == len(detection_scores) ==
+          len(detection_classes)):
+    raise ValueError('Input lists must have the same length')
+  for image_id, boxes, scores, classes in zip(image_ids, detection_boxes,
+                                              detection_scores,
+                                              detection_classes):
+    detections_export_list.extend(ExportSingleImageDetectionBoxesToCoco(
+        image_id,
+        category_id_set,
+        boxes,
+        scores,
+        classes))
+  if output_path:
+    with tf.gfile.GFile(output_path, 'w') as fid:
+      json_utils.Dump(detections_export_list, fid, float_digits=4, indent=2)
+  return detections_export_list
+
+
+def ExportSegmentsToCOCO(image_ids,
+                         detection_masks,
+                         detection_scores,
+                         detection_classes,
+                         categories,
+                         output_path=None):
+  """Export segmentation masks in numpy arrays to COCO API.
+
+  This function converts a set of predicted instance masks represented
+  as numpy arrays to dictionaries that can be ingested by the COCO API.
+  Inputs to this function are lists, consisting of segments, scores and
+  classes, respectively, corresponding to each image for which detections
+  have been produced.
+
+  Note this function is recommended to use for small dataset.
+  For large dataset, it should be used with a merge function
+  (e.g. in map reduce), otherwise the memory consumption is large.
+
+  We assume that for each image, masks, scores and classes are in
+  correspondence --- that is: detection_masks[i, :, :, :], detection_scores[i]
+  and detection_classes[i] are associated with the same detection.
+
+  Args:
+    image_ids: list of image ids (typically ints or strings)
+    detection_masks: list of numpy arrays with shape [num_detection, h, w, 1]
+      and type uint8. The height and width should match the shape of
+      corresponding image.
+    detection_scores: list of numpy arrays (float) with shape
+      [num_detection]. Note that num_detection can be different
+      for each entry in the list.
+    detection_classes: list of numpy arrays (int) with shape
+      [num_detection]. Note that num_detection can be different
+      for each entry in the list.
+    categories: a list of dictionaries representing all possible categories.
+      Each dict in this list must have an integer 'id' key uniquely identifying
+      this category.
+    output_path: (optional) path for exporting result to JSON
+
+  Returns:
+    list of dictionaries that can be read by COCO API, where each entry
+    corresponds to a single detection and has keys from:
+    ['image_id', 'category_id', 'segmentation', 'score'].
+
+  Raises:
+    ValueError: if detection_masks and detection_classes do not have the
+      right lengths or if each of the elements inside these lists do not
+      have the correct shapes.
+  """
+  if not (len(image_ids) == len(detection_masks) == len(detection_scores) ==
+          len(detection_classes)):
+    raise ValueError('Input lists must have the same length')
+
+  segment_export_list = []
+  for image_id, masks, scores, classes in zip(image_ids, detection_masks,
+                                              detection_scores,
+                                              detection_classes):
+
+    if len(classes.shape) != 1 or len(scores.shape) != 1:
+      raise ValueError('All entries in detection_classes and detection_scores'
+                       'expected to be of rank 1.')
+    if len(masks.shape) != 4:
+      raise ValueError('All entries in masks expected to be of '
+                       'rank 4. Given {}'.format(masks.shape))
+
+    num_boxes = classes.shape[0]
+    if not num_boxes == masks.shape[0] == scores.shape[0]:
+      raise ValueError('Corresponding entries in segment_classes, '
+                       'detection_scores and detection_boxes should have '
+                       'compatible shapes (i.e., agree on the 0th dimension).')
+
+    category_id_set = set([cat['id'] for cat in categories])
+    segment_export_list.extend(ExportSingleImageDetectionMasksToCoco(
+        image_id, category_id_set, np.squeeze(masks, axis=3), scores, classes))
+
+  if output_path:
+    with tf.gfile.GFile(output_path, 'w') as fid:
+      json_utils.Dump(segment_export_list, fid, float_digits=4, indent=2)
+  return segment_export_list
+
+
+def ExportKeypointsToCOCO(image_ids,
+                          detection_keypoints,
+                          detection_scores,
+                          detection_classes,
+                          categories,
+                          output_path=None):
+  """Exports keypoints in numpy arrays to COCO API.
+
+  This function converts a set of predicted keypoints represented
+  as numpy arrays to dictionaries that can be ingested by the COCO API.
+  Inputs to this function are lists, consisting of keypoints, scores and
+  classes, respectively, corresponding to each image for which detections
+  have been produced.
+
+  We assume that for each image, keypoints, scores and classes are in
+  correspondence --- that is: detection_keypoints[i, :, :, :],
+  detection_scores[i] and detection_classes[i] are associated with the same
+  detection.
+
+  Args:
+    image_ids: list of image ids (typically ints or strings)
+    detection_keypoints: list of numpy arrays with shape
+      [num_detection, num_keypoints, 2] and type float32 in absolute
+      x-y coordinates.
+    detection_scores: list of numpy arrays (float) with shape
+      [num_detection]. Note that num_detection can be different
+      for each entry in the list.
+    detection_classes: list of numpy arrays (int) with shape
+      [num_detection]. Note that num_detection can be different
+      for each entry in the list.
+    categories: a list of dictionaries representing all possible categories.
+      Each dict in this list must have an integer 'id' key uniquely identifying
+      this category and an integer 'num_keypoints' key specifying the number of
+      keypoints the category has.
+    output_path: (optional) path for exporting result to JSON
+
+  Returns:
+    list of dictionaries that can be read by COCO API, where each entry
+    corresponds to a single detection and has keys from:
+    ['image_id', 'category_id', 'keypoints', 'score'].
+
+  Raises:
+    ValueError: if detection_keypoints and detection_classes do not have the
+      right lengths or if each of the elements inside these lists do not
+      have the correct shapes.
+  """
+  if not (len(image_ids) == len(detection_keypoints) ==
+          len(detection_scores) == len(detection_classes)):
+    raise ValueError('Input lists must have the same length')
+
+  keypoints_export_list = []
+  for image_id, keypoints, scores, classes in zip(
+      image_ids, detection_keypoints, detection_scores, detection_classes):
+
+    if len(classes.shape) != 1 or len(scores.shape) != 1:
+      raise ValueError('All entries in detection_classes and detection_scores'
+                       'expected to be of rank 1.')
+    if len(keypoints.shape) != 3:
+      raise ValueError('All entries in keypoints expected to be of '
+                       'rank 3. Given {}'.format(keypoints.shape))
+
+    num_boxes = classes.shape[0]
+    if not num_boxes == keypoints.shape[0] == scores.shape[0]:
+      raise ValueError('Corresponding entries in detection_classes, '
+                       'detection_keypoints, and detection_scores should have '
+                       'compatible shapes (i.e., agree on the 0th dimension).')
+
+    category_id_set = set([cat['id'] for cat in categories])
+    category_id_to_num_keypoints_map = {
+        cat['id']: cat['num_keypoints'] for cat in categories
+        if 'num_keypoints' in cat}
+
+    for i in range(num_boxes):
+      if classes[i] not in category_id_set:
+        raise ValueError('class id should be in category_id_set\n')
+
+      if classes[i] in category_id_to_num_keypoints_map:
+        num_keypoints = category_id_to_num_keypoints_map[classes[i]]
+        # Adds extra ones to indicate the visibility for each keypoint as is
+        # recommended by MSCOCO.
+        instance_keypoints = np.concatenate(
+            [keypoints[i, 0:num_keypoints, :],
+             np.expand_dims(np.ones(num_keypoints), axis=1)],
+            axis=1).astype(int)
+
+        instance_keypoints = instance_keypoints.flatten().tolist()
+        keypoints_export_list.append({
+            'image_id': image_id,
+            'category_id': int(classes[i]),
+            'keypoints': instance_keypoints,
+            'score': float(scores[i])
+        })
+
+  if output_path:
+    with tf.gfile.GFile(output_path, 'w') as fid:
+      json_utils.Dump(keypoints_export_list, fid, float_digits=4, indent=2)
+  return keypoints_export_list
--- a/research/object_detection/metrics/coco_tools_test.py
+++ b/research/object_detection/metrics/coco_tools_test.py
+"""Tests for google3.image.understanding.object_detection.metrics.coco_tools."""
+import json
+import os
+import re
+import numpy as np
+
+from pycocotools import mask
+
+import tensorflow as tf
+
+from object_detection.metrics import coco_tools
+
+
+class CocoToolsTest(tf.test.TestCase):
+
+  def setUp(self):
+    groundtruth_annotations_list = [
+        {
+            'id': 1,
+            'image_id': 'first',
+            'category_id': 1,
+            'bbox': [100., 100., 100., 100.],
+            'area': 100.**2,
+            'iscrowd': 0
+        },
+        {
+            'id': 2,
+            'image_id': 'second',
+            'category_id': 1,
+            'bbox': [50., 50., 50., 50.],
+            'area': 50.**2,
+            'iscrowd': 0
+        },
+    ]
+    image_list = [{'id': 'first'}, {'id': 'second'}]
+    category_list = [{'id': 0, 'name': 'person'},
+                     {'id': 1, 'name': 'cat'},
+                     {'id': 2, 'name': 'dog'}]
+    self._groundtruth_dict = {
+        'annotations': groundtruth_annotations_list,
+        'images': image_list,
+        'categories': category_list
+    }
+
+    self._detections_list = [
+        {
+            'image_id': 'first',
+            'category_id': 1,
+            'bbox': [100., 100., 100., 100.],
+            'score': .8
+        },
+        {
+            'image_id': 'second',
+            'category_id': 1,
+            'bbox': [50., 50., 50., 50.],
+            'score': .7
+        },
+    ]
+
+  def testCocoWrappers(self):
+    groundtruth = coco_tools.COCOWrapper(self._groundtruth_dict)
+    detections = groundtruth.LoadAnnotations(self._detections_list)
+    evaluator = coco_tools.COCOEvalWrapper(groundtruth, detections)
+    summary_metrics, _ = evaluator.ComputeMetrics()
+    self.assertAlmostEqual(1.0, summary_metrics['Precision/mAP'])
+
+  def testExportGroundtruthToCOCO(self):
+    image_ids = ['first', 'second']
+    groundtruth_boxes = [np.array([[100, 100, 200, 200]], np.float),
+                         np.array([[50, 50, 100, 100]], np.float)]
+    groundtruth_classes = [np.array([1], np.int32), np.array([1], np.int32)]
+    categories = [{'id': 0, 'name': 'person'},
+                  {'id': 1, 'name': 'cat'},
+                  {'id': 2, 'name': 'dog'}]
+    output_path = os.path.join(tf.test.get_temp_dir(), 'groundtruth.json')
+    result = coco_tools.ExportGroundtruthToCOCO(
+        image_ids,
+        groundtruth_boxes,
+        groundtruth_classes,
+        categories,
+        output_path=output_path)
+    self.assertDictEqual(result, self._groundtruth_dict)
+    with tf.gfile.GFile(output_path, 'r') as f:
+      written_result = f.read()
+      # The json output should have floats written to 4 digits of precision.
+      matcher = re.compile(r'"bbox":\s+\[\n\s+\d+.\d\d\d\d,', re.MULTILINE)
+      self.assertTrue(matcher.findall(written_result))
+      written_result = json.loads(written_result)
+      self.assertAlmostEqual(result, written_result)
+
+  def testExportDetectionsToCOCO(self):
+    image_ids = ['first', 'second']
+    detections_boxes = [np.array([[100, 100, 200, 200]], np.float),
+                        np.array([[50, 50, 100, 100]], np.float)]
+    detections_scores = [np.array([.8], np.float), np.array([.7], np.float)]
+    detections_classes = [np.array([1], np.int32), np.array([1], np.int32)]
+    categories = [{'id': 0, 'name': 'person'},
+                  {'id': 1, 'name': 'cat'},
+                  {'id': 2, 'name': 'dog'}]
+    output_path = os.path.join(tf.test.get_temp_dir(), 'detections.json')
+    result = coco_tools.ExportDetectionsToCOCO(
+        image_ids,
+        detections_boxes,
+        detections_scores,
+        detections_classes,
+        categories,
+        output_path=output_path)
+    self.assertListEqual(result, self._detections_list)
+    with tf.gfile.GFile(output_path, 'r') as f:
+      written_result = f.read()
+      # The json output should have floats written to 4 digits of precision.
+      matcher = re.compile(r'"bbox":\s+\[\n\s+\d+.\d\d\d\d,', re.MULTILINE)
+      self.assertTrue(matcher.findall(written_result))
+      written_result = json.loads(written_result)
+      self.assertAlmostEqual(result, written_result)
+
+  def testExportSegmentsToCOCO(self):
+    image_ids = ['first', 'second']
+    detection_masks = [np.array(
+        [[[0, 1, 0, 1], [0, 1, 1, 0], [0, 0, 0, 1], [0, 1, 0, 1]]],
+        dtype=np.uint8), np.array(
+            [[[0, 1, 0, 1], [0, 1, 1, 0], [0, 0, 0, 1], [0, 1, 0, 1]]],
+            dtype=np.uint8)]
+
+    for i, detection_mask in enumerate(detection_masks):
+      detection_masks[i] = detection_mask[:, :, :, None]
+
+    detection_scores = [np.array([.8], np.float), np.array([.7], np.float)]
+    detection_classes = [np.array([1], np.int32), np.array([1], np.int32)]
+
+    categories = [{'id': 0, 'name': 'person'},
+                  {'id': 1, 'name': 'cat'},
+                  {'id': 2, 'name': 'dog'}]
+    output_path = os.path.join(tf.test.get_temp_dir(), 'segments.json')
+    result = coco_tools.ExportSegmentsToCOCO(
+        image_ids,
+        detection_masks,
+        detection_scores,
+        detection_classes,
+        categories,
+        output_path=output_path)
+    with tf.gfile.GFile(output_path, 'r') as f:
+      written_result = f.read()
+      written_result = json.loads(written_result)
+      mask_load = mask.decode([written_result[0]['segmentation']])
+      self.assertTrue(np.allclose(mask_load, detection_masks[0]))
+      self.assertAlmostEqual(result, written_result)
+
+  def testExportKeypointsToCOCO(self):
+    image_ids = ['first', 'second']
+    detection_keypoints = [
+        np.array(
+            [[[100, 200], [300, 400], [500, 600]],
+             [[50, 150], [250, 350], [450, 550]]], dtype=np.int32),
+        np.array(
+            [[[110, 210], [310, 410], [510, 610]],
+             [[60, 160], [260, 360], [460, 560]]], dtype=np.int32)]
+
+    detection_scores = [np.array([.8, 0.2], np.float),
+                        np.array([.7, 0.3], np.float)]
+    detection_classes = [np.array([1, 1], np.int32), np.array([1, 1], np.int32)]
+
+    categories = [{'id': 1, 'name': 'person', 'num_keypoints': 3},
+                  {'id': 2, 'name': 'cat'},
+                  {'id': 3, 'name': 'dog'}]
+
+    output_path = os.path.join(tf.test.get_temp_dir(), 'keypoints.json')
+    result = coco_tools.ExportKeypointsToCOCO(
+        image_ids,
+        detection_keypoints,
+        detection_scores,
+        detection_classes,
+        categories,
+        output_path=output_path)
+
+    with tf.gfile.GFile(output_path, 'r') as f:
+      written_result = f.read()
+      written_result = json.loads(written_result)
+      self.assertAlmostEqual(result, written_result)
+
+  def testSingleImageDetectionBoxesExport(self):
+    boxes = np.array([[0, 0, 1, 1],
+                      [0, 0, .5, .5],
+                      [.5, .5, 1, 1]], dtype=np.float32)
+    classes = np.array([1, 2, 3], dtype=np.int32)
+    scores = np.array([0.8, 0.2, 0.7], dtype=np.float32)
+    coco_boxes = np.array([[0, 0, 1, 1],
+                           [0, 0, .5, .5],
+                           [.5, .5, .5, .5]], dtype=np.float32)
+    coco_annotations = coco_tools.ExportSingleImageDetectionBoxesToCoco(
+        image_id='first_image',
+        category_id_set=set([1, 2, 3]),
+        detection_boxes=boxes,
+        detection_classes=classes,
+        detection_scores=scores)
+    for i, annotation in enumerate(coco_annotations):
+      self.assertEqual(annotation['image_id'], 'first_image')
+      self.assertEqual(annotation['category_id'], classes[i])
+      self.assertAlmostEqual(annotation['score'], scores[i])
+      self.assertTrue(np.all(np.isclose(annotation['bbox'], coco_boxes[i])))
+
+  def testSingleImageDetectionMaskExport(self):
+    masks = np.array(
+        [[[1, 1,], [1, 1]],
+         [[0, 0], [0, 1]],
+         [[0, 0], [0, 0]]], dtype=np.uint8)
+    classes = np.array([1, 2, 3], dtype=np.int32)
+    scores = np.array([0.8, 0.2, 0.7], dtype=np.float32)
+    coco_annotations = coco_tools.ExportSingleImageDetectionMasksToCoco(
+        image_id='first_image',
+        category_id_set=set([1, 2, 3]),
+        detection_classes=classes,
+        detection_scores=scores,
+        detection_masks=masks)
+    expected_counts = ['04', '31', '4']
+    for i, mask_annotation in enumerate(coco_annotations):
+      self.assertEqual(mask_annotation['segmentation']['counts'],
+                       expected_counts[i])
+      self.assertTrue(np.all(np.equal(mask.decode(
+          mask_annotation['segmentation']), masks[i])))
+      self.assertEqual(mask_annotation['image_id'], 'first_image')
+      self.assertEqual(mask_annotation['category_id'], classes[i])
+      self.assertAlmostEqual(mask_annotation['score'], scores[i])
+
+  def testSingleImageGroundtruthExport(self):
+    masks = np.array(
+        [[[1, 1,], [1, 1]],
+         [[0, 0], [0, 1]],
+         [[0, 0], [0, 0]]], dtype=np.uint8)
+    boxes = np.array([[0, 0, 1, 1],
+                      [0, 0, .5, .5],
+                      [.5, .5, 1, 1]], dtype=np.float32)
+    coco_boxes = np.array([[0, 0, 1, 1],
+                           [0, 0, .5, .5],
+                           [.5, .5, .5, .5]], dtype=np.float32)
+    classes = np.array([1, 2, 3], dtype=np.int32)
+    next_annotation_id = 1
+    coco_annotations = coco_tools.ExportSingleImageGroundtruthToCoco(
+        image_id='first_image',
+        category_id_set=set([1, 2, 3]),
+        next_annotation_id=next_annotation_id,
+        groundtruth_boxes=boxes,
+        groundtruth_classes=classes,
+        groundtruth_masks=masks)
+    expected_counts = ['04', '31', '4']
+    for i, annotation in enumerate(coco_annotations):
+      self.assertEqual(annotation['segmentation']['counts'],
+                       expected_counts[i])
+      self.assertTrue(np.all(np.equal(mask.decode(
+          annotation['segmentation']), masks[i])))
+      self.assertTrue(np.all(np.isclose(annotation['bbox'], coco_boxes[i])))
+      self.assertEqual(annotation['image_id'], 'first_image')
+      self.assertEqual(annotation['category_id'], classes[i])
+      self.assertEqual(annotation['id'], i + next_annotation_id)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/metrics/offline_eval_map_corloc.py
+++ b/research/object_detection/metrics/offline_eval_map_corloc.py
@@ -22,7 +22,7 @@ The evaluation metrics set is supplied in object_detection.protos.EvalConfig
 in metrics_set field.
 Currently two set of metrics are supported:
 - pascal_voc_metrics: standard PASCAL VOC 2007 metric
- open_images_metrics: Open Image V2 metric
+- open_images_detection_metrics: Open Image V2 metric
 All other field of object_detection.protos.EvalConfig are ignored.

 Example usage:

--- a/research/object_detection/models/BUILD
+++ b/research/object_detection/models/BUILD
@@ -15,6 +15,7 @@ py_library(
    ],
    deps = [
        "//tensorflow",
+        "//tensorflow/models/research/object_detection/utils:ops",
    ],
 )

@@ -36,6 +37,7 @@ py_library(
    ],
    deps = [
        "//tensorflow",
+        "//tensorflow/models/research/object_detection/utils:test_case",
    ],
 )

@@ -47,9 +49,10 @@ py_library(
    deps = [
        ":feature_map_generators",
        "//tensorflow",
-        "//tensorflow_models/object_detection/meta_architectures:ssd_meta_arch",
-        "//tensorflow_models/object_detection/utils:ops",
-        "//tensorflow_models/slim:inception_v2",
+        "//tensorflow/models/research/object_detection/meta_architectures:ssd_meta_arch",
+        "//tensorflow/models/research/object_detection/utils:ops",
+        "//tensorflow/models/research/object_detection/utils:shape_utils",
+        "//third_party/tensorflow_models/slim:inception_v2",
    ],
 )

@@ -61,9 +64,10 @@ py_library(
    deps = [
        ":feature_map_generators",
        "//tensorflow",
-        "//tensorflow_models/object_detection/meta_architectures:ssd_meta_arch",
-        "//tensorflow_models/object_detection/utils:ops",
-        "//tensorflow_models/slim:inception_v3",
+        "//tensorflow/models/research/object_detection/meta_architectures:ssd_meta_arch",
+        "//tensorflow/models/research/object_detection/utils:ops",
+        "//tensorflow/models/research/object_detection/utils:shape_utils",
+        "//third_party/tensorflow_models/slim:inception_v3",
    ],
 )

@@ -73,9 +77,10 @@ py_library(
    deps = [
        ":feature_map_generators",
        "//tensorflow",
-        "//tensorflow_models/object_detection/meta_architectures:ssd_meta_arch",
-        "//tensorflow_models/object_detection/utils:ops",
-        "//tensorflow_models/slim:mobilenet_v1",
+        "//tensorflow/models/research/object_detection/meta_architectures:ssd_meta_arch",
+        "//tensorflow/models/research/object_detection/utils:ops",
+        "//tensorflow/models/research/object_detection/utils:shape_utils",
+        "//third_party/tensorflow_models/slim:mobilenet_v1",
    ],
 )

@@ -86,8 +91,39 @@ py_library(
        ":feature_map_generators",
        ":ssd_mobilenet_v1_feature_extractor",
        "//tensorflow",
-        "//tensorflow_models/object_detection/utils:ops",
-        "//tensorflow_models/slim:mobilenet_v1",
+        "//tensorflow/models/research/object_detection/utils:ops",
+        "//third_party/tensorflow_models/slim:mobilenet_v1",
+    ],
+)
+
+py_library(
+    name = "ssd_resnet_v1_fpn_feature_extractor",
+    srcs = ["ssd_resnet_v1_fpn_feature_extractor.py"],
+    deps = [
+        ":feature_map_generators",
+        "//tensorflow",
+        "//tensorflow/models/research/object_detection/meta_architectures:ssd_meta_arch",
+        "//tensorflow/models/research/object_detection/utils:ops",
+        "//tensorflow/models/research/object_detection/utils:shape_utils",
+        "//third_party/tensorflow_models/slim:resnet_v1",
+    ],
+)
+
+py_library(
+    name = "ssd_resnet_v1_fpn_feature_extractor_testbase",
+    srcs = ["ssd_resnet_v1_fpn_feature_extractor_testbase.py"],
+    deps = [
+        "//tensorflow/models/research/object_detection/models:ssd_feature_extractor_test",
+    ],
+)
+
+py_test(
+    name = "ssd_resnet_v1_fpn_feature_extractor_test",
+    srcs = ["ssd_resnet_v1_fpn_feature_extractor_test.py"],
+    deps = [
+        ":ssd_resnet_v1_fpn_feature_extractor",
+        ":ssd_resnet_v1_fpn_feature_extractor_testbase",
+        "//tensorflow",
    ],
 )

@@ -153,8 +189,8 @@ py_library(
    ],
    deps = [
        "//tensorflow",
-        "//tensorflow_models/object_detection/meta_architectures:faster_rcnn_meta_arch",
-        "//tensorflow_models/slim:nasnet",
+        "//tensorflow/models/research/object_detection/meta_architectures:faster_rcnn_meta_arch",
+        "//third_party/tensorflow_models/slim:nasnet",
    ],
 )

@@ -165,8 +201,8 @@ py_library(
    ],
    deps = [
        "//tensorflow",
-        "//tensorflow_models/object_detection/meta_architectures:faster_rcnn_meta_arch",
-        "//tensorflow_models/slim:inception_resnet_v2",
+        "//tensorflow/models/research/object_detection/meta_architectures:faster_rcnn_meta_arch",
+        "//third_party/tensorflow_models/slim:inception_resnet_v2",
    ],
 )

@@ -188,8 +224,8 @@ py_library(
    ],
    deps = [
        "//tensorflow",
-        "//tensorflow_models/object_detection/meta_architectures:faster_rcnn_meta_arch",
-        "//tensorflow_models/slim:inception_v2",
+        "//tensorflow/models/research/object_detection/meta_architectures:faster_rcnn_meta_arch",
+        "//third_party/tensorflow_models/slim:inception_v2",
    ],
 )

@@ -211,9 +247,9 @@ py_library(
    ],
    deps = [
        "//tensorflow",
-        "//tensorflow_models/object_detection/meta_architectures:faster_rcnn_meta_arch",
-        "//tensorflow_models/slim:resnet_utils",
-        "//tensorflow_models/slim:resnet_v1",
+        "//tensorflow/models/research/object_detection/meta_architectures:faster_rcnn_meta_arch",
+        "//third_party/tensorflow_models/slim:resnet_utils",
+        "//third_party/tensorflow_models/slim:resnet_v1",
    ],
 )


--- a/research/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor.py
+++ b/research/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor.py
@@ -51,7 +51,8 @@ class EmbeddedSSDMobileNetV1FeatureExtractor(
               pad_to_multiple,
               conv_hyperparams,
               batch_norm_trainable=True,
-               reuse_weights=None):
+               reuse_weights=None,
+               use_explicit_padding=False):
    """MobileNetV1 Feature Extractor for Embedded-friendly SSD Models.

    Args:
@@ -66,6 +67,8 @@ class EmbeddedSSDMobileNetV1FeatureExtractor(
        (e.g. 1), it is desirable to disable batch norm update and use
        pretrained batch norm params.
      reuse_weights: Whether to reuse variables. Default is None.
+      use_explicit_padding: Whether to use explicit padding when extracting
+        features. Default is False.

    Raises:
      ValueError: upon invalid `pad_to_multiple` values.
@@ -76,7 +79,8 @@ class EmbeddedSSDMobileNetV1FeatureExtractor(

    super(EmbeddedSSDMobileNetV1FeatureExtractor, self).__init__(
        is_training, depth_multiplier, min_depth, pad_to_multiple,
-        conv_hyperparams, batch_norm_trainable, reuse_weights)
+        conv_hyperparams, batch_norm_trainable, reuse_weights,
+        use_explicit_padding)

  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.
@@ -88,13 +92,25 @@ class EmbeddedSSDMobileNetV1FeatureExtractor(
    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
+
+    Raises:
+      ValueError: if image height or width are not 256 pixels.
    """
-    preprocessed_inputs.get_shape().assert_has_rank(4)
-    shape_assert = tf.Assert(
-        tf.logical_and(
-            tf.equal(tf.shape(preprocessed_inputs)[1], 256),
-            tf.equal(tf.shape(preprocessed_inputs)[2], 256)),
-        ['image size must be 256 in both height and width.'])
+    image_shape = preprocessed_inputs.get_shape()
+    image_shape.assert_has_rank(4)
+    image_height = image_shape[1].value
+    image_width = image_shape[2].value
+
+    if image_height is None or image_width is None:
+      shape_assert = tf.Assert(
+          tf.logical_and(tf.equal(tf.shape(preprocessed_inputs)[1], 256),
+                         tf.equal(tf.shape(preprocessed_inputs)[2], 256)),
+          ['image size must be 256 in both height and width.'])
+      with tf.control_dependencies([shape_assert]):
+        preprocessed_inputs = tf.identity(preprocessed_inputs)
+    elif image_height != 256 or image_width != 256:
+      raise ValueError('image size must be = 256 in both height and width;'
+                       ' image dim = %d,%d' % (image_height, image_width))

    feature_map_layout = {
        'from_layer': [
@@ -102,10 +118,11 @@ class EmbeddedSSDMobileNetV1FeatureExtractor(
        ],
        'layer_depth': [-1, -1, 512, 256, 256],
        'conv_kernel_size': [-1, -1, 3, 3, 2],
+        'use_explicit_padding': self._use_explicit_padding,
    }

-    with tf.control_dependencies([shape_assert]):
-      with slim.arg_scope(self._conv_hyperparams):
+    with slim.arg_scope(self._conv_hyperparams):
+      with slim.arg_scope([slim.batch_norm], fused=False):
        with tf.variable_scope('MobilenetV1',
                               reuse=self._reuse_weights) as scope:
          _, image_features = mobilenet_v1.mobilenet_v1_base(

--- a/research/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor_test.py
+++ b/research/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor_test.py
@@ -22,7 +22,7 @@ from object_detection.models import ssd_feature_extractor_test


 class EmbeddedSSDMobileNetV1FeatureExtractorTest(
-    ssd_feature_extractor_test.SsdFeatureExtractorTestBase, tf.test.TestCase):
+    ssd_feature_extractor_test.SsdFeatureExtractorTestBase):

  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
                                is_training=True, batch_norm_trainable=True):
@@ -51,11 +51,23 @@ class EmbeddedSSDMobileNetV1FeatureExtractorTest(
    image_width = 256
    depth_multiplier = 1.0
    pad_to_multiple = 1
-    expected_feature_map_shape = [(4, 16, 16, 512), (4, 8, 8, 1024),
-                                  (4, 4, 4, 512), (4, 2, 2, 256),
-                                  (4, 1, 1, 256)]
+    expected_feature_map_shape = [(2, 16, 16, 512), (2, 8, 8, 1024),
+                                  (2, 4, 4, 512), (2, 2, 2, 256),
+                                  (2, 1, 1, 256)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, pad_to_multiple,
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+
+  def test_extract_features_returns_correct_shapes_with_dynamic_inputs(self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 16, 16, 512), (2, 8, 8, 1024),
+                                  (2, 4, 4, 512), (2, 2, 2, 256),
+                                  (2, 1, 1, 256)]
+    self.check_extract_features_returns_correct_shapes_with_dynamic_inputs(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
        expected_feature_map_shape)

  def test_extract_features_returns_correct_shapes_enforcing_min_depth(self):
@@ -63,10 +75,10 @@ class EmbeddedSSDMobileNetV1FeatureExtractorTest(
    image_width = 256
    depth_multiplier = 0.5**12
    pad_to_multiple = 1
-    expected_feature_map_shape = [(4, 16, 16, 32), (4, 8, 8, 32), (4, 4, 4, 32),
-                                  (4, 2, 2, 32), (4, 1, 1, 32)]
+    expected_feature_map_shape = [(2, 16, 16, 32), (2, 8, 8, 32), (2, 4, 4, 32),
+                                  (2, 2, 2, 32), (2, 1, 1, 32)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, pad_to_multiple,
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
        expected_feature_map_shape)

  def test_extract_features_returns_correct_shapes_with_pad_to_multiple_of_1(
@@ -75,11 +87,11 @@ class EmbeddedSSDMobileNetV1FeatureExtractorTest(
    image_width = 256
    depth_multiplier = 1.0
    pad_to_multiple = 1
-    expected_feature_map_shape = [(4, 16, 16, 512), (4, 8, 8, 1024),
-                                  (4, 4, 4, 512), (4, 2, 2, 256),
-                                  (4, 1, 1, 256)]
+    expected_feature_map_shape = [(2, 16, 16, 512), (2, 8, 8, 1024),
+                                  (2, 4, 4, 512), (2, 2, 2, 256),
+                                  (2, 1, 1, 256)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, pad_to_multiple,
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
        expected_feature_map_shape)

  def test_extract_features_raises_error_with_pad_to_multiple_not_1(self):

--- a/research/object_detection/models/faster_rcnn_inception_resnet_v2_feature_extractor.py
+++ b/research/object_detection/models/faster_rcnn_inception_resnet_v2_feature_extractor.py
@@ -180,7 +180,7 @@ class FasterRCNNInceptionResnetV2FeatureExtractor(
    faster_rcnn_meta_arch.FasterRCNNFeatureExtractor which does not work for
    InceptionResnetV2 checkpoints.

-    TODO: revisit whether it's possible to force the
+    TODO(jonathanhuang,rathodv): revisit whether it's possible to force the
    `Repeat` namescope as created in `_extract_box_classifier_features` to
    start counting at 2 (e.g. `Repeat_2`) so that the default restore_fn can
    be used.

--- a/research/object_detection/models/faster_rcnn_resnet_v1_feature_extractor.py
+++ b/research/object_detection/models/faster_rcnn_resnet_v1_feature_extractor.py
@@ -111,7 +111,8 @@ class FasterRCNNResnetV1FeatureExtractor(

    with tf.control_dependencies([shape_assert]):
      # Disables batchnorm for fine-tuning with smaller batch sizes.
-      # TODO: Figure out if it is needed when image batch size is bigger.
+      # TODO: Figure out if it is needed when image
+      # batch size is bigger.
      with slim.arg_scope(
          resnet_utils.resnet_arg_scope(
              batch_norm_epsilon=1e-5,

--- a/research/object_detection/models/feature_map_generators.py
+++ b/research/object_detection/models/feature_map_generators.py
@@ -25,6 +25,7 @@ of final feature maps.
 """
 import collections
 import tensorflow as tf
+from object_detection.utils import ops
 slim = tf.contrib.slim


@@ -115,6 +116,9 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
  feature_map_keys = []
  feature_maps = []
  base_from_layer = ''
+  use_explicit_padding = False
+  if 'use_explicit_padding' in feature_map_layout:
+    use_explicit_padding = feature_map_layout['use_explicit_padding']
  use_depthwise = False
  if 'use_depthwise' in feature_map_layout:
    use_depthwise = feature_map_layout['use_depthwise']
@@ -139,16 +143,21 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
            padding='SAME',
            stride=1,
            scope=layer_name)
-      stride = 2
      layer_name = '{}_2_Conv2d_{}_{}x{}_s2_{}'.format(
          base_from_layer, index, conv_kernel_size, conv_kernel_size,
          depth_fn(layer_depth))
+      stride = 2
+      padding = 'SAME'
+      if use_explicit_padding:
+        padding = 'VALID'
+        intermediate_layer = ops.fixed_padding(
+            intermediate_layer, conv_kernel_size)
      if use_depthwise:
        feature_map = slim.separable_conv2d(
            intermediate_layer,
            None, [conv_kernel_size, conv_kernel_size],
            depth_multiplier=1,
-            padding='SAME',
+            padding=padding,
            stride=stride,
            scope=layer_name + '_depthwise')
        feature_map = slim.conv2d(
@@ -161,10 +170,56 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
        feature_map = slim.conv2d(
            intermediate_layer,
            depth_fn(layer_depth), [conv_kernel_size, conv_kernel_size],
-            padding='SAME',
+            padding=padding,
            stride=stride,
            scope=layer_name)
      feature_map_keys.append(layer_name)
    feature_maps.append(feature_map)
  return collections.OrderedDict(
      [(x, y) for (x, y) in zip(feature_map_keys, feature_maps)])
+
+
+def fpn_top_down_feature_maps(image_features, depth, scope=None):
+  """Generates `top-down` feature maps for Feature Pyramid Networks.
+
+  See https://arxiv.org/abs/1612.03144 for details.
+
+  Args:
+    image_features: list of image feature tensors. Spatial resolutions of
+      succesive tensors must reduce exactly by a factor of 2.
+    depth: depth of output feature maps.
+    scope: A scope name to wrap this op under.
+
+  Returns:
+    feature_maps: an OrderedDict mapping keys (feature map names) to
+      tensors where each tensor has shape [batch, height_i, width_i, depth_i].
+  """
+  with tf.variable_scope(
+      scope, 'top_down', image_features):
+    num_levels = len(image_features)
+    output_feature_maps_list = []
+    output_feature_map_keys = []
+    with slim.arg_scope(
+        [slim.conv2d],
+        activation_fn=None, normalizer_fn=None, padding='SAME', stride=1):
+      top_down = slim.conv2d(
+          image_features[-1],
+          depth, [1, 1], scope='projection_%d' % num_levels)
+      output_feature_maps_list.append(top_down)
+      output_feature_map_keys.append(
+          'top_down_feature_map_%d' % (num_levels - 1))
+
+      for level in reversed(range(num_levels - 1)):
+        top_down = ops.nearest_neighbor_upsampling(top_down, 2)
+        residual = slim.conv2d(
+            image_features[level], depth, [1, 1],
+            scope='projection_%d' % (level + 1))
+        top_down = 0.5 * top_down + 0.5 * residual
+        output_feature_maps_list.append(slim.conv2d(
+            top_down,
+            depth, [3, 3],
+            activation_fn=None,
+            scope='smoothing_%d' % (level + 1)))
+        output_feature_map_keys.append('top_down_feature_map_%d' % level)
+      return collections.OrderedDict(
+          reversed(zip(output_feature_map_keys, output_feature_maps_list)))
--- a/research/object_detection/models/feature_map_generators_test.py
+++ b/research/object_detection/models/feature_map_generators_test.py
@@ -40,7 +40,7 @@ EMBEDDED_SSD_MOBILENET_V1_LAYOUT = {
 }


-# TODO(rathodv): add tests with different anchor strides.
+# TODO: add tests with different anchor strides.
 class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):

  def test_get_expected_feature_map_shapes_with_inception_v2(self):
@@ -134,6 +134,34 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):
      self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)


+class FPNFeatureMapGeneratorTest(tf.test.TestCase):
+
+  def test_get_expected_feature_map_shapes(self):
+    image_features = [
+        tf.random_uniform([4, 8, 8, 256], dtype=tf.float32),
+        tf.random_uniform([4, 4, 4, 256], dtype=tf.float32),
+        tf.random_uniform([4, 2, 2, 256], dtype=tf.float32),
+        tf.random_uniform([4, 1, 1, 256], dtype=tf.float32),
+    ]
+    feature_maps = feature_map_generators.fpn_top_down_feature_maps(
+        image_features=image_features, depth=128)
+
+    expected_feature_map_shapes = {
+        'top_down_feature_map_0': (4, 8, 8, 128),
+        'top_down_feature_map_1': (4, 4, 4, 128),
+        'top_down_feature_map_2': (4, 2, 2, 128),
+        'top_down_feature_map_3': (4, 1, 1, 128)
+    }
+
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      out_feature_maps = sess.run(feature_maps)
+      out_feature_map_shapes = {key: value.shape
+                                for key, value in out_feature_maps.items()}
+      self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)
+
+
 class GetDepthFunctionTest(tf.test.TestCase):

  def test_return_min_depth_when_multiplier_is_small(self):

--- a/research/object_detection/models/ssd_feature_extractor_test.py
+++ b/research/object_detection/models/ssd_feature_extractor_test.py
@@ -17,33 +17,14 @@

 from abc import abstractmethod

+import itertools
 import numpy as np
 import tensorflow as tf

+from object_detection.utils import test_case

-class SsdFeatureExtractorTestBase(object):

-  def _validate_features_shape(self,
-                               feature_extractor,
-                               preprocessed_inputs,
-                               expected_feature_map_shapes):
-    """Checks the extracted features are of correct shape.
-
-    Args:
-      feature_extractor: The feature extractor to test.
-      preprocessed_inputs: A [batch, height, width, 3] tensor to extract
-                           features with.
-      expected_feature_map_shapes: The expected shape of the extracted features.
-    """
-    feature_maps = feature_extractor.extract_features(preprocessed_inputs)
-    feature_map_shapes = [tf.shape(feature_map) for feature_map in feature_maps]
-    init_op = tf.global_variables_initializer()
-    with self.test_session() as sess:
-      sess.run(init_op)
-      feature_map_shapes_out = sess.run(feature_map_shapes)
-      for shape_out, exp_shape_out in zip(
-          feature_map_shapes_out, expected_feature_map_shapes):
-        self.assertAllEqual(shape_out, exp_shape_out)
+class SsdFeatureExtractorTestBase(test_case.TestCase):

  @abstractmethod
  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple):
@@ -59,14 +40,39 @@ class SsdFeatureExtractorTestBase(object):
    pass

  def check_extract_features_returns_correct_shape(
-      self, image_height, image_width, depth_multiplier, pad_to_multiple,
-      expected_feature_map_shapes_out):
-    feature_extractor = self._create_feature_extractor(depth_multiplier,
-                                                       pad_to_multiple)
-    preprocessed_inputs = tf.random_uniform(
-        [4, image_height, image_width, 3], dtype=tf.float32)
-    self._validate_features_shape(
-        feature_extractor, preprocessed_inputs, expected_feature_map_shapes_out)
+      self, batch_size, image_height, image_width, depth_multiplier,
+      pad_to_multiple, expected_feature_map_shapes):
+    def graph_fn(image_tensor):
+      feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                         pad_to_multiple)
+      feature_maps = feature_extractor.extract_features(image_tensor)
+      return feature_maps
+
+    image_tensor = np.random.rand(batch_size, image_height, image_width,
+                                  3).astype(np.float32)
+    feature_maps = self.execute(graph_fn, [image_tensor])
+    for feature_map, expected_shape in itertools.izip(
+        feature_maps, expected_feature_map_shapes):
+      self.assertAllEqual(feature_map.shape, expected_shape)
+
+  def check_extract_features_returns_correct_shapes_with_dynamic_inputs(
+      self, batch_size, image_height, image_width, depth_multiplier,
+      pad_to_multiple, expected_feature_map_shapes):
+    def graph_fn(image_height, image_width):
+      feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                         pad_to_multiple)
+      image_tensor = tf.random_uniform([batch_size, image_height, image_width,
+                                        3], dtype=tf.float32)
+      feature_maps = feature_extractor.extract_features(image_tensor)
+      return feature_maps
+
+    feature_maps = self.execute_cpu(graph_fn, [
+        np.array(image_height, dtype=np.int32),
+        np.array(image_width, dtype=np.int32)
+    ])
+    for feature_map, expected_shape in itertools.izip(
+        feature_maps, expected_feature_map_shapes):
+      self.assertAllEqual(feature_map.shape, expected_shape)

  def check_extract_features_raises_error_with_invalid_image_size(
      self, image_height, image_width, depth_multiplier, pad_to_multiple):

--- a/research/object_detection/models/ssd_inception_v2_feature_extractor.py
+++ b/research/object_detection/models/ssd_inception_v2_feature_extractor.py
@@ -19,6 +19,7 @@ import tensorflow as tf
 from object_detection.meta_architectures import ssd_meta_arch
 from object_detection.models import feature_map_generators
 from object_detection.utils import ops
+from object_detection.utils import shape_utils
 from nets import inception_v2

 slim = tf.contrib.slim
@@ -34,7 +35,8 @@ class SSDInceptionV2FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
               pad_to_multiple,
               conv_hyperparams,
               batch_norm_trainable=True,
-               reuse_weights=None):
+               reuse_weights=None,
+               use_explicit_padding=False):
    """InceptionV2 Feature Extractor for SSD Models.

    Args:
@@ -49,10 +51,13 @@ class SSDInceptionV2FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
        (e.g. 1), it is desirable to disable batch norm update and use
        pretrained batch norm params.
      reuse_weights: Whether to reuse variables. Default is None.
+      use_explicit_padding: Whether to use explicit padding when extracting
+        features. Default is False.
    """
    super(SSDInceptionV2FeatureExtractor, self).__init__(
        is_training, depth_multiplier, min_depth, pad_to_multiple,
-        conv_hyperparams, batch_norm_trainable, reuse_weights)
+        conv_hyperparams, batch_norm_trainable, reuse_weights,
+        use_explicit_padding)

  def preprocess(self, resized_inputs):
    """SSD preprocessing.
@@ -80,32 +85,29 @@ class SSDInceptionV2FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
-    preprocessed_inputs.get_shape().assert_has_rank(4)
-    shape_assert = tf.Assert(
-        tf.logical_and(tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
-                       tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
-        ['image size must at least be 33 in both height and width.'])
+    preprocessed_inputs = shape_utils.check_min_image_dim(
+        33, preprocessed_inputs)

    feature_map_layout = {
        'from_layer': ['Mixed_4c', 'Mixed_5c', '', '', '', ''],
        'layer_depth': [-1, -1, 512, 256, 256, 128],
+        'use_explicit_padding': self._use_explicit_padding,
    }

-    with tf.control_dependencies([shape_assert]):
-      with slim.arg_scope(self._conv_hyperparams):
-        with tf.variable_scope('InceptionV2',
-                               reuse=self._reuse_weights) as scope:
-          _, image_features = inception_v2.inception_v2_base(
-              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
-              final_endpoint='Mixed_5c',
-              min_depth=self._min_depth,
-              depth_multiplier=self._depth_multiplier,
-              scope=scope)
-          feature_maps = feature_map_generators.multi_resolution_feature_maps(
-              feature_map_layout=feature_map_layout,
-              depth_multiplier=self._depth_multiplier,
-              min_depth=self._min_depth,
-              insert_1x1_conv=True,
-              image_features=image_features)
+    with slim.arg_scope(self._conv_hyperparams):
+      with tf.variable_scope('InceptionV2',
+                             reuse=self._reuse_weights) as scope:
+        _, image_features = inception_v2.inception_v2_base(
+            ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
+            final_endpoint='Mixed_5c',
+            min_depth=self._min_depth,
+            depth_multiplier=self._depth_multiplier,
+            scope=scope)
+        feature_maps = feature_map_generators.multi_resolution_feature_maps(
+            feature_map_layout=feature_map_layout,
+            depth_multiplier=self._depth_multiplier,
+            min_depth=self._min_depth,
+            insert_1x1_conv=True,
+            image_features=image_features)

    return feature_maps.values()
--- a/research/object_detection/models/ssd_inception_v2_feature_extractor_test.py
+++ b/research/object_detection/models/ssd_inception_v2_feature_extractor_test.py
@@ -22,7 +22,7 @@ from object_detection.models import ssd_inception_v2_feature_extractor


 class SsdInceptionV2FeatureExtractorTest(
-    ssd_feature_extractor_test.SsdFeatureExtractorTestBase, tf.test.TestCase):
+    ssd_feature_extractor_test.SsdFeatureExtractorTestBase):

  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
                                is_training=True, batch_norm_trainable=True):
@@ -49,11 +49,23 @@ class SsdInceptionV2FeatureExtractorTest(
    image_width = 128
    depth_multiplier = 1.0
    pad_to_multiple = 1
-    expected_feature_map_shape = [(4, 8, 8, 576), (4, 4, 4, 1024),
-                                  (4, 2, 2, 512), (4, 1, 1, 256),
-                                  (4, 1, 1, 256), (4, 1, 1, 128)]
+    expected_feature_map_shape = [(2, 8, 8, 576), (2, 4, 4, 1024),
+                                  (2, 2, 2, 512), (2, 1, 1, 256),
+                                  (2, 1, 1, 256), (2, 1, 1, 128)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, pad_to_multiple,
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+
+  def test_extract_features_returns_correct_shapes_with_dynamic_inputs(self):
+    image_height = 128
+    image_width = 128
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 8, 8, 576), (2, 4, 4, 1024),
+                                  (2, 2, 2, 512), (2, 1, 1, 256),
+                                  (2, 1, 1, 256), (2, 1, 1, 128)]
+    self.check_extract_features_returns_correct_shapes_with_dynamic_inputs(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
        expected_feature_map_shape)

  def test_extract_features_returns_correct_shapes_299(self):
@@ -61,11 +73,11 @@ class SsdInceptionV2FeatureExtractorTest(
    image_width = 299
    depth_multiplier = 1.0
    pad_to_multiple = 1
-    expected_feature_map_shape = [(4, 19, 19, 576), (4, 10, 10, 1024),
-                                  (4, 5, 5, 512), (4, 3, 3, 256),
-                                  (4, 2, 2, 256), (4, 1, 1, 128)]
+    expected_feature_map_shape = [(2, 19, 19, 576), (2, 10, 10, 1024),
+                                  (2, 5, 5, 512), (2, 3, 3, 256),
+                                  (2, 2, 2, 256), (2, 1, 1, 128)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, pad_to_multiple,
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
        expected_feature_map_shape)

  def test_extract_features_returns_correct_shapes_enforcing_min_depth(self):
@@ -73,11 +85,11 @@ class SsdInceptionV2FeatureExtractorTest(
    image_width = 299
    depth_multiplier = 0.5**12
    pad_to_multiple = 1
-    expected_feature_map_shape = [(4, 19, 19, 128), (4, 10, 10, 128),
-                                  (4, 5, 5, 32), (4, 3, 3, 32),
-                                  (4, 2, 2, 32), (4, 1, 1, 32)]
+    expected_feature_map_shape = [(2, 19, 19, 128), (2, 10, 10, 128),
+                                  (2, 5, 5, 32), (2, 3, 3, 32),
+                                  (2, 2, 2, 32), (2, 1, 1, 32)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, pad_to_multiple,
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
        expected_feature_map_shape)

  def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self):
@@ -85,11 +97,11 @@ class SsdInceptionV2FeatureExtractorTest(
    image_width = 299
    depth_multiplier = 1.0
    pad_to_multiple = 32
-    expected_feature_map_shape = [(4, 20, 20, 576), (4, 10, 10, 1024),
-                                  (4, 5, 5, 512), (4, 3, 3, 256),
-                                  (4, 2, 2, 256), (4, 1, 1, 128)]
+    expected_feature_map_shape = [(2, 20, 20, 576), (2, 10, 10, 1024),
+                                  (2, 5, 5, 512), (2, 3, 3, 256),
+                                  (2, 2, 2, 256), (2, 1, 1, 128)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, pad_to_multiple,
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
        expected_feature_map_shape)

  def test_extract_features_raises_error_with_invalid_image_size(self):

--- a/research/object_detection/models/ssd_inception_v3_feature_extractor.py
+++ b/research/object_detection/models/ssd_inception_v3_feature_extractor.py
@@ -19,6 +19,7 @@ import tensorflow as tf
 from object_detection.meta_architectures import ssd_meta_arch
 from object_detection.models import feature_map_generators
 from object_detection.utils import ops
+from object_detection.utils import shape_utils
 from nets import inception_v3

 slim = tf.contrib.slim
@@ -34,7 +35,8 @@ class SSDInceptionV3FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
               pad_to_multiple,
               conv_hyperparams,
               batch_norm_trainable=True,
-               reuse_weights=None):
+               reuse_weights=None,
+               use_explicit_padding=False):
    """InceptionV3 Feature Extractor for SSD Models.

    Args:
@@ -49,10 +51,13 @@ class SSDInceptionV3FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
        (e.g. 1), it is desirable to disable batch norm update and use
        pretrained batch norm params.
      reuse_weights: Whether to reuse variables. Default is None.
+      use_explicit_padding: Whether to use explicit padding when extracting
+        features. Default is False.
    """
    super(SSDInceptionV3FeatureExtractor, self).__init__(
        is_training, depth_multiplier, min_depth, pad_to_multiple,
-        conv_hyperparams, batch_norm_trainable, reuse_weights)
+        conv_hyperparams, batch_norm_trainable, reuse_weights,
+        use_explicit_padding)

  def preprocess(self, resized_inputs):
    """SSD preprocessing.
@@ -80,32 +85,28 @@ class SSDInceptionV3FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
-    preprocessed_inputs.get_shape().assert_has_rank(4)
-    shape_assert = tf.Assert(
-        tf.logical_and(tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
-                       tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
-        ['image size must at least be 33 in both height and width.'])
+    preprocessed_inputs = shape_utils.check_min_image_dim(
+        33, preprocessed_inputs)

    feature_map_layout = {
        'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''],
        'layer_depth': [-1, -1, -1, 512, 256, 128],
+        'use_explicit_padding': self._use_explicit_padding,
    }

-    with tf.control_dependencies([shape_assert]):
-      with slim.arg_scope(self._conv_hyperparams):
-        with tf.variable_scope('InceptionV3',
-                               reuse=self._reuse_weights) as scope:
-          _, image_features = inception_v3.inception_v3_base(
-              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
-              final_endpoint='Mixed_7c',
-              min_depth=self._min_depth,
-              depth_multiplier=self._depth_multiplier,
-              scope=scope)
-          feature_maps = feature_map_generators.multi_resolution_feature_maps(
-              feature_map_layout=feature_map_layout,
-              depth_multiplier=self._depth_multiplier,
-              min_depth=self._min_depth,
-              insert_1x1_conv=True,
-              image_features=image_features)
+    with slim.arg_scope(self._conv_hyperparams):
+      with tf.variable_scope('InceptionV3', reuse=self._reuse_weights) as scope:
+        _, image_features = inception_v3.inception_v3_base(
+            ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
+            final_endpoint='Mixed_7c',
+            min_depth=self._min_depth,
+            depth_multiplier=self._depth_multiplier,
+            scope=scope)
+        feature_maps = feature_map_generators.multi_resolution_feature_maps(
+            feature_map_layout=feature_map_layout,
+            depth_multiplier=self._depth_multiplier,
+            min_depth=self._min_depth,
+            insert_1x1_conv=True,
+            image_features=image_features)

    return feature_maps.values()
--- a/research/object_detection/models/ssd_inception_v3_feature_extractor_test.py
+++ b/research/object_detection/models/ssd_inception_v3_feature_extractor_test.py
@@ -22,7 +22,7 @@ from object_detection.models import ssd_inception_v3_feature_extractor


 class SsdInceptionV3FeatureExtractorTest(
-    ssd_feature_extractor_test.SsdFeatureExtractorTestBase, tf.test.TestCase):
+    ssd_feature_extractor_test.SsdFeatureExtractorTestBase):

  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
                                is_training=True, batch_norm_trainable=True):
@@ -49,11 +49,23 @@ class SsdInceptionV3FeatureExtractorTest(
    image_width = 128
    depth_multiplier = 1.0
    pad_to_multiple = 1
-    expected_feature_map_shape = [(4, 13, 13, 288), (4, 6, 6, 768),
-                                  (4, 2, 2, 2048), (4, 1, 1, 512),
-                                  (4, 1, 1, 256), (4, 1, 1, 128)]
+    expected_feature_map_shape = [(2, 13, 13, 288), (2, 6, 6, 768),
+                                  (2, 2, 2, 2048), (2, 1, 1, 512),
+                                  (2, 1, 1, 256), (2, 1, 1, 128)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, pad_to_multiple,
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+
+  def test_extract_features_returns_correct_shapes_with_dynamic_inputs(self):
+    image_height = 128
+    image_width = 128
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 13, 13, 288), (2, 6, 6, 768),
+                                  (2, 2, 2, 2048), (2, 1, 1, 512),
+                                  (2, 1, 1, 256), (2, 1, 1, 128)]
+    self.check_extract_features_returns_correct_shapes_with_dynamic_inputs(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
        expected_feature_map_shape)

  def test_extract_features_returns_correct_shapes_299(self):
@@ -61,11 +73,11 @@ class SsdInceptionV3FeatureExtractorTest(
    image_width = 299
    depth_multiplier = 1.0
    pad_to_multiple = 1
-    expected_feature_map_shape = [(4, 35, 35, 288), (4, 17, 17, 768),
-                                  (4, 8, 8, 2048), (4, 4, 4, 512),
-                                  (4, 2, 2, 256), (4, 1, 1, 128)]
+    expected_feature_map_shape = [(2, 35, 35, 288), (2, 17, 17, 768),
+                                  (2, 8, 8, 2048), (2, 4, 4, 512),
+                                  (2, 2, 2, 256), (2, 1, 1, 128)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, pad_to_multiple,
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
        expected_feature_map_shape)

  def test_extract_features_returns_correct_shapes_enforcing_min_depth(self):
@@ -73,11 +85,11 @@ class SsdInceptionV3FeatureExtractorTest(
    image_width = 299
    depth_multiplier = 0.5**12
    pad_to_multiple = 1
-    expected_feature_map_shape = [(4, 35, 35, 128), (4, 17, 17, 128),
-                                  (4, 8, 8, 192), (4, 4, 4, 32),
-                                  (4, 2, 2, 32), (4, 1, 1, 32)]
+    expected_feature_map_shape = [(2, 35, 35, 128), (2, 17, 17, 128),
+                                  (2, 8, 8, 192), (2, 4, 4, 32),
+                                  (2, 2, 2, 32), (2, 1, 1, 32)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, pad_to_multiple,
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
        expected_feature_map_shape)

  def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self):
@@ -85,11 +97,11 @@ class SsdInceptionV3FeatureExtractorTest(
    image_width = 299
    depth_multiplier = 1.0
    pad_to_multiple = 32
-    expected_feature_map_shape = [(4, 37, 37, 288), (4, 18, 18, 768),
-                                  (4, 8, 8, 2048), (4, 4, 4, 512),
-                                  (4, 2, 2, 256), (4, 1, 1, 128)]
+    expected_feature_map_shape = [(2, 37, 37, 288), (2, 18, 18, 768),
+                                  (2, 8, 8, 2048), (2, 4, 4, 512),
+                                  (2, 2, 2, 256), (2, 1, 1, 128)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, pad_to_multiple,
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
        expected_feature_map_shape)

  def test_extract_features_raises_error_with_invalid_image_size(self):