Minor fixes for object detection (#5613)

* Internal change. PiperOrigin-RevId: 213914693 * Add original_image_spatial_shape tensor in input dictionary to store shape of the original input image PiperOrigin-RevId: 214018767 * Remove "groundtruth_confidences" from decoders use "groundtruth_weights" to indicate label confidence. This also solves a bug that only surfaced now - random crop routines in core/preprocessor.py did not correctly handle "groundtruth_weight" tensors returned by the decoders. PiperOrigin-RevId: 214091843 * Update CocoMaskEvaluator to allow for a batch of image info, rather than a single image. PiperOrigin-RevId: 214295305 * Adding the option to be able to summarize gradients. PiperOrigin-RevId: 214310875 * Adds FasterRCNN inference on CPU 1. Adds a flag use_static_shapes_for_eval to restrict to the ops that guarantees static shape. 2. No filtering of overlapping anchors while clipping the anchors when use_static_shapes_for_eval is set to True. 3. A...

Minor fixes for object detection (#5613)
* Internal change. PiperOrigin-RevId: 213914693 * Add original_image_spatial_shape tensor in input dictionary to store shape of the original input image PiperOrigin-RevId: 214018767 * Remove "groundtruth_confidences" from decoders use "groundtruth_weights" to indicate label confidence. This also solves a bug that only surfaced now - random crop routines in core/preprocessor.py did not correctly handle "groundtruth_weight" tensors returned by the decoders. PiperOrigin-RevId: 214091843 * Update CocoMaskEvaluator to allow for a batch of image info, rather than a single image. PiperOrigin-RevId: 214295305 * Adding the option to be able to summarize gradients. PiperOrigin-RevId: 214310875 * Adds FasterRCNN inference on CPU 1. Adds a flag use_static_shapes_for_eval to restrict to the ops that guarantees static shape. 2. No filtering of overlapping anchors while clipping the anchors when use_static_shapes_for_eval is set to True. 3. A...
31ae57eb · pkulzc · GitHub · 0b0c9cfd · 31ae57eb · 31ae57eb
Unverified Commit 31ae57eb authored Nov 02, 2018 by pkulzc Committed by GitHub Nov 02, 2018
20 changed files
--- a/research/object_detection/models/ssd_resnet_v1_ppn_feature_extractor_testbase.py
+++ b/research/object_detection/models/ssd_resnet_v1_ppn_feature_extractor_testbase.py
@@ -15,6 +15,7 @@
 """Tests for ssd resnet v1 feature extractors."""
 import abc
 import numpy as np
+import tensorflow as tf

 from object_detection.models import ssd_feature_extractor_test

@@ -64,12 +65,15 @@ class SSDResnetPpnFeatureExtractorTestBase(
    image_width = 128
    depth_multiplier = 1
    pad_to_multiple = 1
-    test_image = np.random.rand(4, image_height, image_width, 3)
+    test_image = tf.constant(np.random.rand(4, image_height, image_width, 3))
    feature_extractor = self._create_feature_extractor(depth_multiplier,
                                                       pad_to_multiple)
    preprocessed_image = feature_extractor.preprocess(test_image)
-    self.assertAllClose(preprocessed_image,
-                        test_image - [[123.68, 116.779, 103.939]])
+    with self.test_session() as sess:
+      test_image_out, preprocessed_image_out = sess.run(
+          [test_image, preprocessed_image])
+      self.assertAllClose(preprocessed_image_out,
+                          test_image_out - [[123.68, 116.779, 103.939]])

  def test_variables_only_created_in_scope(self):
    depth_multiplier = 1

--- a/research/object_detection/predictors/convolutional_keras_box_predictor.py
+++ b/research/object_detection/predictors/convolutional_keras_box_predictor.py
@@ -134,26 +134,32 @@ class ConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor):
                       (len(self._prediction_heads[BOX_ENCODINGS]),
                        len(input_shapes)))
    for stack_index, input_shape in enumerate(input_shapes):
-      net = tf.keras.Sequential(name='PreHeadConvolutions_%d' % stack_index)
-      self._shared_nets.append(net)
+      net = []

      # Add additional conv layers before the class predictor.
      features_depth = static_shape.get_depth(input_shape)
      depth = max(min(features_depth, self._max_depth), self._min_depth)
      tf.logging.info(
          'depth of additional conv before box predictor: {}'.format(depth))
+
      if depth > 0 and self._num_layers_before_predictor > 0:
        for i in range(self._num_layers_before_predictor):
-          net.add(keras.Conv2D(depth, [1, 1],
-                               name='Conv2d_%d_1x1_%d' % (i, depth),
-                               padding='SAME',
-                               **self._conv_hyperparams.params()))
-          net.add(self._conv_hyperparams.build_batch_norm(
+          net.append(keras.Conv2D(depth, [1, 1],
+                                  name='SharedConvolutions_%d/Conv2d_%d_1x1_%d'
+                                  % (stack_index, i, depth),
+                                  padding='SAME',
+                                  **self._conv_hyperparams.params()))
+          net.append(self._conv_hyperparams.build_batch_norm(
              training=(self._is_training and not self._freeze_batchnorm),
-              name='Conv2d_%d_1x1_%d_norm' % (i, depth)))
-          net.add(self._conv_hyperparams.build_activation_layer(
-              name='Conv2d_%d_1x1_%d_activation' % (i, depth),
+              name='SharedConvolutions_%d/Conv2d_%d_1x1_%d_norm'
+              % (stack_index, i, depth)))
+          net.append(self._conv_hyperparams.build_activation_layer(
+              name='SharedConvolutions_%d/Conv2d_%d_1x1_%d_activation'
+              % (stack_index, i, depth),
          ))
+      # Until certain bugs are fixed in checkpointable lists,
+      # this net must be appended only once it's been filled with layers
+      self._shared_nets.append(net)
    self.built = True

  def _predict(self, image_features):
@@ -175,10 +181,11 @@ class ConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor):
    """
    predictions = collections.defaultdict(list)

-    for (index, image_feature) in enumerate(image_features):
+    for (index, net) in enumerate(image_features):

      # Apply shared conv layers before the head predictors.
-      net = self._shared_nets[index](image_feature)
+      for layer in self._shared_nets[index]:
+        net = layer(net)

      for head_name in self._prediction_heads:
        head_obj = self._prediction_heads[head_name][index]

--- a/research/object_detection/predictors/convolutional_keras_box_predictor_test.py
+++ b/research/object_detection/predictors/convolutional_keras_box_predictor_test.py
@@ -181,8 +181,8 @@ class ConvolutionalKerasBoxPredictorTest(test_case.TestCase):
      self.assertAllEqual(objectness_predictions_shape,
                          [4, expected_num_anchors, 1])
    expected_variable_set = set([
-        'BoxPredictor/PreHeadConvolutions_0/Conv2d_0_1x1_32/bias',
-        'BoxPredictor/PreHeadConvolutions_0/Conv2d_0_1x1_32/kernel',
+        'BoxPredictor/SharedConvolutions_0/Conv2d_0_1x1_32/bias',
+        'BoxPredictor/SharedConvolutions_0/Conv2d_0_1x1_32/kernel',
        'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/bias',
        'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/kernel',
        'BoxPredictor/ConvolutionalClassHead_0/ClassPredictor/bias',

--- a/research/object_detection/predictors/heads/class_head.py
+++ b/research/object_detection/predictors/heads/class_head.py
@@ -34,16 +34,18 @@ class MaskRCNNClassHead(head.Head):
  https://arxiv.org/abs/1703.06870
  """

-  def __init__(self, is_training, num_classes, fc_hyperparams_fn,
-               use_dropout, dropout_keep_prob):
+  def __init__(self,
+               is_training,
+               num_class_slots,
+               fc_hyperparams_fn,
+               use_dropout,
+               dropout_keep_prob):
    """Constructor.

    Args:
      is_training: Indicates whether the BoxPredictor is in training mode.
-      num_classes: number of classes.  Note that num_classes *does not*
-        include the background category, so if groundtruth labels take values
-        in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
-        assigned classification targets can range from {0,... K}).
+      num_class_slots: number of class slots. Note that num_class_slots may or
+        may not include an implicit background category.
      fc_hyperparams_fn: A function to generate tf-slim arg_scope with
        hyperparameters for fully connected ops.
      use_dropout: Option to use dropout or not.  Note that a single dropout
@@ -54,7 +56,7 @@ class MaskRCNNClassHead(head.Head):
    """
    super(MaskRCNNClassHead, self).__init__()
    self._is_training = is_training
-    self._num_classes = num_classes
+    self._num_class_slots = num_class_slots
    self._fc_hyperparams_fn = fc_hyperparams_fn
    self._use_dropout = use_dropout
    self._dropout_keep_prob = dropout_keep_prob
@@ -70,7 +72,7 @@ class MaskRCNNClassHead(head.Head):

    Returns:
      class_predictions_with_background: A float tensor of shape
-        [batch_size, 1, num_classes + 1] representing the class predictions for
+        [batch_size, 1, num_class_slots] representing the class predictions for
        the proposals.

    Raises:
@@ -91,11 +93,12 @@ class MaskRCNNClassHead(head.Head):
    with slim.arg_scope(self._fc_hyperparams_fn()):
      class_predictions_with_background = slim.fully_connected(
          flattened_roi_pooled_features,
-          self._num_classes + 1,
+          self._num_class_slots,
          activation_fn=None,
          scope='ClassPredictor')
    class_predictions_with_background = tf.reshape(
-        class_predictions_with_background, [-1, 1, self._num_classes + 1])
+        class_predictions_with_background,
+        [-1, 1, self._num_class_slots])
    return class_predictions_with_background


@@ -104,7 +107,7 @@ class ConvolutionalClassHead(head.Head):

  def __init__(self,
               is_training,
-               num_classes,
+               num_class_slots,
               use_dropout,
               dropout_keep_prob,
               kernel_size,
@@ -115,7 +118,8 @@ class ConvolutionalClassHead(head.Head):

    Args:
      is_training: Indicates whether the BoxPredictor is in training mode.
-      num_classes: Number of classes.
+      num_class_slots: number of class slots. Note that num_class_slots may or
+        may not include an implicit background category.
      use_dropout: Option to use dropout or not.  Note that a single dropout
        op is applied here prior to both box and class predictions, which stands
        in contrast to the ConvolutionalBoxPredictor below.
@@ -137,7 +141,7 @@ class ConvolutionalClassHead(head.Head):
    """
    super(ConvolutionalClassHead, self).__init__()
    self._is_training = is_training
-    self._num_classes = num_classes
+    self._num_class_slots = num_class_slots
    self._use_dropout = use_dropout
    self._dropout_keep_prob = dropout_keep_prob
    self._kernel_size = kernel_size
@@ -156,12 +160,10 @@ class ConvolutionalClassHead(head.Head):

    Returns:
      class_predictions_with_background: A float tensors of shape
-        [batch_size, num_anchors, num_classes + 1] representing the class
+        [batch_size, num_anchors, num_class_slots] representing the class
        predictions for the proposals.
    """
    net = features
-    # Add a slot for the background class.
-    num_class_slots = self._num_classes + 1
    if self._use_dropout:
      net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
    if self._use_depthwise:
@@ -171,7 +173,7 @@ class ConvolutionalClassHead(head.Head):
          rate=1, scope='ClassPredictor_depthwise')
      class_predictions_with_background = slim.conv2d(
          class_predictions_with_background,
-          num_predictions_per_location * num_class_slots, [1, 1],
+          num_predictions_per_location * self._num_class_slots, [1, 1],
          activation_fn=None,
          normalizer_fn=None,
          normalizer_params=None,
@@ -179,7 +181,7 @@ class ConvolutionalClassHead(head.Head):
    else:
      class_predictions_with_background = slim.conv2d(
          net,
-          num_predictions_per_location * num_class_slots,
+          num_predictions_per_location * self._num_class_slots,
          [self._kernel_size, self._kernel_size],
          activation_fn=None,
          normalizer_fn=None,
@@ -194,7 +196,8 @@ class ConvolutionalClassHead(head.Head):
    if batch_size is None:
      batch_size = tf.shape(features)[0]
    class_predictions_with_background = tf.reshape(
-        class_predictions_with_background, [batch_size, -1, num_class_slots])
+        class_predictions_with_background,
+        [batch_size, -1, self._num_class_slots])
    return class_predictions_with_background


@@ -208,7 +211,7 @@ class WeightSharedConvolutionalClassHead(head.Head):
  """

  def __init__(self,
-               num_classes,
+               num_class_slots,
               kernel_size=3,
               class_prediction_bias_init=0.0,
               use_dropout=False,
@@ -218,10 +221,8 @@ class WeightSharedConvolutionalClassHead(head.Head):
    """Constructor.

    Args:
-      num_classes: number of classes.  Note that num_classes *does not*
-        include the background category, so if groundtruth labels take values
-        in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
-        assigned classification targets can range from {0,... K}).
+      num_class_slots: number of class slots. Note that num_class_slots may or
+        may not include an implicit background category.
      kernel_size: Size of final convolution kernel.
      class_prediction_bias_init: constant value to initialize bias of the last
        conv2d layer before class prediction.
@@ -233,7 +234,7 @@ class WeightSharedConvolutionalClassHead(head.Head):
        as inputs and returns tensors).
    """
    super(WeightSharedConvolutionalClassHead, self).__init__()
-    self._num_classes = num_classes
+    self._num_class_slots = num_class_slots
    self._kernel_size = kernel_size
    self._class_prediction_bias_init = class_prediction_bias_init
    self._use_dropout = use_dropout
@@ -252,12 +253,10 @@ class WeightSharedConvolutionalClassHead(head.Head):

    Returns:
      class_predictions_with_background: A tensor of shape
-        [batch_size, num_anchors, num_classes + 1] representing the class
+        [batch_size, num_anchors, num_class_slots] representing the class
        predictions for the proposals.
    """
    class_predictions_net = features
-    num_class_slots = self._num_classes + 1
-    # Add a slot for the background class.
    if self._use_dropout:
      class_predictions_net = slim.dropout(
          class_predictions_net, keep_prob=self._dropout_keep_prob)
@@ -267,7 +266,7 @@ class WeightSharedConvolutionalClassHead(head.Head):
      conv_op = slim.conv2d
    class_predictions_with_background = conv_op(
        class_predictions_net,
-        num_predictions_per_location * num_class_slots,
+        num_predictions_per_location * self._num_class_slots,
        [self._kernel_size, self._kernel_size],
        activation_fn=None, stride=1, padding='SAME',
        normalizer_fn=None,
@@ -280,5 +279,6 @@ class WeightSharedConvolutionalClassHead(head.Head):
    class_predictions_with_background = self._score_converter_fn(
        class_predictions_with_background)
    class_predictions_with_background = tf.reshape(
-        class_predictions_with_background, [batch_size, -1, num_class_slots])
+        class_predictions_with_background,
+        [batch_size, -1, self._num_class_slots])
    return class_predictions_with_background
--- a/research/object_detection/predictors/heads/class_head_test.py
+++ b/research/object_detection/predictors/heads/class_head_test.py
@@ -46,7 +46,7 @@ class MaskRCNNClassHeadTest(test_case.TestCase):
  def test_prediction_size(self):
    class_prediction_head = class_head.MaskRCNNClassHead(
        is_training=False,
-        num_classes=20,
+        num_class_slots=20,
        fc_hyperparams_fn=self._build_arg_scope_with_hyperparams(),
        use_dropout=True,
        dropout_keep_prob=0.5)
@@ -54,7 +54,7 @@ class MaskRCNNClassHeadTest(test_case.TestCase):
        [64, 7, 7, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
    prediction = class_prediction_head.predict(
        features=roi_pooled_features, num_predictions_per_location=1)
-    self.assertAllEqual([64, 1, 21], prediction.get_shape().as_list())
+    self.assertAllEqual([64, 1, 20], prediction.get_shape().as_list())


 class ConvolutionalClassPredictorTest(test_case.TestCase):
@@ -80,7 +80,7 @@ class ConvolutionalClassPredictorTest(test_case.TestCase):
  def test_prediction_size(self):
    class_prediction_head = class_head.ConvolutionalClassHead(
        is_training=True,
-        num_classes=20,
+        num_class_slots=20,
        use_dropout=True,
        dropout_keep_prob=0.5,
        kernel_size=3)
@@ -89,7 +89,7 @@ class ConvolutionalClassPredictorTest(test_case.TestCase):
    class_predictions = class_prediction_head.predict(
        features=image_feature,
        num_predictions_per_location=1)
-    self.assertAllEqual([64, 323, 21],
+    self.assertAllEqual([64, 323, 20],
                        class_predictions.get_shape().as_list())


@@ -115,13 +115,13 @@ class WeightSharedConvolutionalClassPredictorTest(test_case.TestCase):

  def test_prediction_size(self):
    class_prediction_head = (
-        class_head.WeightSharedConvolutionalClassHead(num_classes=20))
+        class_head.WeightSharedConvolutionalClassHead(num_class_slots=20))
    image_feature = tf.random_uniform(
        [64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
    class_predictions = class_prediction_head.predict(
        features=image_feature,
        num_predictions_per_location=1)
-    self.assertAllEqual([64, 323, 21], class_predictions.get_shape().as_list())
+    self.assertAllEqual([64, 323, 20], class_predictions.get_shape().as_list())


 if __name__ == '__main__':

--- a/research/object_detection/predictors/heads/keras_box_head.py
+++ b/research/object_detection/predictors/heads/keras_box_head.py
@@ -91,7 +91,7 @@ class ConvolutionalBoxHead(head.KerasHead):
          tf.keras.layers.Conv2D(
              num_predictions_per_location * self._box_code_size, [1, 1],
              name='BoxEncodingPredictor',
-              **conv_hyperparams.params(activation=None)))
+              **conv_hyperparams.params(use_bias=True)))
    else:
      self._box_encoder_layers.append(
          tf.keras.layers.Conv2D(
@@ -99,7 +99,7 @@ class ConvolutionalBoxHead(head.KerasHead):
              [self._kernel_size, self._kernel_size],
              padding='SAME',
              name='BoxEncodingPredictor',
-              **conv_hyperparams.params(activation=None)))
+              **conv_hyperparams.params(use_bias=True)))

  def _predict(self, features):
    """Predicts boxes.

--- a/research/object_detection/predictors/heads/keras_class_head.py
+++ b/research/object_detection/predictors/heads/keras_class_head.py
@@ -29,7 +29,7 @@ class ConvolutionalClassHead(head.KerasHead):

  def __init__(self,
               is_training,
-               num_classes,
+               num_class_slots,
               use_dropout,
               dropout_keep_prob,
               kernel_size,
@@ -43,7 +43,8 @@ class ConvolutionalClassHead(head.KerasHead):

    Args:
      is_training: Indicates whether the BoxPredictor is in training mode.
-      num_classes: Number of classes.
+      num_class_slots: number of class slots. Note that num_class_slots may or
+        may not include an implicit background category.
      use_dropout: Option to use dropout or not.  Note that a single dropout
        op is applied here prior to both box and class predictions, which stands
        in contrast to the ConvolutionalBoxPredictor below.
@@ -73,13 +74,12 @@ class ConvolutionalClassHead(head.KerasHead):
    """
    super(ConvolutionalClassHead, self).__init__(name=name)
    self._is_training = is_training
-    self._num_classes = num_classes
    self._use_dropout = use_dropout
    self._dropout_keep_prob = dropout_keep_prob
    self._kernel_size = kernel_size
    self._class_prediction_bias_init = class_prediction_bias_init
    self._use_depthwise = use_depthwise
-    self._num_class_slots = self._num_classes + 1
+    self._num_class_slots = num_class_slots

    self._class_predictor_layers = []

@@ -110,7 +110,7 @@ class ConvolutionalClassHead(head.KerasHead):
          tf.keras.layers.Conv2D(
              num_predictions_per_location * self._num_class_slots, [1, 1],
              name='ClassPredictor',
-              **conv_hyperparams.params(activation=None)))
+              **conv_hyperparams.params(use_bias=True)))
    else:
      self._class_predictor_layers.append(
          tf.keras.layers.Conv2D(
@@ -120,7 +120,7 @@ class ConvolutionalClassHead(head.KerasHead):
              name='ClassPredictor',
              bias_initializer=tf.constant_initializer(
                  self._class_prediction_bias_init),
-              **conv_hyperparams.params(activation=None)))
+              **conv_hyperparams.params(use_bias=True)))

  def _predict(self, features):
    """Predicts boxes.
@@ -131,7 +131,7 @@ class ConvolutionalClassHead(head.KerasHead):

    Returns:
      class_predictions_with_background: A float tensor of shape
-        [batch_size, num_anchors, num_classes + 1] representing the class
+        [batch_size, num_anchors, num_class_slots] representing the class
        predictions for the proposals.
    """
    # Add a slot for the background class.

--- a/research/object_detection/predictors/heads/keras_class_head_test.py
+++ b/research/object_detection/predictors/heads/keras_class_head_test.py
@@ -45,7 +45,7 @@ class ConvolutionalKerasClassPredictorTest(test_case.TestCase):
    conv_hyperparams = self._build_conv_hyperparams()
    class_prediction_head = keras_class_head.ConvolutionalClassHead(
        is_training=True,
-        num_classes=20,
+        num_class_slots=20,
        use_dropout=True,
        dropout_keep_prob=0.5,
        kernel_size=3,
@@ -56,7 +56,7 @@ class ConvolutionalKerasClassPredictorTest(test_case.TestCase):
    image_feature = tf.random_uniform(
        [64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
    class_predictions = class_prediction_head(image_feature,)
-    self.assertAllEqual([64, 323, 21],
+    self.assertAllEqual([64, 323, 20],
                        class_predictions.get_shape().as_list())

  # TODO(kaftan): Remove conditional after CMLE moves to TF 1.10

--- a/research/object_detection/predictors/heads/keras_mask_head.py
+++ b/research/object_detection/predictors/heads/keras_mask_head.py
@@ -124,7 +124,7 @@ class ConvolutionalMaskHead(head.KerasHead):
          tf.keras.layers.Conv2D(
              num_predictions_per_location * num_mask_channels, [1, 1],
              name='MaskPredictor',
-              **conv_hyperparams.params(activation=None)))
+              **conv_hyperparams.params(use_bias=True)))
    else:
      self._mask_predictor_layers.append(
          tf.keras.layers.Conv2D(
@@ -132,7 +132,7 @@ class ConvolutionalMaskHead(head.KerasHead):
              [self._kernel_size, self._kernel_size],
              padding='SAME',
              name='MaskPredictor',
-              **conv_hyperparams.params(activation=None)))
+              **conv_hyperparams.params(use_bias=True)))

  def _predict(self, features):
    """Predicts boxes.

--- a/research/object_detection/predictors/heads/mask_head.py
+++ b/research/object_detection/predictors/heads/mask_head.py
@@ -23,6 +23,7 @@ import math
 import tensorflow as tf

 from object_detection.predictors.heads import head
+from object_detection.utils import ops

 slim = tf.contrib.slim

@@ -41,7 +42,8 @@ class MaskRCNNMaskHead(head.Head):
               mask_width=14,
               mask_prediction_num_conv_layers=2,
               mask_prediction_conv_depth=256,
-               masks_are_class_agnostic=False):
+               masks_are_class_agnostic=False,
+               convolve_then_upsample=False):
    """Constructor.

    Args:
@@ -62,6 +64,10 @@ class MaskRCNNMaskHead(head.Head):
        image features.
      masks_are_class_agnostic: Boolean determining if the mask-head is
        class-agnostic or not.
+      convolve_then_upsample: Whether to apply convolutions on mask features
+        before upsampling using nearest neighbor resizing. Otherwise, mask
+        features are resized to [`mask_height`, `mask_width`] using bilinear
+        resizing before applying convolutions.

    Raises:
      ValueError: conv_hyperparams_fn is None.
@@ -74,6 +80,7 @@ class MaskRCNNMaskHead(head.Head):
    self._mask_prediction_num_conv_layers = mask_prediction_num_conv_layers
    self._mask_prediction_conv_depth = mask_prediction_conv_depth
    self._masks_are_class_agnostic = masks_are_class_agnostic
+    self._convolve_then_upsample = convolve_then_upsample
    if conv_hyperparams_fn is None:
      raise ValueError('conv_hyperparams_fn is None.')

@@ -135,17 +142,30 @@ class MaskRCNNMaskHead(head.Head):
      num_conv_channels = self._get_mask_predictor_conv_depth(
          num_feature_channels, self._num_classes)
    with slim.arg_scope(self._conv_hyperparams_fn()):
-      upsampled_features = tf.image.resize_bilinear(
-          features, [self._mask_height, self._mask_width],
-          align_corners=True)
+      if not self._convolve_then_upsample:
+        features = tf.image.resize_bilinear(
+            features, [self._mask_height, self._mask_width],
+            align_corners=True)
      for _ in range(self._mask_prediction_num_conv_layers - 1):
-        upsampled_features = slim.conv2d(
-            upsampled_features,
+        features = slim.conv2d(
+            features,
+            num_outputs=num_conv_channels,
+            kernel_size=[3, 3])
+      if self._convolve_then_upsample:
+        # Replace Transposed Convolution with a Nearest Neighbor upsampling step
+        # followed by 3x3 convolution.
+        height_scale = self._mask_height / features.shape[1].value
+        width_scale = self._mask_width / features.shape[2].value
+        features = ops.nearest_neighbor_upsampling(
+            features, height_scale=height_scale, width_scale=width_scale)
+        features = slim.conv2d(
+            features,
            num_outputs=num_conv_channels,
            kernel_size=[3, 3])
+
      num_masks = 1 if self._masks_are_class_agnostic else self._num_classes
      mask_predictions = slim.conv2d(
-          upsampled_features,
+          features,
          num_outputs=num_masks,
          activation_fn=None,
          normalizer_fn=None,

--- a/research/object_detection/predictors/heads/mask_head_test.py
+++ b/research/object_detection/predictors/heads/mask_head_test.py
@@ -58,6 +58,22 @@ class MaskRCNNMaskHeadTest(test_case.TestCase):
        features=roi_pooled_features, num_predictions_per_location=1)
    self.assertAllEqual([64, 1, 20, 14, 14], prediction.get_shape().as_list())

+  def test_prediction_size_with_convolve_then_upsample(self):
+    mask_prediction_head = mask_head.MaskRCNNMaskHead(
+        num_classes=20,
+        conv_hyperparams_fn=self._build_arg_scope_with_hyperparams(),
+        mask_height=28,
+        mask_width=28,
+        mask_prediction_num_conv_layers=2,
+        mask_prediction_conv_depth=256,
+        masks_are_class_agnostic=True,
+        convolve_then_upsample=True)
+    roi_pooled_features = tf.random_uniform(
+        [64, 14, 14, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
+    prediction = mask_prediction_head.predict(
+        features=roi_pooled_features, num_predictions_per_location=1)
+    self.assertAllEqual([64, 1, 1, 28, 28], prediction.get_shape().as_list())
+

 class ConvolutionalMaskPredictorTest(test_case.TestCase):


--- a/research/object_detection/protos/box_predictor.proto
+++ b/research/object_detection/protos/box_predictor.proto
@@ -138,6 +138,7 @@ message WeightSharedConvolutionalBoxPredictor {

 // TODO(alirezafathi): Refactor the proto file to be able to configure mask rcnn
 // head easily.
+// Next id: 15
 message MaskRCNNBoxPredictor {
  // Hyperparameters for fully connected ops used in the box predictor.
  optional Hyperparams fc_hyperparams = 1;
@@ -178,6 +179,12 @@ message MaskRCNNBoxPredictor {
  // Whether to use one box for all classes rather than a different box for each
  // class.
  optional bool share_box_across_classes = 13 [default = false];
+
+  // Whether to apply convolutions on mask features before upsampling using
+  // nearest neighbor resizing.
+  // By default, mask features are resized to [`mask_height`, `mask_width`]
+  // before applying convolutions and predicting masks.
+  optional bool convolve_then_upsample_masks = 14 [default = false];
 }

 message RfcnBoxPredictor {

--- a/research/object_detection/protos/faster_rcnn.proto
+++ b/research/object_detection/protos/faster_rcnn.proto
@@ -164,6 +164,10 @@ message FasterRcnn {
  // Whether the masks present in groundtruth should be resized in the model to
  // match the image size.
  optional bool resize_masks = 36 [default = true];
+
+  // If True, uses implementation of ops with static shape guarantees when
+  // running evaluation (specifically not is_training if False).
+  optional bool use_static_shapes_for_eval = 37 [default = false];
 }



--- a/research/object_detection/protos/preprocessor.proto
+++ b/research/object_detection/protos/preprocessor.proto
@@ -155,6 +155,9 @@ message RandomCropImage {
  // value, it is removed from the new image.
  optional float overlap_thresh = 6 [default=0.3];

+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 8 [default=true];
+
  // Probability of keeping the original image.
  optional float random_coef = 7 [default=0.0];
 }
@@ -194,6 +197,9 @@ message RandomCropPadImage {
  // value, it is removed from the new image.
  optional float overlap_thresh = 6 [default=0.3];

+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 11 [default=true];
+
  // Probability of keeping the original image during the crop operation.
  optional float random_coef = 7 [default=0.0];

@@ -217,6 +223,9 @@ message RandomCropToAspectRatio {
  // ratio between a cropped bounding box and the original is less than this
  // value, it is removed from the new image.
  optional float overlap_thresh = 2 [default=0.3];
+
+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 3 [default=true];
 }

 // Randomly adds black square patches to an image.
@@ -285,6 +294,9 @@ message SSDRandomCropOperation {
  // Cropped box area ratio must be above this threhold to be kept.
  optional float overlap_thresh = 6;

+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 8 [default=true];
+
  // Probability a crop operation is skipped.
  optional float random_coef = 7;
 }
@@ -315,6 +327,9 @@ message SSDRandomCropPadOperation {
  // Cropped box area ratio must be above this threhold to be kept.
  optional float overlap_thresh = 6;

+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 13 [default=true];
+
  // Probability a crop operation is skipped.
  optional float random_coef = 7;

@@ -353,6 +368,9 @@ message SSDRandomCropFixedAspectRatioOperation {
  // Cropped box area ratio must be above this threhold to be kept.
  optional float overlap_thresh = 6;

+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 8 [default=true];
+
  // Probability a crop operation is skipped.
  optional float random_coef = 7;
 }
@@ -387,6 +405,9 @@ message SSDRandomCropPadFixedAspectRatioOperation {
  // Cropped box area ratio must be above this threhold to be kept.
  optional float overlap_thresh = 6;

+  // Whether to clip the boxes to the cropped image.
+  optional bool clip_boxes = 8 [default=true];
+
  // Probability a crop operation is skipped.
  optional float random_coef = 7;
 }

--- a/research/object_detection/protos/ssd.proto
+++ b/research/object_detection/protos/ssd.proto
@@ -12,7 +12,7 @@ import "object_detection/protos/post_processing.proto";
 import "object_detection/protos/region_similarity_calculator.proto";

 // Configuration for Single Shot Detection (SSD) models.
-// Next id: 21
+// Next id: 22
 message Ssd {

  // Number of classes to predict.
@@ -92,11 +92,17 @@ message Ssd {

  // Minimum number of effective negative samples.
  // Only applies if use_expected_classification_loss_under_sampling is true.
-  optional float minimum_negative_sampling = 19 [default=0];
+  optional float min_num_negative_samples = 19 [default=0];

  // Desired number of effective negative samples per positive sample.
  // Only applies if use_expected_classification_loss_under_sampling is true.
  optional float desired_negative_sampling_ratio = 20 [default=3];
+
+  // Whether to add an implicit background class to one-hot encodings of
+  // groundtruth labels. Set to false if using groundtruth labels with an
+  // explicit background class, using multiclass scores, or if training a single
+  // class model.
+  optional bool add_background_class = 21 [default = true];
 }



--- a/research/object_detection/protos/train.proto
+++ b/research/object_detection/protos/train.proto
@@ -6,7 +6,7 @@ import "object_detection/protos/optimizer.proto";
 import "object_detection/protos/preprocessor.proto";

 // Message for configuring DetectionModel training jobs (train.py).
-// Next id: 27
+// Next id: 28
 message TrainConfig {
  // Effective batch size to use for training.
  // For TPU (or sync SGD jobs), the batch size per core (or GPU) is going to be
@@ -115,4 +115,7 @@ message TrainConfig {

  // Whether to use bfloat16 for training.
  optional bool use_bfloat16 = 26 [default=false];
+
+  // Whether to summarize gradients.
+  optional bool summarize_gradients = 27 [default=false];
 }
--- a/research/object_detection/samples/configs/facessd_mobilenet_v2_quantized_320x320_open_image_v4.config
+++ b/research/object_detection/samples/configs/facessd_mobilenet_v2_quantized_320x320_open_image_v4.config
+# Quantized trained SSD with Mobilenet v2 on Open Images v4.
+# Non-face boxes are dropped during training and non-face groundtruth boxes are
+# ignored when evaluating.
+#
+# Users should configure the fine_tune_checkpoint field in the train config as
+# well as the label_map_path and input_path fields in the train_input_reader and
+# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
+# should be configured.
+
+model {
+  ssd {
+    num_classes: 1
+    image_resizer {
+      fixed_shape_resizer {
+        height: 320
+        width: 320
+      }
+    }
+    feature_extractor {
+      type: "ssd_mobilenet_v2"
+      depth_multiplier: 1.0
+      min_depth: 16
+      conv_hyperparams {
+        regularizer {
+          l2_regularizer {
+            weight: 4.0e-05
+          }
+        }
+        initializer {
+          truncated_normal_initializer {
+            mean: 0.0
+            stddev: 0.03
+          }
+        }
+        activation: RELU_6
+        batch_norm {
+          decay: 0.9997
+          center: true
+          scale: true
+          epsilon: 0.001
+          train: true
+        }
+      }
+      pad_to_multiple: 32
+      use_explicit_padding: true
+    }
+    box_coder {
+      faster_rcnn_box_coder {
+        y_scale: 10.0
+        x_scale: 10.0
+        height_scale: 5.0
+        width_scale: 5.0
+      }
+    }
+    matcher {
+      argmax_matcher {
+        matched_threshold: 0.5
+        unmatched_threshold: 0.5
+        ignore_thresholds: false
+        negatives_lower_than_unmatched: true
+        force_match_for_each_row: true
+      }
+    }
+    similarity_calculator {
+      iou_similarity {
+      }
+    }
+    box_predictor {
+      convolutional_box_predictor {
+        conv_hyperparams {
+          regularizer {
+            l2_regularizer {
+              weight: 4.0e-05
+            }
+          }
+          initializer {
+            truncated_normal_initializer {
+              mean: 0.0
+              stddev: 0.03
+            }
+          }
+          activation: RELU_6
+          batch_norm {
+            decay: 0.9997
+            center: true
+            scale: true
+            epsilon: 0.001
+            train: true
+          }
+        }
+        min_depth: 0
+        max_depth: 0
+        num_layers_before_predictor: 0
+        use_dropout: false
+        kernel_size: 3
+        box_code_size: 4
+        apply_sigmoid_to_scores: false
+      }
+    }
+    anchor_generator {
+      ssd_anchor_generator {
+        num_layers: 6
+        min_scale: 0.2
+        max_scale: 0.95
+        aspect_ratios: 1.0
+        aspect_ratios: 2.0
+        aspect_ratios: 0.5
+        aspect_ratios: 3.0
+        aspect_ratios: 0.3333
+        height_stride: 16
+        height_stride: 32
+        height_stride: 64
+        height_stride: 128
+        height_stride: 256
+        height_stride: 512
+        width_stride: 16
+        width_stride: 32
+        width_stride: 64
+        width_stride: 128
+        width_stride: 256
+        width_stride: 512
+      }
+    }
+    post_processing {
+      batch_non_max_suppression {
+        score_threshold: 1.0e-08
+        iou_threshold: 0.5
+        max_detections_per_class: 100
+        max_total_detections: 100
+      }
+      score_converter: SIGMOID
+    }
+    normalize_loss_by_num_matches: true
+    loss {
+      localization_loss {
+        weighted_smooth_l1 {
+        }
+      }
+      classification_loss {
+        weighted_sigmoid {
+        }
+      }
+      hard_example_miner {
+        num_hard_examples: 3000
+        iou_threshold: 0.99
+        loss_type: CLASSIFICATION
+        max_negatives_per_positive: 3
+        min_negatives_per_image: 10
+      }
+      classification_weight: 1.0
+      localization_weight: 1.0
+    }
+  }
+}
+train_config {
+  batch_size: 32
+  data_augmentation_options {
+    random_horizontal_flip {
+      keypoint_flip_permutation: 1
+      keypoint_flip_permutation: 0
+      keypoint_flip_permutation: 2
+      keypoint_flip_permutation: 3
+      keypoint_flip_permutation: 5
+      keypoint_flip_permutation: 4
+    }
+  }
+  data_augmentation_options {
+    ssd_random_crop_fixed_aspect_ratio {
+    }
+  }
+  optimizer {
+    rms_prop_optimizer {
+      learning_rate {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.004
+          decay_steps: 800720
+          decay_factor: 0.95
+        }
+      }
+      momentum_optimizer_value: 0.9
+      decay: 0.9
+      epsilon: 1.0
+    }
+  }
+  fine_tune_checkpoint: ""
+}
+train_input_reader {
+  label_map_path: "PATH_TO_BE_CONFIGURED/face_label_map.pbtxt"
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/face_train.record-?????-of-00100"
+  }
+}
+eval_config {
+  metrics_set: "coco_detection_metrics"
+  use_moving_averages: true
+}
+eval_input_reader {
+  label_map_path: "PATH_TO_BE_CONFIGURED/face_label_map.pbtxt"
+  shuffle: false
+  num_readers: 1
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/face_val.record-?????-of-00010"
+  }
+}
+graph_rewriter {
+  quantization {
+    delay: 500000
+    weight_bits: 8
+    activation_bits: 8
+  }
+}
--- a/research/object_detection/samples/configs/ssd_mobilenet_v2_quantized_300x300_coco.config
+++ b/research/object_detection/samples/configs/ssd_mobilenet_v2_quantized_300x300_coco.config
+# Quantized trained SSD with Mobilenet v2 on MSCOCO Dataset.
+# Users should configure the fine_tune_checkpoint field in the train config as
+# well as the label_map_path and input_path fields in the train_input_reader and
+# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
+# should be configured.
+
+model {
+  ssd {
+    num_classes: 90
+    box_coder {
+      faster_rcnn_box_coder {
+        y_scale: 10.0
+        x_scale: 10.0
+        height_scale: 5.0
+        width_scale: 5.0
+      }
+    }
+    matcher {
+      argmax_matcher {
+        matched_threshold: 0.5
+        unmatched_threshold: 0.5
+        ignore_thresholds: false
+        negatives_lower_than_unmatched: true
+        force_match_for_each_row: true
+      }
+    }
+    similarity_calculator {
+      iou_similarity {
+      }
+    }
+    anchor_generator {
+      ssd_anchor_generator {
+        num_layers: 6
+        min_scale: 0.2
+        max_scale: 0.95
+        aspect_ratios: 1.0
+        aspect_ratios: 2.0
+        aspect_ratios: 0.5
+        aspect_ratios: 3.0
+        aspect_ratios: 0.3333
+      }
+    }
+    image_resizer {
+      fixed_shape_resizer {
+        height: 300
+        width: 300
+      }
+    }
+    box_predictor {
+      convolutional_box_predictor {
+        min_depth: 0
+        max_depth: 0
+        num_layers_before_predictor: 0
+        use_dropout: false
+        dropout_keep_probability: 0.8
+        kernel_size: 1
+        box_code_size: 4
+        apply_sigmoid_to_scores: false
+        conv_hyperparams {
+          activation: RELU_6,
+          regularizer {
+            l2_regularizer {
+              weight: 0.00004
+            }
+          }
+          initializer {
+            truncated_normal_initializer {
+              stddev: 0.03
+              mean: 0.0
+            }
+          }
+          batch_norm {
+            train: true,
+            scale: true,
+            center: true,
+            decay: 0.9997,
+            epsilon: 0.001,
+          }
+        }
+      }
+    }
+    feature_extractor {
+      type: 'ssd_mobilenet_v2'
+      min_depth: 16
+      depth_multiplier: 1.0
+      conv_hyperparams {
+        activation: RELU_6,
+        regularizer {
+          l2_regularizer {
+            weight: 0.00004
+          }
+        }
+        initializer {
+          truncated_normal_initializer {
+            stddev: 0.03
+            mean: 0.0
+          }
+        }
+        batch_norm {
+          train: true,
+          scale: true,
+          center: true,
+          decay: 0.9997,
+          epsilon: 0.001,
+        }
+      }
+    }
+    loss {
+      classification_loss {
+        weighted_sigmoid {
+        }
+      }
+      localization_loss {
+        weighted_smooth_l1 {
+        }
+      }
+      hard_example_miner {
+        num_hard_examples: 3000
+        iou_threshold: 0.99
+        loss_type: CLASSIFICATION
+        max_negatives_per_positive: 3
+        min_negatives_per_image: 3
+      }
+      classification_weight: 1.0
+      localization_weight: 1.0
+    }
+    normalize_loss_by_num_matches: true
+    post_processing {
+      batch_non_max_suppression {
+        score_threshold: 1e-8
+        iou_threshold: 0.6
+        max_detections_per_class: 100
+        max_total_detections: 100
+      }
+      score_converter: SIGMOID
+    }
+  }
+}
+
+train_config: {
+  batch_size: 24
+  optimizer {
+    rms_prop_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.004
+          decay_steps: 800720
+          decay_factor: 0.95
+        }
+      }
+      momentum_optimizer_value: 0.9
+      decay: 0.9
+      epsilon: 1.0
+    }
+  }
+  fine_tune_checkpoint: "PATH_TO_BE_CONFIGURED/model.ckpt"
+  fine_tune_checkpoint_type:  "detection"
+  # Note: The below line limits the training process to 200K steps, which we
+  # empirically found to be sufficient enough to train the pets dataset. This
+  # effectively bypasses the learning rate schedule (the learning rate will
+  # never decay). Remove the below line to train indefinitely.
+  num_steps: 200000
+  data_augmentation_options {
+    random_horizontal_flip {
+    }
+  }
+  data_augmentation_options {
+    ssd_random_crop {
+    }
+  }
+}
+
+train_input_reader: {
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/mscoco_train.record-?????-of-00100"
+  }
+  label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt"
+}
+
+eval_config: {
+  num_examples: 8000
+  # Note: The below line limits the evaluation process to 10 evaluations.
+  # Remove the below line to evaluate indefinitely.
+  max_evals: 10
+}
+
+eval_input_reader: {
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/mscoco_val.record-?????-of-00010"
+  }
+  label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt"
+  shuffle: false
+  num_readers: 1
+}
+
+graph_rewriter {
+  quantization {
+    delay: 48000
+    weight_bits: 8
+    activation_bits: 8
+  }
+}
\ No newline at end of file
--- a/research/object_detection/utils/config_util.py
+++ b/research/object_detection/utils/config_util.py
@@ -76,12 +76,14 @@ def get_spatial_image_size(image_resizer_config):
  raise ValueError("Unknown image resizer type.")


-def get_configs_from_pipeline_file(pipeline_config_path):
+def get_configs_from_pipeline_file(pipeline_config_path, config_override=None):
  """Reads config from a file containing pipeline_pb2.TrainEvalPipelineConfig.

  Args:
    pipeline_config_path: Path to pipeline_pb2.TrainEvalPipelineConfig text
      proto.
+    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
+      override pipeline_config_path.

  Returns:
    Dictionary of configuration objects. Keys are `model`, `train_config`,
@@ -92,6 +94,8 @@ def get_configs_from_pipeline_file(pipeline_config_path):
  with tf.gfile.GFile(pipeline_config_path, "r") as f:
    proto_str = f.read()
    text_format.Merge(proto_str, pipeline_config)
+  if config_override:
+    text_format.Merge(config_override, pipeline_config)
  return create_configs_from_pipeline_proto(pipeline_config)


@@ -430,7 +434,7 @@ def merge_external_params_with_configs(configs, hparams=None, kwargs_dict=None):
  final learning rates.
  In this case key can be one of the following formats:
      1. legacy update: single string that indicates the attribute to be
-        updated. E.g. 'lable_map_path', 'eval_input_path', 'shuffle'.
+        updated. E.g. 'label_map_path', 'eval_input_path', 'shuffle'.
        Note that when updating fields (e.g. eval_input_path, eval_shuffle) in
        eval_input_configs, the override will only be applied when
        eval_input_configs has exactly 1 element.

--- a/research/object_detection/utils/object_detection_evaluation.py
+++ b/research/object_detection/utils/object_detection_evaluation.py
@@ -633,11 +633,37 @@ class ObjectDetectionEvaluation(object):
               nms_max_output_boxes=10000,
               use_weighted_mean_ap=False,
               label_id_offset=0,
-               group_of_weight=0.0):
+               group_of_weight=0.0,
+               per_image_eval_class=per_image_evaluation.PerImageEvaluation):
+    """Constructor.
+
+    Args:
+      num_groundtruth_classes: Number of ground-truth classes.
+      matching_iou_threshold: IOU threshold used for matching detected boxes
+        to ground-truth boxes.
+      nms_iou_threshold: IOU threshold used for non-maximum suppression.
+      nms_max_output_boxes: Maximum number of boxes returned by non-maximum
+        suppression.
+      use_weighted_mean_ap: (optional) boolean which determines if the mean
+        average precision is computed directly from the scores and tp_fp_labels
+        of all classes.
+      label_id_offset: The label id offset.
+      group_of_weight: Weight of group-of boxes.If set to 0, detections of the
+        correct class within a group-of box are ignored. If weight is > 0, then
+        if at least one detection falls within a group-of box with
+        matching_iou_threshold, weight group_of_weight is added to true
+        positives. Consequently, if no detection falls within a group-of box,
+        weight group_of_weight is added to false negatives.
+      per_image_eval_class: The class that contains functions for computing
+        per image metrics.
+
+    Raises:
+      ValueError: if num_groundtruth_classes is smaller than 1.
+    """
    if num_groundtruth_classes < 1:
      raise ValueError('Need at least 1 groundtruth class for evaluation.')

-    self.per_image_eval = per_image_evaluation.PerImageEvaluation(
+    self.per_image_eval = per_image_eval_class(
        num_groundtruth_classes=num_groundtruth_classes,
        matching_iou_threshold=matching_iou_threshold,
        nms_iou_threshold=nms_iou_threshold,
@@ -659,14 +685,16 @@ class ObjectDetectionEvaluation(object):
    self._initialize_detections()

  def _initialize_detections(self):
+    """Initializes internal data structures."""
    self.detection_keys = set()
    self.scores_per_class = [[] for _ in range(self.num_class)]
    self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)]
    self.num_images_correctly_detected_per_class = np.zeros(self.num_class)
    self.average_precision_per_class = np.empty(self.num_class, dtype=float)
    self.average_precision_per_class.fill(np.nan)
-    self.precisions_per_class = []
-    self.recalls_per_class = []
+    self.precisions_per_class = [np.nan] * self.num_class
+    self.recalls_per_class = [np.nan] * self.num_class
+
    self.corloc_per_class = np.ones(self.num_class, dtype=float)

  def clear_detections(self):
@@ -867,8 +895,8 @@ class ObjectDetectionEvaluation(object):
      logging.info(scores)
      precision, recall = metrics.compute_precision_recall(
          scores, tp_fp_labels, self.num_gt_instances_per_class[class_index])
-      self.precisions_per_class.append(precision)
-      self.recalls_per_class.append(recall)
+      self.precisions_per_class[class_index] = precision
+      self.recalls_per_class[class_index] = recall
      average_precision = metrics.compute_average_precision(precision, recall)
      self.average_precision_per_class[class_index] = average_precision