Merge pull request #3973 from pkulzc/master

Object detection internal changes

Merge pull request #3973 from pkulzc/master
Object detection internal changes
ed4e22b8 · pkulzc · GitHub · cac90a0e · 13b89b93 · ed4e22b8
Unverified Commit ed4e22b8 authored Apr 16, 2018 by pkulzc Committed by GitHub Apr 16, 2018
20 changed files
--- a/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor.py
+++ b/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor.py
@@ -21,6 +21,7 @@ import tensorflow as tf

 from object_detection.meta_architectures import ssd_meta_arch
 from object_detection.models import feature_map_generators
+from object_detection.utils import context_manager
 from object_detection.utils import ops
 from object_detection.utils import shape_utils
 from nets import resnet_v1
@@ -36,15 +37,14 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
               depth_multiplier,
               min_depth,
               pad_to_multiple,
-               conv_hyperparams,
+               conv_hyperparams_fn,
               resnet_base_fn,
               resnet_scope_name,
               fpn_scope_name,
-               batch_norm_trainable=True,
               reuse_weights=None,
               use_explicit_padding=False,
               use_depthwise=False,
-               inplace_batchnorm_update=False):
+               override_base_feature_extractor_hyperparams=False):
    """SSD FPN feature extractor based on Resnet v1 architecture.

    Args:
@@ -54,32 +54,28 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
      min_depth: minimum feature extractor depth. UNUSED Currently.
      pad_to_multiple: the nearest multiple to zero pad the input height and
        width dimensions to.
-      conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
+      conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
+        and separable_conv2d ops in the layers that are added on top of the
+        base feature extractor.
      resnet_base_fn: base resnet network to use.
      resnet_scope_name: scope name under which to construct resnet
      fpn_scope_name: scope name under which to construct the feature pyramid
        network.
-      batch_norm_trainable: Whether to update batch norm parameters during
-        training or not. When training with a small batch size
-        (e.g. 1), it is desirable to disable batch norm update and use
-        pretrained batch norm params.
      reuse_weights: Whether to reuse variables. Default is None.
      use_explicit_padding: Whether to use explicit padding when extracting
        features. Default is False. UNUSED currently.
      use_depthwise: Whether to use depthwise convolutions. UNUSED currently.
-      inplace_batchnorm_update: Whether to update batch_norm inplace during
-        training. This is required for batch norm to work correctly on TPUs.
-        When this is false, user must add a control dependency on
-        tf.GraphKeys.UPDATE_OPS for train/loss op in order to update the batch
-        norm moving average parameters.
+      override_base_feature_extractor_hyperparams: Whether to override
+        hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams_fn`.

    Raises:
      ValueError: On supplying invalid arguments for unused arguments.
    """
    super(_SSDResnetV1FpnFeatureExtractor, self).__init__(
        is_training, depth_multiplier, min_depth, pad_to_multiple,
-        conv_hyperparams, batch_norm_trainable, reuse_weights,
-        use_explicit_padding, inplace_batchnorm_update)
+        conv_hyperparams_fn, reuse_weights, use_explicit_padding,
+        override_base_feature_extractor_hyperparams)
    if self._depth_multiplier != 1.0:
      raise ValueError('Only depth 1.0 is supported, found: {}'.
                       format(self._depth_multiplier))
@@ -116,7 +112,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
        filtered_image_features[feature_name] = feature
    return filtered_image_features

-  def _extract_features(self, preprocessed_inputs):
+  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
@@ -139,19 +135,22 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
    with tf.variable_scope(
        self._resnet_scope_name, reuse=self._reuse_weights) as scope:
      with slim.arg_scope(resnet_v1.resnet_arg_scope()):
-        _, image_features = self._resnet_base_fn(
-            inputs=ops.pad_to_multiple(preprocessed_inputs,
-                                       self._pad_to_multiple),
-            num_classes=None,
-            is_training=self._is_training and self._batch_norm_trainable,
-            global_pool=False,
-            output_stride=None,
-            store_non_strided_activations=True,
-            scope=scope)
+        with (slim.arg_scope(self._conv_hyperparams_fn())
+              if self._override_base_feature_extractor_hyperparams else
+              context_manager.IdentityContextManager()):
+          _, image_features = self._resnet_base_fn(
+              inputs=ops.pad_to_multiple(preprocessed_inputs,
+                                         self._pad_to_multiple),
+              num_classes=None,
+              is_training=None,
+              global_pool=False,
+              output_stride=None,
+              store_non_strided_activations=True,
+              scope=scope)
      image_features = self._filter_features(image_features)
      last_feature_map = image_features['block4']
    with tf.variable_scope(self._fpn_scope_name, reuse=self._reuse_weights):
-      with slim.arg_scope(self._conv_hyperparams):
+      with slim.arg_scope(self._conv_hyperparams_fn()):
        for i in range(5, 7):
          last_feature_map = slim.conv2d(
              last_feature_map,
@@ -178,40 +177,36 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
               depth_multiplier,
               min_depth,
               pad_to_multiple,
-               conv_hyperparams,
-               batch_norm_trainable=True,
+               conv_hyperparams_fn,
               reuse_weights=None,
               use_explicit_padding=False,
               use_depthwise=False,
-               inplace_batchnorm_update=False):
-    """Resnet50 v1 FPN Feature Extractor for SSD Models.
+               override_base_feature_extractor_hyperparams=False):
+    """SSD Resnet50 V1 FPN feature extractor based on Resnet v1 architecture.

    Args:
      is_training: whether the network is in training mode.
      depth_multiplier: float depth multiplier for feature extractor.
-      min_depth: minimum feature extractor depth.
+        UNUSED currently.
+      min_depth: minimum feature extractor depth. UNUSED Currently.
      pad_to_multiple: the nearest multiple to zero pad the input height and
        width dimensions to.
-      conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
-      batch_norm_trainable: Whether to update batch norm parameters during
-        training or not. When training with a small batch size
-        (e.g. 1), it is desirable to disable batch norm update and use
-        pretrained batch norm params.
+      conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
+        and separable_conv2d ops in the layers that are added on top of the
+        base feature extractor.
      reuse_weights: Whether to reuse variables. Default is None.
      use_explicit_padding: Whether to use explicit padding when extracting
        features. Default is False. UNUSED currently.
      use_depthwise: Whether to use depthwise convolutions. UNUSED currently.
-      inplace_batchnorm_update: Whether to update batch_norm inplace during
-        training. This is required for batch norm to work correctly on TPUs.
-        When this is false, user must add a control dependency on
-        tf.GraphKeys.UPDATE_OPS for train/loss op in order to update the batch
-        norm moving average parameters.
+      override_base_feature_extractor_hyperparams: Whether to override
+        hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams_fn`.
    """
    super(SSDResnet50V1FpnFeatureExtractor, self).__init__(
        is_training, depth_multiplier, min_depth, pad_to_multiple,
-        conv_hyperparams, resnet_v1.resnet_v1_50, 'resnet_v1_50', 'fpn',
-        batch_norm_trainable, reuse_weights, use_explicit_padding,
-        inplace_batchnorm_update)
+        conv_hyperparams_fn, resnet_v1.resnet_v1_50, 'resnet_v1_50', 'fpn',
+        reuse_weights, use_explicit_padding,
+        override_base_feature_extractor_hyperparams)


 class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
@@ -221,40 +216,36 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
               depth_multiplier,
               min_depth,
               pad_to_multiple,
-               conv_hyperparams,
-               batch_norm_trainable=True,
+               conv_hyperparams_fn,
               reuse_weights=None,
               use_explicit_padding=False,
               use_depthwise=False,
-               inplace_batchnorm_update=False):
-    """Resnet101 v1 FPN Feature Extractor for SSD Models.
+               override_base_feature_extractor_hyperparams=False):
+    """SSD Resnet101 V1 FPN feature extractor based on Resnet v1 architecture.

    Args:
      is_training: whether the network is in training mode.
      depth_multiplier: float depth multiplier for feature extractor.
-      min_depth: minimum feature extractor depth.
+        UNUSED currently.
+      min_depth: minimum feature extractor depth. UNUSED Currently.
      pad_to_multiple: the nearest multiple to zero pad the input height and
        width dimensions to.
-      conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
-      batch_norm_trainable: Whether to update batch norm parameters during
-        training or not. When training with a small batch size
-        (e.g. 1), it is desirable to disable batch norm update and use
-        pretrained batch norm params.
+      conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
+        and separable_conv2d ops in the layers that are added on top of the
+        base feature extractor.
      reuse_weights: Whether to reuse variables. Default is None.
      use_explicit_padding: Whether to use explicit padding when extracting
        features. Default is False. UNUSED currently.
      use_depthwise: Whether to use depthwise convolutions. UNUSED currently.
-      inplace_batchnorm_update: Whether to update batch_norm inplace during
-        training. This is required for batch norm to work correctly on TPUs.
-        When this is false, user must add a control dependency on
-        tf.GraphKeys.UPDATE_OPS for train/loss op in order to update the batch
-        norm moving average parameters.
+      override_base_feature_extractor_hyperparams: Whether to override
+        hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams_fn`.
    """
    super(SSDResnet101V1FpnFeatureExtractor, self).__init__(
        is_training, depth_multiplier, min_depth, pad_to_multiple,
-        conv_hyperparams, resnet_v1.resnet_v1_101, 'resnet_v1_101', 'fpn',
-        batch_norm_trainable, reuse_weights, use_explicit_padding,
-        inplace_batchnorm_update)
+        conv_hyperparams_fn, resnet_v1.resnet_v1_101, 'resnet_v1_101', 'fpn',
+        reuse_weights, use_explicit_padding,
+        override_base_feature_extractor_hyperparams)


 class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
@@ -264,37 +255,33 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
               depth_multiplier,
               min_depth,
               pad_to_multiple,
-               conv_hyperparams,
-               batch_norm_trainable=True,
+               conv_hyperparams_fn,
               reuse_weights=None,
               use_explicit_padding=False,
               use_depthwise=False,
-               inplace_batchnorm_update=False):
-    """Resnet152 v1 FPN Feature Extractor for SSD Models.
+               override_base_feature_extractor_hyperparams=False):
+    """SSD Resnet152 V1 FPN feature extractor based on Resnet v1 architecture.

    Args:
      is_training: whether the network is in training mode.
      depth_multiplier: float depth multiplier for feature extractor.
-      min_depth: minimum feature extractor depth.
+        UNUSED currently.
+      min_depth: minimum feature extractor depth. UNUSED Currently.
      pad_to_multiple: the nearest multiple to zero pad the input height and
        width dimensions to.
-      conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
-      batch_norm_trainable: Whether to update batch norm parameters during
-        training or not. When training with a small batch size
-        (e.g. 1), it is desirable to disable batch norm update and use
-        pretrained batch norm params.
+      conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
+        and separable_conv2d ops in the layers that are added on top of the
+        base feature extractor.
      reuse_weights: Whether to reuse variables. Default is None.
      use_explicit_padding: Whether to use explicit padding when extracting
        features. Default is False. UNUSED currently.
      use_depthwise: Whether to use depthwise convolutions. UNUSED currently.
-      inplace_batchnorm_update: Whether to update batch_norm inplace during
-        training. This is required for batch norm to work correctly on TPUs.
-        When this is false, user must add a control dependency on
-        tf.GraphKeys.UPDATE_OPS for train/loss op in order to update the batch
-        norm moving average parameters.
+      override_base_feature_extractor_hyperparams: Whether to override
+        hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams_fn`.
    """
    super(SSDResnet152V1FpnFeatureExtractor, self).__init__(
        is_training, depth_multiplier, min_depth, pad_to_multiple,
-        conv_hyperparams, resnet_v1.resnet_v1_152, 'resnet_v1_152', 'fpn',
-        batch_norm_trainable, reuse_weights, use_explicit_padding,
-        inplace_batchnorm_update)
+        conv_hyperparams_fn, resnet_v1.resnet_v1_152, 'resnet_v1_152', 'fpn',
+        reuse_weights, use_explicit_padding,
+        override_base_feature_extractor_hyperparams)
--- a/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_test.py
+++ b/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_test.py
@@ -27,13 +27,10 @@ class SSDResnet50V1FeatureExtractorTest(
  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
                                use_explicit_padding=False):
    min_depth = 32
-    conv_hyperparams = {}
-    batch_norm_trainable = True
    is_training = True
    return ssd_resnet_v1_fpn_feature_extractor.SSDResnet50V1FpnFeatureExtractor(
        is_training, depth_multiplier, min_depth, pad_to_multiple,
-        conv_hyperparams, batch_norm_trainable,
-        use_explicit_padding=use_explicit_padding)
+        self.conv_hyperparams_fn, use_explicit_padding=use_explicit_padding)

  def _resnet_scope_name(self):
    return 'resnet_v1_50'
@@ -47,13 +44,14 @@ class SSDResnet101V1FeatureExtractorTest(
  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
                                use_explicit_padding=False):
    min_depth = 32
-    conv_hyperparams = {}
-    batch_norm_trainable = True
    is_training = True
    return (
        ssd_resnet_v1_fpn_feature_extractor.SSDResnet101V1FpnFeatureExtractor(
-            is_training, depth_multiplier, min_depth, pad_to_multiple,
-            conv_hyperparams, batch_norm_trainable,
+            is_training,
+            depth_multiplier,
+            min_depth,
+            pad_to_multiple,
+            self.conv_hyperparams_fn,
            use_explicit_padding=use_explicit_padding))

  def _resnet_scope_name(self):
@@ -68,13 +66,14 @@ class SSDResnet152V1FeatureExtractorTest(
  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
                                use_explicit_padding=False):
    min_depth = 32
-    conv_hyperparams = {}
-    batch_norm_trainable = True
    is_training = True
    return (
        ssd_resnet_v1_fpn_feature_extractor.SSDResnet152V1FpnFeatureExtractor(
-            is_training, depth_multiplier, min_depth, pad_to_multiple,
-            conv_hyperparams, batch_norm_trainable,
+            is_training,
+            depth_multiplier,
+            min_depth,
+            pad_to_multiple,
+            self.conv_hyperparams_fn,
            use_explicit_padding=use_explicit_padding))

  def _resnet_scope_name(self):

--- a/research/object_detection/protos/box_predictor.proto
+++ b/research/object_detection/protos/box_predictor.proto
@@ -118,6 +118,7 @@ message MaskRCNNBoxPredictor {
  // The number of convolutions applied to image_features in the mask prediction
  // branch.
  optional int32 mask_prediction_num_conv_layers = 11 [default = 2];
+  optional bool masks_are_class_agnostic = 12 [default = false];
 }

 message RfcnBoxPredictor {

--- a/research/object_detection/protos/ssd.proto
+++ b/research/object_detection/protos/ssd.proto
@@ -60,6 +60,21 @@ message Ssd {
  // Loss configuration for training.
  optional Loss loss = 11;

+  // Whether to update batch norm parameters during training or not.
+  // When training with a relative small batch size (e.g. 1), it is
+  // desirable to disable batch norm update and use pretrained batch norm
+  // params.
+  //
+  // Note: Some feature extractors are used with canned arg_scopes
+  // (e.g resnet arg scopes).  In these cases training behavior of batch norm
+  // variables may depend on both values of `batch_norm_trainable` and
+  // `is_training`.
+  //
+  // When canned arg_scopes are used with feature extractors `conv_hyperparams`
+  // will apply only to the additional layers that are added and are outside the
+  // canned arg_scope.
+  optional bool freeze_batchnorm = 16 [default = false];
+
  // Whether to update batch_norm inplace during training. This is required
  // for batch norm to work correctly on TPUs. When this is false, user must add
  // a control dependency on tf.GraphKeys.UPDATE_OPS for train/loss op in order
@@ -69,6 +84,8 @@ message Ssd {


 message SsdFeatureExtractor {
+  reserved 6;
+
  // Type of ssd feature extractor.
  optional string type = 1;

@@ -82,26 +99,19 @@ message SsdFeatureExtractor {
  // of the base feature extractor.
  optional Hyperparams conv_hyperparams = 4;

+  // Normally, SSD feature extractors are constructed by reusing an existing
+  // base feature extractor (that has its own hyperparams) and adding new layers
+  // on top of it. `conv_hyperparams` above normally applies only to the new
+  // layers while base feature extractor uses its own default hyperparams. If
+  // this value is set to true, the base feature extractor's hyperparams will be
+  // overridden with the `conv_hyperparams`.
+  optional bool override_base_feature_extractor_hyperparams = 9 [default = false];
+
  // The nearest multiple to zero-pad the input height and width dimensions to.
  // For example, if pad_to_multiple = 2, input dimensions are zero-padded
  // until the resulting dimensions are even.
  optional int32 pad_to_multiple = 5 [default = 1];

-  // Whether to update batch norm parameters during training or not.
-  // When training with a relative small batch size (e.g. 1), it is
-  // desirable to disable batch norm update and use pretrained batch norm
-  // params.
-  //
-  // Note: Some feature extractors are used with canned arg_scopes
-  // (e.g resnet arg scopes).  In these cases training behavior of batch norm
-  // variables may depend on both values of `batch_norm_trainable` and
-  // `is_training`.
-  //
-  // When canned arg_scopes are used with feature extractors `conv_hyperparams`
-  // will apply only to the additional layers that are added and are outside the
-  // canned arg_scope.
-  optional bool batch_norm_trainable = 6 [default=true];
-
  // Whether to use explicit padding when extracting SSD multiresolution
  // features. Note that this does not apply to the base feature extractor.
  optional bool use_explicit_padding = 7 [default=false];

--- a/research/object_detection/protos/train.proto
+++ b/research/object_detection/protos/train.proto
@@ -6,8 +6,11 @@ import "object_detection/protos/optimizer.proto";
 import "object_detection/protos/preprocessor.proto";

 // Message for configuring DetectionModel training jobs (train.py).
+// Next id: 25
 message TrainConfig {
-  // Input queue batch size.
+  // Effective batch size to use for training.
+  // For TPU (or sync SGD jobs), the batch size per core (or GPU) is going to be
+  // `batch_size` / number of cores (or `batch_size` / number of GPUs).
  optional uint32 batch_size = 1 [default=32];

  // Data augmentation options.
@@ -78,6 +81,10 @@ message TrainConfig {
  // Note that only Sigmoid classification losses should be used.
  optional bool merge_multiple_label_boxes = 17 [default=false];

+  // If true, will use multiclass scores from object annotations as ground
+  // truth. Currently only compatible with annotated image inputs.
+  optional bool use_multiclass_scores = 24 [default = false];
+
  // Whether to add regularization loss to `total_loss`. This is true by
  // default and adds all regularization losses defined in the model to
  // `total_loss`.

--- a/research/object_detection/samples/configs/ssd_inception_v2_coco.config
+++ b/research/object_detection/samples/configs/ssd_inception_v2_coco.config
@@ -98,6 +98,7 @@ model {
          epsilon: 0.001,
        }
      }
+      override_base_feature_extractor_hyperparams: true
    }
    loss {
      classification_loss {

--- a/research/object_detection/samples/configs/ssd_inception_v2_pets.config
+++ b/research/object_detection/samples/configs/ssd_inception_v2_pets.config
@@ -98,6 +98,7 @@ model {
          epsilon: 0.001,
        }
      }
+      override_base_feature_extractor_hyperparams: true
    }
    loss {
      classification_loss {

--- a/research/object_detection/samples/configs/ssd_inception_v3_pets.config
+++ b/research/object_detection/samples/configs/ssd_inception_v3_pets.config
@@ -98,6 +98,7 @@ model {
          epsilon: 0.01,
        }
      }
+      override_base_feature_extractor_hyperparams: true
    }
    loss {
      classification_loss {

--- a/research/object_detection/trainer.py
+++ b/research/object_detection/trainer.py
@@ -69,10 +69,13 @@ def create_input_queue(batch_size_per_clone, create_tensor_dict_fn,
                            in tensor_dict)
  include_keypoints = (fields.InputDataFields.groundtruth_keypoints
                       in tensor_dict)
+  include_multiclass_scores = (fields.InputDataFields.multiclass_scores
+                               in tensor_dict)
  if data_augmentation_options:
    tensor_dict = preprocessor.preprocess(
        tensor_dict, data_augmentation_options,
        func_arg_map=preprocessor.get_default_func_arg_map(
+            include_multiclass_scores=include_multiclass_scores,
            include_instance_masks=include_instance_masks,
            include_keypoints=include_keypoints))

@@ -85,7 +88,10 @@ def create_input_queue(batch_size_per_clone, create_tensor_dict_fn,
  return input_queue


-def get_inputs(input_queue, num_classes, merge_multiple_label_boxes=False):
+def get_inputs(input_queue,
+               num_classes,
+               merge_multiple_label_boxes=False,
+               use_multiclass_scores=False):
  """Dequeues batch and constructs inputs to object detection model.

  Args:
@@ -95,6 +101,8 @@ def get_inputs(input_queue, num_classes, merge_multiple_label_boxes=False):
      or not. Defaults to false. Merged boxes are represented with a single
      box and a k-hot encoding of the multiple labels associated with the
      boxes.
+    use_multiclass_scores: Whether to use multiclass scores instead of
+      groundtruth_classes.

  Returns:
    images: a list of 3-D float tensor of images.
@@ -123,9 +131,19 @@ def get_inputs(input_queue, num_classes, merge_multiple_label_boxes=False):
    classes_gt = tf.cast(read_data[fields.InputDataFields.groundtruth_classes],
                         tf.int32)
    classes_gt -= label_id_offset
+
+    if merge_multiple_label_boxes and use_multiclass_scores:
+      raise ValueError(
+          'Using both merge_multiple_label_boxes and use_multiclass_scores is'
+          'not supported'
+      )
+
    if merge_multiple_label_boxes:
      location_gt, classes_gt, _ = util_ops.merge_boxes_with_multiple_labels(
          location_gt, classes_gt, num_classes)
+    elif use_multiclass_scores:
+      classes_gt = tf.cast(read_data[fields.InputDataFields.multiclass_scores],
+                           tf.float32)
    else:
      classes_gt = util_ops.padded_one_hot_encoding(
          indices=classes_gt, depth=num_classes, left_pad=0)
@@ -155,7 +173,8 @@ def _create_losses(input_queue, create_model_fn, train_config):
   groundtruth_masks_list, groundtruth_keypoints_list, _) = get_inputs(
       input_queue,
       detection_model.num_classes,
-       train_config.merge_multiple_label_boxes)
+       train_config.merge_multiple_label_boxes,
+       train_config.use_multiclass_scores)

  preprocessed_images = []
  true_image_shapes = []
@@ -183,9 +202,19 @@ def _create_losses(input_queue, create_model_fn, train_config):
    tf.losses.add_loss(loss_tensor)


-def train(create_tensor_dict_fn, create_model_fn, train_config, master, task,
-          num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name,
-          is_chief, train_dir, graph_hook_fn=None):
+def train(create_tensor_dict_fn,
+          create_model_fn,
+          train_config,
+          master,
+          task,
+          num_clones,
+          worker_replicas,
+          clone_on_cpu,
+          ps_tasks,
+          worker_job_name,
+          is_chief,
+          train_dir,
+          graph_hook_fn=None):
  """Training function for detection models.

  Args:

--- a/research/object_detection/trainer_test.py
+++ b/research/object_detection/trainer_test.py
@@ -37,12 +37,15 @@ def get_input_function():
      [1], minval=0, maxval=NUMBER_OF_CLASSES, dtype=tf.int32)
  box_label = tf.random_uniform(
      [1, 4], minval=0.4, maxval=0.6, dtype=tf.float32)
+  multiclass_scores = tf.random_uniform(
+      [1, NUMBER_OF_CLASSES], minval=0.4, maxval=0.6, dtype=tf.float32)

  return {
      fields.InputDataFields.image: image,
      fields.InputDataFields.key: key,
      fields.InputDataFields.groundtruth_classes: class_label,
-      fields.InputDataFields.groundtruth_boxes: box_label
+      fields.InputDataFields.groundtruth_boxes: box_label,
+      fields.InputDataFields.multiclass_scores: multiclass_scores
  }


@@ -203,6 +206,50 @@ class TrainerTest(tf.test.TestCase):

    train_dir = self.get_temp_dir()

+    trainer.train(
+        create_tensor_dict_fn=get_input_function,
+        create_model_fn=FakeDetectionModel,
+        train_config=train_config,
+        master='',
+        task=0,
+        num_clones=1,
+        worker_replicas=1,
+        clone_on_cpu=True,
+        ps_tasks=0,
+        worker_job_name='worker',
+        is_chief=True,
+        train_dir=train_dir)
+
+  def test_configure_trainer_with_multiclass_scores_and_train_two_steps(self):
+    train_config_text_proto = """
+    optimizer {
+      adam_optimizer {
+        learning_rate {
+          constant_learning_rate {
+            learning_rate: 0.01
+          }
+        }
+      }
+    }
+    data_augmentation_options {
+      random_adjust_brightness {
+        max_delta: 0.2
+      }
+    }
+    data_augmentation_options {
+      random_adjust_contrast {
+        min_delta: 0.7
+        max_delta: 1.1
+      }
+    }
+    num_steps: 2
+    use_multiclass_scores: true
+    """
+    train_config = train_pb2.TrainConfig()
+    text_format.Merge(train_config_text_proto, train_config)
+
+    train_dir = self.get_temp_dir()
+
    trainer.train(create_tensor_dict_fn=get_input_function,
                  create_model_fn=FakeDetectionModel,
                  train_config=train_config,

--- a/research/object_detection/utils/config_util.py
+++ b/research/object_detection/utils/config_util.py
@@ -63,8 +63,10 @@ def get_spatial_image_size(image_resizer_config):
    ValueError: If the model type is not recognized.
  """
  if image_resizer_config.HasField("fixed_shape_resizer"):
-    return [image_resizer_config.fixed_shape_resizer.height,
-            image_resizer_config.fixed_shape_resizer.width]
+    return [
+        image_resizer_config.fixed_shape_resizer.height,
+        image_resizer_config.fixed_shape_resizer.width
+    ]
  if image_resizer_config.HasField("keep_aspect_ratio_resizer"):
    if image_resizer_config.keep_aspect_ratio_resizer.pad_to_max_dimension:
      return [image_resizer_config.keep_aspect_ratio_resizer.max_dimension] * 2
@@ -74,7 +76,7 @@ def get_spatial_image_size(image_resizer_config):


 def get_configs_from_pipeline_file(pipeline_config_path):
-  """Reads configuration from a pipeline_pb2.TrainEvalPipelineConfig.
+  """Reads config from a file containing pipeline_pb2.TrainEvalPipelineConfig.

  Args:
    pipeline_config_path: Path to pipeline_pb2.TrainEvalPipelineConfig text
@@ -89,23 +91,34 @@ def get_configs_from_pipeline_file(pipeline_config_path):
  with tf.gfile.GFile(pipeline_config_path, "r") as f:
    proto_str = f.read()
    text_format.Merge(proto_str, pipeline_config)
+  return create_configs_from_pipeline_proto(pipeline_config)

+
+def create_configs_from_pipeline_proto(pipeline_config):
+  """Creates a configs dictionary from pipeline_pb2.TrainEvalPipelineConfig.
+
+  Args:
+    pipeline_config: pipeline_pb2.TrainEvalPipelineConfig proto object.
+
+  Returns:
+    Dictionary of configuration objects. Keys are `model`, `train_config`,
+      `train_input_config`, `eval_config`, `eval_input_config`. Value are the
+      corresponding config objects.
+  """
  configs = {}
  configs["model"] = pipeline_config.model
  configs["train_config"] = pipeline_config.train_config
  configs["train_input_config"] = pipeline_config.train_input_reader
  configs["eval_config"] = pipeline_config.eval_config
  configs["eval_input_config"] = pipeline_config.eval_input_reader
-
  return configs


 def create_pipeline_proto_from_configs(configs):
  """Creates a pipeline_pb2.TrainEvalPipelineConfig from configs dictionary.

-  This function nearly performs the inverse operation of
-  get_configs_from_pipeline_file(). Instead of returning a file path, it returns
-  a `TrainEvalPipelineConfig` object.
+  This function performs the inverse operation of
+  create_configs_from_pipeline_proto().

  Args:
    configs: Dictionary of configs. See get_configs_from_pipeline_file().
@@ -437,7 +450,7 @@ def _get_classification_loss(model_config):
  if meta_architecture == "faster_rcnn":
    model = model_config.faster_rcnn
    classification_loss = model.second_stage_classification_loss
-  if meta_architecture == "ssd":
+  elif meta_architecture == "ssd":
    model = model_config.ssd
    classification_loss = model.loss.classification_loss
  else:

--- a/research/object_detection/utils/config_util_test.py
+++ b/research/object_detection/utils/config_util_test.py
@@ -93,6 +93,26 @@ class ConfigUtilTest(tf.test.TestCase):
    self.assertProtoEquals(pipeline_config.eval_input_reader,
                           configs["eval_input_config"])

+  def test_create_configs_from_pipeline_proto(self):
+    """Tests creating configs dictionary from pipeline proto."""
+
+    pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+    pipeline_config.model.faster_rcnn.num_classes = 10
+    pipeline_config.train_config.batch_size = 32
+    pipeline_config.train_input_reader.label_map_path = "path/to/label_map"
+    pipeline_config.eval_config.num_examples = 20
+    pipeline_config.eval_input_reader.queue_capacity = 100
+
+    configs = config_util.create_configs_from_pipeline_proto(pipeline_config)
+    self.assertProtoEquals(pipeline_config.model, configs["model"])
+    self.assertProtoEquals(pipeline_config.train_config,
+                           configs["train_config"])
+    self.assertProtoEquals(pipeline_config.train_input_reader,
+                           configs["train_input_config"])
+    self.assertProtoEquals(pipeline_config.eval_config, configs["eval_config"])
+    self.assertProtoEquals(pipeline_config.eval_input_reader,
+                           configs["eval_input_config"])
+
  def test_create_pipeline_proto_from_configs(self):
    """Tests that proto can be reconstructed from configs dictionary."""
    pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config")

--- a/research/object_detection/utils/label_map_util.py
+++ b/research/object_detection/utils/label_map_util.py
@@ -34,7 +34,8 @@ def _validate_label_map(label_map):
  for item in label_map.item:
    if item.id < 0:
      raise ValueError('Label map ids should be >= 0.')
-    if item.id == 0 and item.name != 'background':
+    if (item.id == 0 and item.name != 'background' and
+        item.display_name != 'background'):
      raise ValueError('Label map id 0 is reserved for the background label')



--- a/research/object_detection/utils/metrics.py
+++ b/research/object_detection/utils/metrics.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Functions for computing metrics like precision, recall, CorLoc and etc."""
 from __future__ import division

@@ -24,7 +23,7 @@ def compute_precision_recall(scores, labels, num_gt):

  Args:
    scores: A float numpy array representing detection score
-    labels: A boolean numpy array representing true/false positive labels
+    labels: A float numpy array representing weighted true/false positive labels
    num_gt: Number of ground truth instances

  Raises:
@@ -37,12 +36,13 @@ def compute_precision_recall(scores, labels, num_gt):
      This value is None if no ground truth labels are present.

  """
-  if not isinstance(
-      labels, np.ndarray) or labels.dtype != np.bool or len(labels.shape) != 1:
-    raise ValueError("labels must be single dimension bool numpy array")
+  if not isinstance(labels, np.ndarray) or len(labels.shape) != 1:
+    raise ValueError("labels must be single dimension numpy array")
+
+  if labels.dtype != np.float and labels.dtype != np.bool:
+    raise ValueError("labels type must be either bool or float")

-  if not isinstance(
-      scores, np.ndarray) or len(scores.shape) != 1:
+  if not isinstance(scores, np.ndarray) or len(scores.shape) != 1:
    raise ValueError("scores must be single dimension numpy array")

  if num_gt < np.sum(labels):
@@ -56,9 +56,8 @@ def compute_precision_recall(scores, labels, num_gt):

  sorted_indices = np.argsort(scores)
  sorted_indices = sorted_indices[::-1]
-  labels = labels.astype(int)
  true_positive_labels = labels[sorted_indices]
-  false_positive_labels = 1 - true_positive_labels
+  false_positive_labels = (true_positive_labels <= 0).astype(float)
  cum_true_positives = np.cumsum(true_positive_labels)
  cum_false_positives = np.cumsum(false_positive_labels)
  precision = cum_true_positives.astype(float) / (
@@ -90,8 +89,8 @@ def compute_average_precision(precision, recall):
      raise ValueError("If precision is None, recall must also be None")
    return np.NAN

-  if not isinstance(precision, np.ndarray) or not isinstance(recall,
-                                                             np.ndarray):
+  if not isinstance(precision, np.ndarray) or not isinstance(
+      recall, np.ndarray):
    raise ValueError("precision and recall must be numpy array")
  if precision.dtype != np.float or recall.dtype != np.float:
    raise ValueError("input must be float numpy array.")
@@ -139,6 +138,53 @@ def compute_cor_loc(num_gt_imgs_per_class,
      class
  """
  return np.where(
-      num_gt_imgs_per_class == 0,
-      np.nan,
+      num_gt_imgs_per_class == 0, np.nan,
      num_images_correctly_detected_per_class / num_gt_imgs_per_class)
+
+
+def compute_median_rank_at_k(tp_fp_list, k):
+  """Computes MedianRank@k, where k is the top-scoring labels.
+
+  Args:
+    tp_fp_list: a list of numpy arrays; each numpy array corresponds to the all
+        detection on a single image, where the detections are sorted by score in
+        descending order. Further, each numpy array element can have boolean or
+        float values. True positive elements have either value >0.0 or True;
+        any other value is considered false positive.
+    k: number of top-scoring proposals to take.
+
+  Returns:
+    median_rank: median rank of all true positive proposals among top k by
+      score.
+  """
+  ranks = []
+  for i in range(len(tp_fp_list)):
+    ranks.append(
+        np.where(tp_fp_list[i][0:min(k, tp_fp_list[i].shape[0])] > 0)[0])
+  concatenated_ranks = np.concatenate(ranks)
+  return np.median(concatenated_ranks)
+
+
+def compute_recall_at_k(tp_fp_list, num_gt, k):
+  """Computes Recall@k, MedianRank@k, where k is the top-scoring labels.
+
+  Args:
+    tp_fp_list: a list of numpy arrays; each numpy array corresponds to the all
+        detection on a single image, where the detections are sorted by score in
+        descending order. Further, each numpy array element can have boolean or
+        float values. True positive elements have either value >0.0 or True;
+        any other value is considered false positive.
+    num_gt: number of groundtruth anotations.
+    k: number of top-scoring proposals to take.
+
+  Returns:
+    recall: recall evaluated on the top k by score detections.
+  """
+
+  tp_fp_eval = []
+  for i in range(len(tp_fp_list)):
+    tp_fp_eval.append(tp_fp_list[i][0:min(k, tp_fp_list[i].shape[0])])
+
+  tp_fp_eval = np.concatenate(tp_fp_eval)
+
+  return np.sum(tp_fp_eval) / num_gt
--- a/research/object_detection/utils/metrics_test.py
+++ b/research/object_detection/utils/metrics_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for object_detection.metrics."""

 import numpy as np
@@ -25,8 +24,8 @@ class MetricsTest(tf.test.TestCase):

  def test_compute_cor_loc(self):
    num_gt_imgs_per_class = np.array([100, 1, 5, 1, 1], dtype=int)
-    num_images_correctly_detected_per_class = np.array([10, 0, 1, 0, 0],
-                                                       dtype=int)
+    num_images_correctly_detected_per_class = np.array(
+        [10, 0, 1, 0, 0], dtype=int)
    corloc = metrics.compute_cor_loc(num_gt_imgs_per_class,
                                     num_images_correctly_detected_per_class)
    expected_corloc = np.array([0.1, 0, 0.2, 0, 0], dtype=float)
@@ -34,8 +33,8 @@ class MetricsTest(tf.test.TestCase):

  def test_compute_cor_loc_nans(self):
    num_gt_imgs_per_class = np.array([100, 0, 0, 1, 1], dtype=int)
-    num_images_correctly_detected_per_class = np.array([10, 0, 1, 0, 0],
-                                                       dtype=int)
+    num_images_correctly_detected_per_class = np.array(
+        [10, 0, 1, 0, 0], dtype=int)
    corloc = metrics.compute_cor_loc(num_gt_imgs_per_class,
                                     num_images_correctly_detected_per_class)
    expected_corloc = np.array([0.1, np.nan, np.nan, 0, 0], dtype=float)
@@ -45,18 +44,37 @@ class MetricsTest(tf.test.TestCase):
    num_gt = 10
    scores = np.array([0.4, 0.3, 0.6, 0.2, 0.7, 0.1], dtype=float)
    labels = np.array([0, 1, 1, 0, 0, 1], dtype=bool)
+    labels_float_type = np.array([0, 1, 1, 0, 0, 1], dtype=float)
    accumulated_tp_count = np.array([0, 1, 1, 2, 2, 3], dtype=float)
    expected_precision = accumulated_tp_count / np.array([1, 2, 3, 4, 5, 6])
    expected_recall = accumulated_tp_count / num_gt
+
    precision, recall = metrics.compute_precision_recall(scores, labels, num_gt)
+    precision_float_type, recall_float_type = metrics.compute_precision_recall(
+        scores, labels_float_type, num_gt)
+
+    self.assertAllClose(precision, expected_precision)
+    self.assertAllClose(recall, expected_recall)
+    self.assertAllClose(precision_float_type, expected_precision)
+    self.assertAllClose(recall_float_type, expected_recall)
+
+  def test_compute_precision_recall_float(self):
+    num_gt = 10
+    scores = np.array([0.4, 0.3, 0.6, 0.2, 0.7, 0.1], dtype=float)
+    labels_float = np.array([0, 1, 1, 0.5, 0, 1], dtype=float)
+    expected_precision = np.array(
+        [0., 0.5, 0.33333333, 0.5, 0.55555556, 0.63636364], dtype=float)
+    expected_recall = np.array([0., 0.1, 0.1, 0.2, 0.25, 0.35], dtype=float)
+    precision, recall = metrics.compute_precision_recall(
+        scores, labels_float, num_gt)
    self.assertAllClose(precision, expected_precision)
    self.assertAllClose(recall, expected_recall)

  def test_compute_average_precision(self):
    precision = np.array([0.8, 0.76, 0.9, 0.65, 0.7, 0.5, 0.55, 0], dtype=float)
    recall = np.array([0.3, 0.3, 0.4, 0.4, 0.45, 0.45, 0.5, 0.5], dtype=float)
-    processed_precision = np.array([0.9, 0.9, 0.9, 0.7, 0.7, 0.55, 0.55, 0],
-                                   dtype=float)
+    processed_precision = np.array(
+        [0.9, 0.9, 0.9, 0.7, 0.7, 0.55, 0.55, 0], dtype=float)
    recall_interval = np.array([0.3, 0, 0.1, 0, 0.05, 0, 0.05, 0], dtype=float)
    expected_mean_ap = np.sum(recall_interval * processed_precision)
    mean_ap = metrics.compute_average_precision(precision, recall)
@@ -74,6 +92,52 @@ class MetricsTest(tf.test.TestCase):
    ap = metrics.compute_average_precision(precision, recall)
    self.assertTrue(np.isnan(ap))

+  def test_compute_recall_at_k(self):
+    num_gt = 4
+    tp_fp = [
+        np.array([1, 0, 0], dtype=float),
+        np.array([0, 1], dtype=float),
+        np.array([0, 0, 0, 0, 0], dtype=float)
+    ]
+    tp_fp_bool = [
+        np.array([True, False, False], dtype=bool),
+        np.array([False, True], dtype=float),
+        np.array([False, False, False, False, False], dtype=float)
+    ]
+
+    recall_1 = metrics.compute_recall_at_k(tp_fp, num_gt, 1)
+    recall_3 = metrics.compute_recall_at_k(tp_fp, num_gt, 3)
+    recall_5 = metrics.compute_recall_at_k(tp_fp, num_gt, 5)
+
+    recall_3_bool = metrics.compute_recall_at_k(tp_fp_bool, num_gt, 3)
+
+    self.assertAlmostEqual(recall_1, 0.25)
+    self.assertAlmostEqual(recall_3, 0.5)
+    self.assertAlmostEqual(recall_3_bool, 0.5)
+    self.assertAlmostEqual(recall_5, 0.5)
+
+  def test_compute_median_rank_at_k(self):
+    tp_fp = [
+        np.array([1, 0, 0], dtype=float),
+        np.array([0, 0.1], dtype=float),
+        np.array([0, 0, 0, 0, 0], dtype=float)
+    ]
+    tp_fp_bool = [
+        np.array([True, False, False], dtype=bool),
+        np.array([False, True], dtype=float),
+        np.array([False, False, False, False, False], dtype=float)
+    ]
+
+    median_ranks_1 = metrics.compute_median_rank_at_k(tp_fp, 1)
+    median_ranks_3 = metrics.compute_median_rank_at_k(tp_fp, 3)
+    median_ranks_5 = metrics.compute_median_rank_at_k(tp_fp, 5)
+    median_ranks_3_bool = metrics.compute_median_rank_at_k(tp_fp_bool, 3)
+
+    self.assertEquals(median_ranks_1, 0)
+    self.assertEquals(median_ranks_3, 0.5)
+    self.assertEquals(median_ranks_3_bool, 0.5)
+    self.assertEquals(median_ranks_5, 0.5)
+

 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/utils/object_detection_evaluation.py
+++ b/research/object_detection/utils/object_detection_evaluation.py
@@ -110,7 +110,8 @@ class ObjectDetectionEvaluator(DetectionEvaluator):
               evaluate_corlocs=False,
               metric_prefix=None,
               use_weighted_mean_ap=False,
-               evaluate_masks=False):
+               evaluate_masks=False,
+               group_of_weight=0.0):
    """Constructor.

    Args:
@@ -128,6 +129,12 @@ class ObjectDetectionEvaluator(DetectionEvaluator):
        of all classes.
      evaluate_masks: If False, evaluation will be performed based on boxes.
        If True, mask evaluation will be performed instead.
+      group_of_weight: Weight of group-of boxes.If set to 0, detections of the
+        correct class within a group-of box are ignored. If weight is > 0, then
+        if at least one detection falls within a group-of box with
+        matching_iou_threshold, weight group_of_weight is added to true
+        positives. Consequently, if no detection falls within a group-of box,
+        weight group_of_weight is added to false negatives.

    Raises:
      ValueError: If the category ids are not 1-indexed.
@@ -140,11 +147,13 @@ class ObjectDetectionEvaluator(DetectionEvaluator):
    self._use_weighted_mean_ap = use_weighted_mean_ap
    self._label_id_offset = 1
    self._evaluate_masks = evaluate_masks
+    self._group_of_weight = group_of_weight
    self._evaluation = ObjectDetectionEvaluation(
        num_groundtruth_classes=self._num_classes,
        matching_iou_threshold=self._matching_iou_threshold,
        use_weighted_mean_ap=self._use_weighted_mean_ap,
-        label_id_offset=self._label_id_offset)
+        label_id_offset=self._label_id_offset,
+        group_of_weight=self._group_of_weight)
    self._image_ids = set([])
    self._evaluate_corlocs = evaluate_corlocs
    self._metric_prefix = (metric_prefix + '_') if metric_prefix else ''
@@ -383,7 +392,9 @@ class OpenImagesDetectionEvaluator(ObjectDetectionEvaluator):
  def __init__(self,
               categories,
               matching_iou_threshold=0.5,
-               evaluate_corlocs=False):
+               evaluate_corlocs=False,
+               metric_prefix='OpenImagesV2',
+               group_of_weight=0.0):
    """Constructor.

    Args:
@@ -393,12 +404,21 @@ class OpenImagesDetectionEvaluator(ObjectDetectionEvaluator):
      matching_iou_threshold: IOU threshold to use for matching groundtruth
        boxes to detection boxes.
      evaluate_corlocs: if True, additionally evaluates and returns CorLoc.
+      metric_prefix: Prefix name of the metric.
+      group_of_weight: Weight of the group-of bounding box. If set to 0 (default
+        for Open Images V2 detection protocol), detections of the correct class
+        within a group-of box are ignored. If weight is > 0, then if at least
+        one detection falls within a group-of box with matching_iou_threshold,
+        weight group_of_weight is added to true positives. Consequently, if no
+        detection falls within a group-of box, weight group_of_weight is added
+        to false negatives.
    """
    super(OpenImagesDetectionEvaluator, self).__init__(
        categories,
        matching_iou_threshold,
        evaluate_corlocs,
-        metric_prefix='OpenImagesV2')
+        metric_prefix=metric_prefix,
+        group_of_weight=group_of_weight)

  def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
    """Adds groundtruth for a single image to be used for evaluation.
@@ -449,6 +469,130 @@ class OpenImagesDetectionEvaluator(ObjectDetectionEvaluator):
    self._image_ids.update([image_id])


+class OpenImagesDetectionChallengeEvaluator(OpenImagesDetectionEvaluator):
+  """A class implements Open Images Challenge Detection metrics.
+
+    Open Images Challenge Detection metric has two major changes in comparison
+    with Open Images V2 detection metric:
+    - a custom weight might be specified for detecting an object contained in
+    a group-of box.
+    - verified image-level labels should be explicitelly provided for
+    evaluation: in case in image has neither positive nor negative image level
+    label of class c, all detections of this class on this image will be
+    ignored.
+  """
+
+  def __init__(self,
+               categories,
+               matching_iou_threshold=0.5,
+               evaluate_corlocs=False,
+               group_of_weight=1.0):
+    """Constructor.
+
+    Args:
+      categories: A list of dicts, each of which has the following keys -
+        'id': (required) an integer id uniquely identifying this category.
+        'name': (required) string representing category name e.g., 'cat', 'dog'.
+      matching_iou_threshold: IOU threshold to use for matching groundtruth
+        boxes to detection boxes.
+      evaluate_corlocs: if True, additionally evaluates and returns CorLoc.
+      group_of_weight: weight of a group-of box. If set to 0, detections of the
+        correct class within a group-of box are ignored. If weight is > 0
+        (default for Open Images Detection Challenge 2018), then if at least one
+        detection falls within a group-of box with matching_iou_threshold,
+        weight group_of_weight is added to true positives. Consequently, if no
+        detection falls within a group-of box, weight group_of_weight is added
+        to false negatives.
+    """
+    super(OpenImagesDetectionChallengeEvaluator, self).__init__(
+        categories,
+        matching_iou_threshold,
+        evaluate_corlocs,
+        metric_prefix='OpenImagesChallenge2018',
+        group_of_weight=group_of_weight)
+
+    self._evaluatable_labels = {}
+
+  def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
+    """Adds groundtruth for a single image to be used for evaluation.
+
+    Args:
+      image_id: A unique string/integer identifier for the image.
+      groundtruth_dict: A dictionary containing -
+        standard_fields.InputDataFields.groundtruth_boxes: float32 numpy array
+          of shape [num_boxes, 4] containing `num_boxes` groundtruth boxes of
+          the format [ymin, xmin, ymax, xmax] in absolute image coordinates.
+        standard_fields.InputDataFields.groundtruth_classes: integer numpy array
+          of shape [num_boxes] containing 1-indexed groundtruth classes for the
+          boxes.
+        standard_fields.InputDataFields.verified_labels: integer 1D numpy array
+          containing all classes for which labels are verified.
+        standard_fields.InputDataFields.groundtruth_group_of: Optional length
+          M numpy boolean array denoting whether a groundtruth box contains a
+          group of instances.
+
+    Raises:
+      ValueError: On adding groundtruth for an image more than once.
+    """
+    super(OpenImagesDetectionChallengeEvaluator,
+          self).add_single_ground_truth_image_info(image_id, groundtruth_dict)
+    groundtruth_classes = (
+        groundtruth_dict[standard_fields.InputDataFields.groundtruth_classes] -
+        self._label_id_offset)
+    self._evaluatable_labels[image_id] = np.unique(
+        np.concatenate(((groundtruth_dict.get(
+            standard_fields.InputDataFields.verified_labels,
+            np.array([], dtype=int)) - self._label_id_offset),
+                        groundtruth_classes)))
+
+  def add_single_detected_image_info(self, image_id, detections_dict):
+    """Adds detections for a single image to be used for evaluation.
+
+    Args:
+      image_id: A unique string/integer identifier for the image.
+      detections_dict: A dictionary containing -
+        standard_fields.DetectionResultFields.detection_boxes: float32 numpy
+          array of shape [num_boxes, 4] containing `num_boxes` detection boxes
+          of the format [ymin, xmin, ymax, xmax] in absolute image coordinates.
+        standard_fields.DetectionResultFields.detection_scores: float32 numpy
+          array of shape [num_boxes] containing detection scores for the boxes.
+        standard_fields.DetectionResultFields.detection_classes: integer numpy
+          array of shape [num_boxes] containing 1-indexed detection classes for
+          the boxes.
+
+    Raises:
+      ValueError: If detection masks are not in detections dictionary.
+    """
+    if image_id not in self._image_ids:
+      # Since for the correct work of evaluator it is assumed that groundtruth
+      # is inserted first we make sure to break the code if is it not the case.
+      self._image_ids.update([image_id])
+      self._evaluatable_labels[image_id] = np.array([])
+
+    detection_classes = (
+        detections_dict[standard_fields.DetectionResultFields.detection_classes]
+        - self._label_id_offset)
+    allowed_classes = np.where(
+        np.isin(detection_classes, self._evaluatable_labels[image_id]))
+    detection_classes = detection_classes[allowed_classes]
+    detected_boxes = detections_dict[
+        standard_fields.DetectionResultFields.detection_boxes][allowed_classes]
+    detected_scores = detections_dict[
+        standard_fields.DetectionResultFields.detection_scores][allowed_classes]
+
+    self._evaluation.add_single_detected_image_info(
+        image_key=image_id,
+        detected_boxes=detected_boxes,
+        detected_scores=detected_scores,
+        detected_class_labels=detection_classes)
+
+  def clear(self):
+    """Clears stored data."""
+
+    super(OpenImagesDetectionChallengeEvaluator, self).clear()
+    self._evaluatable_labels.clear()
+
+
 ObjectDetectionEvalMetrics = collections.namedtuple(
    'ObjectDetectionEvalMetrics', [
        'average_precisions', 'mean_ap', 'precisions', 'recalls', 'corlocs',
@@ -465,7 +609,8 @@ class ObjectDetectionEvaluation(object):
               nms_iou_threshold=1.0,
               nms_max_output_boxes=10000,
               use_weighted_mean_ap=False,
-               label_id_offset=0):
+               label_id_offset=0,
+               group_of_weight=0.0):
    if num_groundtruth_classes < 1:
      raise ValueError('Need at least 1 groundtruth class for evaluation.')

@@ -473,7 +618,9 @@ class ObjectDetectionEvaluation(object):
        num_groundtruth_classes=num_groundtruth_classes,
        matching_iou_threshold=matching_iou_threshold,
        nms_iou_threshold=nms_iou_threshold,
-        nms_max_output_boxes=nms_max_output_boxes)
+        nms_max_output_boxes=nms_max_output_boxes,
+        group_of_weight=group_of_weight)
+    self.group_of_weight = group_of_weight
    self.num_class = num_groundtruth_classes
    self.use_weighted_mean_ap = use_weighted_mean_ap
    self.label_id_offset = label_id_offset
@@ -483,7 +630,7 @@ class ObjectDetectionEvaluation(object):
    self.groundtruth_masks = {}
    self.groundtruth_is_difficult_list = {}
    self.groundtruth_is_group_of_list = {}
-    self.num_gt_instances_per_class = np.zeros(self.num_class, dtype=int)
+    self.num_gt_instances_per_class = np.zeros(self.num_class, dtype=float)
    self.num_gt_imgs_per_class = np.zeros(self.num_class, dtype=int)

    self._initialize_detections()
@@ -650,7 +797,10 @@ class ObjectDetectionEvaluation(object):
      num_gt_instances = np.sum(groundtruth_class_labels[
          ~groundtruth_is_difficult_list
          & ~groundtruth_is_group_of_list] == class_index)
-      self.num_gt_instances_per_class[class_index] += num_gt_instances
+      num_groupof_gt_instances = self.group_of_weight * np.sum(
+          groundtruth_class_labels[groundtruth_is_group_of_list] == class_index)
+      self.num_gt_instances_per_class[
+          class_index] += num_gt_instances + num_groupof_gt_instances
      if np.any(groundtruth_class_labels == class_index):
        self.num_gt_imgs_per_class[class_index] += 1

@@ -677,13 +827,12 @@ class ObjectDetectionEvaluation(object):
    if self.use_weighted_mean_ap:
      all_scores = np.array([], dtype=float)
      all_tp_fp_labels = np.array([], dtype=bool)
-
    for class_index in range(self.num_class):
      if self.num_gt_instances_per_class[class_index] == 0:
        continue
      if not self.scores_per_class[class_index]:
        scores = np.array([], dtype=float)
-        tp_fp_labels = np.array([], dtype=bool)
+        tp_fp_labels = np.array([], dtype=float)
      else:
        scores = np.concatenate(self.scores_per_class[class_index])
        tp_fp_labels = np.concatenate(self.tp_fp_labels_per_class[class_index])

--- a/research/object_detection/utils/object_detection_evaluation_test.py
+++ b/research/object_detection/utils/object_detection_evaluation_test.py
@@ -100,6 +100,126 @@ class OpenImagesV2EvaluationTest(tf.test.TestCase):
    self.assertFalse(oiv2_evaluator._image_ids)


+class OpenImagesDetectionChallengeEvaluatorTest(tf.test.TestCase):
+
+  def test_returns_correct_metric_values(self):
+    categories = [{
+        'id': 1,
+        'name': 'cat'
+    }, {
+        'id': 2,
+        'name': 'dog'
+    }, {
+        'id': 3,
+        'name': 'elephant'
+    }]
+    oivchallenge_evaluator = (
+        object_detection_evaluation.OpenImagesDetectionChallengeEvaluator(
+            categories, group_of_weight=0.5))
+
+    image_key = 'img1'
+    groundtruth_boxes = np.array(
+        [[0, 0, 1, 1], [0, 0, 2, 2], [0, 0, 3, 3]], dtype=float)
+    groundtruth_class_labels = np.array([1, 3, 1], dtype=int)
+    groundtruth_is_group_of_list = np.array([False, False, True], dtype=bool)
+    groundtruth_verified_labels = np.array([1, 2, 3], dtype=int)
+    oivchallenge_evaluator.add_single_ground_truth_image_info(
+        image_key, {
+            standard_fields.InputDataFields.groundtruth_boxes:
+                groundtruth_boxes,
+            standard_fields.InputDataFields.groundtruth_classes:
+                groundtruth_class_labels,
+            standard_fields.InputDataFields.groundtruth_group_of:
+                groundtruth_is_group_of_list,
+            standard_fields.InputDataFields.verified_labels:
+                groundtruth_verified_labels,
+        })
+    image_key = 'img2'
+    groundtruth_boxes = np.array(
+        [[10, 10, 11, 11], [500, 500, 510, 510], [10, 10, 12, 12]], dtype=float)
+    groundtruth_class_labels = np.array([1, 1, 3], dtype=int)
+    groundtruth_is_group_of_list = np.array([False, False, True], dtype=bool)
+    oivchallenge_evaluator.add_single_ground_truth_image_info(
+        image_key, {
+            standard_fields.InputDataFields.groundtruth_boxes:
+                groundtruth_boxes,
+            standard_fields.InputDataFields.groundtruth_classes:
+                groundtruth_class_labels,
+            standard_fields.InputDataFields.groundtruth_group_of:
+                groundtruth_is_group_of_list
+        })
+    image_key = 'img3'
+    groundtruth_boxes = np.array([[0, 0, 1, 1]], dtype=float)
+    groundtruth_class_labels = np.array([2], dtype=int)
+    oivchallenge_evaluator.add_single_ground_truth_image_info(
+        image_key, {
+            standard_fields.InputDataFields.groundtruth_boxes:
+                groundtruth_boxes,
+            standard_fields.InputDataFields.groundtruth_classes:
+                groundtruth_class_labels
+        })
+    image_key = 'img1'
+    detected_boxes = np.array(
+        [[10, 10, 11, 11], [100, 100, 120, 120]], dtype=float)
+    detected_class_labels = np.array([2, 2], dtype=int)
+    detected_scores = np.array([0.7, 0.8], dtype=float)
+    oivchallenge_evaluator.add_single_detected_image_info(
+        image_key, {
+            standard_fields.DetectionResultFields.detection_boxes:
+                detected_boxes,
+            standard_fields.DetectionResultFields.detection_scores:
+                detected_scores,
+            standard_fields.DetectionResultFields.detection_classes:
+                detected_class_labels
+        })
+    image_key = 'img2'
+    detected_boxes = np.array(
+        [[10, 10, 11, 11], [100, 100, 120, 120], [100, 100, 220, 220],
+         [10, 10, 11, 11]],
+        dtype=float)
+    detected_class_labels = np.array([1, 1, 2, 3], dtype=int)
+    detected_scores = np.array([0.7, 0.8, 0.5, 0.9], dtype=float)
+    oivchallenge_evaluator.add_single_detected_image_info(
+        image_key, {
+            standard_fields.DetectionResultFields.detection_boxes:
+                detected_boxes,
+            standard_fields.DetectionResultFields.detection_scores:
+                detected_scores,
+            standard_fields.DetectionResultFields.detection_classes:
+                detected_class_labels
+        })
+    image_key = 'img3'
+    detected_boxes = np.array([[0, 0, 1, 1]], dtype=float)
+    detected_class_labels = np.array([2], dtype=int)
+    detected_scores = np.array([0.5], dtype=float)
+    oivchallenge_evaluator.add_single_detected_image_info(
+        image_key, {
+            standard_fields.DetectionResultFields.detection_boxes:
+                detected_boxes,
+            standard_fields.DetectionResultFields.detection_scores:
+                detected_scores,
+            standard_fields.DetectionResultFields.detection_classes:
+                detected_class_labels
+        })
+    metrics = oivchallenge_evaluator.evaluate()
+
+    self.assertAlmostEqual(
+        metrics['OpenImagesChallenge2018_PerformanceByCategory/AP@0.5IOU/dog'],
+        0.3333333333)
+    self.assertAlmostEqual(
+        metrics[
+            'OpenImagesChallenge2018_PerformanceByCategory/AP@0.5IOU/elephant'],
+        0.333333333333)
+    self.assertAlmostEqual(
+        metrics['OpenImagesChallenge2018_PerformanceByCategory/AP@0.5IOU/cat'],
+        0.142857142857)
+    self.assertAlmostEqual(
+        metrics['OpenImagesChallenge2018_Precision/mAP@0.5IOU'], 0.269841269)
+
+    oivchallenge_evaluator.clear()
+    self.assertFalse(oivchallenge_evaluator._image_ids)
+
+
 class PascalEvaluationTest(tf.test.TestCase):

  def test_returns_correct_metric_values_on_boxes(self):

--- a/research/object_detection/utils/per_image_evaluation.py
+++ b/research/object_detection/utils/per_image_evaluation.py
@@ -35,7 +35,8 @@ class PerImageEvaluation(object):
               num_groundtruth_classes,
               matching_iou_threshold=0.5,
               nms_iou_threshold=0.3,
-               nms_max_output_boxes=50):
+               nms_max_output_boxes=50,
+               group_of_weight=0.0):
    """Initialized PerImageEvaluation by evaluation parameters.

    Args:
@@ -44,24 +45,26 @@ class PerImageEvaluation(object):
          the threshold to consider whether a detection is true positive or not
      nms_iou_threshold: IOU threshold used in Non Maximum Suppression.
      nms_max_output_boxes: Number of maximum output boxes in NMS.
+      group_of_weight: Weight of the group-of boxes.
    """
    self.matching_iou_threshold = matching_iou_threshold
    self.nms_iou_threshold = nms_iou_threshold
    self.nms_max_output_boxes = nms_max_output_boxes
    self.num_groundtruth_classes = num_groundtruth_classes
+    self.group_of_weight = group_of_weight

  def compute_object_detection_metrics(
      self, detected_boxes, detected_scores, detected_class_labels,
      groundtruth_boxes, groundtruth_class_labels,
      groundtruth_is_difficult_list, groundtruth_is_group_of_list,
      detected_masks=None, groundtruth_masks=None):
-    """Evaluates detections as being tp, fp or ignored from a single image.
+    """Evaluates detections as being tp, fp or weighted from a single image.

    The evaluation is done in two stages:
     1. All detections are matched to non group-of boxes; true positives are
        determined and detections matched to difficult boxes are ignored.
     2. Detections that are determined as false positives are matched against
-        group-of boxes and ignored if matched.
+        group-of boxes and weighted if matched.

    Args:
      detected_boxes: A float numpy array of shape [N, 4], representing N
@@ -339,7 +342,8 @@ class PerImageEvaluation(object):
        box_data=groundtruth_boxes[groundtruth_is_group_of_list],
        mask_data=groundtruth_masks[groundtruth_is_group_of_list])
    iou = np_box_mask_list_ops.iou(detected_boxlist, gt_non_group_of_boxlist)
-    ioa = np_box_mask_list_ops.ioa(gt_group_of_boxlist, detected_boxlist)
+    ioa = np.transpose(
+        np_box_mask_list_ops.ioa(gt_group_of_boxlist, detected_boxlist))
    scores = detected_boxlist.get_field('scores')
    num_boxes = detected_boxlist.num_boxes()
    return iou, ioa, scores, num_boxes
@@ -380,7 +384,8 @@ class PerImageEvaluation(object):
    gt_group_of_boxlist = np_box_list.BoxList(
        groundtruth_boxes[groundtruth_is_group_of_list])
    iou = np_box_list_ops.iou(detected_boxlist, gt_non_group_of_boxlist)
-    ioa = np_box_list_ops.ioa(gt_group_of_boxlist, detected_boxlist)
+    ioa = np.transpose(
+        np_box_list_ops.ioa(gt_group_of_boxlist, detected_boxlist))
    scores = detected_boxlist.get_field('scores')
    num_boxes = detected_boxlist.num_boxes()
    return iou, ioa, scores, num_boxes
@@ -455,7 +460,8 @@ class PerImageEvaluation(object):
    # 1. All detections are matched to non group-of boxes; true positives are
    #    determined and detections matched to difficult boxes are ignored.
    # 2. Detections that are determined as false positives are matched against
-    #    group-of boxes and ignored if matched.
+    #    group-of boxes and scored with weight w per ground truth box is
+    # matched.

    # Tp-fp evaluation for non-group of boxes (if any).
    if iou.shape[1] > 0:
@@ -473,18 +479,29 @@ class PerImageEvaluation(object):
          else:
            is_matched_to_difficult_box[i] = True

+    scores_group_of = np.zeros(ioa.shape[1], dtype=float)
+    tp_fp_labels_group_of = self.group_of_weight * np.ones(
+        ioa.shape[1], dtype=float)
    # Tp-fp evaluation for group of boxes.
-    if ioa.shape[0] > 0:
-      max_overlap_group_of_gt = np.max(ioa, axis=0)
+    if ioa.shape[1] > 0:
+      max_overlap_group_of_gt_ids = np.argmax(ioa, axis=1)
      for i in range(num_detected_boxes):
+        gt_id = max_overlap_group_of_gt_ids[i]
        if (not tp_fp_labels[i] and not is_matched_to_difficult_box[i] and
-            max_overlap_group_of_gt[i] >= self.matching_iou_threshold):
+            ioa[i, gt_id] >= self.matching_iou_threshold):
          is_matched_to_group_of_box[i] = True
-
-    return scores[~is_matched_to_difficult_box
-                  & ~is_matched_to_group_of_box], tp_fp_labels[
-                      ~is_matched_to_difficult_box
-                      & ~is_matched_to_group_of_box]
+          scores_group_of[gt_id] = max(scores_group_of[gt_id], scores[i])
+      selector = np.where((scores_group_of > 0) & (tp_fp_labels_group_of > 0))
+      scores_group_of = scores_group_of[selector]
+      tp_fp_labels_group_of = tp_fp_labels_group_of[selector]
+
+    return np.concatenate(
+        (scores[~is_matched_to_difficult_box
+                & ~is_matched_to_group_of_box],
+         scores_group_of)), np.concatenate(
+             (tp_fp_labels[~is_matched_to_difficult_box
+                           & ~is_matched_to_group_of_box].astype(float),
+              tp_fp_labels_group_of))

  def _get_ith_class_arrays(self, detected_boxes, detected_scores,
                            detected_masks, detected_class_labels,

--- a/research/object_detection/utils/per_image_evaluation_test.py
+++ b/research/object_detection/utils/per_image_evaluation_test.py
@@ -173,6 +173,7 @@ class SingleClassTpFpWithGroupOfBoxesTest(tf.test.TestCase):
        self.detected_boxes, self.detected_scores, self.groundtruth_boxes,
        groundtruth_groundtruth_is_difficult_list,
        groundtruth_groundtruth_is_group_of_list)
+
    self.assertTrue(np.allclose(expected_scores, scores))
    self.assertTrue(np.allclose(expected_tp_fp_labels, tp_fp_labels))

@@ -191,6 +192,7 @@ class SingleClassTpFpWithGroupOfBoxesTest(tf.test.TestCase):
        groundtruth_groundtruth_is_group_of_list,
        detected_masks=self.detected_masks,
        groundtruth_masks=self.groundtruth_masks)
+
    self.assertTrue(np.allclose(expected_scores, scores))
    self.assertTrue(np.allclose(expected_tp_fp_labels, tp_fp_labels))

@@ -227,6 +229,122 @@ class SingleClassTpFpWithGroupOfBoxesTest(tf.test.TestCase):
    self.assertTrue(np.allclose(expected_tp_fp_labels, tp_fp_labels))


+class SingleClassTpFpWithGroupOfBoxesTestWeighted(tf.test.TestCase):
+
+  def setUp(self):
+    num_groundtruth_classes = 1
+    matching_iou_threshold = 0.5
+    nms_iou_threshold = 1.0
+    nms_max_output_boxes = 10000
+    self.group_of_weight = 0.5
+    self.eval = per_image_evaluation.PerImageEvaluation(
+        num_groundtruth_classes, matching_iou_threshold, nms_iou_threshold,
+        nms_max_output_boxes, self.group_of_weight)
+
+    self.detected_boxes = np.array(
+        [[0, 0, 1, 1], [0, 0, 2, 1], [0, 0, 3, 1]], dtype=float)
+    self.detected_scores = np.array([0.8, 0.6, 0.5], dtype=float)
+    detected_masks_0 = np.array(
+        [[0, 1, 1, 0], [0, 0, 1, 0], [0, 0, 0, 0]], dtype=np.uint8)
+    detected_masks_1 = np.array(
+        [[1, 0, 0, 0], [1, 1, 0, 0], [0, 0, 0, 0]], dtype=np.uint8)
+    detected_masks_2 = np.array(
+        [[0, 0, 0, 0], [0, 1, 1, 0], [0, 1, 0, 0]], dtype=np.uint8)
+    self.detected_masks = np.stack(
+        [detected_masks_0, detected_masks_1, detected_masks_2], axis=0)
+
+    self.groundtruth_boxes = np.array(
+        [[0, 0, 1, 1], [0, 0, 5, 5], [10, 10, 20, 20]], dtype=float)
+    groundtruth_masks_0 = np.array(
+        [[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0]], dtype=np.uint8)
+    groundtruth_masks_1 = np.array(
+        [[0, 0, 1, 0], [0, 0, 1, 0], [0, 0, 1, 0]], dtype=np.uint8)
+    groundtruth_masks_2 = np.array(
+        [[0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0]], dtype=np.uint8)
+    self.groundtruth_masks = np.stack(
+        [groundtruth_masks_0, groundtruth_masks_1, groundtruth_masks_2], axis=0)
+
+  def test_match_to_non_group_of_and_group_of_box(self):
+    groundtruth_groundtruth_is_difficult_list = np.array(
+        [False, False, False], dtype=bool)
+    groundtruth_groundtruth_is_group_of_list = np.array(
+        [False, True, True], dtype=bool)
+    expected_scores = np.array([0.8, 0.6], dtype=float)
+    expected_tp_fp_labels = np.array([1.0, self.group_of_weight], dtype=float)
+    scores, tp_fp_labels = self.eval._compute_tp_fp_for_single_class(
+        self.detected_boxes, self.detected_scores, self.groundtruth_boxes,
+        groundtruth_groundtruth_is_difficult_list,
+        groundtruth_groundtruth_is_group_of_list)
+
+    self.assertTrue(np.allclose(expected_scores, scores))
+    self.assertTrue(np.allclose(expected_tp_fp_labels, tp_fp_labels))
+
+  def test_mask_match_to_non_group_of_and_group_of_box(self):
+    groundtruth_groundtruth_is_difficult_list = np.array(
+        [False, False, False], dtype=bool)
+    groundtruth_groundtruth_is_group_of_list = np.array(
+        [False, True, True], dtype=bool)
+    expected_scores = np.array([0.6, 0.8, 0.5], dtype=float)
+    expected_tp_fp_labels = np.array(
+        [1.0, self.group_of_weight, self.group_of_weight], dtype=float)
+    scores, tp_fp_labels = self.eval._compute_tp_fp_for_single_class(
+        self.detected_boxes,
+        self.detected_scores,
+        self.groundtruth_boxes,
+        groundtruth_groundtruth_is_difficult_list,
+        groundtruth_groundtruth_is_group_of_list,
+        detected_masks=self.detected_masks,
+        groundtruth_masks=self.groundtruth_masks)
+
+    tf.logging.info(
+        "test_mask_match_to_non_group_of_and_group_of_box {} {}".format(
+            tp_fp_labels, expected_tp_fp_labels))
+
+    self.assertTrue(np.allclose(expected_scores, scores))
+    self.assertTrue(np.allclose(expected_tp_fp_labels, tp_fp_labels))
+
+  def test_match_two_to_group_of_box(self):
+    groundtruth_groundtruth_is_difficult_list = np.array(
+        [False, False, False], dtype=bool)
+    groundtruth_groundtruth_is_group_of_list = np.array(
+        [True, False, True], dtype=bool)
+    expected_scores = np.array([0.5, 0.8], dtype=float)
+    expected_tp_fp_labels = np.array([0.0, self.group_of_weight], dtype=float)
+    scores, tp_fp_labels = self.eval._compute_tp_fp_for_single_class(
+        self.detected_boxes, self.detected_scores, self.groundtruth_boxes,
+        groundtruth_groundtruth_is_difficult_list,
+        groundtruth_groundtruth_is_group_of_list)
+
+    tf.logging.info("test_match_two_to_group_of_box {} {}".format(
+        tp_fp_labels, expected_tp_fp_labels))
+
+    self.assertTrue(np.allclose(expected_scores, scores))
+    self.assertTrue(np.allclose(expected_tp_fp_labels, tp_fp_labels))
+
+  def test_mask_match_two_to_group_of_box(self):
+    groundtruth_groundtruth_is_difficult_list = np.array(
+        [False, False, False], dtype=bool)
+    groundtruth_groundtruth_is_group_of_list = np.array(
+        [True, False, True], dtype=bool)
+    expected_scores = np.array([0.8, 0.6, 0.5], dtype=float)
+    expected_tp_fp_labels = np.array(
+        [1.0, self.group_of_weight, self.group_of_weight], dtype=float)
+    scores, tp_fp_labels = self.eval._compute_tp_fp_for_single_class(
+        self.detected_boxes,
+        self.detected_scores,
+        self.groundtruth_boxes,
+        groundtruth_groundtruth_is_difficult_list,
+        groundtruth_groundtruth_is_group_of_list,
+        detected_masks=self.detected_masks,
+        groundtruth_masks=self.groundtruth_masks)
+
+    tf.logging.info("test_mask_match_two_to_group_of_box {} {}".format(
+        tp_fp_labels, expected_tp_fp_labels))
+
+    self.assertTrue(np.allclose(expected_scores, scores))
+    self.assertTrue(np.allclose(expected_tp_fp_labels, tp_fp_labels))
+
+
 class SingleClassTpFpNoDifficultBoxesTest(tf.test.TestCase):

  def setUp(self):
@@ -439,5 +557,5 @@ class CorLocTest(tf.test.TestCase):
                                   is_class_correctly_detected_in_image))


-if __name__ == '__main__':
+if __name__ == "__main__":
  tf.test.main()
--- a/research/object_detection/utils/per_image_vrd_evaluation.py
+++ b/research/object_detection/utils/per_image_vrd_evaluation.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Evaluates Visual Relations Detection(VRD) result evaluation on an image.
+
+Annotate each VRD result as true positives or false positive according to
+a predefined IOU ratio. Multi-class detection is supported by default.
+Based on the settings, per image evaluation is performed either on phrase
+detection subtask or on relation detection subtask.
+"""
+import numpy as np
+
+from object_detection.utils import np_box_list
+from object_detection.utils import np_box_list_ops
+
+
+class PerImageVRDEvaluation(object):
+  """Evaluate vrd result of a single image."""
+
+  def __init__(self, matching_iou_threshold=0.5):
+    """Initialized PerImageVRDEvaluation by evaluation parameters.
+
+    Args:
+      matching_iou_threshold: A ratio of area intersection to union, which is
+          the threshold to consider whether a detection is true positive or not;
+          in phrase detection subtask.
+    """
+    self.matching_iou_threshold = matching_iou_threshold
+
+  def compute_detection_tp_fp(self, detected_box_tuples, detected_scores,
+                              detected_class_tuples, groundtruth_box_tuples,
+                              groundtruth_class_tuples):
+    """Evaluates VRD as being tp, fp from a single image.
+
+    Args:
+      detected_box_tuples: A numpy array of structures with shape [N,],
+          representing N tuples, each tuple containing the same number of named
+          bounding boxes.
+          Each box is of the format [y_min, x_min, y_max, x_max].
+      detected_scores: A float numpy array of shape [N,], representing
+          the confidence scores of the detected N object instances.
+      detected_class_tuples: A numpy array of structures shape [N,],
+          representing the class labels of the corresponding bounding boxes and
+          possibly additional classes.
+      groundtruth_box_tuples: A float numpy array of structures with the shape
+          [M,], representing M tuples, each tuple containing the same number
+          of named bounding boxes.
+          Each box is of the format [y_min, x_min, y_max, x_max].
+      groundtruth_class_tuples: A numpy array of structures shape [M,],
+          representing  the class labels of the corresponding bounding boxes and
+          possibly additional classes.
+
+    Returns:
+      scores: A single numpy array with shape [N,], representing N scores
+          detected with object class, sorted in descentent order.
+      tp_fp_labels: A single boolean numpy array of shape [N,], representing N
+          True/False positive label, one label per tuple. The labels are sorted
+          so that the order of the labels matches the order of the scores.
+    """
+
+    scores, tp_fp_labels = self._compute_tp_fp(
+        detected_box_tuples=detected_box_tuples,
+        detected_scores=detected_scores,
+        detected_class_tuples=detected_class_tuples,
+        groundtruth_box_tuples=groundtruth_box_tuples,
+        groundtruth_class_tuples=groundtruth_class_tuples)
+
+    return scores, tp_fp_labels
+
+  def _compute_tp_fp(self, detected_box_tuples, detected_scores,
+                     detected_class_tuples, groundtruth_box_tuples,
+                     groundtruth_class_tuples):
+    """Labels as true/false positives detection tuples across all classes.
+
+    Args:
+      detected_box_tuples: A numpy array of structures with shape [N,],
+          representing N tuples, each tuple containing the same number of named
+          bounding boxes.
+          Each box is of the format [y_min, x_min, y_max, x_max]
+      detected_scores: A float numpy array of shape [N,], representing
+          the confidence scores of the detected N object instances.
+      detected_class_tuples: A numpy array of structures shape [N,],
+          representing the class labels of the corresponding bounding boxes and
+          possibly additional classes.
+      groundtruth_box_tuples: A float numpy array of structures with the shape
+          [M,], representing M tuples, each tuple containing the same number
+          of named bounding boxes.
+          Each box is of the format [y_min, x_min, y_max, x_max]
+      groundtruth_class_tuples: A numpy array of structures shape [M,],
+          representing  the class labels of the corresponding bounding boxes and
+          possibly additional classes.
+
+    Returns:
+      scores: A single numpy array with shape [N,], representing N scores
+          detected with object class, sorted in descentent order.
+      tp_fp_labels: A single boolean numpy array of shape [N,], representing N
+          True/False positive label, one label per tuple. The labels are sorted
+          so that the order of the labels matches the order of the scores.
+
+    """
+    unique_gt_tuples = np.unique(
+        np.concatenate((groundtruth_class_tuples, detected_class_tuples)))
+    result_scores = []
+    result_tp_fp_labels = []
+
+    for unique_tuple in unique_gt_tuples:
+      detections_selector = (detected_class_tuples == unique_tuple)
+      gt_selector = (groundtruth_class_tuples == unique_tuple)
+      scores, tp_fp_labels = self._compute_tp_fp_for_single_class(
+          detected_box_tuples=detected_box_tuples[detections_selector],
+          detected_scores=detected_scores[detections_selector],
+          groundtruth_box_tuples=groundtruth_box_tuples[gt_selector])
+      result_scores.append(scores)
+      result_tp_fp_labels.append(tp_fp_labels)
+
+    result_scores = np.concatenate(result_scores)
+    result_tp_fp_labels = np.concatenate(result_tp_fp_labels)
+
+    sorted_indices = np.argsort(result_scores)
+    sorted_indices = sorted_indices[::-1]
+
+    return result_scores[sorted_indices], result_tp_fp_labels[sorted_indices]
+
+  def _get_overlaps_and_scores_relation_tuples(
+      self, detected_box_tuples, detected_scores, groundtruth_box_tuples):
+    """Computes overlaps and scores between detected and groundtruth tuples.
+
+    Both detections and groundtruth boxes have the same class tuples.
+
+    Args:
+      detected_box_tuples: A numpy array of structures with shape [N,],
+          representing N tuples, each tuple containing the same number of named
+          bounding boxes.
+          Each box is of the format [y_min, x_min, y_max, x_max]
+      detected_scores: A float numpy array of shape [N,], representing
+          the confidence scores of the detected N object instances.
+      groundtruth_box_tuples: A float numpy array of structures with the shape
+          [M,], representing M tuples, each tuple containing the same number
+          of named bounding boxes.
+          Each box is of the format [y_min, x_min, y_max, x_max]
+
+    Returns:
+      result_iou: A float numpy array of size
+        [num_detected_tuples, num_gt_box_tuples].
+      scores: The score of the detected boxlist.
+    """
+
+    result_iou = np.ones(
+        (detected_box_tuples.shape[0], groundtruth_box_tuples.shape[0]),
+        dtype=float)
+    for field in detected_box_tuples.dtype.fields:
+      detected_boxlist_field = np_box_list.BoxList(detected_box_tuples[field])
+      detected_boxlist_field.add_field('scores', detected_scores)
+      detected_boxlist_field = np_box_list_ops.sort_by_field(
+          detected_boxlist_field, 'scores')
+      gt_boxlist_field = np_box_list.BoxList(groundtruth_box_tuples[field])
+      iou_field = np_box_list_ops.iou(detected_boxlist_field, gt_boxlist_field)
+      result_iou = np.minimum(iou_field, result_iou)
+    scores = detected_boxlist_field.get_field('scores')
+    return result_iou, scores
+
+  def _compute_tp_fp_for_single_class(self, detected_box_tuples,
+                                      detected_scores, groundtruth_box_tuples):
+    """Labels boxes detected with the same class from the same image as tp/fp.
+
+    Args:
+      detected_box_tuples: A numpy array of structures with shape [N,],
+          representing N tuples, each tuple containing the same number of named
+          bounding boxes.
+          Each box is of the format [y_min, x_min, y_max, x_max]
+      detected_scores: A float numpy array of shape [N,], representing
+          the confidence scores of the detected N object instances.
+      groundtruth_box_tuples: A float numpy array of structures with the shape
+          [M,], representing M tuples, each tuple containing the same number
+          of named bounding boxes.
+          Each box is of the format [y_min, x_min, y_max, x_max]
+
+    Returns:
+      Two arrays of the same size, containing true/false for N boxes that were
+      evaluated as being true positives or false positives;
+
+      scores: A numpy array representing the detection scores.
+      tp_fp_labels: a boolean numpy array indicating whether a detection is a
+          true positive.
+    """
+    if detected_box_tuples.size == 0:
+      return np.array([], dtype=float), np.array([], dtype=bool)
+
+    min_iou, scores = self._get_overlaps_and_scores_relation_tuples(
+        detected_box_tuples=detected_box_tuples,
+        detected_scores=detected_scores,
+        groundtruth_box_tuples=groundtruth_box_tuples)
+
+    num_detected_tuples = detected_box_tuples.shape[0]
+    tp_fp_labels = np.zeros(num_detected_tuples, dtype=bool)
+
+    if min_iou.shape[1] > 0:
+      max_overlap_gt_ids = np.argmax(min_iou, axis=1)
+      is_gt_tuple_detected = np.zeros(min_iou.shape[1], dtype=bool)
+      for i in range(num_detected_tuples):
+        gt_id = max_overlap_gt_ids[i]
+        if min_iou[i, gt_id] >= self.matching_iou_threshold:
+          if not is_gt_tuple_detected[gt_id]:
+            tp_fp_labels[i] = True
+            is_gt_tuple_detected[gt_id] = True
+
+    return scores, tp_fp_labels