Merge pull request #4460 from pkulzc/master

Release evaluation code for OI Challenge 2018 and minor fixes.

Merge pull request #4460 from pkulzc/master
Release evaluation code for OI Challenge 2018 and minor fixes.
97760186 · Jonathan Huang · GitHub · ed901b73 · a703fc0c · 97760186
Unverified Commit 97760186 authored Jun 05, 2018 by Jonathan Huang Committed by GitHub Jun 05, 2018
19 changed files
--- a/research/object_detection/models/ssd_mobilenet_v2_feature_extractor.py
+++ b/research/object_detection/models/ssd_mobilenet_v2_feature_extractor.py
@@ -112,24 +112,18 @@ class SSDMobileNetV2FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
        with (slim.arg_scope(self._conv_hyperparams_fn())
              if self._override_base_feature_extractor_hyperparams else
              context_manager.IdentityContextManager()):
-          # TODO(b/68150321): Enable fused batch norm once quantization
-          # supports it.
-          with slim.arg_scope([slim.batch_norm], fused=False):
-            _, image_features = mobilenet_v2.mobilenet_base(
-                ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
-                final_endpoint='layer_19',
-                depth_multiplier=self._depth_multiplier,
-                use_explicit_padding=self._use_explicit_padding,
-                scope=scope)
+          _, image_features = mobilenet_v2.mobilenet_base(
+              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
+              final_endpoint='layer_19',
+              depth_multiplier=self._depth_multiplier,
+              use_explicit_padding=self._use_explicit_padding,
+              scope=scope)
        with slim.arg_scope(self._conv_hyperparams_fn()):
-          # TODO(b/68150321): Enable fused batch norm once quantization
-          # supports it.
-          with slim.arg_scope([slim.batch_norm], fused=False):
-            feature_maps = feature_map_generators.multi_resolution_feature_maps(
-                feature_map_layout=feature_map_layout,
-                depth_multiplier=self._depth_multiplier,
-                min_depth=self._min_depth,
-                insert_1x1_conv=True,
-                image_features=image_features)
+          feature_maps = feature_map_generators.multi_resolution_feature_maps(
+              feature_map_layout=feature_map_layout,
+              depth_multiplier=self._depth_multiplier,
+              min_depth=self._min_depth,
+              insert_1x1_conv=True,
+              image_features=image_features)

    return feature_maps.values()
--- a/research/object_detection/models/ssd_mobilenet_v2_feature_extractor_test.py
+++ b/research/object_detection/models/ssd_mobilenet_v2_feature_extractor_test.py
@@ -135,7 +135,7 @@ class SsdMobilenetV2FeatureExtractorTest(
    self.check_feature_extractor_variables_under_scope(
        depth_multiplier, pad_to_multiple, scope_name)

-  def test_nofused_batchnorm(self):
+  def test_has_fused_batchnorm(self):
    image_height = 40
    image_width = 40
    depth_multiplier = 1
@@ -146,8 +146,8 @@ class SsdMobilenetV2FeatureExtractorTest(
                                                       pad_to_multiple)
    preprocessed_image = feature_extractor.preprocess(image_placeholder)
    _ = feature_extractor.extract_features(preprocessed_image)
-    self.assertFalse(any(op.type == 'FusedBatchNorm'
-                         for op in tf.get_default_graph().get_operations()))
+    self.assertTrue(any(op.type == 'FusedBatchNorm'
+                        for op in tf.get_default_graph().get_operations()))


 if __name__ == '__main__':

--- a/research/object_detection/protos/image_resizer.proto
+++ b/research/object_detection/protos/image_resizer.proto
@@ -37,6 +37,10 @@ message KeepAspectRatioResizer {

  // Whether to also resize the image channels from 3 to 1 (RGB to grayscale).
  optional bool convert_to_grayscale = 5 [default = false];
+
+  // Per-channel pad value. This is only used when pad_to_max_dimension is True.
+  // If unspecified, a default pad value of 0 is applied to all channels.
+  repeated float per_channel_pad_value = 6;
 }

 // Configuration proto for image resizer that resizes to a fixed shape.

--- a/research/object_detection/protos/input_reader.proto
+++ b/research/object_detection/protos/input_reader.proto
@@ -69,6 +69,10 @@ message InputReader {
  // Type of instance mask.
  optional InstanceMaskType mask_type = 10 [default = NUMERICAL_MASKS];

+  // Whether to use the display name when decoding examples. This is only used
+  // when mapping class text strings to integers.
+  optional bool use_display_name = 17 [default = false];
+
  oneof input_reader {
    TFRecordInputReader tf_record_input_reader = 8;
    ExternalInputReader external_input_reader = 9;

--- a/research/object_detection/trainer.py
+++ b/research/object_detection/trainer.py
@@ -235,6 +235,9 @@ def train(create_tensor_dict_fn,
      built (before optimization). This is helpful to perform additional changes
      to the training graph such as adding FakeQuant ops. The function should
      modify the default graph.
+
+  Raises:
+    ValueError: If both num_clones > 1 and train_config.sync_replicas is true.
  """

  detection_model = create_model_fn()
@@ -256,9 +259,16 @@ def train(create_tensor_dict_fn,
    with tf.device(deploy_config.variables_device()):
      global_step = slim.create_global_step()

+    if num_clones != 1 and train_config.sync_replicas:
+      raise ValueError('In Synchronous SGD mode num_clones must ',
+                       'be 1. Found num_clones: {}'.format(num_clones))
+    batch_size = train_config.batch_size // num_clones
+    if train_config.sync_replicas:
+      batch_size //= train_config.replicas_to_aggregate
+
    with tf.device(deploy_config.inputs_device()):
      input_queue = create_input_queue(
-          train_config.batch_size // num_clones, create_tensor_dict_fn,
+          batch_size, create_tensor_dict_fn,
          train_config.batch_queue_capacity,
          train_config.num_batch_queue_threads,
          train_config.prefetch_queue_capacity, data_augmentation_options)
@@ -377,7 +387,8 @@ def train(create_tensor_dict_fn,
              train_config.load_all_detection_checkpoint_vars))
      available_var_map = (variables_helper.
                           get_variables_available_in_checkpoint(
-                               var_map, train_config.fine_tune_checkpoint))
+                               var_map, train_config.fine_tune_checkpoint,
+                               include_global_step=False))
      init_saver = tf.train.Saver(available_var_map)
      def initializer_fn(sess):
        init_saver.restore(sess, train_config.fine_tune_checkpoint)

--- a/research/object_detection/utils/config_util.py
+++ b/research/object_detection/utils/config_util.py
@@ -278,6 +278,19 @@ def get_learning_rate_type(optimizer_config):
  return optimizer_config.learning_rate.WhichOneof("learning_rate")


+def _is_generic_key(key):
+  """Determines whether the key starts with a generic config dictionary key."""
+  for prefix in [
+      "graph_rewriter_config",
+      "model",
+      "train_input_config",
+      "train_input_config",
+      "train_config"]:
+    if key.startswith(prefix + "."):
+      return True
+  return False
+
+
 def merge_external_params_with_configs(configs, hparams=None, **kwargs):
  """Updates `configs` dictionary based on supplied parameters.

@@ -287,6 +300,16 @@ def merge_external_params_with_configs(configs, hparams=None, **kwargs):
  experiment, one can use a single base config file, and update particular
  values.

+  There are two types of field overrides:
+  1. Strategy-based overrides, which update multiple relevant configuration
+  options. For example, updating `learning_rate` will update both the warmup and
+  final learning rates.
+  2. Generic key/value, which update a specific parameter based on namespaced
+  configuration keys. For example,
+  `model.ssd.loss.hard_example_miner.max_negatives_per_positive` will update the
+  hard example miner configuration for an SSD model config. Generic overrides
+  are automatically detected based on the namespaced keys.
+
  Args:
    configs: Dictionary of configuration objects. See outputs from
      get_configs_from_pipeline_file() or get_configs_from_multiple_files().
@@ -302,44 +325,42 @@ def merge_external_params_with_configs(configs, hparams=None, **kwargs):
  if hparams:
    kwargs.update(hparams.values())
  for key, value in kwargs.items():
+    tf.logging.info("Maybe overwriting %s: %s", key, value)
    # pylint: disable=g-explicit-bool-comparison
    if value == "" or value is None:
      continue
    # pylint: enable=g-explicit-bool-comparison
    if key == "learning_rate":
      _update_initial_learning_rate(configs, value)
-      tf.logging.info("Overwriting learning rate: %f", value)
-    if key == "batch_size":
+    elif key == "batch_size":
      _update_batch_size(configs, value)
-      tf.logging.info("Overwriting batch size: %d", value)
-    if key == "momentum_optimizer_value":
+    elif key == "momentum_optimizer_value":
      _update_momentum_optimizer_value(configs, value)
-      tf.logging.info("Overwriting momentum optimizer value: %f", value)
-    if key == "classification_localization_weight_ratio":
+    elif key == "classification_localization_weight_ratio":
      # Localization weight is fixed to 1.0.
      _update_classification_localization_weight_ratio(configs, value)
-    if key == "focal_loss_gamma":
+    elif key == "focal_loss_gamma":
      _update_focal_loss_gamma(configs, value)
-    if key == "focal_loss_alpha":
+    elif key == "focal_loss_alpha":
      _update_focal_loss_alpha(configs, value)
-    if key == "train_steps":
+    elif key == "train_steps":
      _update_train_steps(configs, value)
-      tf.logging.info("Overwriting train steps: %d", value)
-    if key == "eval_steps":
+    elif key == "eval_steps":
      _update_eval_steps(configs, value)
-      tf.logging.info("Overwriting eval steps: %d", value)
-    if key == "train_input_path":
+    elif key == "train_input_path":
      _update_input_path(configs["train_input_config"], value)
-      tf.logging.info("Overwriting train input path: %s", value)
-    if key == "eval_input_path":
+    elif key == "eval_input_path":
      _update_input_path(configs["eval_input_config"], value)
-      tf.logging.info("Overwriting eval input path: %s", value)
-    if key == "label_map_path":
+    elif key == "label_map_path":
      _update_label_map_path(configs, value)
-      tf.logging.info("Overwriting label map path: %s", value)
-    if key == "mask_type":
+    elif key == "mask_type":
      _update_mask_type(configs, value)
-      tf.logging.info("Overwritten mask type: %s", value)
+    elif key == "eval_with_moving_averages":
+      _update_use_moving_averages(configs, value)
+    elif _is_generic_key(key):
+      _update_generic(configs, key, value)
+    else:
+      tf.logging.info("Ignoring config override key: %s", key)
  return configs


@@ -411,6 +432,38 @@ def _update_batch_size(configs, batch_size):
  configs["train_config"].batch_size = max(1, int(round(batch_size)))


+def _validate_message_has_field(message, field):
+  if not message.HasField(field):
+    raise ValueError("Expecting message to have field %s" % field)
+
+
+def _update_generic(configs, key, value):
+  """Update a pipeline configuration parameter based on a generic key/value.
+
+  Args:
+    configs: Dictionary of pipeline configuration protos.
+    key: A string key, dot-delimited to represent the argument key.
+      e.g. "model.ssd.train_config.batch_size"
+    value: A value to set the argument to. The type of the value must match the
+      type for the protocol buffer. Note that setting the wrong type will
+      result in a TypeError.
+      e.g. 42
+
+  Raises:
+    ValueError if the message key does not match the existing proto fields.
+    TypeError the value type doesn't match the protobuf field type.
+  """
+  fields = key.split(".")
+  first_field = fields.pop(0)
+  last_field = fields.pop()
+  message = configs[first_field]
+  for field in fields:
+    _validate_message_has_field(message, field)
+    message = getattr(message, field)
+  _validate_message_has_field(message, last_field)
+  setattr(message, last_field, value)
+
+
 def _update_momentum_optimizer_value(configs, momentum):
  """Updates `configs` to reflect the new momentum value.

@@ -587,3 +640,17 @@ def _update_mask_type(configs, mask_type):
  """
  configs["train_input_config"].mask_type = mask_type
  configs["eval_input_config"].mask_type = mask_type
+
+
+def _update_use_moving_averages(configs, use_moving_averages):
+  """Updates the eval config option to use or not use moving averages.
+
+  The configs dictionary is updated in place, and hence not returned.
+
+  Args:
+    configs: Dictionary of configuration objects. See outputs from
+      get_configs_from_pipeline_file() or get_configs_from_multiple_files().
+    use_moving_averages: Boolean indicating whether moving average variables
+      should be loaded during evaluation.
+  """
+  configs["eval_config"].use_moving_averages = use_moving_averages
--- a/research/object_detection/utils/config_util_test.py
+++ b/research/object_detection/utils/config_util_test.py
@@ -69,6 +69,11 @@ def _update_optimizer_with_cosine_decay_learning_rate(

 class ConfigUtilTest(tf.test.TestCase):

+  def _create_and_load_test_configs(self, pipeline_config):
+    pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config")
+    _write_config(pipeline_config, pipeline_config_path)
+    return config_util.get_configs_from_pipeline_file(pipeline_config_path)
+
  def test_get_configs_from_pipeline_file(self):
    """Test that proto configs can be read from pipeline config file."""
    pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config")
@@ -307,6 +312,34 @@ class ConfigUtilTest(tf.test.TestCase):
    new_batch_size = configs["train_config"].batch_size
    self.assertEqual(1, new_batch_size)  # Clipped to 1.0.

+  def testOverwriteBatchSizeWithKeyValue(self):
+    """Tests that batch size is overwritten based on key/value."""
+    pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+    pipeline_config.train_config.batch_size = 2
+    configs = self._create_and_load_test_configs(pipeline_config)
+    hparams = tf.contrib.training.HParams(**{"train_config.batch_size": 10})
+    configs = config_util.merge_external_params_with_configs(configs, hparams)
+    new_batch_size = configs["train_config"].batch_size
+    self.assertEqual(10, new_batch_size)
+
+  def testKeyValueOverrideBadKey(self):
+    """Tests that overwriting with a bad key causes an exception."""
+    pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+    configs = self._create_and_load_test_configs(pipeline_config)
+    hparams = tf.contrib.training.HParams(**{"train_config.no_such_field": 10})
+    with self.assertRaises(ValueError):
+      config_util.merge_external_params_with_configs(configs, hparams)
+
+  def testOverwriteBatchSizeWithBadValueType(self):
+    """Tests that overwriting with a bad valuye type causes an exception."""
+    pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+    pipeline_config.train_config.batch_size = 2
+    configs = self._create_and_load_test_configs(pipeline_config)
+    # Type should be an integer, but we're passing a string "10".
+    hparams = tf.contrib.training.HParams(**{"train_config.batch_size": "10"})
+    with self.assertRaises(TypeError):
+      config_util.merge_external_params_with_configs(configs, hparams)
+
  def testNewMomentumOptimizerValue(self):
    """Tests that new momentum value is updated appropriately."""
    original_momentum_value = 0.4
@@ -501,6 +534,19 @@ class ConfigUtilTest(tf.test.TestCase):
    self.assertEqual(new_mask_type, configs["train_input_config"].mask_type)
    self.assertEqual(new_mask_type, configs["eval_input_config"].mask_type)

+  def testUseMovingAverageForEval(self):
+    use_moving_averages_orig = False
+    pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config")
+
+    pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+    pipeline_config.eval_config.use_moving_averages = use_moving_averages_orig
+    _write_config(pipeline_config, pipeline_config_path)
+
+    configs = config_util.get_configs_from_pipeline_file(pipeline_config_path)
+    configs = config_util.merge_external_params_with_configs(
+        configs, eval_with_moving_averages=True)
+    self.assertEqual(True, configs["eval_config"].use_moving_averages)
+
  def  test_get_image_resizer_config(self):
    """Tests that number of classes can be retrieved."""
    model_config = model_pb2.DetectionModel()

--- a/research/object_detection/utils/dataset_util.py
+++ b/research/object_detection/utils/dataset_util.py
@@ -117,13 +117,17 @@ def read_dataset(file_read_func, decode_func, input_files, config):
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
-  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
-                        0)
-  filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
+  filenames = tf.gfile.Glob(input_files)
+  num_readers = config.num_readers
+  if num_readers > len(filenames):
+    num_readers = len(filenames)
+    tf.logging.warning('num_readers has been reduced to %d to match input file '
+                       'shards.' % num_readers)
+  filename_dataset = tf.data.Dataset.from_tensor_slices(tf.unstack(filenames))
  if config.shuffle:
    filename_dataset = filename_dataset.shuffle(
        config.filenames_shuffle_buffer_size)
-  elif config.num_readers > 1:
+  elif num_readers > 1:
    tf.logging.warning('`shuffle` is false, but the input data stream is '
                       'still slightly shuffled since `num_readers` > 1.')

@@ -131,8 +135,10 @@ def read_dataset(file_read_func, decode_func, input_files, config):

  records_dataset = filename_dataset.apply(
      tf.contrib.data.parallel_interleave(
-          file_read_func, cycle_length=config.num_readers,
-          block_length=config.read_block_length, sloppy=config.shuffle))
+          file_read_func,
+          cycle_length=num_readers,
+          block_length=config.read_block_length,
+          sloppy=config.shuffle))
  if config.shuffle:
    records_dataset = records_dataset.shuffle(config.shuffle_buffer_size)
  tensor_dataset = records_dataset.map(

--- a/research/object_detection/utils/dataset_util_test.py
+++ b/research/object_detection/utils/dataset_util_test.py
@@ -16,6 +16,7 @@
 """Tests for object_detection.utils.dataset_util."""

 import os
+import numpy as np
 import tensorflow as tf

 from object_detection.protos import input_reader_pb2
@@ -32,6 +33,13 @@ class DatasetUtilTest(tf.test.TestCase):
      with tf.gfile.Open(path, 'wb') as f:
        f.write('\n'.join([str(i + 1), str((i + 1) * 10)]))

+    self._shuffle_path_template = os.path.join(self.get_temp_dir(),
+                                               'shuffle_%s.txt')
+    for i in range(2):
+      path = self._shuffle_path_template % i
+      with tf.gfile.Open(path, 'wb') as f:
+        f.write('\n'.join([str(i)] * 5))
+
  def _get_dataset_next(self, files, config, batch_size):
    def decode_func(value):
      return [tf.string_to_number(value, out_type=tf.int32)]
@@ -78,6 +86,43 @@ class DatasetUtilTest(tf.test.TestCase):
                          [[1, 10, 2, 20, 3, 30, 4, 40, 5, 50, 1, 10, 2, 20, 3,
                            30, 4, 40, 5, 50]])

+  def test_reduce_num_reader(self):
+    config = input_reader_pb2.InputReader()
+    config.num_readers = 10
+    config.shuffle = False
+
+    data = self._get_dataset_next([self._path_template % '*'], config,
+                                  batch_size=20)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(data),
+                          [[1, 10, 2, 20, 3, 30, 4, 40, 5, 50, 1, 10, 2, 20, 3,
+                            30, 4, 40, 5, 50]])
+
+  def test_enable_shuffle(self):
+    config = input_reader_pb2.InputReader()
+    config.num_readers = 1
+    config.shuffle = True
+
+    data = self._get_dataset_next(
+        [self._shuffle_path_template % '*'], config, batch_size=10)
+    expected_non_shuffle_output = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    with self.test_session() as sess:
+      self.assertTrue(
+          np.any(np.not_equal(sess.run(data), expected_non_shuffle_output)))
+
+  def test_disable_shuffle_(self):
+    config = input_reader_pb2.InputReader()
+    config.num_readers = 1
+    config.shuffle = False
+
+    data = self._get_dataset_next(
+        [self._shuffle_path_template % '*'], config, batch_size=10)
+    expected_non_shuffle_output = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(data), [expected_non_shuffle_output])
+
  def test_read_dataset_single_epoch(self):
    config = input_reader_pb2.InputReader()
    config.num_epochs = 1

--- a/research/object_detection/utils/ops.py
+++ b/research/object_detection/utils/ops.py
@@ -318,8 +318,9 @@ def retain_groundtruth(tensor_dict, valid_indices):
  Args:
    tensor_dict: a dictionary of following groundtruth tensors -
      fields.InputDataFields.groundtruth_boxes
-      fields.InputDataFields.groundtruth_instance_masks
      fields.InputDataFields.groundtruth_classes
+      fields.InputDataFields.groundtruth_keypoints
+      fields.InputDataFields.groundtruth_instance_masks
      fields.InputDataFields.groundtruth_is_crowd
      fields.InputDataFields.groundtruth_area
      fields.InputDataFields.groundtruth_label_types
@@ -347,6 +348,7 @@ def retain_groundtruth(tensor_dict, valid_indices):
    for key in tensor_dict:
      if key in [fields.InputDataFields.groundtruth_boxes,
                 fields.InputDataFields.groundtruth_classes,
+                 fields.InputDataFields.groundtruth_keypoints,
                 fields.InputDataFields.groundtruth_instance_masks]:
        valid_dict[key] = tf.gather(tensor_dict[key], valid_indices)
      # Input decoder returns empty tensor when these fields are not provided.
@@ -374,6 +376,8 @@ def retain_groundtruth_with_positive_classes(tensor_dict):
    tensor_dict: a dictionary of following groundtruth tensors -
      fields.InputDataFields.groundtruth_boxes
      fields.InputDataFields.groundtruth_classes
+      fields.InputDataFields.groundtruth_keypoints
+      fields.InputDataFields.groundtruth_instance_masks
      fields.InputDataFields.groundtruth_is_crowd
      fields.InputDataFields.groundtruth_area
      fields.InputDataFields.groundtruth_label_types
@@ -413,6 +417,8 @@ def filter_groundtruth_with_crowd_boxes(tensor_dict):
    tensor_dict: a dictionary of following groundtruth tensors -
      fields.InputDataFields.groundtruth_boxes
      fields.InputDataFields.groundtruth_classes
+      fields.InputDataFields.groundtruth_keypoints
+      fields.InputDataFields.groundtruth_instance_masks
      fields.InputDataFields.groundtruth_is_crowd
      fields.InputDataFields.groundtruth_area
      fields.InputDataFields.groundtruth_label_types
@@ -435,8 +441,9 @@ def filter_groundtruth_with_nan_box_coordinates(tensor_dict):
  Args:
    tensor_dict: a dictionary of following groundtruth tensors -
      fields.InputDataFields.groundtruth_boxes
-      fields.InputDataFields.groundtruth_instance_masks
      fields.InputDataFields.groundtruth_classes
+      fields.InputDataFields.groundtruth_keypoints
+      fields.InputDataFields.groundtruth_instance_masks
      fields.InputDataFields.groundtruth_is_crowd
      fields.InputDataFields.groundtruth_area
      fields.InputDataFields.groundtruth_label_types
@@ -703,23 +710,30 @@ def reframe_box_masks_to_image_masks(box_masks, boxes, image_height,
    A tf.float32 tensor of size [num_masks, image_height, image_width].
  """
  # TODO(rathodv): Make this a public function.
-  def transform_boxes_relative_to_boxes(boxes, reference_boxes):
-    boxes = tf.reshape(boxes, [-1, 2, 2])
-    min_corner = tf.expand_dims(reference_boxes[:, 0:2], 1)
-    max_corner = tf.expand_dims(reference_boxes[:, 2:4], 1)
-    transformed_boxes = (boxes - min_corner) / (max_corner - min_corner)
-    return tf.reshape(transformed_boxes, [-1, 4])
-
-  box_masks = tf.expand_dims(box_masks, axis=3)
-  num_boxes = tf.shape(box_masks)[0]
-  unit_boxes = tf.concat(
-      [tf.zeros([num_boxes, 2]), tf.ones([num_boxes, 2])], axis=1)
-  reverse_boxes = transform_boxes_relative_to_boxes(unit_boxes, boxes)
-  image_masks = tf.image.crop_and_resize(image=box_masks,
-                                         boxes=reverse_boxes,
-                                         box_ind=tf.range(num_boxes),
-                                         crop_size=[image_height, image_width],
-                                         extrapolation_value=0.0)
+  def reframe_box_masks_to_image_masks_default():
+    """The default function when there are more than 0 box masks."""
+    def transform_boxes_relative_to_boxes(boxes, reference_boxes):
+      boxes = tf.reshape(boxes, [-1, 2, 2])
+      min_corner = tf.expand_dims(reference_boxes[:, 0:2], 1)
+      max_corner = tf.expand_dims(reference_boxes[:, 2:4], 1)
+      transformed_boxes = (boxes - min_corner) / (max_corner - min_corner)
+      return tf.reshape(transformed_boxes, [-1, 4])
+
+    box_masks_expanded = tf.expand_dims(box_masks, axis=3)
+    num_boxes = tf.shape(box_masks_expanded)[0]
+    unit_boxes = tf.concat(
+        [tf.zeros([num_boxes, 2]), tf.ones([num_boxes, 2])], axis=1)
+    reverse_boxes = transform_boxes_relative_to_boxes(unit_boxes, boxes)
+    return tf.image.crop_and_resize(
+        image=box_masks_expanded,
+        boxes=reverse_boxes,
+        box_ind=tf.range(num_boxes),
+        crop_size=[image_height, image_width],
+        extrapolation_value=0.0)
+  image_masks = tf.cond(
+      tf.shape(box_masks)[0] > 0,
+      reframe_box_masks_to_image_masks_default,
+      lambda: tf.zeros([0, image_height, image_width, 1], dtype=tf.float32))
  return tf.squeeze(image_masks, axis=3)



--- a/research/object_detection/utils/ops_test.py
+++ b/research/object_detection/utils/ops_test.py
@@ -1100,6 +1100,16 @@ class ReframeBoxMasksToImageMasksTest(tf.test.TestCase):
      np_image_masks = sess.run(image_masks)
      self.assertAllClose(np_image_masks, np_expected_image_masks)

+  def testZeroBoxMasks(self):
+    box_masks = tf.zeros([0, 3, 3], dtype=tf.float32)
+    boxes = tf.zeros([0, 4], dtype=tf.float32)
+    image_masks = ops.reframe_box_masks_to_image_masks(box_masks, boxes,
+                                                       image_height=4,
+                                                       image_width=4)
+    with self.test_session() as sess:
+      np_image_masks = sess.run(image_masks)
+      self.assertAllEqual(np_image_masks.shape, np.array([0, 4, 4]))
+
  def testMaskIsCenteredInImageWhenBoxIsCentered(self):
    box_masks = tf.constant([[[1, 1],
                              [1, 1]]], dtype=tf.float32)

--- a/research/object_detection/utils/per_image_vrd_evaluation.py
+++ b/research/object_detection/utils/per_image_vrd_evaluation.py
@@ -67,16 +67,18 @@ class PerImageVRDEvaluation(object):
      tp_fp_labels: A single boolean numpy array of shape [N,], representing N
          True/False positive label, one label per tuple. The labels are sorted
          so that the order of the labels matches the order of the scores.
+      result_mapping: A numpy array with shape [N,] with original index of each
+          entry.
    """

-    scores, tp_fp_labels = self._compute_tp_fp(
+    scores, tp_fp_labels, result_mapping = self._compute_tp_fp(
        detected_box_tuples=detected_box_tuples,
        detected_scores=detected_scores,
        detected_class_tuples=detected_class_tuples,
        groundtruth_box_tuples=groundtruth_box_tuples,
        groundtruth_class_tuples=groundtruth_class_tuples)

-    return scores, tp_fp_labels
+    return scores, tp_fp_labels, result_mapping

  def _compute_tp_fp(self, detected_box_tuples, detected_scores,
                     detected_class_tuples, groundtruth_box_tuples,
@@ -107,33 +109,46 @@ class PerImageVRDEvaluation(object):
      tp_fp_labels: A single boolean numpy array of shape [N,], representing N
          True/False positive label, one label per tuple. The labels are sorted
          so that the order of the labels matches the order of the scores.
-
+      result_mapping: A numpy array with shape [N,] with original index of each
+          entry.
    """
    unique_gt_tuples = np.unique(
        np.concatenate((groundtruth_class_tuples, detected_class_tuples)))
    result_scores = []
    result_tp_fp_labels = []
+    result_mapping = []

    for unique_tuple in unique_gt_tuples:
      detections_selector = (detected_class_tuples == unique_tuple)
      gt_selector = (groundtruth_class_tuples == unique_tuple)
-      scores, tp_fp_labels = self._compute_tp_fp_for_single_class(
-          detected_box_tuples=detected_box_tuples[detections_selector],
-          detected_scores=detected_scores[detections_selector],
+
+      selector_mapping = np.where(detections_selector)[0]
+
+      detection_scores_per_tuple = detected_scores[detections_selector]
+      detection_box_per_tuple = detected_box_tuples[detections_selector]
+
+      sorted_indices = np.argsort(detection_scores_per_tuple)
+      sorted_indices = sorted_indices[::-1]
+
+      tp_fp_labels = self._compute_tp_fp_for_single_class(
+          detected_box_tuples=detection_box_per_tuple[sorted_indices],
          groundtruth_box_tuples=groundtruth_box_tuples[gt_selector])
-      result_scores.append(scores)
+      result_scores.append(detection_scores_per_tuple[sorted_indices])
      result_tp_fp_labels.append(tp_fp_labels)
+      result_mapping.append(selector_mapping[sorted_indices])

    result_scores = np.concatenate(result_scores)
    result_tp_fp_labels = np.concatenate(result_tp_fp_labels)
+    result_mapping = np.concatenate(result_mapping)

    sorted_indices = np.argsort(result_scores)
    sorted_indices = sorted_indices[::-1]

-    return result_scores[sorted_indices], result_tp_fp_labels[sorted_indices]
+    return result_scores[sorted_indices], result_tp_fp_labels[
+        sorted_indices], result_mapping[sorted_indices]

-  def _get_overlaps_and_scores_relation_tuples(
-      self, detected_box_tuples, detected_scores, groundtruth_box_tuples):
+  def _get_overlaps_and_scores_relation_tuples(self, detected_box_tuples,
+                                               groundtruth_box_tuples):
    """Computes overlaps and scores between detected and groundtruth tuples.

    Both detections and groundtruth boxes have the same class tuples.
@@ -143,8 +158,6 @@ class PerImageVRDEvaluation(object):
          representing N tuples, each tuple containing the same number of named
          bounding boxes.
          Each box is of the format [y_min, x_min, y_max, x_max]
-      detected_scores: A float numpy array of shape [N,], representing
-          the confidence scores of the detected N object instances.
      groundtruth_box_tuples: A float numpy array of structures with the shape
          [M,], representing M tuples, each tuple containing the same number
          of named bounding boxes.
@@ -153,7 +166,6 @@ class PerImageVRDEvaluation(object):
    Returns:
      result_iou: A float numpy array of size
        [num_detected_tuples, num_gt_box_tuples].
-      scores: The score of the detected boxlist.
    """

    result_iou = np.ones(
@@ -161,46 +173,35 @@ class PerImageVRDEvaluation(object):
        dtype=float)
    for field in detected_box_tuples.dtype.fields:
      detected_boxlist_field = np_box_list.BoxList(detected_box_tuples[field])
-      detected_boxlist_field.add_field('scores', detected_scores)
-      detected_boxlist_field = np_box_list_ops.sort_by_field(
-          detected_boxlist_field, 'scores')
      gt_boxlist_field = np_box_list.BoxList(groundtruth_box_tuples[field])
      iou_field = np_box_list_ops.iou(detected_boxlist_field, gt_boxlist_field)
      result_iou = np.minimum(iou_field, result_iou)
-    scores = detected_boxlist_field.get_field('scores')
-    return result_iou, scores
+    return result_iou

  def _compute_tp_fp_for_single_class(self, detected_box_tuples,
-                                      detected_scores, groundtruth_box_tuples):
+                                      groundtruth_box_tuples):
    """Labels boxes detected with the same class from the same image as tp/fp.

+    Detection boxes are expected to be already sorted by score.
    Args:
      detected_box_tuples: A numpy array of structures with shape [N,],
          representing N tuples, each tuple containing the same number of named
          bounding boxes.
          Each box is of the format [y_min, x_min, y_max, x_max]
-      detected_scores: A float numpy array of shape [N,], representing
-          the confidence scores of the detected N object instances.
      groundtruth_box_tuples: A float numpy array of structures with the shape
          [M,], representing M tuples, each tuple containing the same number
          of named bounding boxes.
          Each box is of the format [y_min, x_min, y_max, x_max]

    Returns:
-      Two arrays of the same size, containing true/false for N boxes that were
-      evaluated as being true positives or false positives;
-
-      scores: A numpy array representing the detection scores.
      tp_fp_labels: a boolean numpy array indicating whether a detection is a
          true positive.
    """
    if detected_box_tuples.size == 0:
-      return np.array([], dtype=float), np.array([], dtype=bool)
+      return np.array([], dtype=bool)

-    min_iou, scores = self._get_overlaps_and_scores_relation_tuples(
-        detected_box_tuples=detected_box_tuples,
-        detected_scores=detected_scores,
-        groundtruth_box_tuples=groundtruth_box_tuples)
+    min_iou = self._get_overlaps_and_scores_relation_tuples(
+        detected_box_tuples, groundtruth_box_tuples)

    num_detected_tuples = detected_box_tuples.shape[0]
    tp_fp_labels = np.zeros(num_detected_tuples, dtype=bool)
@@ -215,4 +216,4 @@ class PerImageVRDEvaluation(object):
            tp_fp_labels[i] = True
            is_gt_tuple_detected[gt_id] = True

-    return scores, tp_fp_labels
+    return tp_fp_labels
--- a/research/object_detection/utils/per_image_vrd_evaluation_test.py
+++ b/research/object_detection/utils/per_image_vrd_evaluation_test.py
@@ -28,31 +28,25 @@ class SingleClassPerImageVrdEvaluationTest(tf.test.TestCase):
    box_data_type = np.dtype([('subject', 'f4', (4,)), ('object', 'f4', (4,))])

    self.detected_box_tuples = np.array(
-        [([0, 0, 1, 1], [1, 1, 2, 2]), ([0, 0, 1.1, 1], [1, 1, 2, 2]),
+        [([0, 0, 1.1, 1], [1, 1, 2, 2]), ([0, 0, 1, 1], [1, 1, 2, 2]),
         ([1, 1, 2, 2], [0, 0, 1.1, 1])],
        dtype=box_data_type)
-    self.detected_scores = np.array([0.2, 0.8, 0.1], dtype=float)
+    self.detected_scores = np.array([0.8, 0.2, 0.1], dtype=float)
    self.groundtruth_box_tuples = np.array(
        [([0, 0, 1, 1], [1, 1, 2, 2])], dtype=box_data_type)

  def test_tp_fp_eval(self):
-    scores, tp_fp_labels = self.eval._compute_tp_fp_for_single_class(
-        self.detected_box_tuples, self.detected_scores,
-        self.groundtruth_box_tuples)
-    expected_scores = np.array([0.8, 0.2, 0.1], dtype=float)
+    tp_fp_labels = self.eval._compute_tp_fp_for_single_class(
+        self.detected_box_tuples, self.groundtruth_box_tuples)
    expected_tp_fp_labels = np.array([True, False, False], dtype=bool)
-    self.assertTrue(np.allclose(expected_scores, scores))
    self.assertTrue(np.allclose(expected_tp_fp_labels, tp_fp_labels))

  def test_tp_fp_eval_empty_gt(self):
    box_data_type = np.dtype([('subject', 'f4', (4,)), ('object', 'f4', (4,))])

-    scores, tp_fp_labels = self.eval._compute_tp_fp_for_single_class(
-        self.detected_box_tuples, self.detected_scores,
-        np.array([], dtype=box_data_type))
-    expected_scores = np.array([0.8, 0.2, 0.1], dtype=float)
+    tp_fp_labels = self.eval._compute_tp_fp_for_single_class(
+        self.detected_box_tuples, np.array([], dtype=box_data_type))
    expected_tp_fp_labels = np.array([False, False, False], dtype=bool)
-    self.assertTrue(np.allclose(expected_scores, scores))
    self.assertTrue(np.allclose(expected_tp_fp_labels, tp_fp_labels))


@@ -82,16 +76,18 @@ class MultiClassPerImageVrdEvaluationTest(tf.test.TestCase):
        [(1, 2, 3), (1, 7, 3), (1, 4, 5)], dtype=label_data_type)

  def test_tp_fp_eval(self):
-    scores, tp_fp_labels = self.eval.compute_detection_tp_fp(
+    scores, tp_fp_labels, mapping = self.eval.compute_detection_tp_fp(
        self.detected_box_tuples, self.detected_scores,
        self.detected_class_tuples, self.groundtruth_box_tuples,
        self.groundtruth_class_tuples)

    expected_scores = np.array([0.8, 0.5, 0.2, 0.1], dtype=float)
    expected_tp_fp_labels = np.array([True, True, False, False], dtype=bool)
+    expected_mapping = np.array([1, 3, 0, 2])

    self.assertTrue(np.allclose(expected_scores, scores))
    self.assertTrue(np.allclose(expected_tp_fp_labels, tp_fp_labels))
+    self.assertTrue(np.allclose(expected_mapping, mapping))


 if __name__ == '__main__':

--- a/research/object_detection/utils/test_utils.py
+++ b/research/object_detection/utils/test_utils.py
@@ -138,3 +138,36 @@ def create_random_boxes(num_boxes, max_height, max_width):
  boxes[:, 3] = np.maximum(x_1, x_2)

  return boxes.astype(np.float32)
+
+
+def first_rows_close_as_set(a, b, k=None, rtol=1e-6, atol=1e-6):
+  """Checks if first K entries of two lists are close, up to permutation.
+
+  Inputs to this assert are lists of items which can be compared via
+  numpy.allclose(...) and can be sorted.
+
+  Args:
+    a: list of items which can be compared via numpy.allclose(...) and are
+      sortable.
+    b: list of items which can be compared via numpy.allclose(...) and are
+      sortable.
+    k: a non-negative integer.  If not provided, k is set to be len(a).
+    rtol: relative tolerance.
+    atol: absolute tolerance.
+
+  Returns:
+    boolean, True if input lists a and b have the same length and
+    the first k entries of the inputs satisfy numpy.allclose() after
+    sorting entries.
+  """
+  if not isinstance(a, list) or not isinstance(b, list) or len(a) != len(b):
+    return False
+  if not k:
+    k = len(a)
+  k = min(k, len(a))
+  a_sorted = sorted(a[:k])
+  b_sorted = sorted(b[:k])
+  return all([
+      np.allclose(entry_a, entry_b, rtol, atol)
+      for (entry_a, entry_b) in zip(a_sorted, b_sorted)
+  ])
--- a/research/object_detection/utils/test_utils_test.py
+++ b/research/object_detection/utils/test_utils_test.py
@@ -68,6 +68,22 @@ class TestUtilsTest(tf.test.TestCase):
    self.assertTrue(boxes[:, 2].max() <= max_height)
    self.assertTrue(boxes[:, 3].max() <= max_width)

+  def test_first_rows_close_as_set(self):
+    a = [1, 2, 3, 0, 0]
+    b = [3, 2, 1, 0, 0]
+    k = 3
+    self.assertTrue(test_utils.first_rows_close_as_set(a, b, k))
+
+    a = [[1, 2], [1, 4], [0, 0]]
+    b = [[1, 4 + 1e-9], [1, 2], [0, 0]]
+    k = 2
+    self.assertTrue(test_utils.first_rows_close_as_set(a, b, k))
+
+    a = [[1, 2], [1, 4], [0, 0]]
+    b = [[1, 4 + 1e-9], [2, 2], [0, 0]]
+    k = 2
+    self.assertFalse(test_utils.first_rows_close_as_set(a, b, k))
+

 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/utils/visualization_utils.py
+++ b/research/object_detection/utils/visualization_utils.py
@@ -315,11 +315,13 @@ def draw_bounding_boxes_on_image_tensors(images,
                                         instance_masks=None,
                                         keypoints=None,
                                         max_boxes_to_draw=20,
-                                         min_score_thresh=0.2):
+                                         min_score_thresh=0.2,
+                                         use_normalized_coordinates=True):
  """Draws bounding boxes, masks, and keypoints on batch of image tensors.

  Args:
-    images: A 4D uint8 image tensor of shape [N, H, W, C].
+    images: A 4D uint8 image tensor of shape [N, H, W, C]. If C > 3, additional
+      channels will be ignored.
    boxes: [N, max_detections, 4] float32 tensor of detection boxes.
    classes: [N, max_detections] int tensor of detection classes. Note that
      classes are 1-indexed.
@@ -332,12 +334,17 @@ def draw_bounding_boxes_on_image_tensors(images,
      with keypoints.
    max_boxes_to_draw: Maximum number of boxes to draw on an image. Default 20.
    min_score_thresh: Minimum score threshold for visualization. Default 0.2.
+    use_normalized_coordinates: Whether to assume boxes and kepoints are in
+      normalized coordinates (as opposed to absolute coordiantes).
+      Default is True.

  Returns:
    4D image tensor of type uint8, with boxes drawn on top.
  """
+  # Additional channels are being ignored.
+  images = images[:, :, :, 0:3]
  visualization_keyword_args = {
-      'use_normalized_coordinates': True,
+      'use_normalized_coordinates': use_normalized_coordinates,
      'max_boxes_to_draw': max_boxes_to_draw,
      'min_score_thresh': min_score_thresh,
      'agnostic_mode': False,
@@ -382,7 +389,8 @@ def draw_bounding_boxes_on_image_tensors(images,
 def draw_side_by_side_evaluation_image(eval_dict,
                                       category_index,
                                       max_boxes_to_draw=20,
-                                       min_score_thresh=0.2):
+                                       min_score_thresh=0.2,
+                                       use_normalized_coordinates=True):
  """Creates a side-by-side image with detections and groundtruth.

  Bounding boxes (and instance masks, if available) are visualized on both
@@ -394,6 +402,9 @@ def draw_side_by_side_evaluation_image(eval_dict,
    category_index: A category index (dictionary) produced from a labelmap.
    max_boxes_to_draw: The maximum number of boxes to draw for detections.
    min_score_thresh: The minimum score threshold for showing detections.
+    use_normalized_coordinates: Whether to assume boxes and kepoints are in
+      normalized coordinates (as opposed to absolute coordiantes).
+      Default is True.

  Returns:
    A [1, H, 2 * W, C] uint8 tensor. The subimage on the left corresponds to
@@ -425,7 +436,8 @@ def draw_side_by_side_evaluation_image(eval_dict,
      instance_masks=instance_masks,
      keypoints=keypoints,
      max_boxes_to_draw=max_boxes_to_draw,
-      min_score_thresh=min_score_thresh)
+      min_score_thresh=min_score_thresh,
+      use_normalized_coordinates=use_normalized_coordinates)
  images_with_groundtruth = draw_bounding_boxes_on_image_tensors(
      eval_dict[input_data_fields.original_image],
      tf.expand_dims(eval_dict[input_data_fields.groundtruth_boxes], axis=0),
@@ -439,7 +451,8 @@ def draw_side_by_side_evaluation_image(eval_dict,
      instance_masks=groundtruth_instance_masks,
      keypoints=None,
      max_boxes_to_draw=None,
-      min_score_thresh=0.0)
+      min_score_thresh=0.0,
+      use_normalized_coordinates=use_normalized_coordinates)
  return tf.concat([images_with_detections, images_with_groundtruth], axis=2)



--- a/research/object_detection/utils/visualization_utils_test.py
+++ b/research/object_detection/utils/visualization_utils_test.py
@@ -48,6 +48,9 @@ class VisualizationUtilsTest(tf.test.TestCase):
    image = np.concatenate((imu, imd), axis=0)
    return image

+  def create_test_image_with_five_channels(self):
+    return np.full([100, 200, 5], 255, dtype=np.uint8)
+
  def test_draw_bounding_box_on_image(self):
    test_image = self.create_colorful_test_image()
    test_image = Image.fromarray(test_image)
@@ -144,6 +147,32 @@ class VisualizationUtilsTest(tf.test.TestCase):
          image_pil = Image.fromarray(images_with_boxes_np[i, ...])
          image_pil.save(output_file)

+  def test_draw_bounding_boxes_on_image_tensors_with_additional_channels(self):
+    """Tests the case where input image tensor has more than 3 channels."""
+    category_index = {1: {'id': 1, 'name': 'dog'}}
+    image_np = self.create_test_image_with_five_channels()
+    images_np = np.stack((image_np, image_np), axis=0)
+
+    with tf.Graph().as_default():
+      images_tensor = tf.constant(value=images_np, dtype=tf.uint8)
+      boxes = tf.constant(0, dtype=tf.float32, shape=[2, 0, 4])
+      classes = tf.constant(0, dtype=tf.int64, shape=[2, 0])
+      scores = tf.constant(0, dtype=tf.float32, shape=[2, 0])
+      images_with_boxes = (
+          visualization_utils.draw_bounding_boxes_on_image_tensors(
+              images_tensor,
+              boxes,
+              classes,
+              scores,
+              category_index,
+              min_score_thresh=0.2))
+
+      with self.test_session() as sess:
+        sess.run(tf.global_variables_initializer())
+
+        final_images_np = sess.run(images_with_boxes)
+        self.assertEqual((2, 100, 200, 3), final_images_np.shape)
+
  def test_draw_keypoints_on_image(self):
    test_image = self.create_colorful_test_image()
    test_image = Image.fromarray(test_image)

--- a/research/object_detection/utils/vrd_evaluation.py
+++ b/research/object_detection/utils/vrd_evaluation.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Evaluator class for Visual Relations Detection.
+
+VRDDetectionEvaluator is a class which manages ground truth information of a
+visual relations detection (vrd) dataset, and computes frequently used detection
+metrics such as Precision, Recall, Recall@k, of the provided vrd detection
+results.
+It supports the following operations:
+1) Adding ground truth information of images sequentially.
+2) Adding detection results of images sequentially.
+3) Evaluating detection metrics on already inserted detection results.
+
+Note1: groundtruth should be inserted before evaluation.
+Note2: This module operates on numpy boxes and box lists.
+"""
+
+from abc import abstractmethod
+import collections
+import logging
+import numpy as np
+
+from object_detection.core import standard_fields
+from object_detection.utils import metrics
+from object_detection.utils import object_detection_evaluation
+from object_detection.utils import per_image_vrd_evaluation
+
+# Below standard input numpy datatypes are defined:
+# box_data_type - datatype of the groundtruth visual relations box annotations;
+# this datatype consists of two named boxes: subject bounding box and object
+# bounding box. Each box is of the format [y_min, x_min, y_max, x_max], each
+# coordinate being of type float32.
+# label_data_type - corresponding datatype of the visual relations label
+# annotaions; it consists of three numerical class labels: subject class label,
+# object class label and relation class label, each class label being of type
+# int32.
+vrd_box_data_type = np.dtype([('subject', 'f4', (4,)), ('object', 'f4', (4,))])
+single_box_data_type = np.dtype([('box', 'f4', (4,))])
+label_data_type = np.dtype([('subject', 'i4'), ('object', 'i4'), ('relation',
+                                                                  'i4')])
+
+
+class VRDDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
+  """A class to evaluate VRD detections.
+
+  This class serves as a base class for VRD evaluation in two settings:
+  - phrase detection
+  - relation detection.
+  """
+
+  def __init__(self, matching_iou_threshold=0.5, metric_prefix=None):
+    """Constructor.
+
+    Args:
+      matching_iou_threshold: IOU threshold to use for matching groundtruth
+        boxes to detection boxes.
+      metric_prefix: (optional) string prefix for metric name; if None, no
+        prefix is used.
+
+    """
+    super(VRDDetectionEvaluator, self).__init__([])
+    self._matching_iou_threshold = matching_iou_threshold
+    self._evaluation = _VRDDetectionEvaluation(
+        matching_iou_threshold=self._matching_iou_threshold)
+    self._image_ids = set([])
+    self._metric_prefix = (metric_prefix + '_') if metric_prefix else ''
+    self._evaluatable_labels = {}
+    self._negative_labels = {}
+
+  @abstractmethod
+  def _process_groundtruth_boxes(self, groundtruth_box_tuples):
+    """Pre-processes boxes before adding them to the VRDDetectionEvaluation.
+
+    Phrase detection and Relation detection subclasses re-implement this method
+    depending on the task.
+
+    Args:
+      groundtruth_box_tuples:  A numpy array of structures with the shape
+        [M, 1], each structure containing the same number of named bounding
+        boxes. Each box is of the format [y_min, x_min, y_max, x_max] (see
+        datatype vrd_box_data_type, single_box_data_type above).
+    """
+    raise NotImplementedError(
+        '_process_groundtruth_boxes method should be implemented in subclasses'
+        'of VRDDetectionEvaluator.')
+
+  @abstractmethod
+  def _process_detection_boxes(self, detections_box_tuples):
+    """Pre-processes boxes before adding them to the VRDDetectionEvaluation.
+
+    Phrase detection and Relation detection subclasses re-implement this method
+    depending on the task.
+
+    Args:
+      detections_box_tuples:  A numpy array of structures with the shape
+        [M, 1], each structure containing the same number of named bounding
+        boxes. Each box is of the format [y_min, x_min, y_max, x_max] (see
+        datatype vrd_box_data_type, single_box_data_type above).
+    """
+    raise NotImplementedError(
+        '_process_detection_boxes method should be implemented in subclasses'
+        'of VRDDetectionEvaluator.')
+
+  def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
+    """Adds groundtruth for a single image to be used for evaluation.
+
+    Args:
+      image_id: A unique string/integer identifier for the image.
+      groundtruth_dict: A dictionary containing -
+        standard_fields.InputDataFields.groundtruth_boxes: A numpy array
+          of structures with the shape [M, 1], representing M tuples, each tuple
+          containing the same number of named bounding boxes.
+          Each box is of the format [y_min, x_min, y_max, x_max] (see
+          datatype vrd_box_data_type, single_box_data_type above).
+        standard_fields.InputDataFields.groundtruth_classes: A numpy array of
+          structures shape [M, 1], representing  the class labels of the
+          corresponding bounding boxes and possibly additional classes (see
+          datatype label_data_type above).
+        standard_fields.InputDataFields.verified_labels: numpy array
+          of shape [K] containing verified labels.
+    Raises:
+      ValueError: On adding groundtruth for an image more than once.
+    """
+    if image_id in self._image_ids:
+      raise ValueError('Image with id {} already added.'.format(image_id))
+
+    groundtruth_class_tuples = (
+        groundtruth_dict[standard_fields.InputDataFields.groundtruth_classes])
+    groundtruth_box_tuples = (
+        groundtruth_dict[standard_fields.InputDataFields.groundtruth_boxes])
+
+    self._evaluation.add_single_ground_truth_image_info(
+        image_key=image_id,
+        groundtruth_box_tuples=self._process_groundtruth_boxes(
+            groundtruth_box_tuples),
+        groundtruth_class_tuples=groundtruth_class_tuples)
+    self._image_ids.update([image_id])
+    all_classes = []
+    for field in groundtruth_box_tuples.dtype.fields:
+      all_classes.append(groundtruth_class_tuples[field])
+    groudtruth_positive_classes = np.unique(np.concatenate(all_classes))
+    verified_labels = groundtruth_dict.get(
+        standard_fields.InputDataFields.verified_labels, np.array(
+            [], dtype=int))
+    self._evaluatable_labels[image_id] = np.unique(
+        np.concatenate((verified_labels, groudtruth_positive_classes)))
+
+    self._negative_labels[image_id] = np.setdiff1d(verified_labels,
+                                                   groudtruth_positive_classes)
+
+  def add_single_detected_image_info(self, image_id, detections_dict):
+    """Adds detections for a single image to be used for evaluation.
+
+    Args:
+      image_id: A unique string/integer identifier for the image.
+      detections_dict: A dictionary containing -
+        standard_fields.DetectionResultFields.detection_boxes: A numpy array of
+          structures with shape [N, 1], representing N tuples, each tuple
+          containing the same number of named bounding boxes.
+          Each box is of the format [y_min, x_min, y_max, x_max] (as an example
+          see datatype vrd_box_data_type, single_box_data_type above).
+        standard_fields.DetectionResultFields.detection_scores: float32 numpy
+          array of shape [N] containing detection scores for the boxes.
+        standard_fields.DetectionResultFields.detection_classes: A numpy array
+          of structures shape [N, 1], representing the class labels of the
+          corresponding bounding boxes and possibly additional classes (see
+          datatype label_data_type above).
+    """
+    num_detections = detections_dict[
+        standard_fields.DetectionResultFields.detection_boxes].shape[0]
+    detection_class_tuples = detections_dict[
+        standard_fields.DetectionResultFields.detection_classes]
+    detection_box_tuples = detections_dict[
+        standard_fields.DetectionResultFields.detection_boxes]
+    selector = np.ones(num_detections, dtype=bool)
+
+    # Only check boxable labels
+    for field in detection_box_tuples.dtype.fields:
+      # Verify if one of the labels is negative (this is sure FP)
+      selector |= np.isin(detection_class_tuples[field],
+                          self._negative_labels[image_id])
+      # Verify if all labels are verified
+      selector |= np.isin(detection_class_tuples[field],
+                          self._evaluatable_labels[image_id])
+
+    self._evaluation.add_single_detected_image_info(
+        image_key=image_id,
+        detected_box_tuples=self._process_detection_boxes(
+            detection_box_tuples[selector]),
+        detected_scores=detections_dict[
+            standard_fields.DetectionResultFields.detection_scores][selector],
+        detected_class_tuples=detection_class_tuples[selector])
+
+  def evaluate(self, relationships=None):
+    """Compute evaluation result.
+
+    Args:
+      relationships: A dictionary of numerical label-text label mapping; if
+        specified, returns per-relationship AP.
+
+    Returns:
+      A dictionary of metrics with the following fields -
+
+      summary_metrics:
+        'weightedAP@<matching_iou_threshold>IOU' : weighted average precision
+        at the specified IOU threshold.
+        'AP@<matching_iou_threshold>IOU/<relationship>' : AP per relationship.
+        'mAP@<matching_iou_threshold>IOU': mean average precision at the
+        specified IOU threshold.
+        'Recall@50@<matching_iou_threshold>IOU': recall@50 at the specified IOU
+        threshold.
+        'Recall@100@<matching_iou_threshold>IOU': recall@100 at the specified
+        IOU threshold.
+      if relationships is specified, returns <relationship> in AP metrics as
+      readable names, otherwise the names correspond to class numbers.
+    """
+    (weighted_average_precision, mean_average_precision, average_precisions, _,
+     _, recall_50, recall_100, _, _) = (
+         self._evaluation.evaluate())
+
+    vrd_metrics = {
+        (self._metric_prefix + 'weightedAP@{}IOU'.format(
+            self._matching_iou_threshold)):
+            weighted_average_precision,
+        self._metric_prefix + 'mAP@{}IOU'.format(self._matching_iou_threshold):
+            mean_average_precision,
+        self._metric_prefix + 'Recall@50@{}IOU'.format(
+            self._matching_iou_threshold):
+            recall_50,
+        self._metric_prefix + 'Recall@100@{}IOU'.format(
+            self._matching_iou_threshold):
+            recall_100,
+    }
+    if relationships:
+      for key, average_precision in average_precisions.iteritems():
+        vrd_metrics[self._metric_prefix + 'AP@{}IOU/{}'.format(
+            self._matching_iou_threshold,
+            relationships[key])] = average_precision
+    else:
+      for key, average_precision in average_precisions.iteritems():
+        vrd_metrics[self._metric_prefix + 'AP@{}IOU/{}'.format(
+            self._matching_iou_threshold, key)] = average_precision
+
+    return vrd_metrics
+
+  def clear(self):
+    """Clears the state to prepare for a fresh evaluation."""
+    self._evaluation = _VRDDetectionEvaluation(
+        matching_iou_threshold=self._matching_iou_threshold)
+    self._image_ids.clear()
+    self._negative_labels.clear()
+    self._evaluatable_labels.clear()
+
+
+class VRDRelationDetectionEvaluator(VRDDetectionEvaluator):
+  """A class to evaluate VRD detections in relations setting.
+
+  Expected groundtruth box datatype is vrd_box_data_type, expected groudtruth
+  labels datatype is label_data_type.
+  Expected detection box datatype is vrd_box_data_type, expected detection
+  labels
+  datatype is label_data_type.
+  """
+
+  def __init__(self, matching_iou_threshold=0.5):
+    super(VRDRelationDetectionEvaluator, self).__init__(
+        matching_iou_threshold=matching_iou_threshold,
+        metric_prefix='VRDMetric_Relationships')
+
+  def _process_groundtruth_boxes(self, groundtruth_box_tuples):
+    """Pre-processes boxes before adding them to the VRDDetectionEvaluation.
+
+    Args:
+      groundtruth_box_tuples: A numpy array of structures with the shape
+        [M, 1], each structure containing the same number of named bounding
+        boxes. Each box is of the format [y_min, x_min, y_max, x_max].
+
+    Returns:
+      Unchanged input.
+    """
+
+    return groundtruth_box_tuples
+
+  def _process_detection_boxes(self, detections_box_tuples):
+    """Pre-processes boxes before adding them to the VRDDetectionEvaluation.
+
+    Phrase detection and Relation detection subclasses re-implement this method
+    depending on the task.
+
+    Args:
+      detections_box_tuples:  A numpy array of structures with the shape
+        [M, 1], each structure containing the same number of named bounding
+        boxes. Each box is of the format [y_min, x_min, y_max, x_max] (see
+        datatype vrd_box_data_type, single_box_data_type above).
+    Returns:
+      Unchanged input.
+    """
+    return detections_box_tuples
+
+
+class VRDPhraseDetectionEvaluator(VRDDetectionEvaluator):
+  """A class to evaluate VRD detections in phrase setting.
+
+  Expected groundtruth box datatype is vrd_box_data_type, expected groudtruth
+  labels datatype is label_data_type.
+  Expected detection box datatype is single_box_data_type, expected detection
+  labels datatype is label_data_type.
+  """
+
+  def __init__(self, matching_iou_threshold=0.5):
+    super(VRDPhraseDetectionEvaluator, self).__init__(
+        matching_iou_threshold=matching_iou_threshold,
+        metric_prefix='VRDMetric_Phrases')
+
+  def _process_groundtruth_boxes(self, groundtruth_box_tuples):
+    """Pre-processes boxes before adding them to the VRDDetectionEvaluation.
+
+    In case of phrase evaluation task, evaluation expects exactly one bounding
+    box containing all objects in the phrase. This bounding box is computed
+    as an enclosing box of all groundtruth boxes of a phrase.
+
+    Args:
+      groundtruth_box_tuples: A numpy array of structures with the shape
+        [M, 1], each structure containing the same number of named bounding
+        boxes. Each box is of the format [y_min, x_min, y_max, x_max]. See
+        vrd_box_data_type for an example of structure.
+
+    Returns:
+      result: A numpy array of structures with the shape [M, 1], each
+        structure containing exactly one named bounding box. i-th output
+        structure corresponds to the result of processing i-th input structure,
+        where the named bounding box is computed as an enclosing bounding box
+        of all bounding boxes of the i-th input structure.
+    """
+    first_box_key = groundtruth_box_tuples.dtype.fields.keys()[0]
+    miny = groundtruth_box_tuples[first_box_key][:, 0]
+    minx = groundtruth_box_tuples[first_box_key][:, 1]
+    maxy = groundtruth_box_tuples[first_box_key][:, 2]
+    maxx = groundtruth_box_tuples[first_box_key][:, 3]
+    for fields in groundtruth_box_tuples.dtype.fields:
+      miny = np.minimum(groundtruth_box_tuples[fields][:, 0], miny)
+      minx = np.minimum(groundtruth_box_tuples[fields][:, 1], minx)
+      maxy = np.maximum(groundtruth_box_tuples[fields][:, 2], maxy)
+      maxx = np.maximum(groundtruth_box_tuples[fields][:, 3], maxx)
+    data_result = []
+    for i in range(groundtruth_box_tuples.shape[0]):
+      data_result.append(([miny[i], minx[i], maxy[i], maxx[i]],))
+    result = np.array(data_result, dtype=[('box', 'f4', (4,))])
+    return result
+
+  def _process_detection_boxes(self, detections_box_tuples):
+    """Pre-processes boxes before adding them to the VRDDetectionEvaluation.
+
+    In case of phrase evaluation task, evaluation expects exactly one bounding
+    box containing all objects in the phrase. This bounding box is computed
+    as an enclosing box of all groundtruth boxes of a phrase.
+
+    Args:
+      detections_box_tuples: A numpy array of structures with the shape
+        [M, 1], each structure containing the same number of named bounding
+        boxes. Each box is of the format [y_min, x_min, y_max, x_max]. See
+        vrd_box_data_type for an example of this structure.
+
+    Returns:
+      result: A numpy array of structures with the shape [M, 1], each
+        structure containing exactly one named bounding box. i-th output
+        structure corresponds to the result of processing i-th input structure,
+        where the named bounding box is computed as an enclosing bounding box
+        of all bounding boxes of the i-th input structure.
+    """
+    first_box_key = detections_box_tuples.dtype.fields.keys()[0]
+    miny = detections_box_tuples[first_box_key][:, 0]
+    minx = detections_box_tuples[first_box_key][:, 1]
+    maxy = detections_box_tuples[first_box_key][:, 2]
+    maxx = detections_box_tuples[first_box_key][:, 3]
+    for fields in detections_box_tuples.dtype.fields:
+      miny = np.minimum(detections_box_tuples[fields][:, 0], miny)
+      minx = np.minimum(detections_box_tuples[fields][:, 1], minx)
+      maxy = np.maximum(detections_box_tuples[fields][:, 2], maxy)
+      maxx = np.maximum(detections_box_tuples[fields][:, 3], maxx)
+    data_result = []
+    for i in range(detections_box_tuples.shape[0]):
+      data_result.append(([miny[i], minx[i], maxy[i], maxx[i]],))
+    result = np.array(data_result, dtype=[('box', 'f4', (4,))])
+    return result
+
+
+VRDDetectionEvalMetrics = collections.namedtuple('VRDDetectionEvalMetrics', [
+    'weighted_average_precision', 'mean_average_precision',
+    'average_precisions', 'precisions', 'recalls', 'recall_50', 'recall_100',
+    'median_rank_50', 'median_rank_100'
+])
+
+
+class _VRDDetectionEvaluation(object):
+  """Performs metric computation for the VRD task. This class is internal.
+  """
+
+  def __init__(self, matching_iou_threshold=0.5):
+    """Constructor.
+
+    Args:
+      matching_iou_threshold: IOU threshold to use for matching groundtruth
+        boxes to detection boxes.
+    """
+    self._per_image_eval = per_image_vrd_evaluation.PerImageVRDEvaluation(
+        matching_iou_threshold=matching_iou_threshold)
+
+    self._groundtruth_box_tuples = {}
+    self._groundtruth_class_tuples = {}
+    self._num_gt_instances = 0
+    self._num_gt_imgs = 0
+    self._num_gt_instances_per_relationship = {}
+
+    self.clear_detections()
+
+  def clear_detections(self):
+    """Clears detections."""
+    self._detection_keys = set()
+    self._scores = []
+    self._relation_field_values = []
+    self._tp_fp_labels = []
+    self._average_precisions = {}
+    self._precisions = []
+    self._recalls = []
+
+  def add_single_ground_truth_image_info(
+      self, image_key, groundtruth_box_tuples, groundtruth_class_tuples):
+    """Adds groundtruth for a single image to be used for evaluation.
+
+    Args:
+      image_key: A unique string/integer identifier for the image.
+      groundtruth_box_tuples: A numpy array of structures with the shape
+          [M, 1], representing M tuples, each tuple containing the same number
+          of named bounding boxes.
+          Each box is of the format [y_min, x_min, y_max, x_max].
+      groundtruth_class_tuples: A numpy array of structures shape [M, 1],
+          representing  the class labels of the corresponding bounding boxes and
+          possibly additional classes.
+    """
+    if image_key in self._groundtruth_box_tuples:
+      logging.warn(
+          'image %s has already been added to the ground truth database.',
+          image_key)
+      return
+
+    self._groundtruth_box_tuples[image_key] = groundtruth_box_tuples
+    self._groundtruth_class_tuples[image_key] = groundtruth_class_tuples
+
+    self._update_groundtruth_statistics(groundtruth_class_tuples)
+
+  def add_single_detected_image_info(self, image_key, detected_box_tuples,
+                                     detected_scores, detected_class_tuples):
+    """Adds detections for a single image to be used for evaluation.
+
+    Args:
+      image_key: A unique string/integer identifier for the image.
+      detected_box_tuples: A numpy array of structures with shape [N, 1],
+          representing N tuples, each tuple containing the same number of named
+          bounding boxes.
+          Each box is of the format [y_min, x_min, y_max, x_max].
+      detected_scores: A float numpy array of shape [N, 1], representing
+          the confidence scores of the detected N object instances.
+      detected_class_tuples: A numpy array of structures shape [N, 1],
+          representing the class labels of the corresponding bounding boxes and
+          possibly additional classes.
+    """
+    self._detection_keys.add(image_key)
+    if image_key in self._groundtruth_box_tuples:
+      groundtruth_box_tuples = self._groundtruth_box_tuples[image_key]
+      groundtruth_class_tuples = self._groundtruth_class_tuples[image_key]
+    else:
+      groundtruth_box_tuples = np.empty(shape=[0, 4], dtype=float)
+      groundtruth_class_tuples = np.array([], dtype=int)
+
+    scores, tp_fp_labels, mapping = (
+        self._per_image_eval.compute_detection_tp_fp(
+            detected_box_tuples=detected_box_tuples,
+            detected_scores=detected_scores,
+            detected_class_tuples=detected_class_tuples,
+            groundtruth_box_tuples=groundtruth_box_tuples,
+            groundtruth_class_tuples=groundtruth_class_tuples))
+
+    self._scores += [scores]
+    self._tp_fp_labels += [tp_fp_labels]
+    self._relation_field_values += [detected_class_tuples[mapping]['relation']]
+
+  def _update_groundtruth_statistics(self, groundtruth_class_tuples):
+    """Updates grouth truth statistics.
+
+    Args:
+      groundtruth_class_tuples: A numpy array of structures shape [M, 1],
+          representing  the class labels of the corresponding bounding boxes and
+          possibly additional classes.
+    """
+    self._num_gt_instances += groundtruth_class_tuples.shape[0]
+    self._num_gt_imgs += 1
+    for relation_field_value in np.unique(groundtruth_class_tuples['relation']):
+      if relation_field_value not in self._num_gt_instances_per_relationship:
+        self._num_gt_instances_per_relationship[relation_field_value] = 0
+      self._num_gt_instances_per_relationship[relation_field_value] += np.sum(
+          groundtruth_class_tuples['relation'] == relation_field_value)
+
+  def evaluate(self):
+    """Computes evaluation result.
+
+    Returns:
+      A named tuple with the following fields -
+        average_precision: a float number corresponding to average precision.
+        precisions: an array of precisions.
+        recalls: an array of recalls.
+        recall@50: recall computed on 50 top-scoring samples.
+        recall@100: recall computed on 100 top-scoring samples.
+        median_rank@50: median rank computed on 50 top-scoring samples.
+        median_rank@100: median rank computed on 100 top-scoring samples.
+    """
+    if self._num_gt_instances == 0:
+      logging.warn('No ground truth instances')
+
+    if not self._scores:
+      scores = np.array([], dtype=float)
+      tp_fp_labels = np.array([], dtype=bool)
+    else:
+      scores = np.concatenate(self._scores)
+      tp_fp_labels = np.concatenate(self._tp_fp_labels)
+      relation_field_values = np.concatenate(self._relation_field_values)
+
+    for relation_field_value, _ in (
+        self._num_gt_instances_per_relationship.iteritems()):
+      precisions, recalls = metrics.compute_precision_recall(
+          scores[relation_field_values == relation_field_value],
+          tp_fp_labels[relation_field_values == relation_field_value],
+          self._num_gt_instances_per_relationship[relation_field_value])
+      self._average_precisions[
+          relation_field_value] = metrics.compute_average_precision(
+              precisions, recalls)
+
+    self._mean_average_precision = np.mean(self._average_precisions.values())
+
+    self._precisions, self._recalls = metrics.compute_precision_recall(
+        scores, tp_fp_labels, self._num_gt_instances)
+    self._weighted_average_precision = metrics.compute_average_precision(
+        self._precisions, self._recalls)
+
+    self._recall_50 = (
+        metrics.compute_recall_at_k(self._tp_fp_labels, self._num_gt_instances,
+                                    50))
+    self._median_rank_50 = (
+        metrics.compute_median_rank_at_k(self._tp_fp_labels, 50))
+    self._recall_100 = (
+        metrics.compute_recall_at_k(self._tp_fp_labels, self._num_gt_instances,
+                                    100))
+    self._median_rank_100 = (
+        metrics.compute_median_rank_at_k(self._tp_fp_labels, 100))
+
+    return VRDDetectionEvalMetrics(
+        self._weighted_average_precision, self._mean_average_precision,
+        self._average_precisions, self._precisions, self._recalls,
+        self._recall_50, self._recall_100, self._median_rank_50,
+        self._median_rank_100)
--- a/research/object_detection/utils/vrd_evaluation_test.py
+++ b/research/object_detection/utils/vrd_evaluation_test.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow_models.object_detection.utils.vrd_evaluation."""
+
+import numpy as np
+import tensorflow as tf
+
+from object_detection.core import standard_fields
+from object_detection.utils import vrd_evaluation
+
+
+class VRDRelationDetectionEvaluatorTest(tf.test.TestCase):
+
+  def test_vrdrelation_evaluator(self):
+    self.vrd_eval = vrd_evaluation.VRDRelationDetectionEvaluator()
+
+    image_key1 = 'img1'
+    groundtruth_box_tuples1 = np.array(
+        [([0, 0, 1, 1], [1, 1, 2, 2]), ([0, 0, 1, 1], [1, 2, 2, 3])],
+        dtype=vrd_evaluation.vrd_box_data_type)
+    groundtruth_class_tuples1 = np.array(
+        [(1, 2, 3), (1, 4, 3)], dtype=vrd_evaluation.label_data_type)
+    groundtruth_verified_labels1 = np.array([1, 2, 3, 4, 5], dtype=int)
+    self.vrd_eval.add_single_ground_truth_image_info(
+        image_key1, {
+            standard_fields.InputDataFields.groundtruth_boxes:
+                groundtruth_box_tuples1,
+            standard_fields.InputDataFields.groundtruth_classes:
+                groundtruth_class_tuples1,
+            standard_fields.InputDataFields.verified_labels:
+                groundtruth_verified_labels1
+        })
+
+    image_key2 = 'img2'
+    groundtruth_box_tuples2 = np.array(
+        [([0, 0, 1, 1], [1, 1, 2, 2])], dtype=vrd_evaluation.vrd_box_data_type)
+    groundtruth_class_tuples2 = np.array(
+        [(1, 4, 3)], dtype=vrd_evaluation.label_data_type)
+    self.vrd_eval.add_single_ground_truth_image_info(
+        image_key2, {
+            standard_fields.InputDataFields.groundtruth_boxes:
+                groundtruth_box_tuples2,
+            standard_fields.InputDataFields.groundtruth_classes:
+                groundtruth_class_tuples2,
+        })
+
+    image_key3 = 'img3'
+    groundtruth_box_tuples3 = np.array(
+        [([0, 0, 1, 1], [1, 1, 2, 2])], dtype=vrd_evaluation.vrd_box_data_type)
+    groundtruth_class_tuples3 = np.array(
+        [(1, 2, 4)], dtype=vrd_evaluation.label_data_type)
+    self.vrd_eval.add_single_ground_truth_image_info(
+        image_key3, {
+            standard_fields.InputDataFields.groundtruth_boxes:
+                groundtruth_box_tuples3,
+            standard_fields.InputDataFields.groundtruth_classes:
+                groundtruth_class_tuples3,
+        })
+
+    image_key = 'img1'
+    detected_box_tuples = np.array(
+        [([0, 0.3, 1, 1], [1.1, 1, 2, 2]), ([0, 0, 1, 1], [1, 1, 2, 2])],
+        dtype=vrd_evaluation.vrd_box_data_type)
+    detected_class_tuples = np.array(
+        [(1, 2, 5), (1, 2, 3)], dtype=vrd_evaluation.label_data_type)
+    detected_scores = np.array([0.7, 0.8], dtype=float)
+    self.vrd_eval.add_single_detected_image_info(
+        image_key, {
+            standard_fields.DetectionResultFields.detection_boxes:
+                detected_box_tuples,
+            standard_fields.DetectionResultFields.detection_scores:
+                detected_scores,
+            standard_fields.DetectionResultFields.detection_classes:
+                detected_class_tuples
+        })
+
+    metrics = self.vrd_eval.evaluate()
+
+    self.assertAlmostEqual(metrics['VRDMetric_Relationships_weightedAP@0.5IOU'],
+                           0.25)
+    self.assertAlmostEqual(metrics['VRDMetric_Relationships_mAP@0.5IOU'],
+                           0.1666666666666666)
+    self.assertAlmostEqual(metrics['VRDMetric_Relationships_AP@0.5IOU/3'],
+                           0.3333333333333333)
+    self.assertAlmostEqual(metrics['VRDMetric_Relationships_AP@0.5IOU/4'], 0)
+    self.assertAlmostEqual(metrics['VRDMetric_Relationships_Recall@50@0.5IOU'],
+                           0.25)
+    self.assertAlmostEqual(metrics['VRDMetric_Relationships_Recall@100@0.5IOU'],
+                           0.25)
+    self.vrd_eval.clear()
+    self.assertFalse(self.vrd_eval._image_ids)
+
+
+class VRDPhraseDetectionEvaluatorTest(tf.test.TestCase):
+
+  def test_vrdphrase_evaluator(self):
+    self.vrd_eval = vrd_evaluation.VRDPhraseDetectionEvaluator()
+
+    image_key1 = 'img1'
+    groundtruth_box_tuples1 = np.array(
+        [([0, 0, 1, 1], [1, 1, 2, 2]), ([0, 0, 1, 1], [1, 2, 2, 3])],
+        dtype=vrd_evaluation.vrd_box_data_type)
+    groundtruth_class_tuples1 = np.array(
+        [(1, 2, 3), (1, 4, 3)], dtype=vrd_evaluation.label_data_type)
+    groundtruth_verified_labels1 = np.array([1, 2, 3, 4, 5], dtype=int)
+    self.vrd_eval.add_single_ground_truth_image_info(
+        image_key1, {
+            standard_fields.InputDataFields.groundtruth_boxes:
+                groundtruth_box_tuples1,
+            standard_fields.InputDataFields.groundtruth_classes:
+                groundtruth_class_tuples1,
+            standard_fields.InputDataFields.verified_labels:
+                groundtruth_verified_labels1
+        })
+
+    image_key2 = 'img2'
+    groundtruth_box_tuples2 = np.array(
+        [([0, 0, 1, 1], [1, 1, 2, 2])], dtype=vrd_evaluation.vrd_box_data_type)
+    groundtruth_class_tuples2 = np.array(
+        [(1, 4, 3)], dtype=vrd_evaluation.label_data_type)
+    self.vrd_eval.add_single_ground_truth_image_info(
+        image_key2, {
+            standard_fields.InputDataFields.groundtruth_boxes:
+                groundtruth_box_tuples2,
+            standard_fields.InputDataFields.groundtruth_classes:
+                groundtruth_class_tuples2,
+        })
+
+    image_key3 = 'img3'
+    groundtruth_box_tuples3 = np.array(
+        [([0, 0, 1, 1], [1, 1, 2, 2])], dtype=vrd_evaluation.vrd_box_data_type)
+    groundtruth_class_tuples3 = np.array(
+        [(1, 2, 4)], dtype=vrd_evaluation.label_data_type)
+    self.vrd_eval.add_single_ground_truth_image_info(
+        image_key3, {
+            standard_fields.InputDataFields.groundtruth_boxes:
+                groundtruth_box_tuples3,
+            standard_fields.InputDataFields.groundtruth_classes:
+                groundtruth_class_tuples3,
+        })
+
+    image_key = 'img1'
+    detected_box_tuples = np.array(
+        [([0, 0.3, 0.5, 0.5], [0.3, 0.3, 1.0, 1.0]),
+         ([0, 0, 1.2, 1.2], [0.0, 0.0, 2.0, 2.0])],
+        dtype=vrd_evaluation.vrd_box_data_type)
+    detected_class_tuples = np.array(
+        [(1, 2, 5), (1, 2, 3)], dtype=vrd_evaluation.label_data_type)
+    detected_scores = np.array([0.7, 0.8], dtype=float)
+    self.vrd_eval.add_single_detected_image_info(
+        image_key, {
+            standard_fields.DetectionResultFields.detection_boxes:
+                detected_box_tuples,
+            standard_fields.DetectionResultFields.detection_scores:
+                detected_scores,
+            standard_fields.DetectionResultFields.detection_classes:
+                detected_class_tuples
+        })
+
+    metrics = self.vrd_eval.evaluate()
+
+    self.assertAlmostEqual(metrics['VRDMetric_Phrases_weightedAP@0.5IOU'], 0.25)
+    self.assertAlmostEqual(metrics['VRDMetric_Phrases_mAP@0.5IOU'],
+                           0.1666666666666666)
+    self.assertAlmostEqual(metrics['VRDMetric_Phrases_AP@0.5IOU/3'],
+                           0.3333333333333333)
+    self.assertAlmostEqual(metrics['VRDMetric_Phrases_AP@0.5IOU/4'], 0)
+    self.assertAlmostEqual(metrics['VRDMetric_Phrases_Recall@50@0.5IOU'], 0.25)
+    self.assertAlmostEqual(metrics['VRDMetric_Phrases_Recall@100@0.5IOU'], 0.25)
+
+    self.vrd_eval.clear()
+    self.assertFalse(self.vrd_eval._image_ids)
+
+
+class VRDDetectionEvaluationTest(tf.test.TestCase):
+
+  def setUp(self):
+
+    self.vrd_eval = vrd_evaluation._VRDDetectionEvaluation(
+        matching_iou_threshold=0.5)
+
+    image_key1 = 'img1'
+    groundtruth_box_tuples1 = np.array(
+        [([0, 0, 1, 1], [1, 1, 2, 2]), ([0, 0, 1, 1], [1, 2, 2, 3])],
+        dtype=vrd_evaluation.vrd_box_data_type)
+    groundtruth_class_tuples1 = np.array(
+        [(1, 2, 3), (1, 4, 3)], dtype=vrd_evaluation.label_data_type)
+    self.vrd_eval.add_single_ground_truth_image_info(
+        image_key1, groundtruth_box_tuples1, groundtruth_class_tuples1)
+
+    image_key2 = 'img2'
+    groundtruth_box_tuples2 = np.array(
+        [([0, 0, 1, 1], [1, 1, 2, 2])], dtype=vrd_evaluation.vrd_box_data_type)
+    groundtruth_class_tuples2 = np.array(
+        [(1, 4, 3)], dtype=vrd_evaluation.label_data_type)
+    self.vrd_eval.add_single_ground_truth_image_info(
+        image_key2, groundtruth_box_tuples2, groundtruth_class_tuples2)
+
+    image_key3 = 'img3'
+    groundtruth_box_tuples3 = np.array(
+        [([0, 0, 1, 1], [1, 1, 2, 2])], dtype=vrd_evaluation.vrd_box_data_type)
+    groundtruth_class_tuples3 = np.array(
+        [(1, 2, 4)], dtype=vrd_evaluation.label_data_type)
+    self.vrd_eval.add_single_ground_truth_image_info(
+        image_key3, groundtruth_box_tuples3, groundtruth_class_tuples3)
+
+    image_key = 'img1'
+    detected_box_tuples = np.array(
+        [([0, 0.3, 1, 1], [1.1, 1, 2, 2]), ([0, 0, 1, 1], [1, 1, 2, 2])],
+        dtype=vrd_evaluation.vrd_box_data_type)
+    detected_class_tuples = np.array(
+        [(1, 2, 3), (1, 2, 3)], dtype=vrd_evaluation.label_data_type)
+    detected_scores = np.array([0.7, 0.8], dtype=float)
+    self.vrd_eval.add_single_detected_image_info(
+        image_key, detected_box_tuples, detected_scores, detected_class_tuples)
+
+    metrics = self.vrd_eval.evaluate()
+    expected_weighted_average_precision = 0.25
+    expected_mean_average_precision = 0.16666666666666
+    expected_precision = np.array([1., 0.5], dtype=float)
+    expected_recall = np.array([0.25, 0.25], dtype=float)
+    expected_recall_50 = 0.25
+    expected_recall_100 = 0.25
+    expected_median_rank_50 = 0
+    expected_median_rank_100 = 0
+
+    self.assertAlmostEqual(expected_weighted_average_precision,
+                           metrics.weighted_average_precision)
+    self.assertAlmostEqual(expected_mean_average_precision,
+                           metrics.mean_average_precision)
+    self.assertAlmostEqual(expected_mean_average_precision,
+                           metrics.mean_average_precision)
+
+    self.assertAllClose(expected_precision, metrics.precisions)
+    self.assertAllClose(expected_recall, metrics.recalls)
+    self.assertAlmostEqual(expected_recall_50, metrics.recall_50)
+    self.assertAlmostEqual(expected_recall_100, metrics.recall_100)
+    self.assertAlmostEqual(expected_median_rank_50, metrics.median_rank_50)
+    self.assertAlmostEqual(expected_median_rank_100, metrics.median_rank_100)
+
+
+if __name__ == '__main__':
+  tf.test.main()