* Change evalutor and eval_util.py to use new eval

interface defined in utils/object_detection_evaluation.py. * Update eval.py to use routines from utils/config_utils.py to parse config files.

* Change evalutor and eval_util.py to use new eval
interface defined in utils/object_detection_evaluation.py. * Update eval.py to use routines from utils/config_utils.py to parse config files.
8a72df2d · Vivek Rathod · a3c7d7e8 · 8a72df2d · 8a72df2d · 8a72df2d
Commit 8a72df2d authored Oct 27, 2017 by Vivek Rathod
4 changed files
--- a/research/object_detection/BUILD
+++ b/research/object_detection/BUILD
@@ -57,8 +57,13 @@ py_library(
    ],
    deps = [
        "//tensorflow",
+        "//tensorflow_models/object_detection/core:box_list",
+        "//tensorflow_models/object_detection/core:box_list_ops",
+        "//tensorflow_models/object_detection/core:keypoint_ops",
+        "//tensorflow_models/object_detection/core:standard_fields",
        "//tensorflow_models/object_detection/utils:label_map_util",
        "//tensorflow_models/object_detection/utils:object_detection_evaluation",
+        "//tensorflow_models/object_detection/utils:ops",
        "//tensorflow_models/object_detection/utils:visualization_utils",
    ],
 )
@@ -69,11 +74,10 @@ py_library(
    deps = [
        "//tensorflow",
        "//tensorflow_models/object_detection:eval_util",
-        "//tensorflow_models/object_detection/core:box_list",
-        "//tensorflow_models/object_detection/core:box_list_ops",
        "//tensorflow_models/object_detection/core:prefetcher",
        "//tensorflow_models/object_detection/core:standard_fields",
        "//tensorflow_models/object_detection/protos:eval_py_pb2",
+        "//tensorflow_models/object_detection/utils:object_detection_evaluation",
    ],
 )

@@ -87,10 +91,7 @@ py_binary(
        "//tensorflow",
        "//tensorflow_models/object_detection/builders:input_reader_builder",
        "//tensorflow_models/object_detection/builders:model_builder",
-        "//tensorflow_models/object_detection/protos:eval_py_pb2",
-        "//tensorflow_models/object_detection/protos:input_reader_py_pb2",
-        "//tensorflow_models/object_detection/protos:model_py_pb2",
-        "//tensorflow_models/object_detection/protos:pipeline_py_pb2",
+        "//tensorflow_models/object_detection/utils:config_util",
        "//tensorflow_models/object_detection/utils:label_map_util",
    ],
 )

--- a/research/object_detection/eval.py
+++ b/research/object_detection/eval.py
@@ -44,18 +44,16 @@ Example usage:
        --input_config_path=eval_input_config.pbtxt
 """
 import functools
+import os
 import tensorflow as tf

-from google.protobuf import text_format
 from object_detection import evaluator
 from object_detection.builders import input_reader_builder
 from object_detection.builders import model_builder
-from object_detection.protos import eval_pb2
-from object_detection.protos import input_reader_pb2
-from object_detection.protos import model_pb2
-from object_detection.protos import pipeline_pb2
+from object_detection.utils import config_util
 from object_detection.utils import label_map_util

+
 tf.logging.set_verbosity(tf.logging.INFO)

 flags = tf.app.flags
@@ -75,69 +73,37 @@ flags.DEFINE_string('input_config_path', '',
                    'Path to an input_reader_pb2.InputReader config file.')
 flags.DEFINE_string('model_config_path', '',
                    'Path to a model_pb2.DetectionModel config file.')
-
+flags.DEFINE_boolean('run_once', False, 'Option to only run a single pass of '
+                     'evaluation. Overrides the `max_evals` parameter in the '
+                     'provided config.')
 FLAGS = flags.FLAGS


-def get_configs_from_pipeline_file():
-  """Reads evaluation configuration from a pipeline_pb2.TrainEvalPipelineConfig.
-
-  Reads evaluation config from file specified by pipeline_config_path flag.
-
-  Returns:
-    model_config: a model_pb2.DetectionModel
-    eval_config: a eval_pb2.EvalConfig
-    input_config: a input_reader_pb2.InputReader
-  """
-  pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
-  with tf.gfile.GFile(FLAGS.pipeline_config_path, 'r') as f:
-    text_format.Merge(f.read(), pipeline_config)
-
-  model_config = pipeline_config.model
-  if FLAGS.eval_training_data:
-    eval_config = pipeline_config.train_config
-  else:
-    eval_config = pipeline_config.eval_config
-  input_config = pipeline_config.eval_input_reader
-
-  return model_config, eval_config, input_config
-
-
-def get_configs_from_multiple_files():
-  """Reads evaluation configuration from multiple config files.
-
-  Reads the evaluation config from the following files:
-    model_config: Read from --model_config_path
-    eval_config: Read from --eval_config_path
-    input_config: Read from --input_config_path
-
-  Returns:
-    model_config: a model_pb2.DetectionModel
-    eval_config: a eval_pb2.EvalConfig
-    input_config: a input_reader_pb2.InputReader
-  """
-  eval_config = eval_pb2.EvalConfig()
-  with tf.gfile.GFile(FLAGS.eval_config_path, 'r') as f:
-    text_format.Merge(f.read(), eval_config)
-
-  model_config = model_pb2.DetectionModel()
-  with tf.gfile.GFile(FLAGS.model_config_path, 'r') as f:
-    text_format.Merge(f.read(), model_config)
-
-  input_config = input_reader_pb2.InputReader()
-  with tf.gfile.GFile(FLAGS.input_config_path, 'r') as f:
-    text_format.Merge(f.read(), input_config)
-
-  return model_config, eval_config, input_config
-
-
 def main(unused_argv):
  assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.'
  assert FLAGS.eval_dir, '`eval_dir` is missing.'
+  tf.gfile.MakeDirs(FLAGS.eval_dir)
  if FLAGS.pipeline_config_path:
-    model_config, eval_config, input_config = get_configs_from_pipeline_file()
+    configs = config_util.get_configs_from_pipeline_file(
+        FLAGS.pipeline_config_path)
+    tf.gfile.Copy(FLAGS.pipeline_config_path,
+                  os.path.join(FLAGS.eval_dir, 'pipeline.config'),
+                  overwrite=True)
  else:
-    model_config, eval_config, input_config = get_configs_from_multiple_files()
+    configs = config_util.get_configs_from_multiple_files(
+        model_config_path=FLAGS.model_config_path,
+        eval_config_path=FLAGS.eval_config_path,
+        eval_input_config_path=FLAGS.input_config_path)
+    for name, config in [('model.config', FLAGS.model_config_path),
+                         ('eval.config', FLAGS.eval_config_path),
+                         ('input.config', FLAGS.input_config_path)]:
+      tf.gfile.Copy(config,
+                    os.path.join(FLAGS.eval_dir, name),
+                    overwrite=True)
+
+  model_config = configs['model']
+  eval_config = configs['eval_config']
+  input_config = configs['eval_input_config']

  model_fn = functools.partial(
      model_builder.build,
@@ -153,6 +119,9 @@ def main(unused_argv):
  categories = label_map_util.convert_label_map_to_categories(
      label_map, max_num_classes)

+  if FLAGS.run_once:
+    eval_config.max_evals = 1
+
  evaluator.evaluate(create_input_dict_fn, model_fn, eval_config, categories,
                     FLAGS.checkpoint_dir, FLAGS.eval_dir)


--- a/research/object_detection/eval_util.py
+++ b/research/object_detection/eval_util.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""Common functions for repeatedly evaluating a checkpoint.
-"""
-import copy
+"""Common functions for repeatedly evaluating a checkpoint."""
 import logging
 import os
 import time
@@ -23,8 +20,12 @@ import time
 import numpy as np
 import tensorflow as tf

+from object_detection.core import box_list
+from object_detection.core import box_list_ops
+from object_detection.core import keypoint_ops
+from object_detection.core import standard_fields as fields
 from object_detection.utils import label_map_util
-from object_detection.utils import object_detection_evaluation
+from object_detection.utils import ops
 from object_detection.utils import visualization_utils as vis_utils

 slim = tf.contrib.slim
@@ -50,117 +51,6 @@ def write_metrics(metrics, global_step, summary_dir):
  logging.info('Metrics written to tf summary.')


-def evaluate_detection_results_pascal_voc(result_lists,
-                                          categories,
-                                          label_id_offset=0,
-                                          iou_thres=0.5,
-                                          corloc_summary=False):
-  """Computes Pascal VOC detection metrics given groundtruth and detections.
-
-  This function computes Pascal VOC metrics. This function by default
-  takes detections and groundtruth boxes encoded in result_lists and writes
-  evaluation results to tf summaries which can be viewed on tensorboard.
-
-  Args:
-    result_lists: a dictionary holding lists of groundtruth and detection
-      data corresponding to each image being evaluated.  The following keys
-      are required:
-        'image_id': a list of string ids
-        'detection_boxes': a list of float32 numpy arrays of shape [N, 4]
-        'detection_scores': a list of float32 numpy arrays of shape [N]
-        'detection_classes': a list of int32 numpy arrays of shape [N]
-        'groundtruth_boxes': a list of float32 numpy arrays of shape [M, 4]
-        'groundtruth_classes': a list of int32 numpy arrays of shape [M]
-      and the remaining fields below are optional:
-        'difficult': a list of boolean arrays of shape [M] indicating the
-          difficulty of groundtruth boxes. Some datasets like PASCAL VOC provide
-          this information and it is used to remove difficult examples from eval
-          in order to not penalize the models on them.
-      Note that it is okay to have additional fields in result_lists --- they
-      are simply ignored.
-    categories: a list of dictionaries representing all possible categories.
-      Each dict in this list has the following keys:
-          'id': (required) an integer id uniquely identifying this category
-          'name': (required) string representing category name
-            e.g., 'cat', 'dog', 'pizza'
-    label_id_offset: an integer offset for the label space.
-    iou_thres: float determining the IoU threshold at which a box is considered
-        correct. Defaults to the standard 0.5.
-    corloc_summary: boolean. If True, also outputs CorLoc metrics.
-
-  Returns:
-    A dictionary of metric names to scalar values.
-
-  Raises:
-    ValueError: if the set of keys in result_lists is not a superset of the
-      expected list of keys.  Unexpected keys are ignored.
-    ValueError: if the lists in result_lists have inconsistent sizes.
-  """
-  # check for expected keys in result_lists
-  expected_keys = [
-      'detection_boxes', 'detection_scores', 'detection_classes', 'image_id'
-  ]
-  expected_keys += ['groundtruth_boxes', 'groundtruth_classes']
-  if not set(expected_keys).issubset(set(result_lists.keys())):
-    raise ValueError('result_lists does not have expected key set.')
-  num_results = len(result_lists[expected_keys[0]])
-  for key in expected_keys:
-    if len(result_lists[key]) != num_results:
-      raise ValueError('Inconsistent list sizes in result_lists')
-
-  # Pascal VOC evaluator assumes foreground index starts from zero.
-  categories = copy.deepcopy(categories)
-  for idx in range(len(categories)):
-    categories[idx]['id'] -= label_id_offset
-
-  # num_classes (maybe encoded as categories)
-  num_classes = max([cat['id'] for cat in categories]) + 1
-  logging.info('Computing Pascal VOC metrics on results.')
-  if all(image_id.isdigit() for image_id in result_lists['image_id']):
-    image_ids = [int(image_id) for image_id in result_lists['image_id']]
-  else:
-    image_ids = range(num_results)
-
-  evaluator = object_detection_evaluation.ObjectDetectionEvaluation(
-      num_classes, matching_iou_threshold=iou_thres)
-
-  difficult_lists = None
-  if 'difficult' in result_lists and result_lists['difficult']:
-    difficult_lists = result_lists['difficult']
-  for idx, image_id in enumerate(image_ids):
-    difficult = None
-    if difficult_lists is not None and difficult_lists[idx].size:
-      difficult = difficult_lists[idx].astype(np.bool)
-    evaluator.add_single_ground_truth_image_info(
-        image_id, result_lists['groundtruth_boxes'][idx],
-        result_lists['groundtruth_classes'][idx] - label_id_offset,
-        difficult)
-    evaluator.add_single_detected_image_info(
-        image_id, result_lists['detection_boxes'][idx],
-        result_lists['detection_scores'][idx],
-        result_lists['detection_classes'][idx] - label_id_offset)
-  per_class_ap, mean_ap, _, _, per_class_corloc, mean_corloc = (
-      evaluator.evaluate())
-
-  metrics = {'Precision/mAP@{}IOU'.format(iou_thres): mean_ap}
-  category_index = label_map_util.create_category_index(categories)
-  for idx in range(per_class_ap.size):
-    if idx in category_index:
-      display_name = ('PerformanceByCategory/mAP@{}IOU/{}'
-                      .format(iou_thres, category_index[idx]['name']))
-      metrics[display_name] = per_class_ap[idx]
-
-  if corloc_summary:
-    metrics['CorLoc/CorLoc@{}IOU'.format(iou_thres)] = mean_corloc
-    for idx in range(per_class_corloc.size):
-      if idx in category_index:
-        display_name = (
-            'PerformanceByCategory/CorLoc@{}IOU/{}'.format(
-                iou_thres, category_index[idx]['name']))
-        metrics[display_name] = per_class_corloc[idx]
-  return metrics
-
-
 # TODO: Add tests.
 def visualize_detection_results(result_dict,
                                tag,
@@ -265,9 +155,11 @@ def visualize_detection_results(result_dict,
    vis_utils.save_image_array_as_png(image, export_path)

  summary = tf.Summary(value=[
-      tf.Summary.Value(tag=tag, image=tf.Summary.Image(
-          encoded_image_string=vis_utils.encode_image_array_as_png_str(
-              image)))
+      tf.Summary.Value(
+          tag=tag,
+          image=tf.Summary.Image(
+              encoded_image_string=vis_utils.encode_image_array_as_png_str(
+                  image)))
  ])
  summary_writer = tf.summary.FileWriter(summary_dir)
  summary_writer.add_summary(summary, global_step)
@@ -276,57 +168,41 @@ def visualize_detection_results(result_dict,
  logging.info('Detection visualizations written to summary with tag %s.', tag)


-# TODO: Add tests.
-# TODO: Have an argument called `aggregated_processor_tensor_keys` that contains
-# a whitelist of tensors used by the `aggregated_result_processor` instead of a
-# blacklist. This will prevent us from inadvertently adding any evaluated
-# tensors into the `results_list` data structure that are not needed by
-# `aggregated_result_preprocessor`.
-def run_checkpoint_once(tensor_dict,
-                        update_op,
-                        summary_dir,
-                        aggregated_result_processor=None,
-                        batch_processor=None,
-                        checkpoint_dirs=None,
-                        variables_to_restore=None,
-                        restore_fn=None,
-                        num_batches=1,
-                        master='',
-                        save_graph=False,
-                        save_graph_dir='',
-                        metric_names_to_values=None,
-                        keys_to_exclude_from_results=()):
-  """Evaluates both python metrics and tensorflow slim metrics.
-
-  Python metrics are processed in batch by the aggregated_result_processor,
-  while tensorflow slim metrics statistics are computed by running
-  metric_names_to_updates tensors and aggregated using metric_names_to_values
-  tensor.
+def _run_checkpoint_once(tensor_dict,
+                         evaluators=None,
+                         batch_processor=None,
+                         checkpoint_dirs=None,
+                         variables_to_restore=None,
+                         restore_fn=None,
+                         num_batches=1,
+                         master='',
+                         save_graph=False,
+                         save_graph_dir=''):
+  """Evaluates metrics defined in evaluators.
+
+  This function loads the latest checkpoint in checkpoint_dirs and evaluates
+  all metrics defined in evaluators. The metrics are processed in batch by the
+  batch_processor.

  Args:
    tensor_dict: a dictionary holding tensors representing a batch of detections
      and corresponding groundtruth annotations.
-    update_op: a tensorflow update op that will run for each batch along with
-      the tensors in tensor_dict..
-    summary_dir: a directory to write metrics summaries.
-    aggregated_result_processor: a function taking one arguments:
-      1. result_lists: a dictionary with keys matching those in tensor_dict
-        and corresponding values being the list of results for each tensor
-        in tensor_dict.  The length of each such list is num_batches.
+    evaluators: a list of object of type DetectionEvaluator to be used for
+      evaluation. Note that the metric names produced by different evaluators
+      must be unique.
    batch_processor: a function taking four arguments:
      1. tensor_dict: the same tensor_dict that is passed in as the first
        argument to this function.
      2. sess: a tensorflow session
      3. batch_index: an integer representing the index of the batch amongst
        all batches
-      4. update_op: a tensorflow update op that will run for each batch.
-      and returns result_dict, a dictionary of results for that batch.
      By default, batch_processor is None, which defaults to running:
        return sess.run(tensor_dict)
      To skip an image, it suffices to return an empty dictionary in place of
      result_dict.
    checkpoint_dirs: list of directories to load into an EnsembleModel. If it
-      has only one directory, EnsembleModel will not be used -- a DetectionModel
+      has only one directory, EnsembleModel will not be used --
+        a DetectionModel
      will be instantiated directly. Not used if restore_fn is set.
    variables_to_restore: None, or a dictionary mapping variable names found in
      a checkpoint to model variables. The dictionary would normally be
@@ -340,14 +216,10 @@ def run_checkpoint_once(tensor_dict,
    save_graph: whether or not the Tensorflow graph is stored as a pbtxt file.
    save_graph_dir: where to store the Tensorflow graph on disk. If save_graph
      is True this must be non-empty.
-    metric_names_to_values: A dictionary containing metric names to tensors
-      which will be evaluated after processing all batches
-      of [tensor_dict, update_op]. If any metrics depend on statistics computed
-      during each batch ensure that `update_op` tensor has a control dependency
-      on the update ops that compute the statistics.
-    keys_to_exclude_from_results: keys in tensor_dict that will be excluded
-      from results_list. Note that the tensors corresponding to these keys will
-      still be evaluated for each batch, but won't be added to results_list.
+
+  Returns:
+    global_step: the count of global steps.
+    all_evaluator_metrics: A dictionary containing metric names and values.

  Raises:
    ValueError: if restore_fn is None and checkpoint_dirs doesn't have at least
@@ -359,6 +231,7 @@ def run_checkpoint_once(tensor_dict,
  sess = tf.Session(master, graph=tf.get_default_graph())
  sess.run(tf.global_variables_initializer())
  sess.run(tf.local_variables_initializer())
+  sess.run(tf.tables_initializer())
  if restore_fn:
    restore_fn(sess)
  else:
@@ -371,10 +244,7 @@ def run_checkpoint_once(tensor_dict,
  if save_graph:
    tf.train.write_graph(sess.graph_def, save_graph_dir, 'eval.pbtxt')

-  valid_keys = list(set(tensor_dict.keys()) - set(keys_to_exclude_from_results))
-  result_lists = {key: [] for key in valid_keys}
  counters = {'skipped': 0, 'success': 0}
-  other_metrics = None
  with tf.contrib.slim.queues.QueueRunners(sess):
    try:
      for batch in range(int(num_batches)):
@@ -382,40 +252,46 @@ def run_checkpoint_once(tensor_dict,
          logging.info('Running eval ops batch %d/%d', batch + 1, num_batches)
        if not batch_processor:
          try:
-            (result_dict, _) = sess.run([tensor_dict, update_op])
+            result_dict = sess.run(tensor_dict)
            counters['success'] += 1
          except tf.errors.InvalidArgumentError:
            logging.info('Skipping image')
            counters['skipped'] += 1
            result_dict = {}
        else:
-          result_dict = batch_processor(
-              tensor_dict, sess, batch, counters, update_op)
-        for key in result_dict:
-          if key in valid_keys:
-            result_lists[key].append(result_dict[key])
-      if metric_names_to_values is not None:
-        other_metrics = sess.run(metric_names_to_values)
+          result_dict = batch_processor(tensor_dict, sess, batch, counters)
+        for evaluator in evaluators:
+          # TODO: Use image_id tensor once we fix the input data
+          # decoders to return correct image_id.
+          # TODO: result_dict contains batches of images, while
+          # add_single_ground_truth_image_info expects a single image. Fix
+          evaluator.add_single_ground_truth_image_info(
+              image_id=batch, groundtruth_dict=result_dict)
+          evaluator.add_single_detected_image_info(
+              image_id=batch, detections_dict=result_dict)
      logging.info('Running eval batches done.')
    except tf.errors.OutOfRangeError:
      logging.info('Done evaluating -- epoch limit reached')
    finally:
      # When done, ask the threads to stop.
-      metrics = aggregated_result_processor(result_lists)
-      if other_metrics is not None:
-        metrics.update(other_metrics)
-      global_step = tf.train.global_step(sess, slim.get_global_step())
-      write_metrics(metrics, global_step, summary_dir)
      logging.info('# success: %d', counters['success'])
      logging.info('# skipped: %d', counters['skipped'])
+      all_evaluator_metrics = {}
+      for evaluator in evaluators:
+        metrics = evaluator.evaluate()
+        evaluator.clear()
+        if any(key in all_evaluator_metrics for key in metrics):
+          raise ValueError('Metric names between evaluators must not collide.')
+        all_evaluator_metrics.update(metrics)
+      global_step = tf.train.global_step(sess, tf.train.get_global_step())
  sess.close()
+  return (global_step, all_evaluator_metrics)


 # TODO: Add tests.
 def repeated_checkpoint_run(tensor_dict,
-                            update_op,
                            summary_dir,
-                            aggregated_result_processor=None,
+                            evaluators,
                            batch_processor=None,
                            checkpoint_dirs=None,
                            variables_to_restore=None,
@@ -425,9 +301,7 @@ def repeated_checkpoint_run(tensor_dict,
                            max_number_of_evaluations=None,
                            master='',
                            save_graph=False,
-                            save_graph_dir='',
-                            metric_names_to_values=None,
-                            keys_to_exclude_from_results=()):
+                            save_graph_dir=''):
  """Periodically evaluates desired tensors using checkpoint_dirs or restore_fn.

  This function repeatedly loads a checkpoint and evaluates a desired
@@ -438,21 +312,16 @@ def repeated_checkpoint_run(tensor_dict,
  Args:
    tensor_dict: a dictionary holding tensors representing a batch of detections
      and corresponding groundtruth annotations.
-    update_op: a tensorflow update op that will run for each batch along with
-      the tensors in tensor_dict.
    summary_dir: a directory to write metrics summaries.
-    aggregated_result_processor: a function taking one argument:
-      1. result_lists: a dictionary with keys matching those in tensor_dict
-        and corresponding values being the list of results for each tensor
-        in tensor_dict.  The length of each such list is num_batches.
+    evaluators: a list of object of type DetectionEvaluator to be used for
+      evaluation. Note that the metric names produced by different evaluators
+      must be unique.
    batch_processor: a function taking three arguments:
      1. tensor_dict: the same tensor_dict that is passed in as the first
        argument to this function.
      2. sess: a tensorflow session
      3. batch_index: an integer representing the index of the batch amongst
        all batches
-      4. update_op: a tensorflow update op that will run for each batch.
-      and returns result_dict, a dictionary of results for that batch.
      By default, batch_processor is None, which defaults to running:
        return sess.run(tensor_dict)
    checkpoint_dirs: list of directories to load into a DetectionModel or an
@@ -472,14 +341,10 @@ def repeated_checkpoint_run(tensor_dict,
    save_graph: whether or not the Tensorflow graph is saved as a pbtxt file.
    save_graph_dir: where to save on disk the Tensorflow graph. If store_graph
      is True this must be non-empty.
-    metric_names_to_values: A dictionary containing metric names to tensors
-      which will be evaluated after processing all batches
-      of [tensor_dict, update_op]. If any metrics depend on statistics computed
-      during each batch ensure that `update_op` tensor has a control dependency
-      on the update ops that compute the statistics.
-    keys_to_exclude_from_results: keys in tensor_dict that will be excluded
-      from results_list. Note that the tensors corresponding to these keys will
-      still be evaluated for each batch, but won't be added to results_list.
+
+  Returns:
+    metrics: A dictionary containing metric names and values in the latest
+      evaluation.

  Raises:
    ValueError: if max_num_of_evaluations is not None or a positive number.
@@ -496,8 +361,8 @@ def repeated_checkpoint_run(tensor_dict,
  number_of_evaluations = 0
  while True:
    start = time.time()
-    logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
-                                                           time.gmtime()))
+    logging.info('Starting evaluation at ' + time.strftime(
+        '%Y-%m-%d-%H:%M:%S', time.gmtime()))
    model_path = tf.train.latest_checkpoint(checkpoint_dirs[0])
    if not model_path:
      logging.info('No model found in %s. Will try again in %d seconds',
@@ -507,12 +372,14 @@ def repeated_checkpoint_run(tensor_dict,
                   'seconds', eval_interval_secs)
    else:
      last_evaluated_model_path = model_path
-      run_checkpoint_once(tensor_dict, update_op, summary_dir,
-                          aggregated_result_processor,
-                          batch_processor, checkpoint_dirs,
-                          variables_to_restore, restore_fn, num_batches, master,
-                          save_graph, save_graph_dir, metric_names_to_values,
-                          keys_to_exclude_from_results)
+      global_step, metrics = _run_checkpoint_once(tensor_dict, evaluators,
+                                                  batch_processor,
+                                                  checkpoint_dirs,
+                                                  variables_to_restore,
+                                                  restore_fn, num_batches,
+                                                  master, save_graph,
+                                                  save_graph_dir)
+      write_metrics(metrics, global_step, summary_dir)
    number_of_evaluations += 1

    if (max_number_of_evaluations and
@@ -522,3 +389,128 @@ def repeated_checkpoint_run(tensor_dict,
    time_to_next_eval = start + eval_interval_secs - time.time()
    if time_to_next_eval > 0:
      time.sleep(time_to_next_eval)
+
+  return metrics
+
+
+def result_dict_for_single_example(image,
+                                   key,
+                                   detections,
+                                   groundtruth=None,
+                                   class_agnostic=False,
+                                   scale_to_absolute=False):
+  """Merges all detection and groundtruth information for a single example.
+
+  Note that evaluation tools require classes that are 1-indexed, and so this
+  function performs the offset. If `class_agnostic` is True, all output classes
+  have label 1.
+
+  Args:
+    image: A single 4D image tensor of shape [1, H, W, C].
+    key: A single string tensor identifying the image.
+    detections: A dictionary of detections, returned from
+      DetectionModel.postprocess().
+    groundtruth: (Optional) Dictionary of groundtruth items, with fields:
+      'groundtruth_boxes': [num_boxes, 4] float32 tensor of boxes, in
+        normalized coordinates.
+      'groundtruth_classes': [num_boxes] int64 tensor of 1-indexed classes.
+      'groundtruth_area': [num_boxes] float32 tensor of bbox area. (Optional)
+      'groundtruth_is_crowd': [num_boxes] int64 tensor. (Optional)
+      'groundtruth_difficult': [num_boxes] int64 tensor. (Optional)
+      'groundtruth_group_of': [num_boxes] int64 tensor. (Optional)
+      'groundtruth_instance_masks': 3D int64 tensor of instance masks
+        (Optional).
+    class_agnostic: Boolean indicating whether the detections are class-agnostic
+      (i.e. binary). Default False.
+    scale_to_absolute: Boolean indicating whether boxes, masks, keypoints should
+      be scaled to absolute coordinates. Note that for IoU based evaluations,
+      it does not matter whether boxes are expressed in absolute or relative
+      coordinates. Default False.
+
+  Returns:
+    A dictionary with:
+    'original_image': A [1, H, W, C] uint8 image tensor.
+    'key': A string tensor with image identifier.
+    'detection_boxes': [max_detections, 4] float32 tensor of boxes, in
+      normalized or absolute coordinates, depending on the value of
+      `scale_to_absolute`.
+    'detection_scores': [max_detections] float32 tensor of scores.
+    'detection_classes': [max_detections] int64 tensor of 1-indexed classes.
+    'detection_masks': [max_detections, None, None] float32 tensor of binarized
+      masks. (Only present if available in `detections`)
+    'groundtruth_boxes': [num_boxes, 4] float32 tensor of boxes, in
+      normalized or absolute coordinates, depending on the value of
+      `scale_to_absolute`. (Optional)
+    'groundtruth_classes': [num_boxes] int64 tensor of 1-indexed classes.
+      (Optional)
+    'groundtruth_area': [num_boxes] float32 tensor of bbox area. (Optional)
+    'groundtruth_is_crowd': [num_boxes] int64 tensor. (Optional)
+    'groundtruth_difficult': [num_boxes] int64 tensor. (Optional)
+    'groundtruth_group_of': [num_boxes] int64 tensor. (Optional)
+    'groundtruth_instance_masks': 3D int64 tensor of instance masks
+      (Optional).
+
+  """
+  label_id_offset = 1  # Applying label id offset (b/63711816)
+
+  input_data_fields = fields.InputDataFields()
+  output_dict = {
+      input_data_fields.original_image: image,
+      input_data_fields.key: key,
+  }
+
+  detection_fields = fields.DetectionResultFields
+  detection_boxes = detections[detection_fields.detection_boxes][0]
+  output_dict[detection_fields.detection_boxes] = detection_boxes
+  image_shape = tf.shape(image)
+  if scale_to_absolute:
+    absolute_detection_boxlist = box_list_ops.to_absolute_coordinates(
+        box_list.BoxList(detection_boxes), image_shape[1], image_shape[2])
+    output_dict[detection_fields.detection_boxes] = (
+        absolute_detection_boxlist.get())
+  detection_scores = detections[detection_fields.detection_scores][0]
+  output_dict[detection_fields.detection_scores] = detection_scores
+
+  if class_agnostic:
+    detection_classes = tf.ones_like(detection_scores, dtype=tf.int64)
+  else:
+    detection_classes = (
+        tf.to_int64(detections[detection_fields.detection_classes][0]) +
+        label_id_offset)
+  output_dict[detection_fields.detection_classes] = detection_classes
+
+  if detection_fields.detection_masks in detections:
+    detection_masks = detections[detection_fields.detection_masks][0]
+    output_dict[detection_fields.detection_masks] = detection_masks
+    if scale_to_absolute:
+      # TODO: This should be done in model's postprocess
+      # function ideally.
+      detection_masks_reframed = ops.reframe_box_masks_to_image_masks(
+          detection_masks, detection_boxes, image_shape[1], image_shape[2])
+      detection_masks_reframed = tf.to_float(
+          tf.greater(detection_masks_reframed, 0.5))
+      output_dict[detection_fields.detection_masks] = detection_masks_reframed
+  if detection_fields.detection_keypoints in detections:
+    detection_keypoints = detections[detection_fields.detection_keypoints][0]
+    output_dict[detection_fields.detection_keypoints] = detection_keypoints
+    if scale_to_absolute:
+      absolute_detection_keypoints = keypoint_ops.scale(
+          detection_keypoints, image_shape[1], image_shape[2])
+      output_dict[detection_fields.detection_keypoints] = (
+          absolute_detection_keypoints)
+
+  if groundtruth:
+    output_dict.update(groundtruth)
+    if scale_to_absolute:
+      groundtruth_boxes = groundtruth[input_data_fields.groundtruth_boxes]
+      absolute_gt_boxlist = box_list_ops.to_absolute_coordinates(
+          box_list.BoxList(groundtruth_boxes), image_shape[1], image_shape[2])
+      output_dict[input_data_fields.groundtruth_boxes] = (
+          absolute_gt_boxlist.get())
+    # For class-agnostic models, groundtruth classes all become 1.
+    if class_agnostic:
+      groundtruth_classes = groundtruth[input_data_fields.groundtruth_classes]
+      groundtruth_classes = tf.ones_like(groundtruth_classes, dtype=tf.int64)
+      output_dict[input_data_fields.groundtruth_classes] = groundtruth_classes
+
+  return output_dict
--- a/research/object_detection/evaluator.py
+++ b/research/object_detection/evaluator.py
@@ -12,26 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Detection model evaluator.

 This file provides a generic evaluation method that can be used to evaluate a
 DetectionModel.
 """
+
 import logging
 import tensorflow as tf

 from object_detection import eval_util
-from object_detection.core import box_list
-from object_detection.core import box_list_ops
 from object_detection.core import prefetcher
 from object_detection.core import standard_fields as fields
-from object_detection.utils import ops
-
-slim = tf.contrib.slim
-
-EVAL_METRICS_FN_DICT = {
-    'pascal_voc_metrics': eval_util.evaluate_detection_results_pascal_voc
+from object_detection.utils import object_detection_evaluation
+
+# A dictionary of metric names to classes that implement the metric. The classes
+# in the dictionary must implement
+# utils.object_detection_evaluation.DetectionEvaluator interface.
+EVAL_METRICS_CLASS_DICT = {
+    'pascal_voc_metrics':
+        object_detection_evaluation.PascalDetectionEvaluator,
+    'weighted_pascal_voc_metrics':
+        object_detection_evaluation.WeightedPascalDetectionEvaluator,
+    'open_images_metrics':
+        object_detection_evaluation.OpenImagesDetectionEvaluator
 }


@@ -56,54 +60,56 @@ def _extract_prediction_tensors(model,
  prediction_dict = model.predict(preprocessed_image)
  detections = model.postprocess(prediction_dict)

-  original_image_shape = tf.shape(original_image)
-  absolute_detection_boxlist = box_list_ops.to_absolute_coordinates(
-      box_list.BoxList(tf.squeeze(detections['detection_boxes'], axis=0)),
-      original_image_shape[1], original_image_shape[2])
-  label_id_offset = 1
-  tensor_dict = {
-      'original_image': original_image,
-      'image_id': input_dict[fields.InputDataFields.source_id],
-      'detection_boxes': absolute_detection_boxlist.get(),
-      'detection_scores': tf.squeeze(detections['detection_scores'], axis=0),
-      'detection_classes': (
-          tf.squeeze(detections['detection_classes'], axis=0) +
-          label_id_offset),
-  }
-  if 'detection_masks' in detections:
-    detection_masks = tf.squeeze(detections['detection_masks'],
-                                 axis=0)
-    detection_boxes = tf.squeeze(detections['detection_boxes'],
-                                 axis=0)
-    # TODO: This should be done in model's postprocess function ideally.
-    detection_masks_reframed = ops.reframe_box_masks_to_image_masks(
-        detection_masks,
-        detection_boxes,
-        original_image_shape[1], original_image_shape[2])
-    detection_masks_reframed = tf.to_float(tf.greater(detection_masks_reframed,
-                                                      0.5))
-
-    tensor_dict['detection_masks'] = detection_masks_reframed
-  # load groundtruth fields into tensor_dict
+  groundtruth = None
  if not ignore_groundtruth:
-    normalized_gt_boxlist = box_list.BoxList(
-        input_dict[fields.InputDataFields.groundtruth_boxes])
-    gt_boxlist = box_list_ops.scale(normalized_gt_boxlist,
-                                    tf.shape(original_image)[1],
-                                    tf.shape(original_image)[2])
-    groundtruth_boxes = gt_boxlist.get()
-    groundtruth_classes = input_dict[fields.InputDataFields.groundtruth_classes]
-    tensor_dict['groundtruth_boxes'] = groundtruth_boxes
-    tensor_dict['groundtruth_classes'] = groundtruth_classes
-    tensor_dict['area'] = input_dict[fields.InputDataFields.groundtruth_area]
-    tensor_dict['is_crowd'] = input_dict[
-        fields.InputDataFields.groundtruth_is_crowd]
-    tensor_dict['difficult'] = input_dict[
-        fields.InputDataFields.groundtruth_difficult]
-    if 'detection_masks' in tensor_dict:
-      tensor_dict['groundtruth_instance_masks'] = input_dict[
-          fields.InputDataFields.groundtruth_instance_masks]
-  return tensor_dict
+    groundtruth = {
+        fields.InputDataFields.groundtruth_boxes:
+            input_dict[fields.InputDataFields.groundtruth_boxes],
+        fields.InputDataFields.groundtruth_classes:
+            input_dict[fields.InputDataFields.groundtruth_classes],
+        fields.InputDataFields.groundtruth_area:
+            input_dict[fields.InputDataFields.groundtruth_area],
+        fields.InputDataFields.groundtruth_is_crowd:
+            input_dict[fields.InputDataFields.groundtruth_is_crowd],
+        fields.InputDataFields.groundtruth_difficult:
+            input_dict[fields.InputDataFields.groundtruth_difficult]
+    }
+    if fields.InputDataFields.groundtruth_group_of in input_dict:
+      groundtruth[fields.InputDataFields.groundtruth_group_of] = (
+          input_dict[fields.InputDataFields.groundtruth_group_of])
+    if fields.DetectionResultFields.detection_masks in detections:
+      groundtruth[fields.InputDataFields.groundtruth_instance_masks] = (
+          input_dict[fields.InputDataFields.groundtruth_instance_masks])
+
+  return eval_util.result_dict_for_single_example(
+      original_image,
+      input_dict[fields.InputDataFields.source_id],
+      detections,
+      groundtruth,
+      class_agnostic=(
+          fields.DetectionResultFields.detection_classes not in detections),
+      scale_to_absolute=True)
+
+
+def get_evaluators(eval_config, categories):
+  """Returns the evaluator class according to eval_config, valid for categories.
+
+  Args:
+    eval_config: evaluation configurations.
+    categories: a list of categories to evaluate.
+  Returns:
+    An list of instances of DetectionEvaluator.
+
+  Raises:
+    ValueError: if metric is not in the metric class dictionary.
+  """
+  eval_metric_fn_key = eval_config.metrics_set
+  if eval_metric_fn_key not in EVAL_METRICS_CLASS_DICT:
+    raise ValueError('Metric not found: {}'.format(eval_metric_fn_key))
+  return [
+      EVAL_METRICS_CLASS_DICT[eval_metric_fn_key](
+          categories=categories)
+  ]


 def evaluate(create_input_dict_fn, create_model_fn, eval_config, categories,
@@ -118,6 +124,10 @@ def evaluate(create_input_dict_fn, create_model_fn, eval_config, categories,
                have an integer 'id' field and string 'name' field.
    checkpoint_dir: directory to load the checkpoints to evaluate from.
    eval_dir: directory to write evaluation metrics summary to.
+
+  Returns:
+    metrics: A dictionary containing metric names and values from the latest
+      run.
  """

  model = create_model_fn()
@@ -131,7 +141,7 @@ def evaluate(create_input_dict_fn, create_model_fn, eval_config, categories,
      create_input_dict_fn=create_input_dict_fn,
      ignore_groundtruth=eval_config.ignore_groundtruth)

-  def _process_batch(tensor_dict, sess, batch_index, counters, update_op):
+  def _process_batch(tensor_dict, sess, batch_index, counters):
    """Evaluates tensors in tensor_dict, visualizing the first K examples.

    This function calls sess.run on tensor_dict, evaluating the original_image
@@ -146,66 +156,57 @@ def evaluate(create_input_dict_fn, create_model_fn, eval_config, categories,
        be updated to keep track of number of successful and failed runs,
        respectively.  If these fields are not updated, then the success/skipped
        counter values shown at the end of evaluation will be incorrect.
-      update_op: An update op that has to be run along with output tensors. For
-        example this could be an op to compute statistics for slim metrics.

    Returns:
      result_dict: a dictionary of numpy arrays
    """
-    if batch_index >= eval_config.num_visualizations:
-      if 'original_image' in tensor_dict:
-        tensor_dict = {k: v for (k, v) in tensor_dict.items()
-                       if k != 'original_image'}
    try:
-      (result_dict, _) = sess.run([tensor_dict, update_op])
+      result_dict = sess.run(tensor_dict)
      counters['success'] += 1
    except tf.errors.InvalidArgumentError:
      logging.info('Skipping image')
      counters['skipped'] += 1
      return {}
-    global_step = tf.train.global_step(sess, slim.get_global_step())
+    global_step = tf.train.global_step(sess, tf.train.get_global_step())
    if batch_index < eval_config.num_visualizations:
      tag = 'image-{}'.format(batch_index)
      eval_util.visualize_detection_results(
-          result_dict, tag, global_step, categories=categories,
+          result_dict,
+          tag,
+          global_step,
+          categories=categories,
          summary_dir=eval_dir,
          export_dir=eval_config.visualization_export_dir,
          show_groundtruth=eval_config.visualization_export_dir)
    return result_dict

-  def _process_aggregated_results(result_lists):
-    eval_metric_fn_key = eval_config.metrics_set
-    if eval_metric_fn_key not in EVAL_METRICS_FN_DICT:
-      raise ValueError('Metric not found: {}'.format(eval_metric_fn_key))
-    return EVAL_METRICS_FN_DICT[eval_metric_fn_key](result_lists,
-                                                    categories=categories)
-
  variables_to_restore = tf.global_variables()
-  global_step = slim.get_or_create_global_step()
+  global_step = tf.train.get_or_create_global_step()
  variables_to_restore.append(global_step)
  if eval_config.use_moving_averages:
    variable_averages = tf.train.ExponentialMovingAverage(0.0)
    variables_to_restore = variable_averages.variables_to_restore()
  saver = tf.train.Saver(variables_to_restore)
+
  def _restore_latest_checkpoint(sess):
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    saver.restore(sess, latest_checkpoint)

-  eval_util.repeated_checkpoint_run(
+  metrics = eval_util.repeated_checkpoint_run(
      tensor_dict=tensor_dict,
-      update_op=tf.no_op(),
      summary_dir=eval_dir,
-      aggregated_result_processor=_process_aggregated_results,
+      evaluators=get_evaluators(eval_config, categories),
      batch_processor=_process_batch,
      checkpoint_dirs=[checkpoint_dir],
      variables_to_restore=None,
      restore_fn=_restore_latest_checkpoint,
      num_batches=eval_config.num_examples,
      eval_interval_secs=eval_config.eval_interval_secs,
-      max_number_of_evaluations=(
-          1 if eval_config.ignore_groundtruth else
-          eval_config.max_evals if eval_config.max_evals else
-          None),
+      max_number_of_evaluations=(1 if eval_config.ignore_groundtruth else
+                                 eval_config.max_evals
+                                 if eval_config.max_evals else None),
      master=eval_config.eval_master,
      save_graph=eval_config.save_graph,
      save_graph_dir=(eval_dir if eval_config.save_graph else ''))
+
+  return metrics