Object detection and slim changes (#5843)

Add more eval metrics to model_main and support group norm for mobilenet v1 based models.

Object detection and slim changes (#5843)
Add more eval metrics to model_main and support group norm for mobilenet v1 based models.
62ce5d2a · pkulzc · GitHub · 7d032ea3 · a1337e01 · 62ce5d2a
Unverified Commit 62ce5d2a authored Dec 13, 2018 by pkulzc Committed by GitHub Dec 13, 2018
20 changed files
--- a/research/object_detection/protos/losses.proto
+++ b/research/object_detection/protos/losses.proto
@@ -23,6 +23,44 @@ message Loss {
  // If not left to default, applies random example sampling.
  optional RandomExampleSampler random_example_sampler = 6;
+  // Equalization loss.
+  message EqualizationLoss {
+    // Weight equalization loss strength.
+    optional float weight = 1 [default=0.0];
+    // When computing equalization loss, ops that start with
+    // equalization_exclude_prefixes will be ignored. Only used when
+    // equalization_weight > 0.
+    repeated string exclude_prefixes = 2;
+  }
+  optional EqualizationLoss equalization_loss = 7;
+  enum ExpectedLossWeights {
+    NONE = 0;
+    // Use expected_classification_loss_by_expected_sampling
+    // from third_party/tensorflow_models/object_detection/utils/ops.py
+    EXPECTED_SAMPLING = 1;
+    // Use expected_classification_loss_by_reweighting_unmatched_anchors
+    // from third_party/tensorflow_models/object_detection/utils/ops.py
+    REWEIGHTING_UNMATCHED_ANCHORS = 2;
+  }
+  // Method to compute expected loss weights with respect to balanced
+  // positive/negative sampling scheme. If NONE, use explicit sampling.
+  // TODO(birdbrain): Move under ExpectedLossWeights.
+  optional ExpectedLossWeights expected_loss_weights = 18 [default = NONE];
+  // Minimum number of effective negative samples.
+  // Only applies if expected_loss_weights is not NONE.
+  // TODO(birdbrain): Move under ExpectedLossWeights.
+  optional float min_num_negative_samples = 19 [default=0];
+  // Desired number of effective negative samples per positive sample.
+  // Only applies if expected_loss_weights is not NONE.
+  // TODO(birdbrain): Move under ExpectedLossWeights.
+  optional float desired_negative_sampling_ratio = 20 [default=3];
 }
 // Configuration for bounding box localization loss function.

--- a/research/object_detection/protos/preprocessor.proto
+++ b/research/object_detection/protos/preprocessor.proto
@@ -166,13 +166,13 @@ message RandomCropImage {
 message RandomPadImage {
  // Minimum dimensions for padded image. If unset, will use original image
  // dimension as a lower bound.
-  optional float min_image_height = 1;
+  optional int32 min_image_height = 1;
-  optional float min_image_width = 2;
+  optional int32 min_image_width = 2;
  // Maximum dimensions for padded image. If unset, will use double the original
  // image dimension as a lower bound.
-  optional float max_image_height = 3;
+  optional int32 max_image_height = 3;
-  optional float max_image_width = 4;
+  optional int32 max_image_width = 4;
  // Color of the padding. If unset, will pad using average color of the input
  // image.

--- a/research/object_detection/protos/ssd.proto
+++ b/research/object_detection/protos/ssd.proto
@@ -12,7 +12,7 @@ import "object_detection/protos/post_processing.proto";
 import "object_detection/protos/region_similarity_calculator.proto";
 // Configuration for Single Shot Detection (SSD) models.
-// Next id: 22
+// Next id: 26
 message Ssd {
  // Number of classes to predict.
@@ -35,7 +35,7 @@ message Ssd {
  // Whether background targets are to be encoded as an all
  // zeros vector or a one-hot vector (where background is the 0th class).
-  optional bool encode_background_as_zeros = 12 [default=false];
+  optional bool encode_background_as_zeros = 12 [default = false];
  // classification weight to be associated to negative
  // anchors (default: 1.0). The weight must be in [0., 1.].
@@ -52,11 +52,11 @@ message Ssd {
  // Whether to normalize the loss by number of groundtruth boxes that match to
  // the anchors.
-  optional bool normalize_loss_by_num_matches = 10 [default=true];
+  optional bool normalize_loss_by_num_matches = 10 [default = true];
  // Whether to normalize the localization loss by the code size of the box
  // encodings. This is applied along with other normalization factors.
-  optional bool normalize_loc_loss_by_codesize = 14 [default=false];
+  optional bool normalize_loc_loss_by_codesize = 14 [default = false];
  // Loss configuration for training.
  optional Loss loss = 11;
@@ -82,29 +82,66 @@ message Ssd {
  // to update the batch norm moving average parameters.
  optional bool inplace_batchnorm_update = 15 [default = false];
-  // Whether to weight the regression loss by the score of the ground truth box
+  // Whether to add an implicit background class to one-hot encodings of
-  // the anchor matches to.
+  // groundtruth labels. Set to false if training a single
-  optional bool weight_regression_loss_by_score = 17 [default=false];
+  // class model or using an explicit background class.
+  optional bool add_background_class = 21 [default = true];
-  // Whether to compute expected loss with respect to balanced positive/negative
+  // Whether to use an explicit background class. Set to true if using
-  // sampling scheme. If false, use explicit sampling.
+  // groundtruth labels with an explicit background class, as in multiclass
-  optional bool use_expected_classification_loss_under_sampling = 18 [default=false];
+  // scores.
+  optional bool explicit_background_class = 24 [default = false];
-  // Minimum number of effective negative samples.
+  optional bool use_confidences_as_targets = 22 [default = false];
-  // Only applies if use_expected_classification_loss_under_sampling is true.
-  optional float min_num_negative_samples = 19 [default=0];
-  // Desired number of effective negative samples per positive sample.
+  optional float implicit_example_weight = 23 [default = 1.0];
-  // Only applies if use_expected_classification_loss_under_sampling is true.
-  optional float desired_negative_sampling_ratio = 20 [default=3];
-  // Whether to add an implicit background class to one-hot encodings of
+  // Configuration proto for MaskHead.
-  // groundtruth labels. Set to false if using groundtruth labels with an
+  // Next id: 11
-  // explicit background class, using multiclass scores, or if training a single
+  message MaskHead {
-  // class model.
+    // The height and the width of the predicted mask. Only used when
-  optional bool add_background_class = 21 [default = true];
+    // predict_instance_masks is true.
-}
+    optional int32 mask_height = 1 [default = 15];
+    optional int32 mask_width = 2 [default = 15];
+    // Whether to predict class agnostic masks. Only used when
+    // predict_instance_masks is true.
+    optional bool masks_are_class_agnostic = 3 [default = true];
+    // The depth for the first conv2d_transpose op applied to the
+    // image_features in the mask prediction branch. If set to 0, the value
+    // will be set automatically based on the number of channels in the image
+    // features and the number of classes.
+    optional int32 mask_prediction_conv_depth = 4 [default = 256];
+    // The number of convolutions applied to image_features in the mask prediction
+    // branch.
+    optional int32 mask_prediction_num_conv_layers = 5 [default = 2];
+    // Whether to apply convolutions on mask features before upsampling using
+    // nearest neighbor resizing.
+    // By default, mask features are resized to [`mask_height`, `mask_width`]
+    // before applying convolutions and predicting masks.
+    optional bool convolve_then_upsample_masks = 6 [default = false];
+    // Mask loss weight.
+    optional float mask_loss_weight = 7 [default=5.0];
+    // Number of boxes to be generated at training time for computing mask loss.
+    optional int32 mask_loss_sample_size = 8 [default=16];
+    // Hyperparameters for convolution ops used in the box predictor.
+    optional Hyperparams conv_hyperparams = 9;
+    // Output size (width and height are set to be the same) of the initial
+    // bilinear interpolation based cropping during ROI pooling. Only used when
+    // we have second stage prediction head enabled (e.g. mask head).
+    optional int32 initial_crop_size = 10 [default = 15];
+  }
+  // Configs for mask head.
+  optional MaskHead mask_head_config = 25;
+}
 message SsdFeatureExtractor {
  reserved 6;
@@ -113,10 +150,10 @@ message SsdFeatureExtractor {
  optional string type = 1;
  // The factor to alter the depth of the channels in the feature extractor.
-  optional float depth_multiplier = 2 [default=1.0];
+  optional float depth_multiplier = 2 [default = 1.0];
  // Minimum number of the channels in the feature extractor.
-  optional int32 min_depth = 3 [default=16];
+  optional int32 min_depth = 3 [default = 16];
  // Hyperparameters that affect the layers of feature extractor added on top
  // of the base feature extractor.
@@ -128,7 +165,8 @@ message SsdFeatureExtractor {
  // layers while base feature extractor uses its own default hyperparams. If
  // this value is set to true, the base feature extractor's hyperparams will be
  // overridden with the `conv_hyperparams`.
-  optional bool override_base_feature_extractor_hyperparams = 9 [default = false];
+  optional bool override_base_feature_extractor_hyperparams = 9
+      [default = false];
  // The nearest multiple to zero-pad the input height and width dimensions to.
  // For example, if pad_to_multiple = 2, input dimensions are zero-padded
@@ -138,11 +176,11 @@ message SsdFeatureExtractor {
  // Whether to use explicit padding when extracting SSD multiresolution
  // features. This will also apply to the base feature extractor if a MobileNet
  // architecture is used.
-  optional bool use_explicit_padding = 7 [default=false];
+  optional bool use_explicit_padding = 7 [default = false];
  // Whether to use depthwise separable convolutions for to extract additional
  // feature maps added by SSD.
-  optional bool use_depthwise = 8 [default=false];
+  optional bool use_depthwise = 8 [default = false];
  // Feature Pyramid Networks config.
  optional FeaturePyramidNetworks fpn = 10;
@@ -173,4 +211,3 @@ message FeaturePyramidNetworks {
  // channel depth for additional coarse feature layers.
  optional int32 additional_layer_depth = 3 [default = 256];
 }
--- a/research/object_detection/protos/train.proto
+++ b/research/object_detection/protos/train.proto
@@ -20,7 +20,7 @@ message TrainConfig {
  optional bool sync_replicas = 3 [default=false];
  // How frequently to keep checkpoints.
-  optional uint32 keep_checkpoint_every_n_hours = 4 [default=1000];
+  optional float keep_checkpoint_every_n_hours = 4 [default=10000.0];
  // Optimizer used to train the DetectionModel.
  optional Optimizer optimizer = 5;

--- a/research/object_detection/utils/object_detection_evaluation.py
+++ b/research/object_detection/utils/object_detection_evaluation.py
@@ -33,6 +33,7 @@ import collections
 import logging
 import unicodedata
 import numpy as np
+import tensorflow as tf
 from object_detection.core import standard_fields
 from object_detection.utils import label_map_util
@@ -126,6 +127,7 @@ class ObjectDetectionEvaluator(DetectionEvaluator):
               categories,
               matching_iou_threshold=0.5,
               evaluate_corlocs=False,
+               evaluate_precision_recall=False,
               metric_prefix=None,
               use_weighted_mean_ap=False,
               evaluate_masks=False,
@@ -140,6 +142,8 @@ class ObjectDetectionEvaluator(DetectionEvaluator):
        boxes to detection boxes.
      evaluate_corlocs: (optional) boolean which determines if corloc scores
        are to be returned or not.
+      evaluate_precision_recall: (optional) boolean which determines if
+        precision and recall values are to be returned or not.
      metric_prefix: (optional) string prefix for metric name; if None, no
        prefix is used.
      use_weighted_mean_ap: (optional) boolean which determines if the mean
@@ -174,7 +178,50 @@ class ObjectDetectionEvaluator(DetectionEvaluator):
        group_of_weight=self._group_of_weight)
    self._image_ids = set([])
    self._evaluate_corlocs = evaluate_corlocs
+    self._evaluate_precision_recall = evaluate_precision_recall
    self._metric_prefix = (metric_prefix + '_') if metric_prefix else ''
+    self._expected_keys = set([
+        standard_fields.InputDataFields.key,
+        standard_fields.InputDataFields.groundtruth_boxes,
+        standard_fields.InputDataFields.groundtruth_classes,
+        standard_fields.InputDataFields.groundtruth_difficult,
+        standard_fields.InputDataFields.groundtruth_instance_masks,
+        standard_fields.DetectionResultFields.detection_boxes,
+        standard_fields.DetectionResultFields.detection_scores,
+        standard_fields.DetectionResultFields.detection_classes,
+        standard_fields.DetectionResultFields.detection_masks
+    ])
+    self._build_metric_names()
+  def _build_metric_names(self):
+    """Builds a list with metric names."""
+    self._metric_names = [
+        self._metric_prefix + 'Precision/mAP@{}IOU'.format(
+            self._matching_iou_threshold)
+    ]
+    if self._evaluate_corlocs:
+      self._metric_names.append(
+          self._metric_prefix +
+          'Precision/meanCorLoc@{}IOU'.format(self._matching_iou_threshold))
+    category_index = label_map_util.create_category_index(self._categories)
+    for idx in range(self._num_classes):
+      if idx + self._label_id_offset in category_index:
+        category_name = category_index[idx + self._label_id_offset]['name']
+        try:
+          category_name = unicode(category_name, 'utf-8')
+        except TypeError:
+          pass
+        category_name = unicodedata.normalize('NFKD', category_name).encode(
+            'ascii', 'ignore')
+        self._metric_names.append(
+            self._metric_prefix + 'PerformanceByCategory/AP@{}IOU/{}'.format(
+                self._matching_iou_threshold, category_name))
+        if self._evaluate_corlocs:
+          self._metric_names.append(
+              self._metric_prefix + 'PerformanceByCategory/CorLoc@{}IOU/{}'
+              .format(self._matching_iou_threshold, category_name))
  def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
    """Adds groundtruth for a single image to be used for evaluation.
@@ -283,22 +330,19 @@ class ObjectDetectionEvaluator(DetectionEvaluator):
      A dictionary of metrics with the following fields -
      1. summary_metrics:
-        'Precision/mAP@<matching_iou_threshold>IOU': mean average precision at
+        '<prefix if not empty>_Precision/mAP@<matching_iou_threshold>IOU': mean
-        the specified IOU threshold.
+        average precision at the specified IOU threshold.
      2. per_category_ap: category specific results with keys of the form
-        'PerformanceByCategory/mAP@<matching_iou_threshold>IOU/category'.
+        '<prefix if not empty>_PerformanceByCategory/
+        mAP@<matching_iou_threshold>IOU/category'.
    """
-    (per_class_ap, mean_ap, _, _, per_class_corloc, mean_corloc) = (
+    (per_class_ap, mean_ap, per_class_precision, per_class_recall,
-        self._evaluation.evaluate())
+     per_class_corloc, mean_corloc) = (
-    pascal_metrics = {
+         self._evaluation.evaluate())
-        self._metric_prefix +
+    pascal_metrics = {self._metric_names[0]: mean_ap}
-        'Precision/mAP@{}IOU'.format(self._matching_iou_threshold):
-            mean_ap
-    }
    if self._evaluate_corlocs:
-      pascal_metrics[self._metric_prefix + 'Precision/meanCorLoc@{}IOU'.format(
+      pascal_metrics[self._metric_names[1]] = mean_corloc
-          self._matching_iou_threshold)] = mean_corloc
    category_index = label_map_util.create_category_index(self._categories)
    for idx in range(per_class_ap.size):
      if idx + self._label_id_offset in category_index:
@@ -314,6 +358,19 @@ class ObjectDetectionEvaluator(DetectionEvaluator):
                self._matching_iou_threshold, category_name))
        pascal_metrics[display_name] = per_class_ap[idx]
+        # Optionally add precision and recall values
+        if self._evaluate_precision_recall:
+          display_name = (
+              self._metric_prefix +
+              'PerformanceByCategory/Precision@{}IOU/{}'.format(
+                  self._matching_iou_threshold, category_name))
+          pascal_metrics[display_name] = per_class_precision[idx]
+          display_name = (
+              self._metric_prefix +
+              'PerformanceByCategory/Recall@{}IOU/{}'.format(
+                  self._matching_iou_threshold, category_name))
+          pascal_metrics[display_name] = per_class_recall[idx]
        # Optionally add CorLoc metrics.classes
        if self._evaluate_corlocs:
          display_name = (
@@ -332,6 +389,74 @@ class ObjectDetectionEvaluator(DetectionEvaluator):
        label_id_offset=self._label_id_offset)
    self._image_ids.clear()
+  def get_estimator_eval_metric_ops(self, eval_dict):
+    """Returns dict of metrics to use with `tf.estimator.EstimatorSpec`.
+    Note that this must only be implemented if performing evaluation with a
+    `tf.estimator.Estimator`.
+    Args:
+      eval_dict: A dictionary that holds tensors for evaluating an object
+        detection model, returned from
+        eval_util.result_dict_for_single_example(). It must contain
+        standard_fields.InputDataFields.key.
+    Returns:
+      A dictionary of metric names to tuple of value_op and update_op that can
+      be used as eval metric ops in `tf.estimator.EstimatorSpec`.
+    """
+    # remove unexpected fields
+    eval_dict_filtered = dict()
+    for key, value in eval_dict.items():
+      if key in self._expected_keys:
+        eval_dict_filtered[key] = value
+    eval_dict_keys = eval_dict_filtered.keys()
+    def update_op(image_id, *eval_dict_batched_as_list):
+      """Update operation that adds batch of images to ObjectDetectionEvaluator.
+      Args:
+        image_id: image id (single id or an array)
+        *eval_dict_batched_as_list: the values of the dictionary of tensors.
+      """
+      if np.isscalar(image_id):
+        single_example_dict = dict(
+            zip(eval_dict_keys, eval_dict_batched_as_list))
+        self.add_single_ground_truth_image_info(image_id, single_example_dict)
+        self.add_single_detected_image_info(image_id, single_example_dict)
+      else:
+        for unzipped_tuple in zip(*eval_dict_batched_as_list):
+          single_example_dict = dict(zip(eval_dict_keys, unzipped_tuple))
+          image_id = single_example_dict[standard_fields.InputDataFields.key]
+          self.add_single_ground_truth_image_info(image_id, single_example_dict)
+          self.add_single_detected_image_info(image_id, single_example_dict)
+    args = [eval_dict_filtered[standard_fields.InputDataFields.key]]
+    args.extend(eval_dict_filtered.values())
+    update_op = tf.py_func(update_op, args, [])
+    def first_value_func():
+      self._metrics = self.evaluate()
+      self.clear()
+      return np.float32(self._metrics[self._metric_names[0]])
+    def value_func_factory(metric_name):
+      def value_func():
+        return np.float32(self._metrics[metric_name])
+      return value_func
+    # Ensure that the metrics are only evaluated once.
+    first_value_op = tf.py_func(first_value_func, [], tf.float32)
+    eval_metric_ops = {self._metric_names[0]: (first_value_op, update_op)}
+    with tf.control_dependencies([first_value_op]):
+      for metric_name in self._metric_names[1:]:
+        eval_metric_ops[metric_name] = (tf.py_func(
+            value_func_factory(metric_name), [], np.float32), update_op)
+    return eval_metric_ops
 class PascalDetectionEvaluator(ObjectDetectionEvaluator):
  """A class to evaluate detections using PASCAL metrics."""
@@ -442,6 +567,15 @@ class OpenImagesDetectionEvaluator(ObjectDetectionEvaluator):
        evaluate_corlocs,
        metric_prefix=metric_prefix,
        group_of_weight=group_of_weight)
+    self._expected_keys = set([
+        standard_fields.InputDataFields.key,
+        standard_fields.InputDataFields.groundtruth_boxes,
+        standard_fields.InputDataFields.groundtruth_classes,
+        standard_fields.InputDataFields.groundtruth_group_of,
+        standard_fields.DetectionResultFields.detection_boxes,
+        standard_fields.DetectionResultFields.detection_scores,
+        standard_fields.DetectionResultFields.detection_classes,
+    ])
  def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
    """Adds groundtruth for a single image to be used for evaluation.
@@ -535,6 +669,16 @@ class OpenImagesDetectionChallengeEvaluator(OpenImagesDetectionEvaluator):
        group_of_weight=group_of_weight)
    self._evaluatable_labels = {}
+    self._expected_keys = set([
+        standard_fields.InputDataFields.key,
+        standard_fields.InputDataFields.groundtruth_boxes,
+        standard_fields.InputDataFields.groundtruth_classes,
+        standard_fields.InputDataFields.groundtruth_group_of,
+        standard_fields.InputDataFields.groundtruth_image_classes,
+        standard_fields.DetectionResultFields.detection_boxes,
+        standard_fields.DetectionResultFields.detection_scores,
+        standard_fields.DetectionResultFields.detection_classes,
+    ])
  def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
    """Adds groundtruth for a single image to be used for evaluation.
@@ -890,15 +1034,14 @@ class ObjectDetectionEvaluation(object):
      if self.use_weighted_mean_ap:
        all_scores = np.append(all_scores, scores)
        all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels)
-      logging.info('Scores and tpfp per class label: %d', class_index)
-      logging.info(tp_fp_labels)
-      logging.info(scores)
      precision, recall = metrics.compute_precision_recall(
          scores, tp_fp_labels, self.num_gt_instances_per_class[class_index])
      self.precisions_per_class[class_index] = precision
      self.recalls_per_class[class_index] = recall
      average_precision = metrics.compute_average_precision(precision, recall)
      self.average_precision_per_class[class_index] = average_precision
+      logging.info('average_precision: %f', average_precision)
    self.corloc_per_class = metrics.compute_cor_loc(
        self.num_gt_imgs_per_class,

--- a/research/object_detection/utils/object_detection_evaluation_test.py
+++ b/research/object_detection/utils/object_detection_evaluation_test.py
@@ -15,9 +15,10 @@
 """Tests for object_detection.utils.object_detection_evaluation."""
+from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
+from object_detection import eval_util
 from object_detection.core import standard_fields
 from object_detection.utils import object_detection_evaluation
@@ -683,5 +684,141 @@ class ObjectDetectionEvaluationTest(tf.test.TestCase):
    self.assertAlmostEqual(expected_mean_corloc, mean_corloc)
+class ObjectDetectionEvaluatorTest(tf.test.TestCase, parameterized.TestCase):
+  def setUp(self):
+    self.categories = [{
+        'id': 1,
+        'name': 'person'
+    }, {
+        'id': 2,
+        'name': 'dog'
+    }, {
+        'id': 3,
+        'name': 'cat'
+    }]
+    self.od_eval = object_detection_evaluation.ObjectDetectionEvaluator(
+        categories=self.categories)
+  def _make_evaluation_dict(self,
+                            resized_groundtruth_masks=False,
+                            batch_size=1,
+                            max_gt_boxes=None,
+                            scale_to_absolute=False):
+    input_data_fields = standard_fields.InputDataFields
+    detection_fields = standard_fields.DetectionResultFields
+    image = tf.zeros(shape=[batch_size, 20, 20, 3], dtype=tf.uint8)
+    if batch_size == 1:
+      key = tf.constant('image1')
+    else:
+      key = tf.constant([str(i) for i in range(batch_size)])
+    detection_boxes = tf.concat([
+        tf.tile(
+            tf.constant([[[0., 0., 1., 1.]]]), multiples=[batch_size - 1, 1, 1
+                                                         ]),
+        tf.constant([[[0., 0., 0.5, 0.5]]])
+    ],
+                                axis=0)
+    detection_scores = tf.concat([
+        tf.tile(tf.constant([[0.5]]), multiples=[batch_size - 1, 1]),
+        tf.constant([[0.8]])
+    ],
+                                 axis=0)
+    detection_classes = tf.tile(tf.constant([[0]]), multiples=[batch_size, 1])
+    detection_masks = tf.tile(
+        tf.ones(shape=[1, 2, 20, 20], dtype=tf.float32),
+        multiples=[batch_size, 1, 1, 1])
+    groundtruth_boxes = tf.constant([[0., 0., 1., 1.]])
+    groundtruth_classes = tf.constant([1])
+    groundtruth_instance_masks = tf.ones(shape=[1, 20, 20], dtype=tf.uint8)
+    num_detections = tf.ones([batch_size])
+    if resized_groundtruth_masks:
+      groundtruth_instance_masks = tf.ones(shape=[1, 10, 10], dtype=tf.uint8)
+    if batch_size > 1:
+      groundtruth_boxes = tf.tile(
+          tf.expand_dims(groundtruth_boxes, 0), multiples=[batch_size, 1, 1])
+      groundtruth_classes = tf.tile(
+          tf.expand_dims(groundtruth_classes, 0), multiples=[batch_size, 1])
+      groundtruth_instance_masks = tf.tile(
+          tf.expand_dims(groundtruth_instance_masks, 0),
+          multiples=[batch_size, 1, 1, 1])
+    detections = {
+        detection_fields.detection_boxes: detection_boxes,
+        detection_fields.detection_scores: detection_scores,
+        detection_fields.detection_classes: detection_classes,
+        detection_fields.detection_masks: detection_masks,
+        detection_fields.num_detections: num_detections
+    }
+    groundtruth = {
+        input_data_fields.groundtruth_boxes:
+            groundtruth_boxes,
+        input_data_fields.groundtruth_classes:
+            groundtruth_classes,
+        input_data_fields.groundtruth_instance_masks:
+            groundtruth_instance_masks,
+    }
+    if batch_size > 1:
+      return eval_util.result_dict_for_batched_example(
+          image,
+          key,
+          detections,
+          groundtruth,
+          scale_to_absolute=scale_to_absolute,
+          max_gt_boxes=max_gt_boxes)
+    else:
+      return eval_util.result_dict_for_single_example(
+          image,
+          key,
+          detections,
+          groundtruth,
+          scale_to_absolute=scale_to_absolute)
+  @parameterized.parameters({
+      'batch_size': 1,
+      'expected_map': 0,
+      'max_gt_boxes': None,
+      'scale_to_absolute': True
+  }, {
+      'batch_size': 8,
+      'expected_map': 0.765625,
+      'max_gt_boxes': [1],
+      'scale_to_absolute': True
+  }, {
+      'batch_size': 1,
+      'expected_map': 0,
+      'max_gt_boxes': None,
+      'scale_to_absolute': False
+  }, {
+      'batch_size': 8,
+      'expected_map': 0.765625,
+      'max_gt_boxes': [1],
+      'scale_to_absolute': False
+  })
+  def test_get_estimator_eval_metric_ops(self,
+                                         batch_size=1,
+                                         expected_map=1,
+                                         max_gt_boxes=None,
+                                         scale_to_absolute=False):
+    eval_dict = self._make_evaluation_dict(
+        batch_size=batch_size,
+        max_gt_boxes=max_gt_boxes,
+        scale_to_absolute=scale_to_absolute)
+    tf.logging.info('eval_dict: {}'.format(eval_dict))
+    metric_ops = self.od_eval.get_estimator_eval_metric_ops(eval_dict)
+    _, update_op = metric_ops['Precision/mAP@0.5IOU']
+    with self.test_session() as sess:
+      metrics = {}
+      for key, (value_op, _) in metric_ops.iteritems():
+        metrics[key] = value_op
+      sess.run(update_op)
+      metrics = sess.run(metrics)
+      self.assertAlmostEqual(expected_map, metrics['Precision/mAP@0.5IOU'])
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/utils/ops.py
+++ b/research/object_detection/utils/ops.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """A module for helper tensorflow ops."""
+import collections
 import math
 import numpy as np
 import six
@@ -1087,81 +1088,10 @@ def native_crop_and_resize(image, boxes, crop_size, scope=None):
    return tf.reshape(cropped_regions, final_shape)
-def expected_classification_loss_under_sampling(
-    batch_cls_targets, cls_losses, unmatched_cls_losses,
-    desired_negative_sampling_ratio, min_num_negative_samples):
-  """Computes classification loss by background/foreground weighting.
-  The weighting is such that the effective background/foreground weight ratio
-  is the desired_negative_sampling_ratio. if p_i is the foreground probability
-  of anchor a_i, L(a_i) is the anchors loss, N is the number of anchors, M
-  is the sum of foreground probabilities across anchors, and K is the desired
-  ratio between the number of negative and positive samples, then the total loss
-  L is calculated as:
-  beta = K*M/(N-M)
-  L = sum_{i=1}^N [p_i * L_p(a_i) + beta * (1 - p_i) * L_n(a_i)]
-  where L_p(a_i) is the loss against target assuming the anchor was matched,
-  otherwise zero, and L_n(a_i) is the loss against the background target
-  assuming the anchor was unmatched, otherwise zero.
-  Args:
+EqualizationLossConfig = collections.namedtuple('EqualizationLossConfig',
-    batch_cls_targets: A tensor with shape [batch_size, num_anchors, num_classes
+                                                ['weight', 'exclude_prefixes'])
-      + 1], where 0'th index is the background class, containing the class
-      distrubution for the target assigned to a given anchor.
-    cls_losses: Float tensor of shape [batch_size, num_anchors] representing
-      anchorwise classification losses.
-    unmatched_cls_losses: loss for each anchor against the unmatched class
-      target.
-    desired_negative_sampling_ratio: The desired background/foreground weight
-      ratio.
-    min_num_negative_samples: Minimum number of effective negative samples.
-      Used only when there are no positive examples.
-  Returns:
-    The classification loss.
-  """
-  num_anchors = tf.cast(tf.shape(batch_cls_targets)[1], tf.float32)
-  # find the p_i
-  foreground_probabilities = 1 - batch_cls_targets[:, :, 0]
-  foreground_sum = tf.reduce_sum(foreground_probabilities, axis=-1)
-  # for each anchor, expected_j is the expected number of positive anchors
-  # given that this anchor was sampled as negative.
-  tiled_foreground_sum = tf.tile(
-      tf.reshape(foreground_sum, [-1, 1]),
-      [1, tf.cast(num_anchors, tf.int32)])
-  expected_j = tiled_foreground_sum - foreground_probabilities
-  k = desired_negative_sampling_ratio
-  # compute beta
-  expected_negatives = tf.to_float(num_anchors) - expected_j
-  desired_negatives = k * expected_j
-  desired_negatives = tf.where(
-      tf.greater(desired_negatives, expected_negatives), expected_negatives,
-      desired_negatives)
-  # probability that an anchor is sampled for the loss computation given that it
-  # is negative.
-  beta = desired_negatives / expected_negatives
-  # where the foreground sum is zero, use a minimum negative weight.
-  min_negative_weight = 1.0 * min_num_negative_samples / num_anchors
-  beta = tf.where(
-      tf.equal(tiled_foreground_sum, 0),
-      min_negative_weight * tf.ones_like(beta), beta)
-  foreground_weights = foreground_probabilities
-  background_weights = (1 - foreground_weights) * beta
-  weighted_foreground_losses = foreground_weights * cls_losses
-  weighted_background_losses = background_weights * unmatched_cls_losses
-  cls_losses = tf.reduce_sum(
-      weighted_foreground_losses, axis=-1) + tf.reduce_sum(
-          weighted_background_losses, axis=-1)
-  return cls_losses
--- a/research/object_detection/utils/ops_test.py
+++ b/research/object_detection/utils/ops_test.py
@@ -21,6 +21,8 @@ from object_detection.core import standard_fields as fields
 from object_detection.utils import ops
 from object_detection.utils import test_case
+slim = tf.contrib.slim
 class NormalizedToImageCoordinatesTest(tf.test.TestCase):
@@ -1466,189 +1468,9 @@ class OpsTestCropAndResize(test_case.TestCase):
    self.assertAllClose(crop_output, expected_output)
-class OpsTestExpectedClassificationLoss(test_case.TestCase):
-  def testExpectedClassificationLossUnderSamplingWithHardLabels(self):
-    def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses,
-                 negative_to_positive_ratio, min_num_negative_samples):
-      return ops.expected_classification_loss_under_sampling(
-          batch_cls_targets, cls_losses, unmatched_cls_losses,
-          negative_to_positive_ratio, min_num_negative_samples)
-    batch_cls_targets = np.array(
-        [[[1., 0, 0], [0, 1., 0]], [[1., 0, 0], [0, 1., 0]]], dtype=np.float32)
-    cls_losses = np.array([[1, 2], [3, 4]], dtype=np.float32)
-    unmatched_cls_losses = np.array([[10, 20], [30, 40]], dtype=np.float32)
-    negative_to_positive_ratio = np.array([2], dtype=np.float32)
-    min_num_negative_samples = np.array([1], dtype=np.float32)
-    classification_loss = self.execute(graph_fn, [
-        batch_cls_targets, cls_losses, unmatched_cls_losses,
-        negative_to_positive_ratio, min_num_negative_samples
-    ])
-    # expected_foreground_sum = [1,1]
-    # expected_expected_j = [[1, 0], [1, 0]]
-    # expected_expected_negatives = [[1, 2], [1, 2]]
-    # expected_desired_negatives = [[2, 0], [2, 0]]
-    # expected_beta = [[1, 0], [1, 0]]
-    # expected_foreground_weights = [[0, 1], [0, 1]]
-    # expected_background_weights = [[1, 0], [1, 0]]
-    # expected_weighted_foreground_losses = [[0, 2], [0, 4]]
-    # expected_weighted_background_losses = [[10, 0], [30, 0]]
-    # expected_classification_loss_under_sampling = [6, 40]
-    expected_classification_loss_under_sampling = [2 + 10, 4 + 30]
-    self.assertAllClose(expected_classification_loss_under_sampling,
-                        classification_loss)
-  def testExpectedClassificationLossUnderSamplingWithHardLabelsMoreNegatives(
-      self):
-    def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses,
-                 negative_to_positive_ratio, min_num_negative_samples):
-      return ops.expected_classification_loss_under_sampling(
-          batch_cls_targets, cls_losses, unmatched_cls_losses,
-          negative_to_positive_ratio, min_num_negative_samples)
-    batch_cls_targets = np.array(
-        [[[1., 0, 0], [0, 1., 0], [1., 0, 0], [1., 0, 0], [1., 0, 0]]],
-        dtype=np.float32)
-    cls_losses = np.array([[1, 2, 3, 4, 5]], dtype=np.float32)
-    unmatched_cls_losses = np.array([[10, 20, 30, 40, 50]], dtype=np.float32)
-    negative_to_positive_ratio = np.array([2], dtype=np.float32)
-    min_num_negative_samples = np.array([1], dtype=np.float32)
-    classification_loss = self.execute(graph_fn, [
-        batch_cls_targets, cls_losses, unmatched_cls_losses,
-        negative_to_positive_ratio, min_num_negative_samples
-    ])
-    # expected_foreground_sum = [1]
-    # expected_expected_j = [[1, 0, 1, 1, 1]]
-    # expected_expected_negatives = [[4, 5, 4, 4, 4]]
-    # expected_desired_negatives = [[2, 0, 2, 2, 2]]
-    # expected_beta = [[.5, 0, .5, .5, .5]]
-    # expected_foreground_weights = [[0, 1, 0, 0, 0]]
-    # expected_background_weights = [[.5, 0, .5, .5, .5]]
-    # expected_weighted_foreground_losses = [[0, 2, 0, 0, 0]]
-    # expected_weighted_background_losses = [[10*.5, 0, 30*.5, 40*.5, 50*.5]]
-    # expected_classification_loss_under_sampling = [5+2+15+20+25]
-    expected_classification_loss_under_sampling = [5 + 2 + 15 + 20 + 25]
-    self.assertAllClose(expected_classification_loss_under_sampling,
-                        classification_loss)
-  def testExpectedClassificationLossUnderSamplingWithAllNegative(self):
-    def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses):
-      return ops.expected_classification_loss_under_sampling(
-          batch_cls_targets, cls_losses, unmatched_cls_losses,
-          negative_to_positive_ratio, min_num_negative_samples)
-    batch_cls_targets = np.array(
-        [[[1, 0, 0], [1, 0, 0]], [[1, 0, 0], [1, 0, 0]]], dtype=np.float32)
-    cls_losses = np.array([[1, 2], [3, 4]], dtype=np.float32)
-    unmatched_cls_losses = np.array([[10, 20], [30, 40]], dtype=np.float32)
-    negative_to_positive_ratio = np.array([2], dtype=np.float32)
-    min_num_negative_samples = np.array([1], dtype=np.float32)
-    classification_loss = self.execute(
-        graph_fn, [batch_cls_targets, cls_losses, unmatched_cls_losses])
-    # expected_foreground_sum = [0,0]
-    # expected_expected_j = [[0, 0], [0, 0]]
-    # expected_expected_negatives = [[2, 2], [2, 2]]
-    # expected_desired_negatives = [[0, 0], [0, 0]]
-    # expected_beta = [[0, 0],[0, 0]]
-    # expected_foreground_weights = [[0, 0], [0, 0]]
-    # expected_background_weights = [[.5, .5], [.5, .5]]
-    # expected_weighted_foreground_losses = [[0, 0], [0, 0]]
-    # expected_weighted_background_losses = [[5, 10], [15, 20]]
-    # expected_classification_loss_under_sampling = [15, 35]
-    expected_classification_loss_under_sampling = [
-        10 * .5 + 20 * .5, 30 * .5 + 40 * .5
-    ]
-    self.assertAllClose(expected_classification_loss_under_sampling,
-                        classification_loss)
-  def testExpectedClassificationLossUnderSamplingWithAllPositive(self):
-    def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses):
-      return ops.expected_classification_loss_under_sampling(
-          batch_cls_targets, cls_losses, unmatched_cls_losses,
-          negative_to_positive_ratio, min_num_negative_samples)
-    batch_cls_targets = np.array(
-        [[[0, 1., 0], [0, 1., 0]], [[0, 1, 0], [0, 0, 1]]], dtype=np.float32)
-    cls_losses = np.array([[1, 2], [3, 4]], dtype=np.float32)
-    unmatched_cls_losses = np.array([[10, 20], [30, 40]], dtype=np.float32)
-    negative_to_positive_ratio = np.array([2], dtype=np.float32)
-    min_num_negative_samples = np.array([1], dtype=np.float32)
-    classification_loss = self.execute(
-        graph_fn, [batch_cls_targets, cls_losses, unmatched_cls_losses])
-    # expected_foreground_sum = [2,2]
-    # expected_expected_j = [[1, 1], [1, 1]]
-    # expected_expected_negatives = [[1, 1], [1, 1]]
-    # expected_desired_negatives = [[1, 1], [1, 1]]
-    # expected_beta = [[1, 1],[1, 1]]
-    # expected_foreground_weights = [[1, 1], [1, 1]]
-    # expected_background_weights = [[0, 0], [0, 0]]
-    # expected_weighted_foreground_losses = [[1, 2], [3, 4]]
-    # expected_weighted_background_losses = [[0, 0], [0, 0]]
-    # expected_classification_loss_under_sampling = [15, 35]
-    expected_classification_loss_under_sampling = [1 + 2, 3 + 4]
-    self.assertAllClose(expected_classification_loss_under_sampling,
-                        classification_loss)
-  def testExpectedClassificationLossUnderSamplingWithSoftLabels(self):
-    def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses,
-                 negative_to_positive_ratio, min_num_negative_samples):
-      return ops.expected_classification_loss_under_sampling(
-          batch_cls_targets, cls_losses, unmatched_cls_losses,
-          negative_to_positive_ratio, min_num_negative_samples)
-    batch_cls_targets = np.array([[[.75, .25, 0], [0.25, .75, 0], [.75, .25, 0],
-                                   [0.25, .75, 0], [1., 0, 0]]],
-                                 dtype=np.float32)
-    cls_losses = np.array([[1, 2, 3, 4, 5]], dtype=np.float32)
-    unmatched_cls_losses = np.array([[10, 20, 30, 40, 50]], dtype=np.float32)
-    negative_to_positive_ratio = np.array([2], dtype=np.float32)
-    min_num_negative_samples = np.array([1], dtype=np.float32)
-    classification_loss = self.execute(graph_fn, [
-        batch_cls_targets, cls_losses, unmatched_cls_losses,
-        negative_to_positive_ratio, min_num_negative_samples
-    ])
-    # expected_foreground_sum = [2]
-    # expected_expected_j = [[1.75, 1.25, 1.75, 1.25, 2]]
-    # expected_expected_negatives = [[3.25, 3.75, 3.25, 3.75, 3]]
-    # expected_desired_negatives = [[3.25, 2.5, 3.25, 2.5, 3]]
-    # expected_beta = [[1, 2/3, 1, 2/3, 1]]
-    # expected_foreground_weights = [[0.25, .75, .25, .75, 0]]
-    # expected_background_weights = [[[.75, 1/6., .75, 1/6., 1]]]
-    # expected_weighted_foreground_losses = [[.25*1, .75*2, .25*3, .75*4, 0*5]]
-    # expected_weighted_background_losses = [[
-    #     .75*10, 1/6.*20, .75*30, 1/6.*40, 1*50]]
-    # expected_classification_loss_under_sampling = sum([
-    #     .25*1, .75*2, .25*3, .75*4, 0, .75*10, 1/6.*20, .75*30,
-    #     1/6.*40, 1*50])
-    expected_classification_loss_under_sampling = [
-        sum([
-            .25 * 1, .75 * 2, .25 * 3, .75 * 4, 0, .75 * 10, 1 / 6. * 20,
-            .75 * 30, 1 / 6. * 40, 1 * 50
-        ])
-    ]
-    self.assertAllClose(expected_classification_loss_under_sampling,
-                        classification_loss)
 if __name__ == '__main__':

--- a/research/object_detection/utils/test_utils.py
+++ b/research/object_detection/utils/test_utils.py
@@ -42,14 +42,25 @@ class MockBoxCoder(box_coder.BoxCoder):
    return box_list.BoxList(rel_codes + anchors.get())
+class MockMaskHead(object):
+  """Simple maskhead that returns all zeros as mask predictions."""
+  def __init__(self, num_classes):
+    self._num_classes = num_classes
+  def predict(self, features):
+    batch_size = tf.shape(features)[0]
+    return tf.zeros((batch_size, 1, self._num_classes, DEFAULT_MASK_SIZE,
+                     DEFAULT_MASK_SIZE),
+                    dtype=tf.float32)
 class MockBoxPredictor(box_predictor.BoxPredictor):
  """Simple box predictor that ignores inputs and outputs all zeros."""
-  def __init__(self, is_training, num_classes, add_background_class=True,
+  def __init__(self, is_training, num_classes, add_background_class=True):
-               predict_mask=False):
    super(MockBoxPredictor, self).__init__(is_training, num_classes)
    self._add_background_class = add_background_class
-    self._predict_mask = predict_mask
  def _predict(self, image_features, num_predictions_per_location):
    image_feature = image_features[0]
@@ -66,31 +77,22 @@ class MockBoxPredictor(box_predictor.BoxPredictor):
        (batch_size, num_anchors, 1, code_size), dtype=tf.float32)
    class_predictions_with_background = zero + tf.zeros(
        (batch_size, num_anchors, num_class_slots), dtype=tf.float32)
-    masks = zero + tf.zeros(
-        (batch_size, num_anchors, self.num_classes, DEFAULT_MASK_SIZE,
-         DEFAULT_MASK_SIZE),
-        dtype=tf.float32)
    predictions_dict = {
        box_predictor.BOX_ENCODINGS:
            box_encodings,
        box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background
    }
-    if self._predict_mask:
-      predictions_dict[box_predictor.MASK_PREDICTIONS] = masks
    return predictions_dict
 class MockKerasBoxPredictor(box_predictor.KerasBoxPredictor):
  """Simple box predictor that ignores inputs and outputs all zeros."""
-  def __init__(self, is_training, num_classes, add_background_class=True,
+  def __init__(self, is_training, num_classes, add_background_class=True):
-               predict_mask=False):
    super(MockKerasBoxPredictor, self).__init__(
        is_training, num_classes, False, False)
    self._add_background_class = add_background_class
-    self._predict_mask = predict_mask
  def _predict(self, image_features, **kwargs):
    image_feature = image_features[0]
@@ -107,18 +109,12 @@ class MockKerasBoxPredictor(box_predictor.KerasBoxPredictor):
        (batch_size, num_anchors, 1, code_size), dtype=tf.float32)
    class_predictions_with_background = zero + tf.zeros(
        (batch_size, num_anchors, num_class_slots), dtype=tf.float32)
-    masks = zero + tf.zeros(
-        (batch_size, num_anchors, self.num_classes, DEFAULT_MASK_SIZE,
-         DEFAULT_MASK_SIZE),
-        dtype=tf.float32)
    predictions_dict = {
        box_predictor.BOX_ENCODINGS:
            box_encodings,
        box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background
    }
-    if self._predict_mask:
-      predictions_dict[box_predictor.MASK_PREDICTIONS] = masks
    return predictions_dict

--- a/research/object_detection/utils/visualization_utils.py
+++ b/research/object_detection/utils/visualization_utils.py
@@ -19,8 +19,7 @@ These functions often receive an image, perform some visualization on the image.
 The functions do not return a value, instead they modify the image itself.
 """
-from abc import ABCMeta
+import abc
-from abc import abstractmethod
 import collections
 import functools
 # Set headless-friendly backend.
@@ -35,7 +34,7 @@ import six
 import tensorflow as tf
 from object_detection.core import standard_fields as fields
+from object_detection.utils import shape_utils
 _TITLE_LEFT_MARGIN = 10
 _TITLE_TOP_MARGIN = 10
@@ -309,11 +308,23 @@ def _visualize_boxes_and_masks_and_keypoints(
      **kwargs)
+def _resize_original_image(image, image_shape):
+  image = tf.expand_dims(image, 0)
+  image = tf.image.resize_images(
+      image,
+      image_shape,
+      method=tf.image.ResizeMethod.NEAREST_NEIGHBOR,
+      align_corners=True)
+  return tf.cast(tf.squeeze(image, 0), tf.uint8)
 def draw_bounding_boxes_on_image_tensors(images,
                                         boxes,
                                         classes,
                                         scores,
                                         category_index,
+                                         original_image_spatial_shape=None,
+                                         true_image_shape=None,
                                         instance_masks=None,
                                         keypoints=None,
                                         max_boxes_to_draw=20,
@@ -323,13 +334,18 @@ def draw_bounding_boxes_on_image_tensors(images,
  Args:
    images: A 4D uint8 image tensor of shape [N, H, W, C]. If C > 3, additional
-      channels will be ignored.
+      channels will be ignored. If C = 1, then we convert the images to RGB
+      images.
    boxes: [N, max_detections, 4] float32 tensor of detection boxes.
    classes: [N, max_detections] int tensor of detection classes. Note that
      classes are 1-indexed.
    scores: [N, max_detections] float32 tensor of detection scores.
    category_index: a dict that maps integer ids to category dicts. e.g.
      {1: {1: 'dog'}, 2: {2: 'cat'}, ...}
+    original_image_spatial_shape: [N, 2] tensor containing the spatial size of
+      the original image.
+    true_image_shape: [N, 3] tensor containing the spatial size of unpadded
+      original_image.
    instance_masks: A 4D uint8 tensor of shape [N, max_detection, H, W] with
      instance masks.
    keypoints: A 4D float32 tensor of shape [N, max_detection, num_keypoints, 2]
@@ -344,7 +360,10 @@ def draw_bounding_boxes_on_image_tensors(images,
    4D image tensor of type uint8, with boxes drawn on top.
  """
  # Additional channels are being ignored.
-  images = images[:, :, :, 0:3]
+  if images.shape[3] > 3:
+    images = images[:, :, :, 0:3]
+  elif images.shape[3] == 1:
+    images = tf.image.grayscale_to_rgb(images)
  visualization_keyword_args = {
      'use_normalized_coordinates': use_normalized_coordinates,
      'max_boxes_to_draw': max_boxes_to_draw,
@@ -352,35 +371,61 @@ def draw_bounding_boxes_on_image_tensors(images,
      'agnostic_mode': False,
      'line_thickness': 4
  }
+  if true_image_shape is None:
+    true_shapes = tf.constant(-1, shape=[images.shape.as_list()[0], 3])
+  else:
+    true_shapes = true_image_shape
+  if original_image_spatial_shape is None:
+    original_shapes = tf.constant(-1, shape=[images.shape.as_list()[0], 2])
+  else:
+    original_shapes = original_image_spatial_shape
  if instance_masks is not None and keypoints is None:
    visualize_boxes_fn = functools.partial(
        _visualize_boxes_and_masks,
        category_index=category_index,
        **visualization_keyword_args)
-    elems = [images, boxes, classes, scores, instance_masks]
+    elems = [
+        true_shapes, original_shapes, images, boxes, classes, scores,
+        instance_masks
+    ]
  elif instance_masks is None and keypoints is not None:
    visualize_boxes_fn = functools.partial(
        _visualize_boxes_and_keypoints,
        category_index=category_index,
        **visualization_keyword_args)
-    elems = [images, boxes, classes, scores, keypoints]
+    elems = [
+        true_shapes, original_shapes, images, boxes, classes, scores, keypoints
+    ]
  elif instance_masks is not None and keypoints is not None:
    visualize_boxes_fn = functools.partial(
        _visualize_boxes_and_masks_and_keypoints,
        category_index=category_index,
        **visualization_keyword_args)
-    elems = [images, boxes, classes, scores, instance_masks, keypoints]
+    elems = [
+        true_shapes, original_shapes, images, boxes, classes, scores,
+        instance_masks, keypoints
+    ]
  else:
    visualize_boxes_fn = functools.partial(
        _visualize_boxes,
        category_index=category_index,
        **visualization_keyword_args)
-    elems = [images, boxes, classes, scores]
+    elems = [
+        true_shapes, original_shapes, images, boxes, classes, scores
+    ]
  def draw_boxes(image_and_detections):
    """Draws boxes on image."""
-    image_with_boxes = tf.py_func(visualize_boxes_fn, image_and_detections,
+    true_shape = image_and_detections[0]
+    original_shape = image_and_detections[1]
+    if true_image_shape is not None:
+      image = shape_utils.pad_or_clip_nd(image_and_detections[2],
+                                         [true_shape[0], true_shape[1], 3])
+    if original_image_spatial_shape is not None:
+      image_and_detections[2] = _resize_original_image(image, original_shape)
+    image_with_boxes = tf.py_func(visualize_boxes_fn, image_and_detections[2:],
                                  tf.uint8)
    return image_with_boxes
@@ -400,6 +445,7 @@ def draw_side_by_side_evaluation_image(eval_dict,
  Args:
    eval_dict: The evaluation dictionary returned by
+      eval_util.result_dict_for_batched_example() or
      eval_util.result_dict_for_single_example().
    category_index: A category index (dictionary) produced from a labelmap.
    max_boxes_to_draw: The maximum number of boxes to draw for detections.
@@ -409,53 +455,85 @@ def draw_side_by_side_evaluation_image(eval_dict,
      Default is True.
  Returns:
-    A [1, H, 2 * W, C] uint8 tensor. The subimage on the left corresponds to
+    A list of [1, H, 2 * W, C] uint8 tensor. The subimage on the left
-      detections, while the subimage on the right corresponds to groundtruth.
+      corresponds to detections, while the subimage on the right corresponds to
+      groundtruth.
  """
  detection_fields = fields.DetectionResultFields()
  input_data_fields = fields.InputDataFields()
-  instance_masks = None
-  if detection_fields.detection_masks in eval_dict:
+  images_with_detections_list = []
-    instance_masks = tf.cast(
-        tf.expand_dims(eval_dict[detection_fields.detection_masks], axis=0),
+  # Add the batch dimension if the eval_dict is for single example.
-        tf.uint8)
+  if len(eval_dict[detection_fields.detection_classes].shape) == 1:
-  keypoints = None
+    for key in eval_dict:
-  if detection_fields.detection_keypoints in eval_dict:
+      if key != input_data_fields.original_image:
-    keypoints = tf.expand_dims(
+        eval_dict[key] = tf.expand_dims(eval_dict[key], 0)
-        eval_dict[detection_fields.detection_keypoints], axis=0)
-  groundtruth_instance_masks = None
+  for indx in range(eval_dict[input_data_fields.original_image].shape[0]):
-  if input_data_fields.groundtruth_instance_masks in eval_dict:
+    instance_masks = None
-    groundtruth_instance_masks = tf.cast(
+    if detection_fields.detection_masks in eval_dict:
+      instance_masks = tf.cast(
+          tf.expand_dims(
+              eval_dict[detection_fields.detection_masks][indx], axis=0),
+          tf.uint8)
+    keypoints = None
+    if detection_fields.detection_keypoints in eval_dict:
+      keypoints = tf.expand_dims(
+          eval_dict[detection_fields.detection_keypoints][indx], axis=0)
+    groundtruth_instance_masks = None
+    if input_data_fields.groundtruth_instance_masks in eval_dict:
+      groundtruth_instance_masks = tf.cast(
+          tf.expand_dims(
+              eval_dict[input_data_fields.groundtruth_instance_masks][indx],
+              axis=0), tf.uint8)
+    images_with_detections = draw_bounding_boxes_on_image_tensors(
        tf.expand_dims(
-            eval_dict[input_data_fields.groundtruth_instance_masks], axis=0),
+            eval_dict[input_data_fields.original_image][indx], axis=0),
-        tf.uint8)
+        tf.expand_dims(
-  images_with_detections = draw_bounding_boxes_on_image_tensors(
+            eval_dict[detection_fields.detection_boxes][indx], axis=0),
-      eval_dict[input_data_fields.original_image],
+        tf.expand_dims(
-      tf.expand_dims(eval_dict[detection_fields.detection_boxes], axis=0),
+            eval_dict[detection_fields.detection_classes][indx], axis=0),
-      tf.expand_dims(eval_dict[detection_fields.detection_classes], axis=0),
+        tf.expand_dims(
-      tf.expand_dims(eval_dict[detection_fields.detection_scores], axis=0),
+            eval_dict[detection_fields.detection_scores][indx], axis=0),
-      category_index,
+        category_index,
-      instance_masks=instance_masks,
+        original_image_spatial_shape=tf.expand_dims(
-      keypoints=keypoints,
+            eval_dict[input_data_fields.original_image_spatial_shape][indx],
-      max_boxes_to_draw=max_boxes_to_draw,
+            axis=0),
-      min_score_thresh=min_score_thresh,
+        true_image_shape=tf.expand_dims(
-      use_normalized_coordinates=use_normalized_coordinates)
+            eval_dict[input_data_fields.true_image_shape][indx], axis=0),
-  images_with_groundtruth = draw_bounding_boxes_on_image_tensors(
+        instance_masks=instance_masks,
-      eval_dict[input_data_fields.original_image],
+        keypoints=keypoints,
-      tf.expand_dims(eval_dict[input_data_fields.groundtruth_boxes], axis=0),
+        max_boxes_to_draw=max_boxes_to_draw,
-      tf.expand_dims(eval_dict[input_data_fields.groundtruth_classes], axis=0),
+        min_score_thresh=min_score_thresh,
-      tf.expand_dims(
+        use_normalized_coordinates=use_normalized_coordinates)
-          tf.ones_like(
+    images_with_groundtruth = draw_bounding_boxes_on_image_tensors(
-              eval_dict[input_data_fields.groundtruth_classes],
+        tf.expand_dims(
-              dtype=tf.float32),
+            eval_dict[input_data_fields.original_image][indx], axis=0),
-          axis=0),
+        tf.expand_dims(
-      category_index,
+            eval_dict[input_data_fields.groundtruth_boxes][indx], axis=0),
-      instance_masks=groundtruth_instance_masks,
+        tf.expand_dims(
-      keypoints=None,
+            eval_dict[input_data_fields.groundtruth_classes][indx], axis=0),
-      max_boxes_to_draw=None,
+        tf.expand_dims(
-      min_score_thresh=0.0,
+            tf.ones_like(
-      use_normalized_coordinates=use_normalized_coordinates)
+                eval_dict[input_data_fields.groundtruth_classes][indx],
-  return tf.concat([images_with_detections, images_with_groundtruth], axis=2)
+                dtype=tf.float32),
+            axis=0),
+        category_index,
+        original_image_spatial_shape=tf.expand_dims(
+            eval_dict[input_data_fields.original_image_spatial_shape][indx],
+            axis=0),
+        true_image_shape=tf.expand_dims(
+            eval_dict[input_data_fields.true_image_shape][indx], axis=0),
+        instance_masks=groundtruth_instance_masks,
+        keypoints=None,
+        max_boxes_to_draw=None,
+        min_score_thresh=0.0,
+        use_normalized_coordinates=use_normalized_coordinates)
+    images_with_detections_list.append(
+        tf.concat([images_with_detections, images_with_groundtruth], axis=2))
+  return images_with_detections_list
 def draw_keypoints_on_image_array(image,
@@ -744,7 +822,7 @@ class EvalMetricOpsVisualization(object):
  responsible for accruing images (with overlaid detections and groundtruth)
  and returning a dictionary that can be passed to `eval_metric_ops`.
  """
-  __metaclass__ = ABCMeta
+  __metaclass__ = abc.ABCMeta
  def __init__(self,
               category_index,
@@ -792,26 +870,33 @@ class EvalMetricOpsVisualization(object):
    Args:
      eval_dict: A dictionary that holds an image, groundtruth, and detections
-        for a single example. See eval_util.result_dict_for_single_example() for
+        for a batched example. Note that, we use only the first example for
-        a convenient method for constructing such a dictionary. The dictionary
+        visualization. See eval_util.result_dict_for_batched_example() for a
+        convenient method for constructing such a dictionary. The dictionary
        contains
-        fields.InputDataFields.original_image: [1, H, W, 3] image.
+        fields.InputDataFields.original_image: [batch_size, H, W, 3] image.
-        fields.InputDataFields.groundtruth_boxes - [num_boxes, 4] float32
+        fields.InputDataFields.original_image_spatial_shape: [batch_size, 2]
-          tensor with groundtruth boxes in range [0.0, 1.0].
+          tensor containing the size of the original image.
-        fields.InputDataFields.groundtruth_classes - [num_boxes] int64
+        fields.InputDataFields.true_image_shape: [batch_size, 3]
-          tensor with 1-indexed groundtruth classes.
+          tensor containing the spatial size of the upadded original image.
+        fields.InputDataFields.groundtruth_boxes - [batch_size, num_boxes, 4]
+          float32 tensor with groundtruth boxes in range [0.0, 1.0].
+        fields.InputDataFields.groundtruth_classes - [batch_size, num_boxes]
+          int64 tensor with 1-indexed groundtruth classes.
        fields.InputDataFields.groundtruth_instance_masks - (optional)
-          [num_boxes, H, W] int64 tensor with instance masks.
+          [batch_size, num_boxes, H, W] int64 tensor with instance masks.
-        fields.DetectionResultFields.detection_boxes - [max_num_boxes, 4]
+        fields.DetectionResultFields.detection_boxes - [batch_size,
-          float32 tensor with detection boxes in range [0.0, 1.0].
+          max_num_boxes, 4] float32 tensor with detection boxes in range [0.0,
-        fields.DetectionResultFields.detection_classes - [max_num_boxes]
+          1.0].
-          int64 tensor with 1-indexed detection classes.
+        fields.DetectionResultFields.detection_classes - [batch_size,
-        fields.DetectionResultFields.detection_scores - [max_num_boxes]
+          max_num_boxes] int64 tensor with 1-indexed detection classes.
-          float32 tensor with detection scores.
+        fields.DetectionResultFields.detection_scores - [batch_size,
-        fields.DetectionResultFields.detection_masks - (optional)
+          max_num_boxes] float32 tensor with detection scores.
-          [max_num_boxes, H, W] float32 tensor of binarized masks.
+        fields.DetectionResultFields.detection_masks - (optional) [batch_size,
+          max_num_boxes, H, W] float32 tensor of binarized masks.
        fields.DetectionResultFields.detection_keypoints - (optional)
-          [max_num_boxes, num_keypoints, 2] float32 tensor with keypooints.
+          [batch_size, max_num_boxes, num_keypoints, 2] float32 tensor with
+          keypoints.
    Returns:
      A dictionary of image summary names to tuple of (value_op, update_op). The
@@ -820,6 +905,8 @@ class EvalMetricOpsVisualization(object):
      groundtruth. Each `value_op` holds the tf.summary.image string for a given
      image.
    """
+    if self._max_examples_to_draw == 0:
+      return {}
    images = self.images_from_evaluation_dict(eval_dict)
    def get_images():
@@ -837,7 +924,7 @@ class EvalMetricOpsVisualization(object):
          lambda: tf.summary.image(summary_name, image),
          lambda: tf.constant(''))
-    update_op = tf.py_func(self.add_images, [images], [])
+    update_op = tf.py_func(self.add_images, [[images[0]]], [])
    image_tensors = tf.py_func(
        get_images, [], [tf.uint8] * self._max_examples_to_draw)
    eval_metric_ops = {}
@@ -847,7 +934,7 @@ class EvalMetricOpsVisualization(object):
      eval_metric_ops[summary_name] = (value_op, update_op)
    return eval_metric_ops
-  @abstractmethod
+  @abc.abstractmethod
  def images_from_evaluation_dict(self, eval_dict):
    """Converts evaluation dictionary into a list of image tensors.
@@ -882,9 +969,6 @@ class VisualizeSingleFrameDetections(EvalMetricOpsVisualization):
        summary_name_prefix=summary_name_prefix)
  def images_from_evaluation_dict(self, eval_dict):
-    return [draw_side_by_side_evaluation_image(
+    return draw_side_by_side_evaluation_image(
-        eval_dict,
+        eval_dict, self._category_index, self._max_boxes_to_draw,
-        self._category_index,
+        self._min_score_thresh, self._use_normalized_coordinates)
-        self._max_boxes_to_draw,
-        self._min_score_thresh,
-        self._use_normalized_coordinates)]
--- a/research/object_detection/utils/visualization_utils_test.py
+++ b/research/object_detection/utils/visualization_utils_test.py
@@ -52,6 +52,9 @@ class VisualizationUtilsTest(tf.test.TestCase):
  def create_test_image_with_five_channels(self):
    return np.full([100, 200, 5], 255, dtype=np.uint8)
+  def create_test_grayscale_image(self):
+    return np.full([100, 200, 1], 255, dtype=np.uint8)
  def test_draw_bounding_box_on_image(self):
    test_image = self.create_colorful_test_image()
    test_image = Image.fromarray(test_image)
@@ -119,9 +122,11 @@ class VisualizationUtilsTest(tf.test.TestCase):
    fname = os.path.join(_TESTDATA_PATH, 'image1.jpg')
    image_np = np.array(Image.open(fname))
    images_np = np.stack((image_np, image_np), axis=0)
+    original_image_shape = [[636, 512], [636, 512]]
    with tf.Graph().as_default():
      images_tensor = tf.constant(value=images_np, dtype=tf.uint8)
+      image_shape = tf.constant(original_image_shape, dtype=tf.int32)
      boxes = tf.constant([[[0.4, 0.25, 0.75, 0.75], [0.5, 0.3, 0.6, 0.9]],
                           [[0.25, 0.25, 0.75, 0.75], [0.1, 0.3, 0.6, 1.0]]])
      classes = tf.constant([[1, 1], [1, 2]], dtype=tf.int64)
@@ -133,6 +138,8 @@ class VisualizationUtilsTest(tf.test.TestCase):
              classes,
              scores,
              category_index,
+              original_image_spatial_shape=image_shape,
+              true_image_shape=image_shape,
              min_score_thresh=0.2))
      with self.test_session() as sess:
@@ -140,7 +147,10 @@ class VisualizationUtilsTest(tf.test.TestCase):
        # Write output images for visualization.
        images_with_boxes_np = sess.run(images_with_boxes)
-        self.assertEqual(images_np.shape, images_with_boxes_np.shape)
+        self.assertEqual(images_np.shape[0], images_with_boxes_np.shape[0])
+        self.assertEqual(images_np.shape[3], images_with_boxes_np.shape[3])
+        self.assertEqual(
+            tuple(original_image_shape[0]), images_with_boxes_np.shape[1:3])
        for i in range(images_with_boxes_np.shape[0]):
          img_name = 'image_' + str(i) + '.png'
          output_file = os.path.join(self.get_temp_dir(), img_name)
@@ -174,6 +184,35 @@ class VisualizationUtilsTest(tf.test.TestCase):
        final_images_np = sess.run(images_with_boxes)
        self.assertEqual((2, 100, 200, 3), final_images_np.shape)
+  def test_draw_bounding_boxes_on_image_tensors_grayscale(self):
+    """Tests the case where input image tensor has one channel."""
+    category_index = {1: {'id': 1, 'name': 'dog'}}
+    image_np = self.create_test_grayscale_image()
+    images_np = np.stack((image_np, image_np), axis=0)
+    with tf.Graph().as_default():
+      images_tensor = tf.constant(value=images_np, dtype=tf.uint8)
+      image_shape = tf.constant([[100, 200], [100, 200]], dtype=tf.int32)
+      boxes = tf.constant(0, dtype=tf.float32, shape=[2, 0, 4])
+      classes = tf.constant(0, dtype=tf.int64, shape=[2, 0])
+      scores = tf.constant(0, dtype=tf.float32, shape=[2, 0])
+      images_with_boxes = (
+          visualization_utils.draw_bounding_boxes_on_image_tensors(
+              images_tensor,
+              boxes,
+              classes,
+              scores,
+              category_index,
+              original_image_spatial_shape=image_shape,
+              true_image_shape=image_shape,
+              min_score_thresh=0.2))
+      with self.test_session() as sess:
+        sess.run(tf.global_variables_initializer())
+        final_images_np = sess.run(images_with_boxes)
+        self.assertEqual((2, 100, 200, 3), final_images_np.shape)
  def test_draw_keypoints_on_image(self):
    test_image = self.create_colorful_test_image()
    test_image = Image.fromarray(test_image)
@@ -234,34 +273,46 @@ class VisualizationUtilsTest(tf.test.TestCase):
        category_index,
        max_examples_to_draw=max_examples_to_draw,
        summary_name_prefix=metric_op_base)
-    original_image = tf.placeholder(tf.uint8, [1, None, None, 3])
+    original_image = tf.placeholder(tf.uint8, [4, None, None, 3])
-    detection_boxes = tf.random_uniform([20, 4],
+    original_image_spatial_shape = tf.placeholder(tf.int32, [4, 2])
+    true_image_shape = tf.placeholder(tf.int32, [4, 3])
+    detection_boxes = tf.random_uniform([4, 20, 4],
                                        minval=0.0,
                                        maxval=1.0,
                                        dtype=tf.float32)
-    detection_classes = tf.random_uniform([20],
+    detection_classes = tf.random_uniform([4, 20],
                                          minval=1,
                                          maxval=3,
                                          dtype=tf.int64)
-    detection_scores = tf.random_uniform([20],
+    detection_scores = tf.random_uniform([4, 20],
                                         minval=0.,
                                         maxval=1.,
                                         dtype=tf.float32)
-    groundtruth_boxes = tf.random_uniform([8, 4],
+    groundtruth_boxes = tf.random_uniform([4, 8, 4],
                                          minval=0.0,
                                          maxval=1.0,
                                          dtype=tf.float32)
-    groundtruth_classes = tf.random_uniform([8],
+    groundtruth_classes = tf.random_uniform([4, 8],
                                            minval=1,
                                            maxval=3,
                                            dtype=tf.int64)
    eval_dict = {
-        fields.DetectionResultFields.detection_boxes: detection_boxes,
+        fields.DetectionResultFields.detection_boxes:
-        fields.DetectionResultFields.detection_classes: detection_classes,
+            detection_boxes,
-        fields.DetectionResultFields.detection_scores: detection_scores,
+        fields.DetectionResultFields.detection_classes:
-        fields.InputDataFields.original_image: original_image,
+            detection_classes,
-        fields.InputDataFields.groundtruth_boxes: groundtruth_boxes,
+        fields.DetectionResultFields.detection_scores:
-        fields.InputDataFields.groundtruth_classes: groundtruth_classes}
+            detection_scores,
+        fields.InputDataFields.original_image:
+            original_image,
+        fields.InputDataFields.original_image_spatial_shape: (
+            original_image_spatial_shape),
+        fields.InputDataFields.true_image_shape: (true_image_shape),
+        fields.InputDataFields.groundtruth_boxes:
+            groundtruth_boxes,
+        fields.InputDataFields.groundtruth_classes:
+            groundtruth_classes
+    }
    metric_ops = eval_metric_ops.get_estimator_eval_metric_ops(eval_dict)
    _, update_op = metric_ops[metric_ops.keys()[0]]
@@ -274,12 +325,20 @@ class VisualizationUtilsTest(tf.test.TestCase):
      # First run enough update steps to surpass `max_examples_to_draw`.
      for i in range(max_examples_to_draw):
        # Use a unique image shape on each eval image.
-        sess.run(update_op, feed_dict={
+        sess.run(
-            original_image: np.random.randint(low=0,
+            update_op,
-                                              high=256,
+            feed_dict={
-                                              size=(1, 6 + i, 7 + i, 3),
+                original_image:
-                                              dtype=np.uint8)
+                    np.random.randint(
-        })
+                        low=0,
+                        high=256,
+                        size=(4, 6 + i, 7 + i, 3),
+                        dtype=np.uint8),
+                original_image_spatial_shape: [[6 + i, 7 + i], [6 + i, 7 + i],
+                                               [6 + i, 7 + i], [6 + i, 7 + i]],
+                true_image_shape: [[6 + i, 7 + i, 3], [6 + i, 7 + i, 3],
+                                   [6 + i, 7 + i, 3], [6 + i, 7 + i, 3]]
+            })
      value_ops_out = sess.run(value_ops)
      for key, value_op in value_ops_out.iteritems():
        self.assertNotEqual('', value_op)
@@ -289,12 +348,20 @@ class VisualizationUtilsTest(tf.test.TestCase):
      # produced.
      for i in range(max_examples_to_draw - 1):
        # Use a unique image shape on each eval image.
-        sess.run(update_op, feed_dict={
+        sess.run(
-            original_image: np.random.randint(low=0,
+            update_op,
-                                              high=256,
+            feed_dict={
-                                              size=(1, 6 + i, 7 + i, 3),
+                original_image:
-                                              dtype=np.uint8)
+                    np.random.randint(
-        })
+                        low=0,
+                        high=256,
+                        size=(4, 6 + i, 7 + i, 3),
+                        dtype=np.uint8),
+                original_image_spatial_shape: [[6 + i, 7 + i], [6 + i, 7 + i],
+                                               [6 + i, 7 + i], [6 + i, 7 + i]],
+                true_image_shape: [[6 + i, 7 + i, 3], [6 + i, 7 + i, 3],
+                                   [6 + i, 7 + i, 3], [6 + i, 7 + i, 3]]
+            })
      value_ops_out = sess.run(value_ops)
      self.assertEqual(
          '',

--- a/research/slim/nets/cyclegan.py
+++ b/research/slim/nets/cyclegan.py
@@ -63,7 +63,8 @@ def cyclegan_arg_scope(instance_norm_center=True,
    return sc
-def cyclegan_upsample(net, num_outputs, stride, method='conv2d_transpose'):
+def cyclegan_upsample(net, num_outputs, stride, method='conv2d_transpose',
+                      pad_mode='REFLECT', align_corners=False):
  """Upsamples the given inputs.
  Args:
@@ -75,6 +76,10 @@ def cyclegan_upsample(net, num_outputs, stride, method='conv2d_transpose'):
      times the input size.
    method: The upsampling method: 'nn_upsample_conv', 'bilinear_upsample_conv',
      or 'conv2d_transpose'.
+    pad_mode: mode for tf.pad, one of "CONSTANT", "REFLECT", or "SYMMETRIC".
+    align_corners: option for method, 'bilinear_upsample_conv'. If true, the
+      centers of the 4 corner pixels of the input and output tensors are
+      aligned, preserving the values at the corner pixels.
  Returns:
    A Tensor which was upsampled using the specified method.
@@ -95,12 +100,13 @@ def cyclegan_upsample(net, num_outputs, stride, method='conv2d_transpose'):
    if method == 'nn_upsample_conv':
      net = tf.image.resize_nearest_neighbor(
          net, [stride[0] * height, stride[1] * width])
-      net = tf.pad(net, spatial_pad_1, 'REFLECT')
+      net = tf.pad(net, spatial_pad_1, pad_mode)
      net = layers.conv2d(net, num_outputs, kernel_size=[3, 3], padding='valid')
    elif method == 'bilinear_upsample_conv':
      net = tf.image.resize_bilinear(
-          net, [stride[0] * height, stride[1] * width])
+          net, [stride[0] * height, stride[1] * width],
-      net = tf.pad(net, spatial_pad_1, 'REFLECT')
+          align_corners=align_corners)
+      net = tf.pad(net, spatial_pad_1, pad_mode)
      net = layers.conv2d(net, num_outputs, kernel_size=[3, 3], padding='valid')
    elif method == 'conv2d_transpose':
      # This corrects 1 pixel offset for images with even width and height.
@@ -111,7 +117,7 @@ def cyclegan_upsample(net, num_outputs, stride, method='conv2d_transpose'):
          net, num_outputs, kernel_size=[3, 3], stride=stride, padding='valid')
      net = net[:, 1:, 1:, :]
    else:
-      raise ValueError('Unknown method: [%s]', method)
+      raise ValueError('Unknown method: [%s]' % method)
    return net

--- a/research/slim/nets/inception_resnet_v2.py
+++ b/research/slim/nets/inception_resnet_v2.py
@@ -370,7 +370,8 @@ def inception_resnet_v2_arg_scope(
    batch_norm_decay=0.9997,
    batch_norm_epsilon=0.001,
    activation_fn=tf.nn.relu,
-    batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS):
+    batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS,
+    batch_norm_scale=False):
  """Returns the scope with the default parameters for inception_resnet_v2.
  Args:
@@ -380,6 +381,8 @@ def inception_resnet_v2_arg_scope(
    activation_fn: Activation function for conv2d.
    batch_norm_updates_collections: Collection for the update ops for
      batch norm.
+    batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
+      activations in the batch normalization layer.
  Returns:
    a arg_scope with the parameters needed for inception_resnet_v2.
@@ -394,6 +397,7 @@ def inception_resnet_v2_arg_scope(
        'epsilon': batch_norm_epsilon,
        'updates_collections': batch_norm_updates_collections,
        'fused': None,  # Use fused batch norm if possible.
+        'scale': batch_norm_scale,
    }
    # Set activation_fn and parameters for batch_norm.
    with slim.arg_scope([slim.conv2d], activation_fn=activation_fn,

--- a/research/slim/nets/inception_resnet_v2_test.py
+++ b/research/slim/nets/inception_resnet_v2_test.py
@@ -306,6 +306,29 @@ class InceptionTest(tf.test.TestCase):
      output = sess.run(predictions)
      self.assertEquals(output.shape, (eval_batch_size,))
+  def testNoBatchNormScaleByDefault(self):
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.placeholder(tf.float32, (1, height, width, 3))
+    with tf.contrib.slim.arg_scope(inception.inception_resnet_v2_arg_scope()):
+      inception.inception_resnet_v2(inputs, num_classes, is_training=False)
+    self.assertEqual(tf.global_variables('.*/BatchNorm/gamma:0$'), [])
+  def testBatchNormScale(self):
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.placeholder(tf.float32, (1, height, width, 3))
+    with tf.contrib.slim.arg_scope(
+        inception.inception_resnet_v2_arg_scope(batch_norm_scale=True)):
+      inception.inception_resnet_v2(inputs, num_classes, is_training=False)
+    gamma_names = set(
+        v.op.name for v in tf.global_variables('.*/BatchNorm/gamma:0$'))
+    self.assertGreater(len(gamma_names), 0)
+    for v in tf.global_variables('.*/BatchNorm/moving_mean:0$'):
+      self.assertIn(v.op.name[:-len('moving_mean')] + 'gamma', gamma_names)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/slim/nets/inception_utils.py
+++ b/research/slim/nets/inception_utils.py
@@ -34,7 +34,8 @@ def inception_arg_scope(weight_decay=0.00004,
                        batch_norm_decay=0.9997,
                        batch_norm_epsilon=0.001,
                        activation_fn=tf.nn.relu,
-                        batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS):
+                        batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS,
+                        batch_norm_scale=False):
  """Defines the default arg scope for inception models.
  Args:
@@ -46,6 +47,8 @@ def inception_arg_scope(weight_decay=0.00004,
    activation_fn: Activation function for conv2d.
    batch_norm_updates_collections: Collection for the update ops for
      batch norm.
+    batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
+      activations in the batch normalization layer.
  Returns:
    An `arg_scope` to use for the inception models.
@@ -59,6 +62,7 @@ def inception_arg_scope(weight_decay=0.00004,
      'updates_collections': batch_norm_updates_collections,
      # use fused batch norm if possible.
      'fused': None,
+      'scale': batch_norm_scale,
  }
  if use_batch_norm:
    normalizer_fn = slim.batch_norm

--- a/research/slim/nets/inception_v1_test.py
+++ b/research/slim/nets/inception_v1_test.py
@@ -237,6 +237,29 @@ class InceptionV1Test(tf.test.TestCase):
      logits_out = sess.run(logits)
      self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes])
+  def testNoBatchNormScaleByDefault(self):
+    height, width = 224, 224
+    num_classes = 1000
+    inputs = tf.placeholder(tf.float32, (1, height, width, 3))
+    with slim.arg_scope(inception.inception_v1_arg_scope()):
+      inception.inception_v1(inputs, num_classes, is_training=False)
+    self.assertEqual(tf.global_variables('.*/BatchNorm/gamma:0$'), [])
+  def testBatchNormScale(self):
+    height, width = 224, 224
+    num_classes = 1000
+    inputs = tf.placeholder(tf.float32, (1, height, width, 3))
+    with slim.arg_scope(
+        inception.inception_v1_arg_scope(batch_norm_scale=True)):
+      inception.inception_v1(inputs, num_classes, is_training=False)
+    gamma_names = set(
+        v.op.name for v in tf.global_variables('.*/BatchNorm/gamma:0$'))
+    self.assertGreater(len(gamma_names), 0)
+    for v in tf.global_variables('.*/BatchNorm/moving_mean:0$'):
+      self.assertIn(v.op.name[:-len('moving_mean')] + 'gamma', gamma_names)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/slim/nets/inception_v2_test.py
+++ b/research/slim/nets/inception_v2_test.py
@@ -351,6 +351,29 @@ class InceptionV2Test(tf.test.TestCase):
      logits_out = sess.run(logits)
      self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes])
+  def testNoBatchNormScaleByDefault(self):
+    height, width = 224, 224
+    num_classes = 1000
+    inputs = tf.placeholder(tf.float32, (1, height, width, 3))
+    with slim.arg_scope(inception.inception_v2_arg_scope()):
+      inception.inception_v2(inputs, num_classes, is_training=False)
+    self.assertEqual(tf.global_variables('.*/BatchNorm/gamma:0$'), [])
+  def testBatchNormScale(self):
+    height, width = 224, 224
+    num_classes = 1000
+    inputs = tf.placeholder(tf.float32, (1, height, width, 3))
+    with slim.arg_scope(
+        inception.inception_v2_arg_scope(batch_norm_scale=True)):
+      inception.inception_v2(inputs, num_classes, is_training=False)
+    gamma_names = set(
+        v.op.name for v in tf.global_variables('.*/BatchNorm/gamma:0$'))
+    self.assertGreater(len(gamma_names), 0)
+    for v in tf.global_variables('.*/BatchNorm/moving_mean:0$'):
+      self.assertIn(v.op.name[:-len('moving_mean')] + 'gamma', gamma_names)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/slim/nets/inception_v3_test.py
+++ b/research/slim/nets/inception_v3_test.py
@@ -318,6 +318,29 @@ class InceptionV3Test(tf.test.TestCase):
      logits_out = sess.run(logits)
      self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes])
+  def testNoBatchNormScaleByDefault(self):
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.placeholder(tf.float32, (1, height, width, 3))
+    with slim.arg_scope(inception.inception_v3_arg_scope()):
+      inception.inception_v3(inputs, num_classes, is_training=False)
+    self.assertEqual(tf.global_variables('.*/BatchNorm/gamma:0$'), [])
+  def testBatchNormScale(self):
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.placeholder(tf.float32, (1, height, width, 3))
+    with slim.arg_scope(
+        inception.inception_v3_arg_scope(batch_norm_scale=True)):
+      inception.inception_v3(inputs, num_classes, is_training=False)
+    gamma_names = set(
+        v.op.name for v in tf.global_variables('.*/BatchNorm/gamma:0$'))
+    self.assertGreater(len(gamma_names), 0)
+    for v in tf.global_variables('.*/BatchNorm/moving_mean:0$'):
+      self.assertIn(v.op.name[:-len('moving_mean')] + 'gamma', gamma_names)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/slim/nets/inception_v4_test.py
+++ b/research/slim/nets/inception_v4_test.py
@@ -255,6 +255,29 @@ class InceptionTest(tf.test.TestCase):
      output = sess.run(predictions)
      self.assertEquals(output.shape, (eval_batch_size,))
+  def testNoBatchNormScaleByDefault(self):
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.placeholder(tf.float32, (1, height, width, 3))
+    with tf.contrib.slim.arg_scope(inception.inception_v4_arg_scope()):
+      inception.inception_v4(inputs, num_classes, is_training=False)
+    self.assertEqual(tf.global_variables('.*/BatchNorm/gamma:0$'), [])
+  def testBatchNormScale(self):
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.placeholder(tf.float32, (1, height, width, 3))
+    with tf.contrib.slim.arg_scope(
+        inception.inception_v4_arg_scope(batch_norm_scale=True)):
+      inception.inception_v4(inputs, num_classes, is_training=False)
+    gamma_names = set(
+        v.op.name for v in tf.global_variables('.*/BatchNorm/gamma:0$'))
+    self.assertGreater(len(gamma_names), 0)
+    for v in tf.global_variables('.*/BatchNorm/moving_mean:0$'):
+      self.assertIn(v.op.name[:-len('moving_mean')] + 'gamma', gamma_names)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/slim/nets/mobilenet_v1.py
+++ b/research/slim/nets/mobilenet_v1.py
@@ -263,7 +263,6 @@ def mobilenet_v1_base(inputs,
            net = _fixed_padding(net, conv_def.kernel)
          net = slim.conv2d(net, depth(conv_def.depth), conv_def.kernel,
                            stride=conv_def.stride,
-                            normalizer_fn=slim.batch_norm,
                            scope=end_point)
          end_points[end_point] = net
          if end_point == final_endpoint:
@@ -280,7 +279,6 @@ def mobilenet_v1_base(inputs,
                                      depth_multiplier=1,
                                      stride=layer_stride,
                                      rate=layer_rate,
-                                      normalizer_fn=slim.batch_norm,
                                      scope=end_point)
          end_points[end_point] = net
@@ -291,7 +289,6 @@ def mobilenet_v1_base(inputs,
          net = slim.conv2d(net, depth(conv_def.depth), [1, 1],
                            stride=1,
-                            normalizer_fn=slim.batch_norm,
                            scope=end_point)
          end_points[end_point] = net
@@ -432,7 +429,8 @@ def mobilenet_v1_arg_scope(
    regularize_depthwise=False,
    batch_norm_decay=0.9997,
    batch_norm_epsilon=0.001,
-    batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS):
+    batch_norm_updates_collections=tf.GraphKeys.UPDATE_OPS,
+    normalizer_fn=slim.batch_norm):
  """Defines the default MobilenetV1 arg scope.
  Args:
@@ -446,6 +444,7 @@ def mobilenet_v1_arg_scope(
      in batch norm.
    batch_norm_updates_collections: Collection for the update ops for
      batch norm.
+    normalizer_fn: Normalization function to apply after convolution.
  Returns:
    An `arg_scope` to use for the mobilenet v1 model.
@@ -469,7 +468,7 @@ def mobilenet_v1_arg_scope(
    depthwise_regularizer = None
  with slim.arg_scope([slim.conv2d, slim.separable_conv2d],
                      weights_initializer=weights_init,
-                      activation_fn=tf.nn.relu6, normalizer_fn=slim.batch_norm):
+                      activation_fn=tf.nn.relu6, normalizer_fn=normalizer_fn):
    with slim.arg_scope([slim.batch_norm], **batch_norm_params):
      with slim.arg_scope([slim.conv2d], weights_regularizer=regularizer):
        with slim.arg_scope([slim.separable_conv2d],