Merge pull request #4232 from pkulzc/master

Release ssdlite mobilenet v2 coco trained model, add quantized training and minor fixes.

Merge pull request #4232 from pkulzc/master
Release ssdlite mobilenet v2 coco trained model, add quantized training and minor fixes.
b9ca525f · Jonathan Huang · GitHub · 0270cac7 · 324d6dc3 · b9ca525f
Unverified Commit b9ca525f authored May 11, 2018 by Jonathan Huang Committed by GitHub May 11, 2018
20 changed files
--- a/research/object_detection/meta_architectures/ssd_meta_arch.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch.py
@@ -139,7 +139,8 @@ class SSDMetaArch(model.DetectionModel):
               normalize_loc_loss_by_codesize=False,
               freeze_batchnorm=False,
               inplace_batchnorm_update=False,
-               add_background_class=True):
+               add_background_class=True,
+               random_example_sampler=None):
    """SSDMetaArch Constructor.
    TODO(rathodv,jonathanhuang): group NMS parameters + score converter into
@@ -198,6 +199,12 @@ class SSDMetaArch(model.DetectionModel):
        one-hot encodings of groundtruth labels. Set to false if using
        groundtruth labels with an explicit background class or using multiclass
        scores instead of truth in the case of distillation.
+      random_example_sampler: a BalancedPositiveNegativeSampler object that can
+        perform random example sampling when computing loss. If None, random
+        sampling process is skipped. Note that random example sampler and hard
+        example miner can both be applied to the model. In that case, random
+        sampler will take effect first and hard example miner can only process
+        the random sampled examples.
    """
    super(SSDMetaArch, self).__init__(num_classes=box_predictor.num_classes)
    self._is_training = is_training
@@ -240,6 +247,8 @@ class SSDMetaArch(model.DetectionModel):
    self._normalize_loss_by_num_matches = normalize_loss_by_num_matches
    self._normalize_loc_loss_by_codesize = normalize_loc_loss_by_codesize
    self._hard_example_miner = hard_example_miner
+    self._random_example_sampler = random_example_sampler
+    self._parallel_iterations = 16
    self._image_resizer_fn = image_resizer_fn
    self._non_max_suppression_fn = non_max_suppression_fn
@@ -543,6 +552,20 @@ class SSDMetaArch(model.DetectionModel):
      if self._add_summaries:
        self._summarize_target_assignment(
            self.groundtruth_lists(fields.BoxListFields.boxes), match_list)
+      if self._random_example_sampler:
+        batch_sampled_indicator = tf.to_float(
+            shape_utils.static_or_dynamic_map_fn(
+                self._minibatch_subsample_fn,
+                [batch_cls_targets, batch_cls_weights],
+                dtype=tf.bool,
+                parallel_iterations=self._parallel_iterations,
+                back_prop=True))
+        batch_reg_weights = tf.multiply(batch_sampled_indicator,
+                                        batch_reg_weights)
+        batch_cls_weights = tf.multiply(batch_sampled_indicator,
+                                        batch_cls_weights)
      location_losses = self._localization_loss(
          prediction_dict['box_encodings'],
          batch_reg_targets,
@@ -593,6 +616,32 @@ class SSDMetaArch(model.DetectionModel):
      }
    return loss_dict
+  def _minibatch_subsample_fn(self, inputs):
+    """Randomly samples anchors for one image.
+    Args:
+      inputs: a list of 2 inputs. First one is a tensor of shape [num_anchors,
+        num_classes] indicating targets assigned to each anchor. Second one
+        is a tensor of shape [num_anchors] indicating the class weight of each
+        anchor.
+    Returns:
+      batch_sampled_indicator: bool tensor of shape [num_anchors] indicating
+        whether the anchor should be selected for loss computation.
+    """
+    cls_targets, cls_weights = inputs
+    if self._add_background_class:
+      # Set background_class bits to 0 so that the positives_indicator
+      # computation would not consider background class.
+      background_class = tf.zeros_like(tf.slice(cls_targets, [0, 0], [-1, 1]))
+      regular_class = tf.slice(cls_targets, [0, 1], [-1, -1])
+      cls_targets = tf.concat([background_class, regular_class], 1)
+    positives_indicator = tf.reduce_sum(cls_targets, axis=1)
+    return self._random_example_sampler.subsample(
+        tf.cast(cls_weights, tf.bool),
+        batch_size=None,
+        labels=tf.cast(positives_indicator, tf.bool))
  def _summarize_anchor_classification_loss(self, class_ids, cls_losses):
    positive_indices = tf.where(tf.greater(class_ids, 0))
    positive_anchor_cls_loss = tf.squeeze(
@@ -790,8 +839,8 @@ class SSDMetaArch(model.DetectionModel):
        classification checkpoint for initialization prior to training.
        Valid values: `detection`, `classification`. Default 'detection'.
      load_all_detection_checkpoint_vars: whether to load all variables (when
-         `from_detection_checkpoint` is True). If False, only variables within
+         `fine_tune_checkpoint_type='detection'`). If False, only variables
-         the appropriate scopes are included. Default False.
+         within the appropriate scopes are included. Default False.
    Returns:
      A dict mapping variable names (to load from a checkpoint) to variables in

--- a/research/object_detection/meta_architectures/ssd_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch_test.py
@@ -19,6 +19,7 @@ import numpy as np
 import tensorflow as tf
 from object_detection.core import anchor_generator
+from object_detection.core import balanced_positive_negative_sampler as sampler
 from object_detection.core import box_list
 from object_detection.core import losses
 from object_detection.core import post_processing
@@ -83,7 +84,8 @@ class SsdMetaArchTest(test_case.TestCase):
  def _create_model(self,
                    apply_hard_mining=True,
                    normalize_loc_loss_by_codesize=False,
-                    add_background_class=True):
+                    add_background_class=True,
+                    random_example_sampling=False):
    is_training = False
    num_classes = 1
    mock_anchor_generator = MockAnchorGenerator2x2()
@@ -117,6 +119,11 @@ class SsdMetaArchTest(test_case.TestCase):
          num_hard_examples=None,
          iou_threshold=1.0)
+    random_example_sampler = None
+    if random_example_sampling:
+      random_example_sampler = sampler.BalancedPositiveNegativeSampler(
+          positive_fraction=0.5)
    code_size = 4
    model = ssd_meta_arch.SSDMetaArch(
        is_training,
@@ -141,7 +148,8 @@ class SsdMetaArchTest(test_case.TestCase):
        normalize_loc_loss_by_codesize=normalize_loc_loss_by_codesize,
        freeze_batchnorm=False,
        inplace_batchnorm_update=False,
-        add_background_class=add_background_class)
+        add_background_class=add_background_class,
+        random_example_sampler=random_example_sampler)
    return model, num_classes, mock_anchor_generator.num_anchors(), code_size
  def test_preprocess_preserves_shapes_with_dynamic_input_image(self):
@@ -493,6 +501,47 @@ class SsdMetaArchTest(test_case.TestCase):
      self.assertIsInstance(var_map, dict)
      self.assertIn('another_variable', var_map)
+  def test_loss_results_are_correct_with_random_example_sampling(self):
+    with tf.Graph().as_default():
+      _, num_classes, num_anchors, _ = self._create_model(
+          random_example_sampling=True)
+    print num_classes, num_anchors
+    def graph_fn(preprocessed_tensor, groundtruth_boxes1, groundtruth_boxes2,
+                 groundtruth_classes1, groundtruth_classes2):
+      groundtruth_boxes_list = [groundtruth_boxes1, groundtruth_boxes2]
+      groundtruth_classes_list = [groundtruth_classes1, groundtruth_classes2]
+      model, _, _, _ = self._create_model(random_example_sampling=True)
+      model.provide_groundtruth(groundtruth_boxes_list,
+                                groundtruth_classes_list)
+      prediction_dict = model.predict(
+          preprocessed_tensor, true_image_shapes=None)
+      loss_dict = model.loss(prediction_dict, true_image_shapes=None)
+      return (_get_value_for_matching_key(loss_dict, 'Loss/localization_loss'),
+              _get_value_for_matching_key(loss_dict,
+                                          'Loss/classification_loss'))
+    batch_size = 2
+    preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)
+    groundtruth_boxes1 = np.array([[0, 0, .5, .5]], dtype=np.float32)
+    groundtruth_boxes2 = np.array([[0, 0, .5, .5]], dtype=np.float32)
+    groundtruth_classes1 = np.array([[1]], dtype=np.float32)
+    groundtruth_classes2 = np.array([[1]], dtype=np.float32)
+    expected_localization_loss = 0.0
+    # Among 4 anchors (1 positive, 3 negative) in this test, only 2 anchors are
+    # selected (1 positive, 1 negative) since random sampler will adjust number
+    # of negative examples to make sure positive example fraction in the batch
+    # is 0.5.
+    expected_classification_loss = (
+        batch_size * 2 * (num_classes + 1) * np.log(2.0))
+    (localization_loss, classification_loss) = self.execute_cpu(
+        graph_fn, [
+            preprocessed_input, groundtruth_boxes1, groundtruth_boxes2,
+            groundtruth_classes1, groundtruth_classes2
+        ])
+    self.assertAllClose(localization_loss, expected_localization_loss)
+    self.assertAllClose(classification_loss, expected_classification_loss)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/metrics/coco_evaluation.py
+++ b/research/object_detection/metrics/coco_evaluation.py
@@ -202,8 +202,10 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
    return box_metrics
  def get_estimator_eval_metric_ops(self, image_id, groundtruth_boxes,
-                                    groundtruth_classes, detection_boxes,
+                                    groundtruth_classes,
+                                    detection_boxes,
                                    detection_scores, detection_classes,
+                                    groundtruth_is_crowd=None,
                                    num_gt_boxes_per_image=None,
                                    num_det_boxes_per_image=None):
    """Returns a dictionary of eval metric ops to use with `tf.EstimatorSpec`.
@@ -230,6 +232,9 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
        detection scores for the boxes.
      detection_classes: int32 tensor of shape [batch, num_boxes] containing
        1-indexed detection classes for the boxes.
+      groundtruth_is_crowd: bool tensor of shape [batch, num_boxes] containing
+        is_crowd annotations. This field is optional, and if not passed, then
+        all boxes are treated as *not* is_crowd.
      num_gt_boxes_per_image: int32 tensor of shape [batch] containing the
        number of groundtruth boxes per image. If None, will assume no padding
        in groundtruth tensors.
@@ -247,6 +252,7 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
        image_id_batched,
        groundtruth_boxes_batched,
        groundtruth_classes_batched,
+        groundtruth_is_crowd_batched,
        num_gt_boxes_per_image,
        detection_boxes_batched,
        detection_scores_batched,
@@ -254,27 +260,32 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
        num_det_boxes_per_image):
      """Update operation for adding batch of images to Coco evaluator."""
-      for (image_id, gt_box, gt_class, num_gt_box, det_box, det_score,
+      for (image_id, gt_box, gt_class, gt_is_crowd, num_gt_box, det_box,
-           det_class, num_det_box) in zip(
+           det_score, det_class, num_det_box) in zip(
               image_id_batched, groundtruth_boxes_batched,
-               groundtruth_classes_batched, num_gt_boxes_per_image,
+               groundtruth_classes_batched, groundtruth_is_crowd_batched,
+               num_gt_boxes_per_image,
               detection_boxes_batched, detection_scores_batched,
               detection_classes_batched, num_det_boxes_per_image):
        self.add_single_ground_truth_image_info(
            image_id,
            {'groundtruth_boxes': gt_box[:num_gt_box],
-             'groundtruth_classes': gt_class[:num_gt_box]})
+             'groundtruth_classes': gt_class[:num_gt_box],
+             'groundtruth_is_crowd': gt_is_crowd[:num_gt_box]})
        self.add_single_detected_image_info(
            image_id,
            {'detection_boxes': det_box[:num_det_box],
             'detection_scores': det_score[:num_det_box],
             'detection_classes': det_class[:num_det_box]})
+    if groundtruth_is_crowd is None:
+      groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool)
    if not image_id.shape.as_list():
      # Apply a batch dimension to all tensors.
      image_id = tf.expand_dims(image_id, 0)
      groundtruth_boxes = tf.expand_dims(groundtruth_boxes, 0)
      groundtruth_classes = tf.expand_dims(groundtruth_classes, 0)
+      groundtruth_is_crowd = tf.expand_dims(groundtruth_is_crowd, 0)
      detection_boxes = tf.expand_dims(detection_boxes, 0)
      detection_scores = tf.expand_dims(detection_scores, 0)
      detection_classes = tf.expand_dims(detection_classes, 0)
@@ -301,6 +312,7 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
    update_op = tf.py_func(update_op, [image_id,
                                       groundtruth_boxes,
                                       groundtruth_classes,
+                                       groundtruth_is_crowd,
                                       num_gt_boxes_per_image,
                                       detection_boxes,
                                       detection_scores,
@@ -545,7 +557,7 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
                                    groundtruth_classes,
                                    groundtruth_instance_masks,
                                    detection_scores, detection_classes,
-                                    detection_masks):
+                                    detection_masks, groundtruth_is_crowd=None):
    """Returns a dictionary of eval metric ops to use with `tf.EstimatorSpec`.
    Note that once value_op is called, the detections and groundtruth added via
@@ -568,6 +580,9 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
      detection_masks: uint8 tensor array of shape
        [num_boxes, image_height, image_width] containing instance masks
        corresponding to the boxes. The elements of the array must be in {0, 1}.
+      groundtruth_is_crowd: bool tensor of shape [batch, num_boxes] containing
+        is_crowd annotations. This field is optional, and if not passed, then
+        all boxes are treated as *not* is_crowd.
    Returns:
      a dictionary of metric names to tuple of value_op and update_op that can
@@ -580,6 +595,7 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
        groundtruth_boxes,
        groundtruth_classes,
        groundtruth_instance_masks,
+        groundtruth_is_crowd,
        detection_scores,
        detection_classes,
        detection_masks):
@@ -587,17 +603,21 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
          image_id,
          {'groundtruth_boxes': groundtruth_boxes,
           'groundtruth_classes': groundtruth_classes,
-           'groundtruth_instance_masks': groundtruth_instance_masks})
+           'groundtruth_instance_masks': groundtruth_instance_masks,
+           'groundtruth_is_crowd': groundtruth_is_crowd})
      self.add_single_detected_image_info(
          image_id,
          {'detection_scores': detection_scores,
           'detection_classes': detection_classes,
           'detection_masks': detection_masks})
+    if groundtruth_is_crowd is None:
+      groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool)
    update_op = tf.py_func(update_op, [image_id,
                                       groundtruth_boxes,
                                       groundtruth_classes,
                                       groundtruth_instance_masks,
+                                       groundtruth_is_crowd,
                                       detection_scores,
                                       detection_classes,
                                       detection_masks], [])

--- a/research/object_detection/metrics/coco_evaluation_test.py
+++ b/research/object_detection/metrics/coco_evaluation_test.py
@@ -492,8 +492,8 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
        detection_boxes,
        detection_scores,
        detection_classes,
-        num_gt_boxes_per_image,
+        num_gt_boxes_per_image=num_gt_boxes_per_image,
-        num_det_boxes_per_image)
+        num_det_boxes_per_image=num_det_boxes_per_image)
    _, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']

--- a/research/object_detection/model_lib.py
+++ b/research/object_detection/model_lib.py
@@ -48,8 +48,8 @@ MODEL_BUILD_UTIL_MAP = {
 }
-def _get_groundtruth_data(detection_model, class_agnostic):
+def _prepare_groundtruth_for_eval(detection_model, class_agnostic):
-  """Extracts groundtruth data from detection_model.
+  """Extracts groundtruth data from detection_model and prepares it for eval.
  Args:
    detection_model: A `DetectionModel` object.
@@ -63,6 +63,8 @@ def _get_groundtruth_data(detection_model, class_agnostic):
      'groundtruth_classes': [num_boxes] int64 tensor of 1-indexed classes.
      'groundtruth_masks': 3D float32 tensor of instance masks (if provided in
        groundtruth)
+      'groundtruth_is_crowd': [num_boxes] bool tensor indicating is_crowd
+        annotations (if provided in groundtruth).
    class_agnostic: Boolean indicating whether detections are class agnostic.
  """
  input_data_fields = fields.InputDataFields()
@@ -86,6 +88,9 @@ def _get_groundtruth_data(detection_model, class_agnostic):
  if detection_model.groundtruth_has_field(fields.BoxListFields.masks):
    groundtruth[input_data_fields.groundtruth_instance_masks] = (
        detection_model.groundtruth_lists(fields.BoxListFields.masks)[0])
+  if detection_model.groundtruth_has_field(fields.BoxListFields.is_crowd):
+    groundtruth[input_data_fields.groundtruth_is_crowd] = (
+        detection_model.groundtruth_lists(fields.BoxListFields.is_crowd)[0])
  return groundtruth
@@ -224,13 +229,16 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
      gt_keypoints_list = None
      if fields.InputDataFields.groundtruth_keypoints in labels:
        gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints]
+      if fields.InputDataFields.groundtruth_is_crowd in labels:
+        gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd]
      detection_model.provide_groundtruth(
          groundtruth_boxes_list=gt_boxes_list,
          groundtruth_classes_list=gt_classes_list,
          groundtruth_masks_list=gt_masks_list,
          groundtruth_keypoints_list=gt_keypoints_list,
          groundtruth_weights_list=labels[
-              fields.InputDataFields.groundtruth_weights])
+              fields.InputDataFields.groundtruth_weights],
+          groundtruth_is_crowd_list=gt_is_crowd_list)
    preprocessed_images = features[fields.InputDataFields.image]
    prediction_dict = detection_model.predict(
@@ -328,7 +336,8 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
    if mode == tf.estimator.ModeKeys.EVAL:
      class_agnostic = (fields.DetectionResultFields.detection_classes
                        not in detections)
-      groundtruth = _get_groundtruth_data(detection_model, class_agnostic)
+      groundtruth = _prepare_groundtruth_for_eval(
+          detection_model, class_agnostic)
      use_original_images = fields.InputDataFields.original_image in features
      eval_images = (
          features[fields.InputDataFields.original_image] if use_original_images
@@ -339,7 +348,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
          detections,
          groundtruth,
          class_agnostic=class_agnostic,
-          scale_to_absolute=False)
+          scale_to_absolute=True)
      if class_agnostic:
        category_index = label_map_util.create_class_agnostic_category_index()
@@ -360,8 +369,10 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
      if not eval_metrics:
        eval_metrics = ['coco_detection_metrics']
      eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
-          eval_metrics, category_index.values(), eval_dict,
+          eval_metrics,
-          include_metrics_per_category=False)
+          category_index.values(),
+          eval_dict,
+          include_metrics_per_category=eval_config.include_metrics_per_category)
      for loss_key, loss_tensor in iter(losses_dict.items()):
        eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
      for var in optimizer_summary_vars:
@@ -528,6 +539,7 @@ def create_train_and_eval_specs(train_input_fn,
                                train_steps,
                                eval_steps,
                                eval_on_train_data=False,
+                                eval_on_train_steps=None,
                                final_exporter_name='Servo',
                                eval_spec_name='eval'):
  """Creates a `TrainSpec` and `EvalSpec`s.
@@ -542,6 +554,8 @@ def create_train_and_eval_specs(train_input_fn,
    eval_steps: Number of eval steps.
    eval_on_train_data: Whether to evaluate model on training data. Default is
      False.
+    eval_on_train_steps: Number of eval steps for training data. If not given,
+      uses eval_steps.
    final_exporter_name: String name given to `FinalExporter`.
    eval_spec_name: String name given to main `EvalSpec`.
@@ -569,7 +583,7 @@ def create_train_and_eval_specs(train_input_fn,
    eval_specs.append(
        tf.estimator.EvalSpec(
            name='eval_on_train', input_fn=eval_on_train_input_fn,
-            steps=eval_steps))
+            steps=eval_on_train_steps or eval_steps))
  return train_spec, eval_specs

--- a/research/object_detection/model_lib_test.py
+++ b/research/object_detection/model_lib_test.py
@@ -253,6 +253,7 @@ class ModelLibTest(tf.test.TestCase):
    pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST)
    train_steps = 20
    eval_steps = 10
+    eval_on_train_steps = 15
    train_and_eval_dict = model_lib.create_estimator_and_inputs(
        run_config,
        hparams,
@@ -274,6 +275,7 @@ class ModelLibTest(tf.test.TestCase):
        train_steps,
        eval_steps,
        eval_on_train_data=True,
+        eval_on_train_steps=eval_on_train_steps,
        final_exporter_name='exporter',
        eval_spec_name='holdout')
    self.assertEqual(train_steps, train_spec.max_steps)
@@ -281,7 +283,7 @@ class ModelLibTest(tf.test.TestCase):
    self.assertEqual(eval_steps, eval_specs[0].steps)
    self.assertEqual('holdout', eval_specs[0].name)
    self.assertEqual('exporter', eval_specs[0].exporters[0].name)
-    self.assertEqual(eval_steps, eval_specs[1].steps)
+    self.assertEqual(eval_on_train_steps, eval_specs[1].steps)
    self.assertEqual('eval_on_train', eval_specs[1].name)
  def test_experiment(self):

--- a/research/object_detection/models/feature_map_generators.py
+++ b/research/object_detection/models/feature_map_generators.py
@@ -185,8 +185,9 @@ def fpn_top_down_feature_maps(image_features, depth, scope=None):
  See https://arxiv.org/abs/1612.03144 for details.
  Args:
-    image_features: list of image feature tensors. Spatial resolutions of
+    image_features: list of tuples of (tensor_name, image_feature_tensor).
-      succesive tensors must reduce exactly by a factor of 2.
+      Spatial resolutions of succesive tensors must reduce exactly by a factor
+      of 2.
    depth: depth of output feature maps.
    scope: A scope name to wrap this op under.
@@ -194,32 +195,31 @@ def fpn_top_down_feature_maps(image_features, depth, scope=None):
    feature_maps: an OrderedDict mapping keys (feature map names) to
      tensors where each tensor has shape [batch, height_i, width_i, depth_i].
  """
-  with tf.variable_scope(
+  with tf.name_scope(scope, 'top_down'):
-      scope, 'top_down', image_features):
    num_levels = len(image_features)
    output_feature_maps_list = []
    output_feature_map_keys = []
    with slim.arg_scope(
-        [slim.conv2d],
+        [slim.conv2d], padding='SAME', stride=1):
-        activation_fn=None, normalizer_fn=None, padding='SAME', stride=1):
      top_down = slim.conv2d(
-          image_features[-1],
+          image_features[-1][1],
-          depth, [1, 1], scope='projection_%d' % num_levels)
+          depth, [1, 1], activation_fn=None, normalizer_fn=None,
+          scope='projection_%d' % num_levels)
      output_feature_maps_list.append(top_down)
      output_feature_map_keys.append(
-          'top_down_feature_map_%d' % (num_levels - 1))
+          'top_down_%s' % image_features[-1][0])
      for level in reversed(range(num_levels - 1)):
        top_down = ops.nearest_neighbor_upsampling(top_down, 2)
        residual = slim.conv2d(
-            image_features[level], depth, [1, 1],
+            image_features[level][1], depth, [1, 1],
+            activation_fn=None, normalizer_fn=None,
            scope='projection_%d' % (level + 1))
-        top_down = 0.5 * top_down + 0.5 * residual
+        top_down += residual
        output_feature_maps_list.append(slim.conv2d(
            top_down,
            depth, [3, 3],
-            activation_fn=None,
            scope='smoothing_%d' % (level + 1)))
-        output_feature_map_keys.append('top_down_feature_map_%d' % level)
+        output_feature_map_keys.append('top_down_%s' % image_features[level][0])
      return collections.OrderedDict(
          reversed(zip(output_feature_map_keys, output_feature_maps_list)))
--- a/research/object_detection/models/feature_map_generators_test.py
+++ b/research/object_detection/models/feature_map_generators_test.py
@@ -138,19 +138,19 @@ class FPNFeatureMapGeneratorTest(tf.test.TestCase):
  def test_get_expected_feature_map_shapes(self):
    image_features = [
-        tf.random_uniform([4, 8, 8, 256], dtype=tf.float32),
+        ('block2', tf.random_uniform([4, 8, 8, 256], dtype=tf.float32)),
-        tf.random_uniform([4, 4, 4, 256], dtype=tf.float32),
+        ('block3', tf.random_uniform([4, 4, 4, 256], dtype=tf.float32)),
-        tf.random_uniform([4, 2, 2, 256], dtype=tf.float32),
+        ('block4', tf.random_uniform([4, 2, 2, 256], dtype=tf.float32)),
-        tf.random_uniform([4, 1, 1, 256], dtype=tf.float32),
+        ('block5', tf.random_uniform([4, 1, 1, 256], dtype=tf.float32))
    ]
    feature_maps = feature_map_generators.fpn_top_down_feature_maps(
        image_features=image_features, depth=128)
    expected_feature_map_shapes = {
-        'top_down_feature_map_0': (4, 8, 8, 128),
+        'top_down_block2': (4, 8, 8, 128),
-        'top_down_feature_map_1': (4, 4, 4, 128),
+        'top_down_block3': (4, 4, 4, 128),
-        'top_down_feature_map_2': (4, 2, 2, 128),
+        'top_down_block4': (4, 2, 2, 128),
-        'top_down_feature_map_3': (4, 1, 1, 128)
+        'top_down_block5': (4, 1, 1, 128)
    }
    init_op = tf.global_variables_initializer()

--- a/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor.py
+++ b/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor.py
@@ -148,9 +148,15 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
              store_non_strided_activations=True,
              scope=scope)
          image_features = self._filter_features(image_features)
-      last_feature_map = image_features['block4']
-    with tf.variable_scope(self._fpn_scope_name, reuse=self._reuse_weights):
      with slim.arg_scope(self._conv_hyperparams_fn()):
+        with tf.variable_scope(self._fpn_scope_name,
+                               reuse=self._reuse_weights):
+          fpn_features = feature_map_generators.fpn_top_down_feature_maps(
+              [(key, image_features[key])
+               for key in ['block2', 'block3', 'block4']],
+              depth=256)
+          last_feature_map = fpn_features['top_down_block4']
+          coarse_features = {}
          for i in range(5, 7):
            last_feature_map = slim.conv2d(
                last_feature_map,
@@ -158,16 +164,13 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
                kernel_size=[3, 3],
                stride=2,
                padding='SAME',
-              scope='block{}'.format(i))
+                scope='bottom_up_block{}'.format(i))
-          image_features['bottomup_{}'.format(i)] = last_feature_map
+            coarse_features['bottom_up_block{}'.format(i)] = last_feature_map
-        feature_maps = feature_map_generators.fpn_top_down_feature_maps(
+    return [fpn_features['top_down_block2'],
-            [
+            fpn_features['top_down_block3'],
-                image_features[key] for key in
+            fpn_features['top_down_block4'],
-                ['block2', 'block3', 'block4', 'bottomup_5', 'bottomup_6']
+            coarse_features['bottom_up_block5'],
-            ],
+            coarse_features['bottom_up_block6']]
-            depth=256,
-            scope='top_down_features')
-    return feature_maps.values()
 class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):

--- a/research/object_detection/protos/eval.proto
+++ b/research/object_detection/protos/eval.proto
@@ -72,4 +72,7 @@ message EvalConfig {
  // Whether to retain original images (i.e. not pre-processed) in the tensor
  // dictionary, so that they can be displayed in Tensorboard.
  optional bool retain_original_images = 23 [default=true];
+  // If True, additionally include per-category metrics.
+  optional bool include_metrics_per_category = 24 [default=false];
 }
--- a/research/object_detection/protos/graph_rewriter.proto
+++ b/research/object_detection/protos/graph_rewriter.proto
+syntax = "proto2";
+package object_detection.protos;
+// Message to configure graph rewriter for the tf graph.
+message GraphRewriter {
+  optional Quantization quantization = 1;
+}
+// Message for quantization options. See
+// tensorflow/contrib/quantize/python/quantize.py for details.
+message Quantization {
+  // Number of steps to delay before quantization takes effect during training.
+  optional int32 delay = 1 [default = 500000];
+  // Number of bits to use for quantizing weights.
+  // Only 8 bit is supported for now.
+  optional int32 weight_bits = 2 [default = 8];
+  // Number of bits to use for quantizing activations.
+  // Only 8 bit is supported for now.
+  optional int32 activation_bits = 3 [default = 8];
+}
--- a/research/object_detection/protos/hyperparams.proto
+++ b/research/object_detection/protos/hyperparams.proto
@@ -38,6 +38,10 @@ message Hyperparams {
  // BatchNorm hyperparameters. If this parameter is NOT set then BatchNorm is
  // not applied!
  optional BatchNorm batch_norm = 5;
+  // Whether depthwise convolutions should be regularized. If this parameter is
+  // NOT set then the conv hyperparams will default to the parent scope.
+  optional bool regularize_depthwise = 6 [default = false];
 }
 // Proto with one-of field for regularizers.

--- a/research/object_detection/protos/losses.proto
+++ b/research/object_detection/protos/losses.proto
@@ -20,6 +20,9 @@ message Loss {
  // Localization loss weight.
  optional float localization_weight = 5 [default=1.0];
+  // If not left to default, applies random example sampling.
+  optional RandomExampleSampler random_example_sampler = 6;
 }
 // Configuration for bounding box localization loss function.
@@ -121,7 +124,7 @@ message BootstrappedSigmoidClassificationLoss {
  optional bool anchorwise_output = 3 [default=false];
 }
-// Configuation for hard example miner.
+// Configuration for hard example miner.
 message HardExampleMiner {
  // Maximum number of hard examples to be selected per image (prior to
  // enforcing max negative to positive ratio constraint).  If set to 0,
@@ -152,3 +155,10 @@ message HardExampleMiner {
  // detection per image.
  optional int32 min_negatives_per_image = 5 [default=0];
 }
+// Configuration for random example sampler.
+message RandomExampleSampler {
+  // The desired fraction of positive samples in batch when applying random
+  // example sampling.
+  optional float positive_sample_fraction = 1 [default = 0.01];
+}
--- a/research/object_detection/protos/mean_stddev_box_coder.proto
+++ b/research/object_detection/protos/mean_stddev_box_coder.proto
@@ -5,4 +5,6 @@ package object_detection.protos;
 // Configuration proto for MeanStddevBoxCoder. See
 // box_coders/mean_stddev_box_coder.py for details.
 message MeanStddevBoxCoder {
+  // The standard deviation used to encode and decode boxes.
+  optional float stddev = 1 [default=0.01];
 }
--- a/research/object_detection/protos/pipeline.proto
+++ b/research/object_detection/protos/pipeline.proto
@@ -3,6 +3,7 @@ syntax = "proto2";
 package object_detection.protos;
 import "object_detection/protos/eval.proto";
+import "object_detection/protos/graph_rewriter.proto";
 import "object_detection/protos/input_reader.proto";
 import "object_detection/protos/model.proto";
 import "object_detection/protos/train.proto";
@@ -15,5 +16,6 @@ message TrainEvalPipelineConfig {
  optional InputReader train_input_reader = 3;
  optional EvalConfig eval_config = 4;
  optional InputReader eval_input_reader = 5;
+  optional GraphRewriter graph_rewriter = 6;
  extensions 1000 to max;
 }
--- a/research/object_detection/samples/configs/ssd_mobilenet_v2_coco.config
+++ b/research/object_detection/samples/configs/ssd_mobilenet_v2_coco.config
@@ -53,8 +53,7 @@ model {
        num_layers_before_predictor: 0
        use_dropout: false
        dropout_keep_probability: 0.8
-        kernel_size: 3
+        kernel_size: 1
-        use_depthwise: true
        box_code_size: 4
        apply_sigmoid_to_scores: false
        conv_hyperparams {
@@ -84,7 +83,6 @@ model {
      type: 'ssd_mobilenet_v2'
      min_depth: 16
      depth_multiplier: 1.0
-      use_depthwise: true
      conv_hyperparams {
        activation: RELU_6,
        regularizer {

--- a/research/object_detection/samples/configs/ssdlite_mobilenet_v1_coco.config
+++ b/research/object_detection/samples/configs/ssdlite_mobilenet_v1_coco.config
+# SSDLite with Mobilenet v1 configuration for MSCOCO Dataset.
+# Users should configure the fine_tune_checkpoint field in the train config as
+# well as the label_map_path and input_path fields in the train_input_reader and
+# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
+# should be configured.
+model {
+  ssd {
+    num_classes: 90
+    box_coder {
+      faster_rcnn_box_coder {
+        y_scale: 10.0
+        x_scale: 10.0
+        height_scale: 5.0
+        width_scale: 5.0
+      }
+    }
+    matcher {
+      argmax_matcher {
+        matched_threshold: 0.5
+        unmatched_threshold: 0.5
+        ignore_thresholds: false
+        negatives_lower_than_unmatched: true
+        force_match_for_each_row: true
+      }
+    }
+    similarity_calculator {
+      iou_similarity {
+      }
+    }
+    anchor_generator {
+      ssd_anchor_generator {
+        num_layers: 6
+        min_scale: 0.2
+        max_scale: 0.95
+        aspect_ratios: 1.0
+        aspect_ratios: 2.0
+        aspect_ratios: 0.5
+        aspect_ratios: 3.0
+        aspect_ratios: 0.3333
+      }
+    }
+    image_resizer {
+      fixed_shape_resizer {
+        height: 300
+        width: 300
+      }
+    }
+    box_predictor {
+      convolutional_box_predictor {
+        min_depth: 0
+        max_depth: 0
+        num_layers_before_predictor: 0
+        use_dropout: false
+        dropout_keep_probability: 0.8
+        kernel_size: 3
+        use_depthwise: true
+        box_code_size: 4
+        apply_sigmoid_to_scores: false
+        conv_hyperparams {
+          activation: RELU_6,
+          regularizer {
+            l2_regularizer {
+              weight: 0.00004
+            }
+          }
+          initializer {
+            truncated_normal_initializer {
+              stddev: 0.03
+              mean: 0.0
+            }
+          }
+          batch_norm {
+            train: true,
+            scale: true,
+            center: true,
+            decay: 0.9997,
+            epsilon: 0.001,
+          }
+        }
+      }
+    }
+    feature_extractor {
+      type: 'ssd_mobilenet_v1'
+      min_depth: 16
+      depth_multiplier: 1.0
+      use_depthwise: true
+      conv_hyperparams {
+        activation: RELU_6,
+        regularizer {
+          l2_regularizer {
+            weight: 0.00004
+          }
+        }
+        initializer {
+          truncated_normal_initializer {
+            stddev: 0.03
+            mean: 0.0
+          }
+        }
+        batch_norm {
+          train: true,
+          scale: true,
+          center: true,
+          decay: 0.9997,
+          epsilon: 0.001,
+        }
+      }
+    }
+    loss {
+      classification_loss {
+        weighted_sigmoid {
+        }
+      }
+      localization_loss {
+        weighted_smooth_l1 {
+        }
+      }
+      hard_example_miner {
+        num_hard_examples: 3000
+        iou_threshold: 0.99
+        loss_type: CLASSIFICATION
+        max_negatives_per_positive: 3
+        min_negatives_per_image: 0
+      }
+      classification_weight: 1.0
+      localization_weight: 1.0
+    }
+    normalize_loss_by_num_matches: true
+    post_processing {
+      batch_non_max_suppression {
+        score_threshold: 1e-8
+        iou_threshold: 0.6
+        max_detections_per_class: 100
+        max_total_detections: 100
+      }
+      score_converter: SIGMOID
+    }
+  }
+}
+train_config: {
+  batch_size: 24
+  optimizer {
+    rms_prop_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.004
+          decay_steps: 800720
+          decay_factor: 0.95
+        }
+      }
+      momentum_optimizer_value: 0.9
+      decay: 0.9
+      epsilon: 1.0
+    }
+  }
+  fine_tune_checkpoint: "PATH_TO_BE_CONFIGURED/model.ckpt"
+  from_detection_checkpoint: true
+  # Note: The below line limits the training process to 200K steps, which we
+  # empirically found to be sufficient enough to train the pets dataset. This
+  # effectively bypasses the learning rate schedule (the learning rate will
+  # never decay). Remove the below line to train indefinitely.
+  num_steps: 200000
+  data_augmentation_options {
+    random_horizontal_flip {
+    }
+  }
+  data_augmentation_options {
+    ssd_random_crop {
+    }
+  }
+}
+train_input_reader: {
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/mscoco_train.record"
+  }
+  label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt"
+}
+eval_config: {
+  num_examples: 8000
+  # Note: The below line limits the evaluation process to 10 evaluations.
+  # Remove the below line to evaluate indefinitely.
+  max_evals: 10
+}
+eval_input_reader: {
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/mscoco_val.record"
+  }
+  label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt"
+  shuffle: false
+  num_readers: 1
+}
--- a/research/object_detection/samples/configs/ssdlite_mobilenet_v2_coco.config
+++ b/research/object_detection/samples/configs/ssdlite_mobilenet_v2_coco.config
+# SSDLite with Mobilenet v2 configuration for MSCOCO Dataset.
+# Users should configure the fine_tune_checkpoint field in the train config as
+# well as the label_map_path and input_path fields in the train_input_reader and
+# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
+# should be configured.
+model {
+  ssd {
+    num_classes: 90
+    box_coder {
+      faster_rcnn_box_coder {
+        y_scale: 10.0
+        x_scale: 10.0
+        height_scale: 5.0
+        width_scale: 5.0
+      }
+    }
+    matcher {
+      argmax_matcher {
+        matched_threshold: 0.5
+        unmatched_threshold: 0.5
+        ignore_thresholds: false
+        negatives_lower_than_unmatched: true
+        force_match_for_each_row: true
+      }
+    }
+    similarity_calculator {
+      iou_similarity {
+      }
+    }
+    anchor_generator {
+      ssd_anchor_generator {
+        num_layers: 6
+        min_scale: 0.2
+        max_scale: 0.95
+        aspect_ratios: 1.0
+        aspect_ratios: 2.0
+        aspect_ratios: 0.5
+        aspect_ratios: 3.0
+        aspect_ratios: 0.3333
+      }
+    }
+    image_resizer {
+      fixed_shape_resizer {
+        height: 300
+        width: 300
+      }
+    }
+    box_predictor {
+      convolutional_box_predictor {
+        min_depth: 0
+        max_depth: 0
+        num_layers_before_predictor: 0
+        use_dropout: false
+        dropout_keep_probability: 0.8
+        kernel_size: 3
+        use_depthwise: true
+        box_code_size: 4
+        apply_sigmoid_to_scores: false
+        conv_hyperparams {
+          activation: RELU_6,
+          regularizer {
+            l2_regularizer {
+              weight: 0.00004
+            }
+          }
+          initializer {
+            truncated_normal_initializer {
+              stddev: 0.03
+              mean: 0.0
+            }
+          }
+          batch_norm {
+            train: true,
+            scale: true,
+            center: true,
+            decay: 0.9997,
+            epsilon: 0.001,
+          }
+        }
+      }
+    }
+    feature_extractor {
+      type: 'ssd_mobilenet_v2'
+      min_depth: 16
+      depth_multiplier: 1.0
+      use_depthwise: true
+      conv_hyperparams {
+        activation: RELU_6,
+        regularizer {
+          l2_regularizer {
+            weight: 0.00004
+          }
+        }
+        initializer {
+          truncated_normal_initializer {
+            stddev: 0.03
+            mean: 0.0
+          }
+        }
+        batch_norm {
+          train: true,
+          scale: true,
+          center: true,
+          decay: 0.9997,
+          epsilon: 0.001,
+        }
+      }
+    }
+    loss {
+      classification_loss {
+        weighted_sigmoid {
+        }
+      }
+      localization_loss {
+        weighted_smooth_l1 {
+        }
+      }
+      hard_example_miner {
+        num_hard_examples: 3000
+        iou_threshold: 0.99
+        loss_type: CLASSIFICATION
+        max_negatives_per_positive: 3
+        min_negatives_per_image: 3
+      }
+      classification_weight: 1.0
+      localization_weight: 1.0
+    }
+    normalize_loss_by_num_matches: true
+    post_processing {
+      batch_non_max_suppression {
+        score_threshold: 1e-8
+        iou_threshold: 0.6
+        max_detections_per_class: 100
+        max_total_detections: 100
+      }
+      score_converter: SIGMOID
+    }
+  }
+}
+train_config: {
+  batch_size: 24
+  optimizer {
+    rms_prop_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.004
+          decay_steps: 800720
+          decay_factor: 0.95
+        }
+      }
+      momentum_optimizer_value: 0.9
+      decay: 0.9
+      epsilon: 1.0
+    }
+  }
+  fine_tune_checkpoint: "PATH_TO_BE_CONFIGURED/model.ckpt"
+  fine_tune_checkpoint_type:  "detection"
+  # Note: The below line limits the training process to 200K steps, which we
+  # empirically found to be sufficient enough to train the pets dataset. This
+  # effectively bypasses the learning rate schedule (the learning rate will
+  # never decay). Remove the below line to train indefinitely.
+  num_steps: 200000
+  data_augmentation_options {
+    random_horizontal_flip {
+    }
+  }
+  data_augmentation_options {
+    ssd_random_crop {
+    }
+  }
+}
+train_input_reader: {
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/mscoco_train.record"
+  }
+  label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt"
+}
+eval_config: {
+  num_examples: 8000
+  # Note: The below line limits the evaluation process to 10 evaluations.
+  # Remove the below line to evaluate indefinitely.
+  max_evals: 10
+}
+eval_input_reader: {
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/mscoco_val.record"
+  }
+  label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt"
+  shuffle: false
+  num_readers: 1
+}
\ No newline at end of file
--- a/research/object_detection/train.py
+++ b/research/object_detection/train.py
@@ -48,6 +48,7 @@ import tensorflow as tf
 from object_detection import trainer
 from object_detection.builders import dataset_builder
+from object_detection.builders import graph_rewriter_builder
 from object_detection.builders import model_builder
 from object_detection.utils import config_util
 from object_detection.utils import dataset_util
@@ -158,9 +159,25 @@ def main(_):
    is_chief = (task_info.type == 'master')
    master = server.target
-  trainer.train(create_input_dict_fn, model_fn, train_config, master, task,
+  graph_rewriter_fn = None
-                FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks,
+  if 'graph_rewriter_config' in configs:
-                worker_job_name, is_chief, FLAGS.train_dir)
+    graph_rewriter_fn = graph_rewriter_builder.build(
+        configs['graph_rewriter_config'], is_training=True)
+  trainer.train(
+      create_input_dict_fn,
+      model_fn,
+      train_config,
+      master,
+      task,
+      FLAGS.num_clones,
+      worker_replicas,
+      FLAGS.clone_on_cpu,
+      ps_tasks,
+      worker_job_name,
+      is_chief,
+      FLAGS.train_dir,
+      graph_hook_fn=graph_rewriter_fn)
 if __name__ == '__main__':

--- a/research/object_detection/trainer.py
+++ b/research/object_detection/trainer.py
@@ -231,10 +231,10 @@ def train(create_tensor_dict_fn,
    worker_job_name: Name of the worker job.
    is_chief: Whether this replica is the chief replica.
    train_dir: Directory to write checkpoints and training summaries to.
-    graph_hook_fn: Optional function that is called after the training graph is
+    graph_hook_fn: Optional function that is called after the inference graph is
-      completely built. This is helpful to perform additional changes to the
+      built (before optimization). This is helpful to perform additional changes
-      training graph such as optimizing batchnorm. The function should modify
+      to the training graph such as adding FakeQuant ops. The function should
-      the default graph.
+      modify the default graph.
  """
  detection_model = create_model_fn()
@@ -275,6 +275,10 @@ def train(create_tensor_dict_fn,
    clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue])
    first_clone_scope = clones[0].scope
+    if graph_hook_fn:
+      with tf.device(deploy_config.variables_device()):
+        graph_hook_fn()
    # Gather update_ops from the first clone. These contain, for example,
    # the updates for the batch_norm variables created by model_fn.
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)
@@ -328,10 +332,6 @@ def train(create_tensor_dict_fn,
      with tf.control_dependencies([update_op]):
        train_tensor = tf.identity(total_loss, name='train_op')
-    if graph_hook_fn:
-      with tf.device(deploy_config.variables_device()):
-        graph_hook_fn()
    # Add summaries.
    for model_var in slim.get_model_variables():
      global_summaries.add(tf.summary.histogram('ModelVars/' +