Merge pull request #3846 from pkulzc/master

Internal changes for object detection

Merge pull request #3846 from pkulzc/master
Internal changes for object detection
abd50423 · pkulzc · GitHub · c3b26603 · 143464d2 · abd50423
Unverified Commit abd50423 authored Apr 03, 2018 by pkulzc Committed by GitHub Apr 03, 2018
20 changed files
--- a/research/object_detection/README.md
+++ b/research/object_detection/README.md
@@ -29,6 +29,7 @@ https://scholar.googleusercontent.com/scholar.bib?q=info:l291WsrB-hQJ:scholar.go

 * Jonathan Huang, github: [jch1](https://github.com/jch1)
 * Vivek Rathod, github: [tombstone](https://github.com/tombstone)
+* Ronny Votel, github: [ronnyvotel](https://github.com/ronnyvotel)
 * Derek Chow, github: [derekjchow](https://github.com/derekjchow)
 * Chen Sun, github: [jesu9](https://github.com/jesu9)
 * Menglong Zhu, github: [dreamdragon](https://github.com/dreamdragon)
@@ -89,6 +90,16 @@ reporting an issue.

 ## Release information

+### April 2, 2018
+
+Supercharge your mobile phones with the next generation mobile object detector!
+We are adding support for MobileNet V2 with SSDLite presented in
+[MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381).
+This model is 35% faster than Mobilenet V1 SSD on a Google Pixel phone CPU (200ms vs. 270ms) at the same accuracy.
+Along with the model definition, we are also releasing a model checkpoint trained on the COCO dataset.
+
+<b>Thanks to contributors</b>: Menglong Zhu, Mark Sandler, Zhichao Lu, Vivek Rathod, Jonathan Huang
+
 ### February 9, 2018

 We now support instance segmentation!!  In this API update we support a number of instance segmentation models similar to those discussed in the [Mask R-CNN paper](https://arxiv.org/abs/1703.06870). For further details refer to

--- a/research/object_detection/builders/model_builder.py
+++ b/research/object_detection/builders/model_builder.py
@@ -30,6 +30,7 @@ from object_detection.meta_architectures import ssd_meta_arch
 from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extractor as frcnn_inc_res
 from object_detection.models import faster_rcnn_inception_v2_feature_extractor as frcnn_inc_v2
 from object_detection.models import faster_rcnn_nas_feature_extractor as frcnn_nas
+from object_detection.models import faster_rcnn_pnas_feature_extractor as frcnn_pnas
 from object_detection.models import faster_rcnn_resnet_v1_feature_extractor as frcnn_resnet_v1
 from object_detection.models import ssd_resnet_v1_fpn_feature_extractor as ssd_resnet_v1_fpn
 from object_detection.models.embedded_ssd_mobilenet_v1_feature_extractor import EmbeddedSSDMobileNetV1FeatureExtractor
@@ -55,6 +56,8 @@ SSD_FEATURE_EXTRACTOR_CLASS_MAP = {
 FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP = {
    'faster_rcnn_nas':
    frcnn_nas.FasterRCNNNASFeatureExtractor,
+    'faster_rcnn_pnas':
+    frcnn_pnas.FasterRCNNPNASFeatureExtractor,
    'faster_rcnn_inception_resnet_v2':
    frcnn_inc_res.FasterRCNNInceptionResnetV2FeatureExtractor,
    'faster_rcnn_inception_v2':
@@ -95,13 +98,19 @@ def build(model_config, is_training, add_summaries=True):


 def _build_ssd_feature_extractor(feature_extractor_config, is_training,
-                                 reuse_weights=None):
+                                 reuse_weights=None,
+                                 inplace_batchnorm_update=False):
  """Builds a ssd_meta_arch.SSDFeatureExtractor based on config.

  Args:
    feature_extractor_config: A SSDFeatureExtractor proto config from ssd.proto.
    is_training: True if this feature extractor is being built for training.
    reuse_weights: if the feature extractor should reuse weights.
+    inplace_batchnorm_update: Whether to update batch_norm inplace during
+      training. This is required for batch norm to work correctly on TPUs. When
+      this is false, user must add a control dependency on
+      tf.GraphKeys.UPDATE_OPS for train/loss op in order to update the batch
+      norm moving average parameters.

  Returns:
    ssd_meta_arch.SSDFeatureExtractor based on config.
@@ -126,7 +135,8 @@ def _build_ssd_feature_extractor(feature_extractor_config, is_training,
  return feature_extractor_class(is_training, depth_multiplier, min_depth,
                                 pad_to_multiple, conv_hyperparams,
                                 batch_norm_trainable, reuse_weights,
-                                 use_explicit_padding, use_depthwise)
+                                 use_explicit_padding, use_depthwise,
+                                 inplace_batchnorm_update)


 def _build_ssd_model(ssd_config, is_training, add_summaries):
@@ -140,6 +150,7 @@ def _build_ssd_model(ssd_config, is_training, add_summaries):

  Returns:
    SSDMetaArch based on the config.
+
  Raises:
    ValueError: If ssd_config.type is not recognized (i.e. not registered in
      model_class_map).
@@ -147,8 +158,10 @@ def _build_ssd_model(ssd_config, is_training, add_summaries):
  num_classes = ssd_config.num_classes

  # Feature extractor
-  feature_extractor = _build_ssd_feature_extractor(ssd_config.feature_extractor,
-                                                   is_training)
+  feature_extractor = _build_ssd_feature_extractor(
+      feature_extractor_config=ssd_config.feature_extractor,
+      is_training=is_training,
+      inplace_batchnorm_update=ssd_config.inplace_batchnorm_update)

  box_coder = box_coder_builder.build(ssd_config.box_coder)
  matcher = matcher_builder.build(ssd_config.matcher)
@@ -194,7 +207,8 @@ def _build_ssd_model(ssd_config, is_training, add_summaries):


 def _build_faster_rcnn_feature_extractor(
-    feature_extractor_config, is_training, reuse_weights=None):
+    feature_extractor_config, is_training, reuse_weights=None,
+    inplace_batchnorm_update=False):
  """Builds a faster_rcnn_meta_arch.FasterRCNNFeatureExtractor based on config.

  Args:
@@ -202,6 +216,11 @@ def _build_faster_rcnn_feature_extractor(
      faster_rcnn.proto.
    is_training: True if this feature extractor is being built for training.
    reuse_weights: if the feature extractor should reuse weights.
+    inplace_batchnorm_update: Whether to update batch_norm inplace during
+      training. This is required for batch norm to work correctly on TPUs. When
+      this is false, user must add a control dependency on
+      tf.GraphKeys.UPDATE_OPS for train/loss op in order to update the batch
+      norm moving average parameters.

  Returns:
    faster_rcnn_meta_arch.FasterRCNNFeatureExtractor based on config.
@@ -209,6 +228,8 @@ def _build_faster_rcnn_feature_extractor(
  Raises:
    ValueError: On invalid feature extractor type.
  """
+  if inplace_batchnorm_update:
+    raise ValueError('inplace batchnorm updates not supported.')
  feature_type = feature_extractor_config.type
  first_stage_features_stride = (
      feature_extractor_config.first_stage_features_stride)
@@ -238,6 +259,7 @@ def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries):

  Returns:
    FasterRCNNMetaArch based on the config.
+
  Raises:
    ValueError: If frcnn_config.type is not recognized (i.e. not registered in
      model_class_map).
@@ -246,7 +268,8 @@ def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries):
  image_resizer_fn = image_resizer_builder.build(frcnn_config.image_resizer)

  feature_extractor = _build_faster_rcnn_feature_extractor(
-      frcnn_config.feature_extractor, is_training)
+      frcnn_config.feature_extractor, is_training,
+      frcnn_config.inplace_batchnorm_update)

  number_of_stages = frcnn_config.number_of_stages
  first_stage_anchor_generator = anchor_generator_builder.build(

--- a/research/object_detection/builders/model_builder_test.py
+++ b/research/object_detection/builders/model_builder_test.py
@@ -25,6 +25,7 @@ from object_detection.meta_architectures import ssd_meta_arch
 from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extractor as frcnn_inc_res
 from object_detection.models import faster_rcnn_inception_v2_feature_extractor as frcnn_inc_v2
 from object_detection.models import faster_rcnn_nas_feature_extractor as frcnn_nas
+from object_detection.models import faster_rcnn_pnas_feature_extractor as frcnn_pnas
 from object_detection.models import faster_rcnn_resnet_v1_feature_extractor as frcnn_resnet_v1
 from object_detection.models import ssd_resnet_v1_fpn_feature_extractor as ssd_resnet_v1_fpn
 from object_detection.models.embedded_ssd_mobilenet_v1_feature_extractor import EmbeddedSSDMobileNetV1FeatureExtractor
@@ -297,6 +298,7 @@ class ModelBuilderTest(tf.test.TestCase):
  def test_create_ssd_mobilenet_v1_model_from_config(self):
    model_text_proto = """
      ssd {
+        inplace_batchnorm_update: true
        feature_extractor {
          type: 'ssd_mobilenet_v1'
          conv_hyperparams {
@@ -519,6 +521,7 @@ class ModelBuilderTest(tf.test.TestCase):
  def test_create_faster_rcnn_resnet_v1_models_from_config(self):
    model_text_proto = """
      faster_rcnn {
+        inplace_batchnorm_update: true
        num_classes: 3
        image_resizer {
          keep_aspect_ratio_resizer {
@@ -726,6 +729,73 @@ class ModelBuilderTest(tf.test.TestCase):
        model._feature_extractor,
        frcnn_nas.FasterRCNNNASFeatureExtractor)

+  def test_create_faster_rcnn_pnas_model_from_config(self):
+    model_text_proto = """
+      faster_rcnn {
+        num_classes: 3
+        image_resizer {
+          keep_aspect_ratio_resizer {
+            min_dimension: 600
+            max_dimension: 1024
+          }
+        }
+        feature_extractor {
+          type: 'faster_rcnn_pnas'
+        }
+        first_stage_anchor_generator {
+          grid_anchor_generator {
+            scales: [0.25, 0.5, 1.0, 2.0]
+            aspect_ratios: [0.5, 1.0, 2.0]
+            height_stride: 16
+            width_stride: 16
+          }
+        }
+        first_stage_box_predictor_conv_hyperparams {
+          regularizer {
+            l2_regularizer {
+            }
+          }
+          initializer {
+            truncated_normal_initializer {
+            }
+          }
+        }
+        initial_crop_size: 17
+        maxpool_kernel_size: 1
+        maxpool_stride: 1
+        second_stage_box_predictor {
+          mask_rcnn_box_predictor {
+            fc_hyperparams {
+              op: FC
+              regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+            }
+          }
+        }
+        second_stage_post_processing {
+          batch_non_max_suppression {
+            score_threshold: 0.01
+            iou_threshold: 0.6
+            max_detections_per_class: 100
+            max_total_detections: 300
+          }
+          score_converter: SOFTMAX
+        }
+      }"""
+    model_proto = model_pb2.DetectionModel()
+    text_format.Merge(model_text_proto, model_proto)
+    model = model_builder.build(model_proto, is_training=True)
+    self.assertIsInstance(model, faster_rcnn_meta_arch.FasterRCNNMetaArch)
+    self.assertIsInstance(
+        model._feature_extractor,
+        frcnn_pnas.FasterRCNNPNASFeatureExtractor)
+
  def test_create_faster_rcnn_inception_resnet_v2_model_from_config(self):
    model_text_proto = """
      faster_rcnn {

--- a/research/object_detection/core/box_list_ops_test.py
+++ b/research/object_detection/core/box_list_ops_test.py
@@ -17,6 +17,7 @@
 import numpy as np
 import tensorflow as tf
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops

 from object_detection.core import box_list
 from object_detection.core import box_list_ops
@@ -509,9 +510,13 @@ class BoxListOpsTest(tf.test.TestCase):
      with self.assertRaises(ValueError):
        box_list_ops.sort_by_field(boxes, 'misc')

-      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
-                                               'Incorrect field size'):
-        sess.run(box_list_ops.sort_by_field(boxes, 'weights').get())
+      if ops._USE_C_API:
+        with self.assertRaises(ValueError):
+          box_list_ops.sort_by_field(boxes, 'weights')
+      else:
+        with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                                 'Incorrect field size'):
+          sess.run(box_list_ops.sort_by_field(boxes, 'weights').get())

  def test_visualize_boxes_in_image(self):
    image = tf.zeros((6, 4, 3))

--- a/research/object_detection/core/preprocessor.py
+++ b/research/object_detection/core/preprocessor.py
@@ -2279,7 +2279,11 @@ def resize_image(image,
        return new_masks

      def reshape_masks_branch():
-        new_masks = tf.reshape(masks, [0, new_size[0], new_size[1]])
+        # The shape function will be computed for both branches of the
+        # condition, regardless of which branch is actually taken. Make sure
+        # that we don't trigger an assertion in the shape function when trying
+        # to reshape a non empty tensor into an empty one.
+        new_masks = tf.reshape(masks, [-1, new_size[0], new_size[1]])
        return new_masks

      masks = tf.cond(num_instances > 0, resize_masks_branch,

--- a/research/object_detection/dataset_tools/download_and_preprocess_mscoco.sh
+++ b/research/object_detection/dataset_tools/download_and_preprocess_mscoco.sh
@@ -64,7 +64,7 @@ cd ${SCRATCH_DIR}
 # Download the images.
 BASE_IMAGE_URL="http://images.cocodataset.org/zips"

-# TRAIN_IMAGE_FILE="train2017.zip"
+TRAIN_IMAGE_FILE="train2017.zip"
 download_and_unzip ${BASE_IMAGE_URL} ${TRAIN_IMAGE_FILE}
 TRAIN_IMAGE_DIR="${SCRATCH_DIR}/train2017"

@@ -91,7 +91,7 @@ download_and_unzip ${BASE_IMAGE_INFO_URL} ${IMAGE_INFO_FILE}

 TESTDEV_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/image_info_test-dev2017.json"

-# # Build TFRecords of the image data.
+# Build TFRecords of the image data.
 cd "${CURRENT_DIR}"
 python object_detection/dataset_tools/create_coco_tf_record.py \
  --logtostderr \

--- a/research/object_detection/eval_util.py
+++ b/research/object_detection/eval_util.py
@@ -79,7 +79,7 @@ def visualize_detection_results(result_dict,
      data corresponding to each image being evaluated.  The following keys
      are required:
        'original_image': a numpy array representing the image with shape
-          [1, height, width, 3]
+          [1, height, width, 3] or [1, height, width, 1]
        'detection_boxes': a numpy array of shape [N, 4]
        'detection_scores': a numpy array of shape [N]
        'detection_classes': a numpy array of shape [N]
@@ -133,6 +133,8 @@ def visualize_detection_results(result_dict,
  category_index = label_map_util.create_category_index(categories)

  image = np.squeeze(result_dict[input_fields.original_image], axis=0)
+  if image.shape[2] == 1:  # If one channel image, repeat in RGB.
+    image = np.tile(image, [1, 1, 3])
  detection_boxes = result_dict[detection_fields.detection_boxes]
  detection_scores = result_dict[detection_fields.detection_scores]
  detection_classes = np.int32((result_dict[

--- a/research/object_detection/evaluator.py
+++ b/research/object_detection/evaluator.py
@@ -94,14 +94,24 @@ def _extract_predictions_and_losses(model,
    if fields.InputDataFields.groundtruth_group_of in input_dict:
      groundtruth[fields.InputDataFields.groundtruth_group_of] = (
          input_dict[fields.InputDataFields.groundtruth_group_of])
+    groundtruth_masks_list = None
    if fields.DetectionResultFields.detection_masks in detections:
      groundtruth[fields.InputDataFields.groundtruth_instance_masks] = (
          input_dict[fields.InputDataFields.groundtruth_instance_masks])
+      groundtruth_masks_list = [
+          input_dict[fields.InputDataFields.groundtruth_instance_masks]]
+    groundtruth_keypoints_list = None
+    if fields.DetectionResultFields.detection_keypoints in detections:
+      groundtruth[fields.InputDataFields.groundtruth_keypoints] = (
+          input_dict[fields.InputDataFields.groundtruth_keypoints])
+      groundtruth_keypoints_list = [
+          input_dict[fields.InputDataFields.groundtruth_keypoints]]
    label_id_offset = 1
    model.provide_groundtruth(
        [input_dict[fields.InputDataFields.groundtruth_boxes]],
        [tf.one_hot(input_dict[fields.InputDataFields.groundtruth_classes]
-                    - label_id_offset, depth=model.num_classes)])
+                    - label_id_offset, depth=model.num_classes)],
+        groundtruth_masks_list, groundtruth_keypoints_list)
    losses_dict.update(model.loss(prediction_dict, true_image_shapes))

  result_dict = eval_util.result_dict_for_single_example(
@@ -205,7 +215,7 @@ def evaluate(create_input_dict_fn, create_model_fn, eval_config, categories,
    except tf.errors.InvalidArgumentError:
      logging.info('Skipping image')
      counters['skipped'] += 1
-      return {}
+      return {}, {}
    global_step = tf.train.global_step(sess, tf.train.get_global_step())
    if batch_index < eval_config.num_visualizations:
      tag = 'image-{}'.format(batch_index)

--- a/research/object_detection/g3doc/detection_model_zoo.md
+++ b/research/object_detection/g3doc/detection_model_zoo.md
@@ -19,7 +19,9 @@ In the table below, we list each such pre-trained model including:
  aware that these timings depend highly on one's specific hardware
  configuration (these timings were performed using an Nvidia
  GeForce GTX TITAN X card) and should be treated more as relative timings in
-  many cases.
+  many cases. Also note that desktop GPU timing does not always reflect mobile
+  run time. For example Mobilenet V2 is faster on mobile devices than Mobilenet
+  V1, but is slightly slower on desktop GPU.
 * detector performance on subset of the COCO validation set or Open Images test split as measured by the dataset-specific mAP measure.
  Here, higher is better, and we only report bounding box mAP rounded to the
  nearest integer.
@@ -68,6 +70,7 @@ Some remarks on frozen inference graphs:
 | Model name  | Speed (ms) | COCO mAP[^1] | Outputs |
 | ------------ | :--------------: | :--------------: | :-------------: |
 | [ssd_mobilenet_v1_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2017_11_17.tar.gz) | 30 | 21 | Boxes |
+| [ssd_mobilenet_v2_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_coco_2018_03_29.tar.gz) | 31 | 22 | Boxes |
 | [ssd_inception_v2_coco](http://download.tensorflow.org/models/object_detection/ssd_inception_v2_coco_2017_11_17.tar.gz) | 42 | 24 | Boxes |
 | [faster_rcnn_inception_v2_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_v2_coco_2018_01_28.tar.gz) | 58 | 28 | Boxes |
 | [faster_rcnn_resnet50_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_coco_2018_01_28.tar.gz) | 89 | 30 | Boxes |

--- a/research/object_detection/g3doc/running_pets.md
+++ b/research/object_detection/g3doc/running_pets.md
@@ -37,7 +37,7 @@ environment variable below:
 export YOUR_GCS_BUCKET=${YOUR_GCS_BUCKET}
 ```

-It is also possible to run locally by following 
+It is also possible to run locally by following
 [the running locally instructions](running_locally.md).

 ## Installing Tensorflow and the Tensorflow Object Detection API

--- a/research/object_detection/inputs.py
+++ b/research/object_detection/inputs.py
@@ -58,7 +58,8 @@ def transform_input_data(tensor_dict,
  Data transformation functions are applied in the following order.
  1. data_augmentation_fn (optional): applied on tensor_dict.
  2. model_preprocess_fn: applied only on image tensor in tensor_dict.
-  3. image_resizer_fn: applied only on instance mask tensor in tensor_dict.
+  3. image_resizer_fn: applied on original image and instance mask tensor in
+     tensor_dict.
  4. one_hot_encoding: applied to classes tensor in tensor_dict.
  5. merge_multiple_boxes (optional): when groundtruth boxes are exactly the
     same they can be merged into a single box with an associated k-hot class
@@ -70,10 +71,11 @@ def transform_input_data(tensor_dict,
    model_preprocess_fn: model's preprocess function to apply on image tensor.
      This function must take in a 4-D float tensor and return a 4-D preprocess
      float tensor and a tensor containing the true image shape.
-    image_resizer_fn: image resizer function to apply on groundtruth instance
-      masks. This function must take a 4-D float tensor of image and a 4-D
-      tensor of instances masks and return resized version of these along with
-      the true shapes.
+    image_resizer_fn: image resizer function to apply on original image (if
+      `retain_original_image` is True) and groundtruth instance masks. This
+      function must take a 3-D float tensor of an image and a 3-D tensor of
+      instance masks and return a resized version of these along with the true
+      shapes.
    num_classes: number of max classes to one-hot (or k-hot) encode the class
      labels.
    data_augmentation_fn: (optional) data augmentation function to apply on
@@ -88,17 +90,19 @@ def transform_input_data(tensor_dict,
    after applying all the transformations.
  """
  if retain_original_image:
-    tensor_dict[fields.InputDataFields.
-                original_image] = tensor_dict[fields.InputDataFields.image]
+    original_image_resized, _ = image_resizer_fn(
+        tensor_dict[fields.InputDataFields.image])
+    tensor_dict[fields.InputDataFields.original_image] = tf.cast(
+        original_image_resized, tf.uint8)

  # Apply data augmentation ops.
  if data_augmentation_fn is not None:
    tensor_dict = data_augmentation_fn(tensor_dict)

  # Apply model preprocessing ops and resize instance masks.
-  image = tf.expand_dims(
-      tf.to_float(tensor_dict[fields.InputDataFields.image]), axis=0)
-  preprocessed_resized_image, true_image_shape = model_preprocess_fn(image)
+  image = tensor_dict[fields.InputDataFields.image]
+  preprocessed_resized_image, true_image_shape = model_preprocess_fn(
+      tf.expand_dims(tf.to_float(image), axis=0))
  tensor_dict[fields.InputDataFields.image] = tf.squeeze(
      preprocessed_resized_image, axis=0)
  tensor_dict[fields.InputDataFields.true_image_shape] = tf.squeeze(
@@ -156,6 +160,52 @@ def augment_input_data(tensor_dict, data_augmentation_options):
  return tensor_dict


+def _get_labels_dict(input_dict):
+  """Extracts labels dict from input dict."""
+  required_label_keys = [
+      fields.InputDataFields.num_groundtruth_boxes,
+      fields.InputDataFields.groundtruth_boxes,
+      fields.InputDataFields.groundtruth_classes,
+      fields.InputDataFields.groundtruth_weights
+  ]
+  labels_dict = {}
+  for key in required_label_keys:
+    labels_dict[key] = input_dict[key]
+
+  optional_label_keys = [
+      fields.InputDataFields.groundtruth_keypoints,
+      fields.InputDataFields.groundtruth_instance_masks,
+      fields.InputDataFields.groundtruth_area,
+      fields.InputDataFields.groundtruth_is_crowd,
+      fields.InputDataFields.groundtruth_difficult
+  ]
+
+  for key in optional_label_keys:
+    if key in input_dict:
+      labels_dict[key] = input_dict[key]
+  if fields.InputDataFields.groundtruth_difficult in labels_dict:
+    labels_dict[fields.InputDataFields.groundtruth_difficult] = tf.cast(
+        labels_dict[fields.InputDataFields.groundtruth_difficult], tf.int32)
+  return labels_dict
+
+
+def _get_features_dict(input_dict):
+  """Extracts features dict from input dict."""
+  hash_from_source_id = tf.string_to_hash_bucket_fast(
+      input_dict[fields.InputDataFields.source_id], HASH_BINS)
+  features = {
+      fields.InputDataFields.image:
+          input_dict[fields.InputDataFields.image],
+      HASH_KEY: tf.cast(hash_from_source_id, tf.int32),
+      fields.InputDataFields.true_image_shape:
+          input_dict[fields.InputDataFields.true_image_shape]
+  }
+  if fields.InputDataFields.original_image in input_dict:
+    features[fields.InputDataFields.original_image] = input_dict[
+        fields.InputDataFields.original_image]
+  return features
+
+
 def create_train_input_fn(train_config, train_input_config,
                          model_config):
  """Creates a train `input` function for `Estimator`.
@@ -184,6 +234,8 @@ def create_train_input_fn(train_config, train_input_config,
        features[fields.InputDataFields.true_image_shape] is a [batch_size, 3]
          int32 tensor representing the true image shapes, as preprocessed
          images could be padded.
+        features[fields.InputDataFields.original_image] (optional) is a
+          [batch_size, H, W, C] float32 tensor with original images.
      labels: Dictionary of groundtruth tensors.
        labels[fields.InputDataFields.num_groundtruth_boxes] is a [batch_size]
          int32 tensor indicating the number of groundtruth boxes.
@@ -233,7 +285,8 @@ def create_train_input_fn(train_config, train_input_config,
        transform_input_data, model_preprocess_fn=model.preprocess,
        image_resizer_fn=image_resizer_fn,
        num_classes=config_util.get_number_of_classes(model_config),
-        data_augmentation_fn=data_augmentation_fn)
+        data_augmentation_fn=data_augmentation_fn,
+        retain_original_image=train_config.retain_original_images)
    dataset = INPUT_BUILDER_UTIL_MAP['dataset_build'](
        train_input_config,
        transform_input_data_fn=transform_data_fn,
@@ -242,35 +295,8 @@ def create_train_input_fn(train_config, train_input_config,
        num_classes=config_util.get_number_of_classes(model_config),
        spatial_image_shape=config_util.get_spatial_image_size(
            image_resizer_config))
-    tensor_dict = dataset_util.make_initializable_iterator(dataset).get_next()
-
-    hash_from_source_id = tf.string_to_hash_bucket_fast(
-        tensor_dict[fields.InputDataFields.source_id], HASH_BINS)
-    features = {
-        fields.InputDataFields.image: tensor_dict[fields.InputDataFields.image],
-        HASH_KEY: tf.cast(hash_from_source_id, tf.int32),
-        fields.InputDataFields.true_image_shape: tensor_dict[
-            fields.InputDataFields.true_image_shape]
-    }
-
-    labels = {
-        fields.InputDataFields.num_groundtruth_boxes: tensor_dict[
-            fields.InputDataFields.num_groundtruth_boxes],
-        fields.InputDataFields.groundtruth_boxes: tensor_dict[
-            fields.InputDataFields.groundtruth_boxes],
-        fields.InputDataFields.groundtruth_classes: tensor_dict[
-            fields.InputDataFields.groundtruth_classes],
-        fields.InputDataFields.groundtruth_weights: tensor_dict[
-            fields.InputDataFields.groundtruth_weights]
-    }
-    if fields.InputDataFields.groundtruth_keypoints in tensor_dict:
-      labels[fields.InputDataFields.groundtruth_keypoints] = tensor_dict[
-          fields.InputDataFields.groundtruth_keypoints]
-    if fields.InputDataFields.groundtruth_instance_masks in tensor_dict:
-      labels[fields.InputDataFields.groundtruth_instance_masks] = tensor_dict[
-          fields.InputDataFields.groundtruth_instance_masks]
-
-    return features, labels
+    input_dict = dataset_util.make_initializable_iterator(dataset).get_next()
+    return (_get_features_dict(input_dict), _get_labels_dict(input_dict))

  return _train_input_fn

@@ -345,7 +371,7 @@ def create_eval_input_fn(eval_config, eval_input_config, model_config):
        image_resizer_fn=image_resizer_fn,
        num_classes=num_classes,
        data_augmentation_fn=None,
-        retain_original_image=True)
+        retain_original_image=eval_config.retain_original_images)
    dataset = INPUT_BUILDER_UTIL_MAP['dataset_build'](
        eval_input_config,
        transform_input_data_fn=transform_data_fn,
@@ -355,36 +381,7 @@ def create_eval_input_fn(eval_config, eval_input_config, model_config):
            image_resizer_config))
    input_dict = dataset_util.make_initializable_iterator(dataset).get_next()

-    hash_from_source_id = tf.string_to_hash_bucket_fast(
-        input_dict[fields.InputDataFields.source_id], HASH_BINS)
-    features = {
-        fields.InputDataFields.image:
-            input_dict[fields.InputDataFields.image],
-        fields.InputDataFields.original_image:
-            input_dict[fields.InputDataFields.original_image],
-        HASH_KEY: tf.cast(hash_from_source_id, tf.int32),
-        fields.InputDataFields.true_image_shape:
-            input_dict[fields.InputDataFields.true_image_shape]
-    }
-
-    labels = {
-        fields.InputDataFields.groundtruth_boxes:
-            input_dict[fields.InputDataFields.groundtruth_boxes],
-        fields.InputDataFields.groundtruth_classes:
-            input_dict[fields.InputDataFields.groundtruth_classes],
-        fields.InputDataFields.groundtruth_area:
-            input_dict[fields.InputDataFields.groundtruth_area],
-        fields.InputDataFields.groundtruth_is_crowd:
-            input_dict[fields.InputDataFields.groundtruth_is_crowd],
-        fields.InputDataFields.groundtruth_difficult:
-            tf.cast(input_dict[fields.InputDataFields.groundtruth_difficult],
-                    tf.int32)
-    }
-    if fields.InputDataFields.groundtruth_instance_masks in input_dict:
-      labels[fields.InputDataFields.groundtruth_instance_masks] = input_dict[
-          fields.InputDataFields.groundtruth_instance_masks]
-
-    return features, labels
+    return (_get_features_dict(input_dict), _get_labels_dict(input_dict))

  return _eval_input_fn


--- a/research/object_detection/inputs_test.py
+++ b/research/object_detection/inputs_test.py
@@ -34,16 +34,12 @@ FLAGS = tf.flags.FLAGS

 def _get_configs_for_model(model_name):
  """Returns configurations for model."""
-  fname = os.path.join(
-      FLAGS.test_srcdir,
-      ('google3/third_party/tensorflow_models/'
-       'object_detection/samples/configs/' + model_name + '.config'))
-  label_map_path = os.path.join(FLAGS.test_srcdir,
-                                ('google3/third_party/tensorflow_models/'
-                                 'object_detection/data/pet_label_map.pbtxt'))
-  data_path = os.path.join(FLAGS.test_srcdir,
-                           ('google3/third_party/tensorflow_models/'
-                            'object_detection/test_data/pets_examples.record'))
+  fname = os.path.join(tf.resource_loader.get_data_files_path(),
+                       'samples/configs/' + model_name + '.config')
+  label_map_path = os.path.join(tf.resource_loader.get_data_files_path(),
+                                'data/pet_label_map.pbtxt')
+  data_path = os.path.join(tf.resource_loader.get_data_files_path(),
+                           'test_data/pets_examples.record')
  configs = config_util.get_configs_from_pipeline_file(fname)
  return config_util.merge_external_params_with_configs(
      configs,
@@ -462,22 +458,31 @@ class DataTransformationFnTest(tf.test.TestCase):
        fields.InputDataFields.groundtruth_classes:
            tf.constant(np.array([3, 1], np.int32))
    }
-    def fake_image_resizer_fn(image, masks):
+    def fake_image_resizer_fn(image, masks=None):
      resized_image = tf.image.resize_images(image, [8, 8])
-      resized_masks = tf.transpose(
-          tf.image.resize_images(tf.transpose(masks, [1, 2, 0]), [8, 8]),
-          [2, 0, 1])
-      return resized_image, resized_masks, tf.shape(resized_image)
+      results = [resized_image]
+      if masks is not None:
+        resized_masks = tf.transpose(
+            tf.image.resize_images(tf.transpose(masks, [1, 2, 0]), [8, 8]),
+            [2, 0, 1])
+        results.append(resized_masks)
+      results.append(tf.shape(resized_image))
+      return results

    num_classes = 3
    input_transformation_fn = functools.partial(
        inputs.transform_input_data,
        model_preprocess_fn=_fake_model_preprocessor_fn,
        image_resizer_fn=fake_image_resizer_fn,
-        num_classes=num_classes)
+        num_classes=num_classes,
+        retain_original_image=True)
    with self.test_session() as sess:
      transformed_inputs = sess.run(
          input_transformation_fn(tensor_dict=tensor_dict))
+    self.assertAllEqual(transformed_inputs[
+        fields.InputDataFields.original_image].dtype, tf.uint8)
+    self.assertAllEqual(transformed_inputs[
+        fields.InputDataFields.original_image].shape, [8, 8, 3])
    self.assertAllEqual(transformed_inputs[
        fields.InputDataFields.groundtruth_instance_masks].shape, [2, 8, 8])


--- a/research/object_detection/meta_architectures/ssd_meta_arch.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch.py
@@ -46,7 +46,8 @@ class SSDFeatureExtractor(object):
               batch_norm_trainable=True,
               reuse_weights=None,
               use_explicit_padding=False,
-               use_depthwise=False):
+               use_depthwise=False,
+               inplace_batchnorm_update=False):
    """Constructor.

    Args:
@@ -64,6 +65,10 @@ class SSDFeatureExtractor(object):
      use_explicit_padding: Whether to use explicit padding when extracting
        features. Default is False.
      use_depthwise: Whether to use depthwise convolutions. Default is False.
+      inplace_batchnorm_update: Whether to update batch norm moving average
+        values inplace. When this is false train op must add a control
+        dependency on tf.graphkeys.UPDATE_OPS collection in order to update
+        batch norm statistics.
    """
    self._is_training = is_training
    self._depth_multiplier = depth_multiplier
@@ -71,6 +76,7 @@ class SSDFeatureExtractor(object):
    self._pad_to_multiple = pad_to_multiple
    self._conv_hyperparams = conv_hyperparams
    self._batch_norm_trainable = batch_norm_trainable
+    self._inplace_batchnorm_update = inplace_batchnorm_update
    self._reuse_weights = reuse_weights
    self._use_explicit_padding = use_explicit_padding
    self._use_depthwise = use_depthwise
@@ -108,7 +114,29 @@ class SSDFeatureExtractor(object):
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
-    pass
+    batchnorm_updates_collections = (None if self._inplace_batchnorm_update
+                                     else tf.GraphKeys.UPDATE_OPS)
+
+    with slim.arg_scope([slim.batch_norm],
+                        updates_collections=batchnorm_updates_collections):
+      return self._extract_features(preprocessed_inputs)
+
+  @abstractmethod
+  def _extract_features(self, preprocessed_inputs):
+    """Extracts features from preprocessed inputs.
+
+    This function is responsible for extracting feature maps from preprocessed
+    images.
+
+    Args:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+
+    Returns:
+      feature_maps: a list of tensors where the ith tensor has shape
+        [batch, height_i, width_i, depth_i]
+    """
+    raise NotImplementedError


 class SSDMetaArch(model.DetectionModel):

--- a/research/object_detection/model.py
+++ b/research/object_detection/model.py
@@ -49,8 +49,8 @@ tf.flags.DEFINE_string('model_dir', None, 'Path to output model directory '
                       'where event and checkpoint files will be written.')
 tf.flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config '
                       'file.')
-tf.flags.DEFINE_integer('num_train_steps', 500000, 'Number of train steps.')
-tf.flags.DEFINE_integer('num_eval_steps', 10000, 'Number of train steps.')
+tf.flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.')
+tf.flags.DEFINE_integer('num_eval_steps', None, 'Number of train steps.')
 FLAGS = tf.flags.FLAGS


@@ -225,7 +225,14 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
          labels,
          unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors)
    elif mode == tf.estimator.ModeKeys.EVAL:
-      labels = unstack_batch(labels, unpad_groundtruth_tensors=False)
+      # For evaling on train data, it is necessary to check whether groundtruth
+      # must be unpadded.
+      boxes_shape = (
+          labels[fields.InputDataFields.groundtruth_boxes].get_shape()
+          .as_list())
+      unpad_groundtruth_tensors = True if boxes_shape[1] is not None else False
+      labels = unstack_batch(
+          labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
@@ -241,7 +248,9 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
          groundtruth_boxes_list=gt_boxes_list,
          groundtruth_classes_list=gt_classes_list,
          groundtruth_masks_list=gt_masks_list,
-          groundtruth_keypoints_list=gt_keypoints_list)
+          groundtruth_keypoints_list=gt_keypoints_list,
+          groundtruth_weights_list=labels[
+              fields.InputDataFields.groundtruth_weights])

    preprocessed_images = features[fields.InputDataFields.image]
    prediction_dict = detection_model.predict(
@@ -250,14 +259,6 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
        prediction_dict, features[fields.InputDataFields.true_image_shape])

    if mode == tf.estimator.ModeKeys.TRAIN:
-      if not train_config.fine_tune_checkpoint_type:
-        # train_config.from_detection_checkpoint field is deprecated. For
-        # backward compatibility, sets finetune_checkpoint_type based on
-        # from_detection_checkpoint.
-        if train_config.from_detection_checkpoint:
-          train_config.fine_tune_checkpoint_type = 'detection'
-        else:
-          train_config.fine_tune_checkpoint_type = 'classification'
      if train_config.fine_tune_checkpoint and hparams.load_pretrained:
        if not train_config.fine_tune_checkpoint_type:
          # train_config.from_detection_checkpoint field is deprecated. For
@@ -341,17 +342,16 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
      }

    eval_metric_ops = None
-    if mode == tf.estimator.ModeKeys.EVAL:
-      # Detection summaries during eval.
+    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      class_agnostic = (fields.DetectionResultFields.detection_classes
                        not in detections)
      groundtruth = _get_groundtruth_data(detection_model, class_agnostic)
      use_original_images = fields.InputDataFields.original_image in features
-      eval_images = (
+      original_images = (
          features[fields.InputDataFields.original_image] if use_original_images
          else features[fields.InputDataFields.image])
      eval_dict = eval_util.result_dict_for_single_example(
-          eval_images[0:1],
+          original_images[0:1],
          features[inputs.HASH_KEY][0],
          detections,
          groundtruth,
@@ -363,21 +363,26 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
      else:
        category_index = label_map_util.create_category_index_from_labelmap(
            eval_input_config.label_map_path)
+      img_summary = None
      if not use_tpu and use_original_images:
        detection_and_groundtruth = (
            vis_utils.draw_side_by_side_evaluation_image(
                eval_dict, category_index, max_boxes_to_draw=20,
                min_score_thresh=0.2))
-        tf.summary.image('Detections_Left_Groundtruth_Right',
-                         detection_and_groundtruth)
-
-      # Eval metrics on a single image.
-      eval_metrics = eval_config.metrics_set
-      if not eval_metrics:
-        eval_metrics = ['coco_detection_metrics']
-      eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
-          eval_metrics, category_index.values(), eval_dict,
-          include_metrics_per_category=False)
+        img_summary = tf.summary.image('Detections_Left_Groundtruth_Right',
+                                       detection_and_groundtruth)
+
+      if mode == tf.estimator.ModeKeys.EVAL:
+        # Eval metrics on a single example.
+        eval_metrics = eval_config.metrics_set
+        if not eval_metrics:
+          eval_metrics = ['coco_detection_metrics']
+        eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
+            eval_metrics, category_index.values(), eval_dict,
+            include_metrics_per_category=False)
+        if img_summary is not None:
+          eval_metric_ops['Detections_Left_Groundtruth_Right'] = (
+              img_summary, tf.no_op())

    if use_tpu:
      return tf.contrib.tpu.TPUEstimatorSpec(

--- a/research/object_detection/model_test.py
+++ b/research/object_detection/model_test.py
@@ -32,20 +32,19 @@ from object_detection.builders import model_builder
 from object_detection.core import standard_fields as fields
 from object_detection.utils import config_util

-FLAGS = tf.flags.FLAGS

 MODEL_NAME_FOR_TEST = model_test_util.SSD_INCEPTION_MODEL_NAME


 def _get_data_path():
  """Returns an absolute path to TFRecord file."""
-  return os.path.join(FLAGS.test_srcdir, model_test_util.PATH_BASE, 'test_data',
+  return os.path.join(tf.resource_loader.get_data_files_path(), 'test_data',
                      'pets_examples.record')


 def _get_labelmap_path():
  """Returns an absolute path to label map file."""
-  return os.path.join(FLAGS.test_srcdir, model_test_util.PATH_BASE, 'data',
+  return os.path.join(tf.resource_loader.get_data_files_path(), 'data',
                      'pet_label_map.pbtxt')



--- a/research/object_detection/model_test_util.py
+++ b/research/object_detection/model_test_util.py
@@ -28,13 +28,12 @@ FLAGS = tf.flags.FLAGS

 FASTER_RCNN_MODEL_NAME = 'faster_rcnn_resnet50_pets'
 SSD_INCEPTION_MODEL_NAME = 'ssd_inception_v2_pets'
-PATH_BASE = 'google3/third_party/tensorflow_models/object_detection/'


 def GetPipelineConfigPath(model_name):
  """Returns path to the local pipeline config file."""
-  return os.path.join(FLAGS.test_srcdir, PATH_BASE, 'samples', 'configs',
-                      model_name + '.config')
+  return os.path.join(tf.resource_loader.get_data_files_path(), 'samples',
+                      'configs', model_name + '.config')


 def InitializeFlags(model_name_for_test):

--- a/research/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor.py
+++ b/research/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor.py
@@ -53,7 +53,8 @@ class EmbeddedSSDMobileNetV1FeatureExtractor(
               batch_norm_trainable=True,
               reuse_weights=None,
               use_explicit_padding=False,
-               use_depthwise=False):
+               use_depthwise=False,
+               inplace_batchnorm_update=False):
    """MobileNetV1 Feature Extractor for Embedded-friendly SSD Models.

    Args:
@@ -71,6 +72,11 @@ class EmbeddedSSDMobileNetV1FeatureExtractor(
      use_explicit_padding: Whether to use explicit padding when extracting
        features. Default is False.
      use_depthwise: Whether to use depthwise convolutions. Default is False.
+      inplace_batchnorm_update: Whether to update batch_norm inplace during
+        training. This is required for batch norm to work correctly on TPUs.
+        When this is false, user must add a control dependency on
+        tf.GraphKeys.UPDATE_OPS for train/loss op in order to update the batch
+        norm moving average parameters.

    Raises:
      ValueError: upon invalid `pad_to_multiple` values.
@@ -82,9 +88,9 @@ class EmbeddedSSDMobileNetV1FeatureExtractor(
    super(EmbeddedSSDMobileNetV1FeatureExtractor, self).__init__(
        is_training, depth_multiplier, min_depth, pad_to_multiple,
        conv_hyperparams, batch_norm_trainable, reuse_weights,
-        use_explicit_padding, use_depthwise)
+        use_explicit_padding, use_depthwise, inplace_batchnorm_update)

-  def extract_features(self, preprocessed_inputs):
+  def _extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:

--- a/research/object_detection/models/faster_rcnn_mobilenet_v1_feature_extractor.py
+++ b/research/object_detection/models/faster_rcnn_mobilenet_v1_feature_extractor.py
@@ -22,30 +22,6 @@ from nets import mobilenet_v1
 slim = tf.contrib.slim


-def _batch_norm_arg_scope(list_ops,
-                          use_batch_norm=True,
-                          batch_norm_decay=0.9997,
-                          batch_norm_epsilon=0.001,
-                          batch_norm_scale=False,
-                          train_batch_norm=False):
-  """Slim arg scope for Mobilenet V1 batch norm."""
-  if use_batch_norm:
-    batch_norm_params = {
-        'is_training': train_batch_norm,
-        'scale': batch_norm_scale,
-        'decay': batch_norm_decay,
-        'epsilon': batch_norm_epsilon
-    }
-    normalizer_fn = slim.batch_norm
-  else:
-    normalizer_fn = None
-    batch_norm_params = None
-
-  return slim.arg_scope(list_ops,
-                        normalizer_fn=normalizer_fn,
-                        normalizer_params=batch_norm_params)
-
-
 class FasterRCNNMobilenetV1FeatureExtractor(
    faster_rcnn_meta_arch.FasterRCNNFeatureExtractor):
  """Faster R-CNN Mobilenet V1 feature extractor implementation."""
@@ -121,18 +97,19 @@ class FasterRCNNMobilenetV1FeatureExtractor(
        ['image size must at least be 33 in both height and width.'])

    with tf.control_dependencies([shape_assert]):
-      with tf.variable_scope('MobilenetV1',
-                             reuse=self._reuse_weights) as scope:
-        with _batch_norm_arg_scope([slim.conv2d, slim.separable_conv2d],
-                                   batch_norm_scale=True,
-                                   train_batch_norm=self._train_batch_norm):
+      with slim.arg_scope(
+          mobilenet_v1.mobilenet_v1_arg_scope(
+              is_training=self._train_batch_norm,
+              weight_decay=self._weight_decay)):
+        with tf.variable_scope('MobilenetV1',
+                               reuse=self._reuse_weights) as scope:
          _, activations = mobilenet_v1.mobilenet_v1_base(
              preprocessed_inputs,
-              final_endpoint='Conv2d_13_pointwise',
+              final_endpoint='Conv2d_11_pointwise',
              min_depth=self._min_depth,
              depth_multiplier=self._depth_multiplier,
              scope=scope)
-    return activations['Conv2d_13_pointwise'], activations
+    return activations['Conv2d_11_pointwise'], activations

  def _extract_box_classifier_features(self, proposal_feature_maps, scope):
    """Extracts second stage box classifier features.
@@ -152,9 +129,10 @@ class FasterRCNNMobilenetV1FeatureExtractor(

    depth = lambda d: max(int(d * 1.0), 16)
    with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights):
-      with _batch_norm_arg_scope([slim.conv2d, slim.separable_conv2d],
-                                 batch_norm_scale=True,
-                                 train_batch_norm=self._train_batch_norm):
+      with slim.arg_scope(
+          mobilenet_v1.mobilenet_v1_arg_scope(
+              is_training=self._train_batch_norm,
+              weight_decay=self._weight_decay)):
        with slim.arg_scope(
            [slim.conv2d, slim.separable_conv2d], padding='SAME'):
          net = slim.separable_conv2d(

--- a/research/object_detection/models/faster_rcnn_mobilenet_v1_feature_extractor_test.py
+++ b/research/object_detection/models/faster_rcnn_mobilenet_v1_feature_extractor_test.py
@@ -44,7 +44,7 @@ class FasterRcnnMobilenetV1FeatureExtractorTest(tf.test.TestCase):
    with self.test_session() as sess:
      sess.run(init_op)
      features_shape_out = sess.run(features_shape)
-      self.assertAllEqual(features_shape_out, [4, 7, 7, 1024])
+      self.assertAllEqual(features_shape_out, [4, 14, 14, 512])

  def test_extract_proposal_features_stride_eight(self):
    feature_extractor = self._build_feature_extractor(
@@ -59,7 +59,7 @@ class FasterRcnnMobilenetV1FeatureExtractorTest(tf.test.TestCase):
    with self.test_session() as sess:
      sess.run(init_op)
      features_shape_out = sess.run(features_shape)
-      self.assertAllEqual(features_shape_out, [4, 7, 7, 1024])
+      self.assertAllEqual(features_shape_out, [4, 14, 14, 512])

  def test_extract_proposal_features_half_size_input(self):
    feature_extractor = self._build_feature_extractor(
@@ -74,7 +74,7 @@ class FasterRcnnMobilenetV1FeatureExtractorTest(tf.test.TestCase):
    with self.test_session() as sess:
      sess.run(init_op)
      features_shape_out = sess.run(features_shape)
-      self.assertAllEqual(features_shape_out, [1, 4, 4, 1024])
+      self.assertAllEqual(features_shape_out, [1, 7, 7, 512])

  def test_extract_proposal_features_dies_on_invalid_stride(self):
    with self.assertRaises(ValueError):

--- a/research/object_detection/models/faster_rcnn_pnas_feature_extractor.py
+++ b/research/object_detection/models/faster_rcnn_pnas_feature_extractor.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""PNASNet Faster R-CNN implementation.
+
+Based on PNASNet model: https://arxiv.org/abs/1712.00559
+"""
+
+import tensorflow as tf
+
+from object_detection.meta_architectures import faster_rcnn_meta_arch
+from nets.nasnet import nasnet_utils
+from nets.nasnet import pnasnet
+
+arg_scope = tf.contrib.framework.arg_scope
+slim = tf.contrib.slim
+
+
+def pnasnet_large_arg_scope_for_detection(is_batch_norm_training=False):
+  """Defines the default arg scope for the PNASNet Large for object detection.
+
+  This provides a small edit to switch batch norm training on and off.
+
+  Args:
+    is_batch_norm_training: Boolean indicating whether to train with batch norm.
+
+  Returns:
+    An `arg_scope` to use for the PNASNet Large Model.
+  """
+  imagenet_scope = pnasnet.pnasnet_large_arg_scope()
+  with arg_scope(imagenet_scope):
+    with arg_scope([slim.batch_norm], is_training=is_batch_norm_training) as sc:
+      return sc
+
+
+def _filter_scaling(reduction_indices, start_cell_num):
+  """Compute the expected filter scaling at given PNASNet cell start_cell_num.
+
+  In the pnasnet.py code, filter_scaling starts at 1.0. We instead
+  adapt filter scaling to depend on the starting cell.
+  At first cells, before any reduction, filter_scalling is 1.0. With passing
+  any reduction cell, the filter_scaling is multiplied by 2.
+
+  Args:
+    reduction_indices: list of int indices.
+    start_cell_num: int.
+  Returns:
+    filter_scaling: float.
+  """
+  filter_scaling = 1.0
+  for ind in reduction_indices:
+    if ind < start_cell_num:
+      filter_scaling *= 2.0
+  return filter_scaling
+
+
+# Note: This is largely a copy of _build_pnasnet_base inside pnasnet.py but
+# with special edits to remove instantiation of the stem and the special
+# ability to receive as input a pair of hidden states. It constructs only
+# a sub-network from the original PNASNet model, starting from the
+# start_cell_num cell and with modified final layer.
+def _build_pnasnet_base(
+    hidden_previous, hidden, normal_cell, hparams, true_cell_num,
+    start_cell_num):
+  """Constructs a PNASNet image model for proposal classifier features."""
+
+  # Find where to place the reduction cells or stride normal cells
+  reduction_indices = nasnet_utils.calc_reduction_layers(
+      hparams.num_cells, hparams.num_reduction_layers)
+  filter_scaling = _filter_scaling(reduction_indices, start_cell_num)
+
+  # Note: The None is prepended to match the behavior of _imagenet_stem()
+  cell_outputs = [None, hidden_previous, hidden]
+  net = hidden
+
+  # Run the cells
+  for cell_num in range(start_cell_num, hparams.num_cells):
+    is_reduction = cell_num in reduction_indices
+    stride = 2 if is_reduction else 1
+    if is_reduction: filter_scaling *= hparams.filter_scaling_rate
+    prev_layer = cell_outputs[-2]
+    net = normal_cell(
+        net,
+        scope='cell_{}'.format(cell_num),
+        filter_scaling=filter_scaling,
+        stride=stride,
+        prev_layer=prev_layer,
+        cell_num=true_cell_num)
+    true_cell_num += 1
+    cell_outputs.append(net)
+
+  # Final nonlinearity.
+  # Note that we have dropped the final pooling, dropout and softmax layers
+  # from the default pnasnet version.
+  with tf.variable_scope('final_layer'):
+    net = tf.nn.relu(net)
+  return net
+
+
+# TODO(shlens): Only fixed_shape_resizer is currently supported for PNASNet
+# featurization. The reason for this is that pnasnet.py only supports
+# inputs with fully known shapes. We need to update pnasnet.py to handle
+# shapes not known at compile time.
+class FasterRCNNPNASFeatureExtractor(
+    faster_rcnn_meta_arch.FasterRCNNFeatureExtractor):
+  """Faster R-CNN with PNASNet feature extractor implementation."""
+
+  def __init__(self,
+               is_training,
+               first_stage_features_stride,
+               batch_norm_trainable=False,
+               reuse_weights=None,
+               weight_decay=0.0):
+    """Constructor.
+
+    Args:
+      is_training: See base class.
+      first_stage_features_stride: See base class.
+      batch_norm_trainable: See base class.
+      reuse_weights: See base class.
+      weight_decay: See base class.
+
+    Raises:
+      ValueError: If `first_stage_features_stride` is not 16.
+    """
+    if first_stage_features_stride != 16:
+      raise ValueError('`first_stage_features_stride` must be 16.')
+    super(FasterRCNNPNASFeatureExtractor, self).__init__(
+        is_training, first_stage_features_stride, batch_norm_trainable,
+        reuse_weights, weight_decay)
+
+  def preprocess(self, resized_inputs):
+    """Faster R-CNN with PNAS preprocessing.
+
+    Maps pixel values to the range [-1, 1].
+
+    Args:
+      resized_inputs: A [batch, height_in, width_in, channels] float32 tensor
+        representing a batch of images with values between 0 and 255.0.
+
+    Returns:
+      preprocessed_inputs: A [batch, height_out, width_out, channels] float32
+        tensor representing a batch of images.
+
+    """
+    return (2.0 / 255.0) * resized_inputs - 1.0
+
+  def _extract_proposal_features(self, preprocessed_inputs, scope):
+    """Extracts first stage RPN features.
+
+    Extracts features using the first half of the PNASNet network.
+    We construct the network in `align_feature_maps=True` mode, which means
+    that all VALID paddings in the network are changed to SAME padding so that
+    the feature maps are aligned.
+
+    Args:
+      preprocessed_inputs: A [batch, height, width, channels] float32 tensor
+        representing a batch of images.
+      scope: A scope name.
+
+    Returns:
+      rpn_feature_map: A tensor with shape [batch, height, width, depth]
+      end_points: A dictionary mapping feature extractor tensor names to tensors
+
+    Raises:
+      ValueError: If the created network is missing the required activation.
+    """
+    del scope
+
+    if len(preprocessed_inputs.get_shape().as_list()) != 4:
+      raise ValueError('`preprocessed_inputs` must be 4 dimensional, got a '
+                       'tensor of shape %s' % preprocessed_inputs.get_shape())
+
+    with slim.arg_scope(pnasnet_large_arg_scope_for_detection(
+        is_batch_norm_training=self._train_batch_norm)):
+      with arg_scope([slim.conv2d,
+                      slim.batch_norm,
+                      slim.separable_conv2d],
+                     reuse=self._reuse_weights):
+        _, end_points = pnasnet.build_pnasnet_large(
+            preprocessed_inputs, num_classes=None,
+            is_training=self._is_training,
+            final_endpoint='Cell_7')
+
+    # Note that both 'Cell_6' and 'Cell_7' have equal depth = 2160.
+    # Cell_7 is the last cell before second reduction.
+    rpn_feature_map = tf.concat([end_points['Cell_6'],
+                                 end_points['Cell_7']], 3)
+
+    # pnasnet.py does not maintain the batch size in the first dimension.
+    # This work around permits us retaining the batch for below.
+    batch = preprocessed_inputs.get_shape().as_list()[0]
+    shape_without_batch = rpn_feature_map.get_shape().as_list()[1:]
+    rpn_feature_map_shape = [batch] + shape_without_batch
+    rpn_feature_map.set_shape(rpn_feature_map_shape)
+
+    return rpn_feature_map, end_points
+
+  def _extract_box_classifier_features(self, proposal_feature_maps, scope):
+    """Extracts second stage box classifier features.
+
+    This function reconstructs the "second half" of the PNASNet
+    network after the part defined in `_extract_proposal_features`.
+
+    Args:
+      proposal_feature_maps: A 4-D float tensor with shape
+        [batch_size * self.max_num_proposals, crop_height, crop_width, depth]
+        representing the feature map cropped to each proposal.
+      scope: A scope name.
+
+    Returns:
+      proposal_classifier_features: A 4-D float tensor with shape
+        [batch_size * self.max_num_proposals, height, width, depth]
+        representing box classifier features for each proposal.
+    """
+    del scope
+
+    # Number of used stem cells.
+    num_stem_cells = 2
+
+    # Note that we always feed into 2 layers of equal depth
+    # where the first N channels corresponds to previous hidden layer
+    # and the second N channels correspond to the final hidden layer.
+    hidden_previous, hidden = tf.split(proposal_feature_maps, 2, axis=3)
+
+    # Note that what follows is largely a copy of build_pnasnet_large() within
+    # pnasnet.py. We are copying to minimize code pollution in slim.
+
+    # TODO(shlens,skornblith): Determine the appropriate drop path schedule.
+    # For now the schedule is the default (1.0->0.7 over 250,000 train steps).
+    hparams = pnasnet.large_imagenet_config()
+    if not self._is_training:
+      hparams.set_hparam('drop_path_keep_prob', 1.0)
+
+    # Calculate the total number of cells in the network
+    total_num_cells = hparams.num_cells + num_stem_cells
+
+    normal_cell = pnasnet.PNasNetNormalCell(
+        hparams.num_conv_filters, hparams.drop_path_keep_prob,
+        total_num_cells, hparams.total_training_steps)
+    with arg_scope([slim.dropout, nasnet_utils.drop_path],
+                   is_training=self._is_training):
+      with arg_scope([slim.batch_norm], is_training=self._train_batch_norm):
+        with arg_scope([slim.avg_pool2d,
+                        slim.max_pool2d,
+                        slim.conv2d,
+                        slim.batch_norm,
+                        slim.separable_conv2d,
+                        nasnet_utils.factorized_reduction,
+                        nasnet_utils.global_avg_pool,
+                        nasnet_utils.get_channel_index,
+                        nasnet_utils.get_channel_dim],
+                       data_format=hparams.data_format):
+
+          # This corresponds to the cell number just past 'Cell_7' used by
+          # _extract_proposal_features().
+          start_cell_num = 8
+          true_cell_num = start_cell_num + num_stem_cells
+
+          with slim.arg_scope(pnasnet.pnasnet_large_arg_scope()):
+            net = _build_pnasnet_base(
+                hidden_previous,
+                hidden,
+                normal_cell=normal_cell,
+                hparams=hparams,
+                true_cell_num=true_cell_num,
+                start_cell_num=start_cell_num)
+
+    proposal_classifier_features = net
+    return proposal_classifier_features
+
+  def restore_from_classification_checkpoint_fn(
+      self,
+      first_stage_feature_extractor_scope,
+      second_stage_feature_extractor_scope):
+    """Returns a map of variables to load from a foreign checkpoint.
+
+    Note that this overrides the default implementation in
+    faster_rcnn_meta_arch.FasterRCNNFeatureExtractor which does not work for
+    PNASNet checkpoints.
+
+    Args:
+      first_stage_feature_extractor_scope: A scope name for the first stage
+        feature extractor.
+      second_stage_feature_extractor_scope: A scope name for the second stage
+        feature extractor.
+
+    Returns:
+      A dict mapping variable names (to load from a checkpoint) to variables in
+      the model graph.
+    """
+    variables_to_restore = {}
+    for variable in tf.global_variables():
+      if variable.op.name.startswith(
+          first_stage_feature_extractor_scope):
+        var_name = variable.op.name.replace(
+            first_stage_feature_extractor_scope + '/', '')
+        var_name += '/ExponentialMovingAverage'
+        variables_to_restore[var_name] = variable
+      if variable.op.name.startswith(
+          second_stage_feature_extractor_scope):
+        var_name = variable.op.name.replace(
+            second_stage_feature_extractor_scope + '/', '')
+        var_name += '/ExponentialMovingAverage'
+        variables_to_restore[var_name] = variable
+    return variables_to_restore