Merge pull request #2631 from tombstone/feature_extractors_update

feature extractor and model builder update.

Merge pull request #2631 from tombstone/feature_extractors_update
feature extractor and model builder update.
74a03640 · vivek rathod · GitHub · ff88581a · 3237c080 · 74a03640
Unverified Commit 74a03640 authored Oct 29, 2017 by vivek rathod Committed by GitHub Oct 29, 2017
20 changed files
--- a/research/object_detection/builders/BUILD
+++ b/research/object_detection/builders/BUILD
@@ -24,9 +24,12 @@ py_library(
        "//tensorflow_models/object_detection/meta_architectures:faster_rcnn_meta_arch",
        "//tensorflow_models/object_detection/meta_architectures:rfcn_meta_arch",
        "//tensorflow_models/object_detection/meta_architectures:ssd_meta_arch",
+        "//tensorflow_models/object_detection/models:embedded_ssd_mobilenet_v1_feature_extractor",
        "//tensorflow_models/object_detection/models:faster_rcnn_inception_resnet_v2_feature_extractor",
+        "//tensorflow_models/object_detection/models:faster_rcnn_inception_v2_feature_extractor",
        "//tensorflow_models/object_detection/models:faster_rcnn_resnet_v1_feature_extractor",
        "//tensorflow_models/object_detection/models:ssd_inception_v2_feature_extractor",
+        "//tensorflow_models/object_detection/models:ssd_inception_v3_feature_extractor",
        "//tensorflow_models/object_detection/models:ssd_mobilenet_v1_feature_extractor",
        "//tensorflow_models/object_detection/protos:model_py_pb2",
    ],
@@ -40,7 +43,11 @@ py_test(
        "//tensorflow",
        "//tensorflow_models/object_detection/meta_architectures:faster_rcnn_meta_arch",
        "//tensorflow_models/object_detection/meta_architectures:ssd_meta_arch",
+        "//tensorflow_models/object_detection/models:faster_rcnn_inception_resnet_v2_feature_extractor",
+        "//tensorflow_models/object_detection/models:faster_rcnn_inception_v2_feature_extractor",
+        "//tensorflow_models/object_detection/models:faster_rcnn_resnet_v1_feature_extractor",
        "//tensorflow_models/object_detection/models:ssd_inception_v2_feature_extractor",
+        "//tensorflow_models/object_detection/models:ssd_inception_v3_feature_extractor",
        "//tensorflow_models/object_detection/models:ssd_mobilenet_v1_feature_extractor",
        "//tensorflow_models/object_detection/protos:model_py_pb2",
    ],

--- a/research/object_detection/builders/model_builder.py
+++ b/research/object_detection/builders/model_builder.py
@@ -28,27 +28,37 @@ from object_detection.meta_architectures import faster_rcnn_meta_arch
 from object_detection.meta_architectures import rfcn_meta_arch
 from object_detection.meta_architectures import ssd_meta_arch
 from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extractor as frcnn_inc_res
+from object_detection.models import faster_rcnn_inception_v2_feature_extractor as frcnn_inc_v2
+from object_detection.models import faster_rcnn_nas_feature_extractor as frcnn_nas
 from object_detection.models import faster_rcnn_resnet_v1_feature_extractor as frcnn_resnet_v1
+from object_detection.models.embedded_ssd_mobilenet_v1_feature_extractor import EmbeddedSSDMobileNetV1FeatureExtractor
 from object_detection.models.ssd_inception_v2_feature_extractor import SSDInceptionV2FeatureExtractor
+from object_detection.models.ssd_inception_v3_feature_extractor import SSDInceptionV3FeatureExtractor
 from object_detection.models.ssd_mobilenet_v1_feature_extractor import SSDMobileNetV1FeatureExtractor
 from object_detection.protos import model_pb2

 # A map of names to SSD feature extractors.
 SSD_FEATURE_EXTRACTOR_CLASS_MAP = {
    'ssd_inception_v2': SSDInceptionV2FeatureExtractor,
+    'ssd_inception_v3': SSDInceptionV3FeatureExtractor,
    'ssd_mobilenet_v1': SSDMobileNetV1FeatureExtractor,
+    'embedded_ssd_mobilenet_v1': EmbeddedSSDMobileNetV1FeatureExtractor,
 }

 # A map of names to Faster R-CNN feature extractors.
 FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP = {
+    'faster_rcnn_nas':
+    frcnn_nas.FasterRCNNNASFeatureExtractor,
+    'faster_rcnn_inception_resnet_v2':
+    frcnn_inc_res.FasterRCNNInceptionResnetV2FeatureExtractor,
+    'faster_rcnn_inception_v2':
+    frcnn_inc_v2.FasterRCNNInceptionV2FeatureExtractor,
    'faster_rcnn_resnet50':
    frcnn_resnet_v1.FasterRCNNResnet50FeatureExtractor,
    'faster_rcnn_resnet101':
    frcnn_resnet_v1.FasterRCNNResnet101FeatureExtractor,
    'faster_rcnn_resnet152':
    frcnn_resnet_v1.FasterRCNNResnet152FeatureExtractor,
-    'faster_rcnn_inception_resnet_v2':
-    frcnn_inc_res.FasterRCNNInceptionResnetV2FeatureExtractor
 }


@@ -94,6 +104,8 @@ def _build_ssd_feature_extractor(feature_extractor_config, is_training,
  feature_type = feature_extractor_config.type
  depth_multiplier = feature_extractor_config.depth_multiplier
  min_depth = feature_extractor_config.min_depth
+  pad_to_multiple = feature_extractor_config.pad_to_multiple
+  batch_norm_trainable = feature_extractor_config.batch_norm_trainable
  conv_hyperparams = hyperparams_builder.build(
      feature_extractor_config.conv_hyperparams, is_training)

@@ -101,8 +113,9 @@ def _build_ssd_feature_extractor(feature_extractor_config, is_training,
    raise ValueError('Unknown ssd feature_extractor: {}'.format(feature_type))

  feature_extractor_class = SSD_FEATURE_EXTRACTOR_CLASS_MAP[feature_type]
-  return feature_extractor_class(depth_multiplier, min_depth, conv_hyperparams,
-                                 reuse_weights)
+  return feature_extractor_class(is_training, depth_multiplier, min_depth,
+                                 pad_to_multiple, conv_hyperparams,
+                                 batch_norm_trainable, reuse_weights)


 def _build_ssd_model(ssd_config, is_training):
@@ -180,6 +193,7 @@ def _build_faster_rcnn_feature_extractor(
  feature_type = feature_extractor_config.type
  first_stage_features_stride = (
      feature_extractor_config.first_stage_features_stride)
+  batch_norm_trainable = feature_extractor_config.batch_norm_trainable

  if feature_type not in FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP:
    raise ValueError('Unknown Faster R-CNN feature_extractor: {}'.format(
@@ -187,7 +201,8 @@ def _build_faster_rcnn_feature_extractor(
  feature_extractor_class = FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP[
      feature_type]
  return feature_extractor_class(
-      is_training, first_stage_features_stride, reuse_weights)
+      is_training, first_stage_features_stride,
+      batch_norm_trainable, reuse_weights)


 def _build_faster_rcnn_model(frcnn_config, is_training):
@@ -248,8 +263,13 @@ def _build_faster_rcnn_model(frcnn_config, is_training):
  ) = post_processing_builder.build(frcnn_config.second_stage_post_processing)
  second_stage_localization_loss_weight = (
      frcnn_config.second_stage_localization_loss_weight)
+  second_stage_classification_loss = (
+      losses_builder.build_faster_rcnn_classification_loss(
+          frcnn_config.second_stage_classification_loss))
  second_stage_classification_loss_weight = (
      frcnn_config.second_stage_classification_loss_weight)
+  second_stage_mask_prediction_loss_weight = (
+      frcnn_config.second_stage_mask_prediction_loss_weight)

  hard_example_miner = None
  if frcnn_config.HasField('hard_example_miner'):
@@ -286,6 +306,8 @@ def _build_faster_rcnn_model(frcnn_config, is_training):
      'second_stage_score_conversion_fn': second_stage_score_conversion_fn,
      'second_stage_localization_loss_weight':
      second_stage_localization_loss_weight,
+      'second_stage_classification_loss':
+      second_stage_classification_loss,
      'second_stage_classification_loss_weight':
      second_stage_classification_loss_weight,
      'hard_example_miner': hard_example_miner}
@@ -300,4 +322,6 @@ def _build_faster_rcnn_model(frcnn_config, is_training):
        maxpool_kernel_size=maxpool_kernel_size,
        maxpool_stride=maxpool_stride,
        second_stage_mask_rcnn_box_predictor=second_stage_box_predictor,
+        second_stage_mask_prediction_loss_weight=(
+            second_stage_mask_prediction_loss_weight),
        **common_kwargs)
--- a/research/object_detection/builders/model_builder_test.py
+++ b/research/object_detection/builders/model_builder_test.py
@@ -23,8 +23,11 @@ from object_detection.meta_architectures import faster_rcnn_meta_arch
 from object_detection.meta_architectures import rfcn_meta_arch
 from object_detection.meta_architectures import ssd_meta_arch
 from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extractor as frcnn_inc_res
+from object_detection.models import faster_rcnn_inception_v2_feature_extractor as frcnn_inc_v2
+from object_detection.models import faster_rcnn_nas_feature_extractor as frcnn_nas
 from object_detection.models import faster_rcnn_resnet_v1_feature_extractor as frcnn_resnet_v1
 from object_detection.models.ssd_inception_v2_feature_extractor import SSDInceptionV2FeatureExtractor
+from object_detection.models.ssd_inception_v3_feature_extractor import SSDInceptionV3FeatureExtractor
 from object_detection.models.ssd_mobilenet_v1_feature_extractor import SSDMobileNetV1FeatureExtractor
 from object_detection.protos import model_pb2

@@ -123,6 +126,77 @@ class ModelBuilderTest(tf.test.TestCase):
    self.assertIsInstance(model._feature_extractor,
                          SSDInceptionV2FeatureExtractor)

+  def test_create_ssd_inception_v3_model_from_config(self):
+    model_text_proto = """
+      ssd {
+        feature_extractor {
+          type: 'ssd_inception_v3'
+          conv_hyperparams {
+            regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+          }
+        }
+        box_coder {
+          faster_rcnn_box_coder {
+          }
+        }
+        matcher {
+          argmax_matcher {
+          }
+        }
+        similarity_calculator {
+          iou_similarity {
+          }
+        }
+        anchor_generator {
+          ssd_anchor_generator {
+            aspect_ratios: 1.0
+          }
+        }
+        image_resizer {
+          fixed_shape_resizer {
+            height: 320
+            width: 320
+          }
+        }
+        box_predictor {
+          convolutional_box_predictor {
+            conv_hyperparams {
+              regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+            }
+          }
+        }
+        loss {
+          classification_loss {
+            weighted_softmax {
+            }
+          }
+          localization_loss {
+            weighted_smooth_l1 {
+            }
+          }
+        }
+      }"""
+    model_proto = model_pb2.DetectionModel()
+    text_format.Merge(model_text_proto, model_proto)
+    model = self.create_model(model_proto)
+    self.assertIsInstance(model, ssd_meta_arch.SSDMetaArch)
+    self.assertIsInstance(model._feature_extractor,
+                          SSDInceptionV3FeatureExtractor)
+
  def test_create_ssd_mobilenet_v1_model_from_config(self):
    model_text_proto = """
      ssd {
@@ -138,6 +212,7 @@ class ModelBuilderTest(tf.test.TestCase):
                }
              }
          }
+          batch_norm_trainable: true
        }
        box_coder {
          faster_rcnn_box_coder {
@@ -193,6 +268,7 @@ class ModelBuilderTest(tf.test.TestCase):
    self.assertIsInstance(model, ssd_meta_arch.SSDMetaArch)
    self.assertIsInstance(model._feature_extractor,
                          SSDMobileNetV1FeatureExtractor)
+    self.assertTrue(model._feature_extractor._batch_norm_trainable)

  def test_create_faster_rcnn_resnet_v1_models_from_config(self):
    model_text_proto = """
@@ -255,12 +331,155 @@ class ModelBuilderTest(tf.test.TestCase):
      }"""
    model_proto = model_pb2.DetectionModel()
    text_format.Merge(model_text_proto, model_proto)
-    for extractor_type, extractor_class in FEATURE_EXTRACTOR_MAPS.items():
+    for extractor_type, extractor_class in FEATURE_EXTRACTOR_MAPS.iteritems():
      model_proto.faster_rcnn.feature_extractor.type = extractor_type
      model = model_builder.build(model_proto, is_training=True)
      self.assertIsInstance(model, faster_rcnn_meta_arch.FasterRCNNMetaArch)
      self.assertIsInstance(model._feature_extractor, extractor_class)

+  def test_create_faster_rcnn_resnet101_with_mask_prediction_enabled(self):
+    model_text_proto = """
+      faster_rcnn {
+        num_classes: 3
+        image_resizer {
+          keep_aspect_ratio_resizer {
+            min_dimension: 600
+            max_dimension: 1024
+          }
+        }
+        feature_extractor {
+          type: 'faster_rcnn_resnet101'
+        }
+        first_stage_anchor_generator {
+          grid_anchor_generator {
+            scales: [0.25, 0.5, 1.0, 2.0]
+            aspect_ratios: [0.5, 1.0, 2.0]
+            height_stride: 16
+            width_stride: 16
+          }
+        }
+        first_stage_box_predictor_conv_hyperparams {
+          regularizer {
+            l2_regularizer {
+            }
+          }
+          initializer {
+            truncated_normal_initializer {
+            }
+          }
+        }
+        initial_crop_size: 14
+        maxpool_kernel_size: 2
+        maxpool_stride: 2
+        second_stage_box_predictor {
+          mask_rcnn_box_predictor {
+            fc_hyperparams {
+              op: FC
+              regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+            }
+            conv_hyperparams {
+              regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+            }
+            predict_instance_masks: true
+          }
+        }
+        second_stage_mask_prediction_loss_weight: 3.0
+        second_stage_post_processing {
+          batch_non_max_suppression {
+            score_threshold: 0.01
+            iou_threshold: 0.6
+            max_detections_per_class: 100
+            max_total_detections: 300
+          }
+          score_converter: SOFTMAX
+        }
+      }"""
+    model_proto = model_pb2.DetectionModel()
+    text_format.Merge(model_text_proto, model_proto)
+    model = model_builder.build(model_proto, is_training=True)
+    self.assertAlmostEqual(model._second_stage_mask_loss_weight, 3.0)
+
+  def test_create_faster_rcnn_nas_model_from_config(self):
+    model_text_proto = """
+      faster_rcnn {
+        num_classes: 3
+        image_resizer {
+          keep_aspect_ratio_resizer {
+            min_dimension: 600
+            max_dimension: 1024
+          }
+        }
+        feature_extractor {
+          type: 'faster_rcnn_nas'
+        }
+        first_stage_anchor_generator {
+          grid_anchor_generator {
+            scales: [0.25, 0.5, 1.0, 2.0]
+            aspect_ratios: [0.5, 1.0, 2.0]
+            height_stride: 16
+            width_stride: 16
+          }
+        }
+        first_stage_box_predictor_conv_hyperparams {
+          regularizer {
+            l2_regularizer {
+            }
+          }
+          initializer {
+            truncated_normal_initializer {
+            }
+          }
+        }
+        initial_crop_size: 17
+        maxpool_kernel_size: 1
+        maxpool_stride: 1
+        second_stage_box_predictor {
+          mask_rcnn_box_predictor {
+            fc_hyperparams {
+              op: FC
+              regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+            }
+          }
+        }
+        second_stage_post_processing {
+          batch_non_max_suppression {
+            score_threshold: 0.01
+            iou_threshold: 0.6
+            max_detections_per_class: 100
+            max_total_detections: 300
+          }
+          score_converter: SOFTMAX
+        }
+      }"""
+    model_proto = model_pb2.DetectionModel()
+    text_format.Merge(model_text_proto, model_proto)
+    model = model_builder.build(model_proto, is_training=True)
+    self.assertIsInstance(model, faster_rcnn_meta_arch.FasterRCNNMetaArch)
+    self.assertIsInstance(
+        model._feature_extractor,
+        frcnn_nas.FasterRCNNNASFeatureExtractor)
+
  def test_create_faster_rcnn_inception_resnet_v2_model_from_config(self):
    model_text_proto = """
      faster_rcnn {
@@ -328,6 +547,72 @@ class ModelBuilderTest(tf.test.TestCase):
        model._feature_extractor,
        frcnn_inc_res.FasterRCNNInceptionResnetV2FeatureExtractor)

+  def test_create_faster_rcnn_inception_v2_model_from_config(self):
+    model_text_proto = """
+      faster_rcnn {
+        num_classes: 3
+        image_resizer {
+          keep_aspect_ratio_resizer {
+            min_dimension: 600
+            max_dimension: 1024
+          }
+        }
+        feature_extractor {
+          type: 'faster_rcnn_inception_v2'
+        }
+        first_stage_anchor_generator {
+          grid_anchor_generator {
+            scales: [0.25, 0.5, 1.0, 2.0]
+            aspect_ratios: [0.5, 1.0, 2.0]
+            height_stride: 16
+            width_stride: 16
+          }
+        }
+        first_stage_box_predictor_conv_hyperparams {
+          regularizer {
+            l2_regularizer {
+            }
+          }
+          initializer {
+            truncated_normal_initializer {
+            }
+          }
+        }
+        initial_crop_size: 14
+        maxpool_kernel_size: 2
+        maxpool_stride: 2
+        second_stage_box_predictor {
+          mask_rcnn_box_predictor {
+            fc_hyperparams {
+              op: FC
+              regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+            }
+          }
+        }
+        second_stage_post_processing {
+          batch_non_max_suppression {
+            score_threshold: 0.01
+            iou_threshold: 0.6
+            max_detections_per_class: 100
+            max_total_detections: 300
+          }
+          score_converter: SOFTMAX
+        }
+      }"""
+    model_proto = model_pb2.DetectionModel()
+    text_format.Merge(model_text_proto, model_proto)
+    model = model_builder.build(model_proto, is_training=True)
+    self.assertIsInstance(model, faster_rcnn_meta_arch.FasterRCNNMetaArch)
+    self.assertIsInstance(model._feature_extractor,
+                          frcnn_inc_v2.FasterRCNNInceptionV2FeatureExtractor)
+
  def test_create_faster_rcnn_model_from_config_with_example_miner(self):
    model_text_proto = """
      faster_rcnn {
@@ -445,7 +730,7 @@ class ModelBuilderTest(tf.test.TestCase):
      }"""
    model_proto = model_pb2.DetectionModel()
    text_format.Merge(model_text_proto, model_proto)
-    for extractor_type, extractor_class in FEATURE_EXTRACTOR_MAPS.items():
+    for extractor_type, extractor_class in FEATURE_EXTRACTOR_MAPS.iteritems():
      model_proto.faster_rcnn.feature_extractor.type = extractor_type
      model = model_builder.build(model_proto, is_training=True)
      self.assertIsInstance(model, rfcn_meta_arch.RFCNMetaArch)

--- a/research/object_detection/models/BUILD
+++ b/research/object_detection/models/BUILD
@@ -15,7 +15,6 @@ py_library(
    ],
    deps = [
        "//tensorflow",
-        "//tensorflow_models/object_detection/utils:ops",
    ],
 )

@@ -49,10 +48,25 @@ py_library(
        ":feature_map_generators",
        "//tensorflow",
        "//tensorflow_models/object_detection/meta_architectures:ssd_meta_arch",
+        "//tensorflow_models/object_detection/utils:ops",
        "//tensorflow_models/slim:inception_v2",
    ],
 )

+py_library(
+    name = "ssd_inception_v3_feature_extractor",
+    srcs = [
+        "ssd_inception_v3_feature_extractor.py",
+    ],
+    deps = [
+        ":feature_map_generators",
+        "//tensorflow",
+        "//tensorflow_models/object_detection/meta_architectures:ssd_meta_arch",
+        "//tensorflow_models/object_detection/utils:ops",
+        "//tensorflow_models/slim:inception_v3",
+    ],
+)
+
 py_library(
    name = "ssd_mobilenet_v1_feature_extractor",
    srcs = ["ssd_mobilenet_v1_feature_extractor.py"],
@@ -60,6 +74,19 @@ py_library(
        ":feature_map_generators",
        "//tensorflow",
        "//tensorflow_models/object_detection/meta_architectures:ssd_meta_arch",
+        "//tensorflow_models/object_detection/utils:ops",
+        "//tensorflow_models/slim:mobilenet_v1",
+    ],
+)
+
+py_library(
+    name = "embedded_ssd_mobilenet_v1_feature_extractor",
+    srcs = ["embedded_ssd_mobilenet_v1_feature_extractor.py"],
+    deps = [
+        ":feature_map_generators",
+        ":ssd_mobilenet_v1_feature_extractor",
+        "//tensorflow",
+        "//tensorflow_models/object_detection/utils:ops",
        "//tensorflow_models/slim:mobilenet_v1",
    ],
 )
@@ -76,6 +103,18 @@ py_test(
    ],
 )

+py_test(
+    name = "ssd_inception_v3_feature_extractor_test",
+    srcs = [
+        "ssd_inception_v3_feature_extractor_test.py",
+    ],
+    deps = [
+        ":ssd_feature_extractor_test",
+        ":ssd_inception_v3_feature_extractor",
+        "//tensorflow",
+    ],
+)
+
 py_test(
    name = "ssd_mobilenet_v1_feature_extractor_test",
    srcs = ["ssd_mobilenet_v1_feature_extractor_test.py"],
@@ -86,6 +125,39 @@ py_test(
    ],
 )

+py_test(
+    name = "embedded_ssd_mobilenet_v1_feature_extractor_test",
+    srcs = ["embedded_ssd_mobilenet_v1_feature_extractor_test.py"],
+    deps = [
+        ":embedded_ssd_mobilenet_v1_feature_extractor",
+        ":ssd_feature_extractor_test",
+        "//tensorflow",
+    ],
+)
+
+py_test(
+    name = "faster_rcnn_nas_feature_extractor_test",
+    srcs = [
+        "faster_rcnn_nas_feature_extractor_test.py",
+    ],
+    deps = [
+        ":faster_rcnn_nas_feature_extractor",
+        "//tensorflow",
+    ],
+)
+
+py_library(
+    name = "faster_rcnn_nas_feature_extractor",
+    srcs = [
+        "faster_rcnn_nas_feature_extractor.py",
+    ],
+    deps = [
+        "//tensorflow",
+        "//tensorflow_models/object_detection/meta_architectures:faster_rcnn_meta_arch",
+        "//tensorflow_models/slim:nasnet",
+    ],
+)
+
 py_library(
    name = "faster_rcnn_inception_resnet_v2_feature_extractor",
    srcs = [
@@ -109,6 +181,29 @@ py_test(
    ],
 )

+py_library(
+    name = "faster_rcnn_inception_v2_feature_extractor",
+    srcs = [
+        "faster_rcnn_inception_v2_feature_extractor.py",
+    ],
+    deps = [
+        "//tensorflow",
+        "//tensorflow_models/object_detection/meta_architectures:faster_rcnn_meta_arch",
+        "//tensorflow_models/slim:inception_v2",
+    ],
+)
+
+py_test(
+    name = "faster_rcnn_inception_v2_feature_extractor_test",
+    srcs = [
+        "faster_rcnn_inception_v2_feature_extractor_test.py",
+    ],
+    deps = [
+        ":faster_rcnn_inception_v2_feature_extractor",
+        "//tensorflow",
+    ],
+)
+
 py_library(
    name = "faster_rcnn_resnet_v1_feature_extractor",
    srcs = [

--- a/research/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor.py
+++ b/research/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Embedded-friendly SSDFeatureExtractor for MobilenetV1 features."""
+
+import tensorflow as tf
+
+from object_detection.models import feature_map_generators
+from object_detection.models import ssd_mobilenet_v1_feature_extractor
+from object_detection.utils import ops
+from nets import mobilenet_v1
+
+slim = tf.contrib.slim
+
+
+class EmbeddedSSDMobileNetV1FeatureExtractor(
+    ssd_mobilenet_v1_feature_extractor.SSDMobileNetV1FeatureExtractor):
+  """Embedded-friendly SSD Feature Extractor using MobilenetV1 features.
+
+  This feature extractor is similar to SSD MobileNetV1 feature extractor, and
+  it fixes input resolution to be 256x256, reduces the number of feature maps
+  used for box prediction and ensures convolution kernel to be no larger
+  than input tensor in spatial dimensions.
+
+  This feature extractor requires support of the following ops if used in
+  embedded devices:
+  - Conv
+  - DepthwiseConv
+  - Relu6
+
+  All conv/depthwiseconv use SAME padding, and no additional spatial padding is
+  needed.
+  """
+
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams,
+               batch_norm_trainable=True,
+               reuse_weights=None):
+    """MobileNetV1 Feature Extractor for Embedded-friendly SSD Models.
+
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: float depth multiplier for feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to. For EmbeddedSSD it must be set to 1.
+      conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
+      batch_norm_trainable:  Whether to update batch norm parameters during
+        training or not. When training with a small batch size
+        (e.g. 1), it is desirable to disable batch norm update and use
+        pretrained batch norm params.
+      reuse_weights: Whether to reuse variables. Default is None.
+
+    Raises:
+      ValueError: upon invalid `pad_to_multiple` values.
+    """
+    if pad_to_multiple != 1:
+      raise ValueError('Embedded-specific SSD only supports `pad_to_multiple` '
+                       'of 1.')
+
+    super(EmbeddedSSDMobileNetV1FeatureExtractor, self).__init__(
+        is_training, depth_multiplier, min_depth, pad_to_multiple,
+        conv_hyperparams, batch_norm_trainable, reuse_weights)
+
+  def extract_features(self, preprocessed_inputs):
+    """Extract features from preprocessed inputs.
+
+    Args:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+
+    Returns:
+      feature_maps: a list of tensors where the ith tensor has shape
+        [batch, height_i, width_i, depth_i]
+    """
+    preprocessed_inputs.get_shape().assert_has_rank(4)
+    shape_assert = tf.Assert(
+        tf.logical_and(
+            tf.equal(tf.shape(preprocessed_inputs)[1], 256),
+            tf.equal(tf.shape(preprocessed_inputs)[2], 256)),
+        ['image size must be 256 in both height and width.'])
+
+    feature_map_layout = {
+        'from_layer': [
+            'Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', ''
+        ],
+        'layer_depth': [-1, -1, 512, 256, 256],
+        'conv_kernel_size': [-1, -1, 3, 3, 2],
+    }
+
+    with tf.control_dependencies([shape_assert]):
+      with slim.arg_scope(self._conv_hyperparams):
+        with tf.variable_scope('MobilenetV1',
+                               reuse=self._reuse_weights) as scope:
+          _, image_features = mobilenet_v1.mobilenet_v1_base(
+              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
+              final_endpoint='Conv2d_13_pointwise',
+              min_depth=self._min_depth,
+              depth_multiplier=self._depth_multiplier,
+              scope=scope)
+          feature_maps = feature_map_generators.multi_resolution_feature_maps(
+              feature_map_layout=feature_map_layout,
+              depth_multiplier=self._depth_multiplier,
+              min_depth=self._min_depth,
+              insert_1x1_conv=True,
+              image_features=image_features)
+
+    return feature_maps.values()
--- a/research/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor_test.py
+++ b/research/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor_test.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for embedded_ssd_mobilenet_v1_feature_extractor."""
+import numpy as np
+import tensorflow as tf
+
+from object_detection.models import embedded_ssd_mobilenet_v1_feature_extractor
+from object_detection.models import ssd_feature_extractor_test
+
+
+class EmbeddedSSDMobileNetV1FeatureExtractorTest(
+    ssd_feature_extractor_test.SsdFeatureExtractorTestBase, tf.test.TestCase):
+
+  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
+                                is_training=True, batch_norm_trainable=True):
+    """Constructs a new feature extractor.
+
+    Args:
+      depth_multiplier: float depth multiplier for feature extractor
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      is_training: whether the network is in training mode.
+      batch_norm_trainable: whether to update batch norm parameters during
+        training.
+
+    Returns:
+      an ssd_meta_arch.SSDFeatureExtractor object.
+    """
+    min_depth = 32
+    conv_hyperparams = {}
+    return (embedded_ssd_mobilenet_v1_feature_extractor.
+            EmbeddedSSDMobileNetV1FeatureExtractor(
+                is_training, depth_multiplier, min_depth, pad_to_multiple,
+                conv_hyperparams, batch_norm_trainable))
+
+  def test_extract_features_returns_correct_shapes_256(self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(4, 16, 16, 512), (4, 8, 8, 1024),
+                                  (4, 4, 4, 512), (4, 2, 2, 256),
+                                  (4, 1, 1, 256)]
+    self.check_extract_features_returns_correct_shape(
+        image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+
+  def test_extract_features_returns_correct_shapes_enforcing_min_depth(self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 0.5**12
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(4, 16, 16, 32), (4, 8, 8, 32), (4, 4, 4, 32),
+                                  (4, 2, 2, 32), (4, 1, 1, 32)]
+    self.check_extract_features_returns_correct_shape(
+        image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+
+  def test_extract_features_returns_correct_shapes_with_pad_to_multiple_of_1(
+      self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(4, 16, 16, 512), (4, 8, 8, 1024),
+                                  (4, 4, 4, 512), (4, 2, 2, 256),
+                                  (4, 1, 1, 256)]
+    self.check_extract_features_returns_correct_shape(
+        image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+
+  def test_extract_features_raises_error_with_pad_to_multiple_not_1(self):
+    depth_multiplier = 1.0
+    pad_to_multiple = 2
+    with self.assertRaises(ValueError):
+      _ = self._create_feature_extractor(depth_multiplier, pad_to_multiple)
+
+  def test_extract_features_raises_error_with_invalid_image_size(self):
+    image_height = 128
+    image_width = 128
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    self.check_extract_features_raises_error_with_invalid_image_size(
+        image_height, image_width, depth_multiplier, pad_to_multiple)
+
+  def test_preprocess_returns_correct_value_range(self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 1
+    pad_to_multiple = 1
+    test_image = np.random.rand(4, image_height, image_width, 3)
+    feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                       pad_to_multiple)
+    preprocessed_image = feature_extractor.preprocess(test_image)
+    self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))
+
+  def test_variables_only_created_in_scope(self):
+    depth_multiplier = 1
+    pad_to_multiple = 1
+    scope_name = 'MobilenetV1'
+    self.check_feature_extractor_variables_under_scope(
+        depth_multiplier, pad_to_multiple, scope_name)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/models/faster_rcnn_inception_resnet_v2_feature_extractor.py
+++ b/research/object_detection/models/faster_rcnn_inception_resnet_v2_feature_extractor.py
@@ -37,6 +37,7 @@ class FasterRCNNInceptionResnetV2FeatureExtractor(
  def __init__(self,
               is_training,
               first_stage_features_stride,
+               batch_norm_trainable=False,
               reuse_weights=None,
               weight_decay=0.0):
    """Constructor.
@@ -44,6 +45,7 @@ class FasterRCNNInceptionResnetV2FeatureExtractor(
    Args:
      is_training: See base class.
      first_stage_features_stride: See base class.
+      batch_norm_trainable: See base class.
      reuse_weights: See base class.
      weight_decay: See base class.

@@ -53,7 +55,8 @@ class FasterRCNNInceptionResnetV2FeatureExtractor(
    if first_stage_features_stride != 8 and first_stage_features_stride != 16:
      raise ValueError('`first_stage_features_stride` must be 8 or 16.')
    super(FasterRCNNInceptionResnetV2FeatureExtractor, self).__init__(
-        is_training, first_stage_features_stride, reuse_weights, weight_decay)
+        is_training, first_stage_features_stride, batch_norm_trainable,
+        reuse_weights, weight_decay)

  def preprocess(self, resized_inputs):
    """Faster R-CNN with Inception Resnet v2 preprocessing.
@@ -98,7 +101,8 @@ class FasterRCNNInceptionResnetV2FeatureExtractor(
    with slim.arg_scope(inception_resnet_v2.inception_resnet_v2_arg_scope(
        weight_decay=self._weight_decay)):
      # Forces is_training to False to disable batch norm update.
-      with slim.arg_scope([slim.batch_norm], is_training=False):
+      with slim.arg_scope([slim.batch_norm],
+                          is_training=self._train_batch_norm):
        with tf.variable_scope('InceptionResnetV2',
                               reuse=self._reuse_weights) as scope:
          rpn_feature_map, _ = (
@@ -129,7 +133,8 @@ class FasterRCNNInceptionResnetV2FeatureExtractor(
      with slim.arg_scope(inception_resnet_v2.inception_resnet_v2_arg_scope(
          weight_decay=self._weight_decay)):
        # Forces is_training to False to disable batch norm update.
-        with slim.arg_scope([slim.batch_norm], is_training=False):
+        with slim.arg_scope([slim.batch_norm],
+                            is_training=self._train_batch_norm):
          with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
                              stride=1, padding='SAME'):
            with tf.variable_scope('Mixed_7a'):
@@ -207,3 +212,4 @@ class FasterRCNNInceptionResnetV2FeatureExtractor(
            second_stage_feature_extractor_scope + '/', '')
        variables_to_restore[var_name] = variable
    return variables_to_restore
+
--- a/research/object_detection/models/faster_rcnn_inception_resnet_v2_feature_extractor_test.py
+++ b/research/object_detection/models/faster_rcnn_inception_resnet_v2_feature_extractor_test.py
@@ -26,6 +26,7 @@ class FasterRcnnInceptionResnetV2FeatureExtractorTest(tf.test.TestCase):
    return frcnn_inc_res.FasterRCNNInceptionResnetV2FeatureExtractor(
        is_training=False,
        first_stage_features_stride=first_stage_features_stride,
+        batch_norm_trainable=False,
        reuse_weights=None,
        weight_decay=0.0)


--- a/research/object_detection/models/faster_rcnn_inception_v2_feature_extractor.py
+++ b/research/object_detection/models/faster_rcnn_inception_v2_feature_extractor.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inception V2 Faster R-CNN implementation.
+
+See "Rethinking the Inception Architecture for Computer Vision"
+https://arxiv.org/abs/1512.00567
+"""
+import tensorflow as tf
+
+from object_detection.meta_architectures import faster_rcnn_meta_arch
+from nets import inception_v2
+
+slim = tf.contrib.slim
+
+
+def _batch_norm_arg_scope(list_ops,
+                          use_batch_norm=True,
+                          batch_norm_decay=0.9997,
+                          batch_norm_epsilon=0.001,
+                          batch_norm_scale=False,
+                          train_batch_norm=False):
+  """Slim arg scope for InceptionV2 batch norm."""
+  if use_batch_norm:
+    batch_norm_params = {
+        'is_training': train_batch_norm,
+        'scale': batch_norm_scale,
+        'decay': batch_norm_decay,
+        'epsilon': batch_norm_epsilon
+    }
+    normalizer_fn = slim.batch_norm
+  else:
+    normalizer_fn = None
+    batch_norm_params = None
+
+  return slim.arg_scope(list_ops,
+                        normalizer_fn=normalizer_fn,
+                        normalizer_params=batch_norm_params)
+
+
+class FasterRCNNInceptionV2FeatureExtractor(
+    faster_rcnn_meta_arch.FasterRCNNFeatureExtractor):
+  """Faster R-CNN Inception V2 feature extractor implementation."""
+
+  def __init__(self,
+               is_training,
+               first_stage_features_stride,
+               batch_norm_trainable=False,
+               reuse_weights=None,
+               weight_decay=0.0,
+               depth_multiplier=1.0,
+               min_depth=16):
+    """Constructor.
+
+    Args:
+      is_training: See base class.
+      first_stage_features_stride: See base class.
+      batch_norm_trainable: See base class.
+      reuse_weights: See base class.
+      weight_decay: See base class.
+      depth_multiplier: float depth multiplier for feature extractor.
+      min_depth: minimum feature extractor depth.
+
+    Raises:
+      ValueError: If `first_stage_features_stride` is not 8 or 16.
+    """
+    if first_stage_features_stride != 8 and first_stage_features_stride != 16:
+      raise ValueError('`first_stage_features_stride` must be 8 or 16.')
+    self._depth_multiplier = depth_multiplier
+    self._min_depth = min_depth
+    super(FasterRCNNInceptionV2FeatureExtractor, self).__init__(
+        is_training, first_stage_features_stride, batch_norm_trainable,
+        reuse_weights, weight_decay)
+
+  def preprocess(self, resized_inputs):
+    """Faster R-CNN Inception V2 preprocessing.
+
+    Maps pixel values to the range [-1, 1].
+
+    Args:
+      resized_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+
+    Returns:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+    """
+    return (2.0 / 255.0) * resized_inputs - 1.0
+
+  def _extract_proposal_features(self, preprocessed_inputs, scope):
+    """Extracts first stage RPN features.
+
+    Args:
+      preprocessed_inputs: A [batch, height, width, channels] float32 tensor
+        representing a batch of images.
+      scope: A scope name.
+
+    Returns:
+      rpn_feature_map: A tensor with shape [batch, height, width, depth]
+    Raises:
+      InvalidArgumentError: If the spatial size of `preprocessed_inputs`
+        (height or width) is less than 33.
+      ValueError: If the created network is missing the required activation.
+    """
+
+    preprocessed_inputs.get_shape().assert_has_rank(4)
+    shape_assert = tf.Assert(
+        tf.logical_and(tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
+                       tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
+        ['image size must at least be 33 in both height and width.'])
+
+    with tf.control_dependencies([shape_assert]):
+      with tf.variable_scope('InceptionV2',
+                             reuse=self._reuse_weights) as scope:
+        with _batch_norm_arg_scope([slim.conv2d, slim.separable_conv2d],
+                                   batch_norm_scale=True,
+                                   train_batch_norm=self._train_batch_norm):
+          _, activations = inception_v2.inception_v2_base(
+              preprocessed_inputs,
+              final_endpoint='Mixed_4e',
+              min_depth=self._min_depth,
+              depth_multiplier=self._depth_multiplier,
+              scope=scope)
+
+    return activations['Mixed_4e']
+
+  def _extract_box_classifier_features(self, proposal_feature_maps, scope):
+    """Extracts second stage box classifier features.
+
+    Args:
+      proposal_feature_maps: A 4-D float tensor with shape
+        [batch_size * self.max_num_proposals, crop_height, crop_width, depth]
+        representing the feature map cropped to each proposal.
+      scope: A scope name (unused).
+
+    Returns:
+      proposal_classifier_features: A 4-D float tensor with shape
+        [batch_size * self.max_num_proposals, height, width, depth]
+        representing box classifier features for each proposal.
+    """
+    net = proposal_feature_maps
+
+    depth = lambda d: max(int(d * self._depth_multiplier), self._min_depth)
+    trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev)
+
+    data_format = 'NHWC'
+    concat_dim = 3 if data_format == 'NHWC' else 1
+
+    with tf.variable_scope('InceptionV2', reuse=self._reuse_weights):
+      with slim.arg_scope(
+          [slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
+          stride=1,
+          padding='SAME',
+          data_format=data_format):
+        with _batch_norm_arg_scope([slim.conv2d, slim.separable_conv2d],
+                                   batch_norm_scale=True,
+                                   train_batch_norm=self._train_batch_norm):
+
+          with tf.variable_scope('Mixed_5a'):
+            with tf.variable_scope('Branch_0'):
+              branch_0 = slim.conv2d(
+                  net, depth(128), [1, 1],
+                  weights_initializer=trunc_normal(0.09),
+                  scope='Conv2d_0a_1x1')
+              branch_0 = slim.conv2d(branch_0, depth(192), [3, 3], stride=2,
+                                     scope='Conv2d_1a_3x3')
+            with tf.variable_scope('Branch_1'):
+              branch_1 = slim.conv2d(
+                  net, depth(192), [1, 1],
+                  weights_initializer=trunc_normal(0.09),
+                  scope='Conv2d_0a_1x1')
+              branch_1 = slim.conv2d(branch_1, depth(256), [3, 3],
+                                     scope='Conv2d_0b_3x3')
+              branch_1 = slim.conv2d(branch_1, depth(256), [3, 3], stride=2,
+                                     scope='Conv2d_1a_3x3')
+            with tf.variable_scope('Branch_2'):
+              branch_2 = slim.max_pool2d(net, [3, 3], stride=2,
+                                         scope='MaxPool_1a_3x3')
+            net = tf.concat([branch_0, branch_1, branch_2], concat_dim)
+
+          with tf.variable_scope('Mixed_5b'):
+            with tf.variable_scope('Branch_0'):
+              branch_0 = slim.conv2d(net, depth(352), [1, 1],
+                                     scope='Conv2d_0a_1x1')
+            with tf.variable_scope('Branch_1'):
+              branch_1 = slim.conv2d(
+                  net, depth(192), [1, 1],
+                  weights_initializer=trunc_normal(0.09),
+                  scope='Conv2d_0a_1x1')
+              branch_1 = slim.conv2d(branch_1, depth(320), [3, 3],
+                                     scope='Conv2d_0b_3x3')
+            with tf.variable_scope('Branch_2'):
+              branch_2 = slim.conv2d(
+                  net, depth(160), [1, 1],
+                  weights_initializer=trunc_normal(0.09),
+                  scope='Conv2d_0a_1x1')
+              branch_2 = slim.conv2d(branch_2, depth(224), [3, 3],
+                                     scope='Conv2d_0b_3x3')
+              branch_2 = slim.conv2d(branch_2, depth(224), [3, 3],
+                                     scope='Conv2d_0c_3x3')
+            with tf.variable_scope('Branch_3'):
+              branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+              branch_3 = slim.conv2d(
+                  branch_3, depth(128), [1, 1],
+                  weights_initializer=trunc_normal(0.1),
+                  scope='Conv2d_0b_1x1')
+            net = tf.concat([branch_0, branch_1, branch_2, branch_3],
+                            concat_dim)
+
+          with tf.variable_scope('Mixed_5c'):
+            with tf.variable_scope('Branch_0'):
+              branch_0 = slim.conv2d(net, depth(352), [1, 1],
+                                     scope='Conv2d_0a_1x1')
+            with tf.variable_scope('Branch_1'):
+              branch_1 = slim.conv2d(
+                  net, depth(192), [1, 1],
+                  weights_initializer=trunc_normal(0.09),
+                  scope='Conv2d_0a_1x1')
+              branch_1 = slim.conv2d(branch_1, depth(320), [3, 3],
+                                     scope='Conv2d_0b_3x3')
+            with tf.variable_scope('Branch_2'):
+              branch_2 = slim.conv2d(
+                  net, depth(192), [1, 1],
+                  weights_initializer=trunc_normal(0.09),
+                  scope='Conv2d_0a_1x1')
+              branch_2 = slim.conv2d(branch_2, depth(224), [3, 3],
+                                     scope='Conv2d_0b_3x3')
+              branch_2 = slim.conv2d(branch_2, depth(224), [3, 3],
+                                     scope='Conv2d_0c_3x3')
+            with tf.variable_scope('Branch_3'):
+              branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
+              branch_3 = slim.conv2d(
+                  branch_3, depth(128), [1, 1],
+                  weights_initializer=trunc_normal(0.1),
+                  scope='Conv2d_0b_1x1')
+            proposal_classifier_features = tf.concat(
+                [branch_0, branch_1, branch_2, branch_3], concat_dim)
+
+    return proposal_classifier_features
--- a/research/object_detection/models/faster_rcnn_inception_v2_feature_extractor_test.py
+++ b/research/object_detection/models/faster_rcnn_inception_v2_feature_extractor_test.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for faster_rcnn_inception_v2_feature_extractor."""
+
+import numpy as np
+import tensorflow as tf
+
+from object_detection.models import faster_rcnn_inception_v2_feature_extractor as faster_rcnn_inception_v2
+
+
+class FasterRcnnInceptionV2FeatureExtractorTest(tf.test.TestCase):
+
+  def _build_feature_extractor(self, first_stage_features_stride):
+    return faster_rcnn_inception_v2.FasterRCNNInceptionV2FeatureExtractor(
+        is_training=False,
+        first_stage_features_stride=first_stage_features_stride,
+        batch_norm_trainable=False,
+        reuse_weights=None,
+        weight_decay=0.0)
+
+  def test_extract_proposal_features_returns_expected_size(self):
+    feature_extractor = self._build_feature_extractor(
+        first_stage_features_stride=16)
+    preprocessed_inputs = tf.random_uniform(
+        [4, 224, 224, 3], maxval=255, dtype=tf.float32)
+    rpn_feature_map = feature_extractor.extract_proposal_features(
+        preprocessed_inputs, scope='TestScope')
+    features_shape = tf.shape(rpn_feature_map)
+
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      features_shape_out = sess.run(features_shape)
+      self.assertAllEqual(features_shape_out, [4, 14, 14, 576])
+
+  def test_extract_proposal_features_stride_eight(self):
+    feature_extractor = self._build_feature_extractor(
+        first_stage_features_stride=8)
+    preprocessed_inputs = tf.random_uniform(
+        [4, 224, 224, 3], maxval=255, dtype=tf.float32)
+    rpn_feature_map = feature_extractor.extract_proposal_features(
+        preprocessed_inputs, scope='TestScope')
+    features_shape = tf.shape(rpn_feature_map)
+
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      features_shape_out = sess.run(features_shape)
+      self.assertAllEqual(features_shape_out, [4, 14, 14, 576])
+
+  def test_extract_proposal_features_half_size_input(self):
+    feature_extractor = self._build_feature_extractor(
+        first_stage_features_stride=16)
+    preprocessed_inputs = tf.random_uniform(
+        [1, 112, 112, 3], maxval=255, dtype=tf.float32)
+    rpn_feature_map = feature_extractor.extract_proposal_features(
+        preprocessed_inputs, scope='TestScope')
+    features_shape = tf.shape(rpn_feature_map)
+
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      features_shape_out = sess.run(features_shape)
+      self.assertAllEqual(features_shape_out, [1, 7, 7, 576])
+
+  def test_extract_proposal_features_dies_on_invalid_stride(self):
+    with self.assertRaises(ValueError):
+      self._build_feature_extractor(first_stage_features_stride=99)
+
+  def test_extract_proposal_features_dies_on_very_small_images(self):
+    feature_extractor = self._build_feature_extractor(
+        first_stage_features_stride=16)
+    preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3))
+    rpn_feature_map = feature_extractor.extract_proposal_features(
+        preprocessed_inputs, scope='TestScope')
+    features_shape = tf.shape(rpn_feature_map)
+
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(tf.errors.InvalidArgumentError):
+        sess.run(
+            features_shape,
+            feed_dict={preprocessed_inputs: np.random.rand(4, 32, 32, 3)})
+
+  def test_extract_proposal_features_dies_with_incorrect_rank_inputs(self):
+    feature_extractor = self._build_feature_extractor(
+        first_stage_features_stride=16)
+    preprocessed_inputs = tf.random_uniform(
+        [224, 224, 3], maxval=255, dtype=tf.float32)
+    with self.assertRaises(ValueError):
+      feature_extractor.extract_proposal_features(
+          preprocessed_inputs, scope='TestScope')
+
+  def test_extract_box_classifier_features_returns_expected_size(self):
+    feature_extractor = self._build_feature_extractor(
+        first_stage_features_stride=16)
+    proposal_feature_maps = tf.random_uniform(
+        [3, 14, 14, 576], maxval=255, dtype=tf.float32)
+    proposal_classifier_features = (
+        feature_extractor.extract_box_classifier_features(
+            proposal_feature_maps, scope='TestScope'))
+    features_shape = tf.shape(proposal_classifier_features)
+
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      features_shape_out = sess.run(features_shape)
+      self.assertAllEqual(features_shape_out, [3, 7, 7, 1024])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/models/faster_rcnn_nas_feature_extractor.py
+++ b/research/object_detection/models/faster_rcnn_nas_feature_extractor.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""NASNet Faster R-CNN implementation.
+
+Learning Transferable Architectures for Scalable Image Recognition
+Barret Zoph, Vijay Vasudevan, Jonathon Shlens, Quoc V. Le
+https://arxiv.org/abs/1707.07012
+"""
+
+import tensorflow as tf
+
+from object_detection.meta_architectures import faster_rcnn_meta_arch
+from nets.nasnet import nasnet
+from nets.nasnet import nasnet_utils
+
+arg_scope = tf.contrib.framework.arg_scope
+slim = tf.contrib.slim
+
+
+# Note: This is largely a copy of _build_nasnet_base inside nasnet.py but
+# with special edits to remove instantiation of the stem and the special
+# ability to receive as input a pair of hidden states.
+def _build_nasnet_base(hidden_previous,
+                       hidden,
+                       normal_cell,
+                       reduction_cell,
+                       hparams,
+                       true_cell_num,
+                       start_cell_num):
+  """Constructs a NASNet image model."""
+
+  # Find where to place the reduction cells or stride normal cells
+  reduction_indices = nasnet_utils.calc_reduction_layers(
+      hparams.num_cells, hparams.num_reduction_layers)
+
+  # Note: The None is prepended to match the behavior of _imagenet_stem()
+  cell_outputs = [None, hidden_previous, hidden]
+  net = hidden
+
+  # NOTE: In the nasnet.py code, filter_scaling starts at 1.0. We instead
+  # start at 2.0 because 1 reduction cell has been created which would
+  # update the filter_scaling to 2.0.
+  filter_scaling = 2.0
+
+  # Run the cells
+  for cell_num in range(start_cell_num, hparams.num_cells):
+    stride = 1
+    if hparams.skip_reduction_layer_input:
+      prev_layer = cell_outputs[-2]
+    if cell_num in reduction_indices:
+      filter_scaling *= hparams.filter_scaling_rate
+      net = reduction_cell(
+          net,
+          scope='reduction_cell_{}'.format(reduction_indices.index(cell_num)),
+          filter_scaling=filter_scaling,
+          stride=2,
+          prev_layer=cell_outputs[-2],
+          cell_num=true_cell_num)
+      true_cell_num += 1
+      cell_outputs.append(net)
+    if not hparams.skip_reduction_layer_input:
+      prev_layer = cell_outputs[-2]
+    net = normal_cell(
+        net,
+        scope='cell_{}'.format(cell_num),
+        filter_scaling=filter_scaling,
+        stride=stride,
+        prev_layer=prev_layer,
+        cell_num=true_cell_num)
+    true_cell_num += 1
+    cell_outputs.append(net)
+
+  # Final nonlinearity.
+  # Note that we have dropped the final pooling, dropout and softmax layers
+  # from the default nasnet version.
+  with tf.variable_scope('final_layer'):
+    net = tf.nn.relu(net)
+  return net
+
+
+# TODO: Only fixed_shape_resizer is currently supported for NASNet
+# featurization. The reason for this is that nasnet.py only supports
+# inputs with fully known shapes. We need to update nasnet.py to handle
+# shapes not known at compile time.
+class FasterRCNNNASFeatureExtractor(
+    faster_rcnn_meta_arch.FasterRCNNFeatureExtractor):
+  """Faster R-CNN with NASNet-A feature extractor implementation."""
+
+  def __init__(self,
+               is_training,
+               first_stage_features_stride,
+               batch_norm_trainable=False,
+               reuse_weights=None,
+               weight_decay=0.0):
+    """Constructor.
+
+    Args:
+      is_training: See base class.
+      first_stage_features_stride: See base class.
+      batch_norm_trainable: See base class.
+      reuse_weights: See base class.
+      weight_decay: See base class.
+
+    Raises:
+      ValueError: If `first_stage_features_stride` is not 16.
+    """
+    if first_stage_features_stride != 16:
+      raise ValueError('`first_stage_features_stride` must be 16.')
+    super(FasterRCNNNASFeatureExtractor, self).__init__(
+        is_training, first_stage_features_stride, batch_norm_trainable,
+        reuse_weights, weight_decay)
+
+  def preprocess(self, resized_inputs):
+    """Faster R-CNN with NAS preprocessing.
+
+    Maps pixel values to the range [-1, 1].
+
+    Args:
+      resized_inputs: A [batch, height_in, width_in, channels] float32 tensor
+        representing a batch of images with values between 0 and 255.0.
+
+    Returns:
+      preprocessed_inputs: A [batch, height_out, width_out, channels] float32
+        tensor representing a batch of images.
+
+    """
+    return (2.0 / 255.0) * resized_inputs - 1.0
+
+  def _extract_proposal_features(self, preprocessed_inputs, scope):
+    """Extracts first stage RPN features.
+
+    Extracts features using the first half of the NASNet network.
+    We construct the network in `align_feature_maps=True` mode, which means
+    that all VALID paddings in the network are changed to SAME padding so that
+    the feature maps are aligned.
+
+    Args:
+      preprocessed_inputs: A [batch, height, width, channels] float32 tensor
+        representing a batch of images.
+      scope: A scope name.
+
+    Returns:
+      rpn_feature_map: A tensor with shape [batch, height, width, depth]
+    Raises:
+      ValueError: If the created network is missing the required activation.
+    """
+    del scope
+
+    if len(preprocessed_inputs.get_shape().as_list()) != 4:
+      raise ValueError('`preprocessed_inputs` must be 4 dimensional, got a '
+                       'tensor of shape %s' % preprocessed_inputs.get_shape())
+
+    with slim.arg_scope(nasnet.nasnet_large_arg_scope()):
+      _, end_points = nasnet.build_nasnet_large(
+          preprocessed_inputs, num_classes=None,
+          is_training=self._is_training,
+          is_batchnorm_training=self._train_batch_norm,
+          final_endpoint='Cell_11')
+
+    # Note that both 'Cell_10' and 'Cell_11' have equal depth = 2016.
+    rpn_feature_map = tf.concat([end_points['Cell_10'],
+                                 end_points['Cell_11']], 3)
+
+    # nasnet.py does not maintain the batch size in the first dimension.
+    # This work around permits us retaining the batch for below.
+    batch = preprocessed_inputs.get_shape().as_list()[0]
+    shape_without_batch = rpn_feature_map.get_shape().as_list()[1:]
+    rpn_feature_map_shape = [batch] + shape_without_batch
+    rpn_feature_map.set_shape(rpn_feature_map_shape)
+
+    return rpn_feature_map
+
+  def _extract_box_classifier_features(self, proposal_feature_maps, scope):
+    """Extracts second stage box classifier features.
+
+    This function reconstructs the "second half" of the NASNet-A
+    network after the part defined in `_extract_proposal_features`.
+
+    Args:
+      proposal_feature_maps: A 4-D float tensor with shape
+        [batch_size * self.max_num_proposals, crop_height, crop_width, depth]
+        representing the feature map cropped to each proposal.
+      scope: A scope name.
+
+    Returns:
+      proposal_classifier_features: A 4-D float tensor with shape
+        [batch_size * self.max_num_proposals, height, width, depth]
+        representing box classifier features for each proposal.
+    """
+    del scope
+
+    # Note that we always feed into 2 layers of equal depth
+    # where the first N channels corresponds to previous hidden layer
+    # and the second N channels correspond to the final hidden layer.
+    hidden_previous, hidden = tf.split(proposal_feature_maps, 2, axis=3)
+
+    # Note that what follows is largely a copy of build_nasnet_large() within
+    # nasnet.py. We are copying to minimize code pollution in slim.
+
+    # pylint: disable=protected-access
+    hparams = nasnet._large_imagenet_config(is_training=self._is_training)
+    # pylint: enable=protected-access
+
+    # Calculate the total number of cells in the network
+    # -- Add 2 for the reduction cells.
+    total_num_cells = hparams.num_cells + 2
+    # -- And add 2 for the stem cells for ImageNet training.
+    total_num_cells += 2
+
+    normal_cell = nasnet_utils.NasNetANormalCell(
+        hparams.num_conv_filters, hparams.drop_path_keep_prob,
+        total_num_cells, hparams.total_training_steps)
+    reduction_cell = nasnet_utils.NasNetAReductionCell(
+        hparams.num_conv_filters, hparams.drop_path_keep_prob,
+        total_num_cells, hparams.total_training_steps)
+    with arg_scope([slim.dropout, nasnet_utils.drop_path],
+                   is_training=self._is_training):
+      with arg_scope([slim.batch_norm], is_training=self._train_batch_norm):
+        with arg_scope([slim.avg_pool2d,
+                        slim.max_pool2d,
+                        slim.conv2d,
+                        slim.batch_norm,
+                        slim.separable_conv2d,
+                        nasnet_utils.factorized_reduction,
+                        nasnet_utils.global_avg_pool,
+                        nasnet_utils.get_channel_index,
+                        nasnet_utils.get_channel_dim],
+                       data_format=hparams.data_format):
+
+          # This corresponds to the cell number just past 'Cell_11' used by
+          # by _extract_proposal_features().
+          start_cell_num = 12
+          # Note that this number equals:
+          #  start_cell_num + 2 stem cells + 1 reduction cell
+          true_cell_num = 15
+
+          with slim.arg_scope(nasnet.nasnet_large_arg_scope()):
+            net = _build_nasnet_base(hidden_previous,
+                                     hidden,
+                                     normal_cell=normal_cell,
+                                     reduction_cell=reduction_cell,
+                                     hparams=hparams,
+                                     true_cell_num=true_cell_num,
+                                     start_cell_num=start_cell_num)
+
+    proposal_classifier_features = net
+    return proposal_classifier_features
+
+  def restore_from_classification_checkpoint_fn(
+      self,
+      first_stage_feature_extractor_scope,
+      second_stage_feature_extractor_scope):
+    """Returns a map of variables to load from a foreign checkpoint.
+
+    Note that this overrides the default implementation in
+    faster_rcnn_meta_arch.FasterRCNNFeatureExtractor which does not work for
+    NASNet-A checkpoints.
+
+    Args:
+      first_stage_feature_extractor_scope: A scope name for the first stage
+        feature extractor.
+      second_stage_feature_extractor_scope: A scope name for the second stage
+        feature extractor.
+
+    Returns:
+      A dict mapping variable names (to load from a checkpoint) to variables in
+      the model graph.
+    """
+    # Note that the NAS checkpoint only contains the moving average version of
+    # the Variables so we need to generate an appropriate dictionary mapping.
+    variables_to_restore = {}
+    for variable in tf.global_variables():
+      if variable.op.name.startswith(
+          first_stage_feature_extractor_scope):
+        var_name = variable.op.name.replace(
+            first_stage_feature_extractor_scope + '/', '')
+        var_name += '/ExponentialMovingAverage'
+        variables_to_restore[var_name] = variable
+      if variable.op.name.startswith(
+          second_stage_feature_extractor_scope):
+        var_name = variable.op.name.replace(
+            second_stage_feature_extractor_scope + '/', '')
+        var_name += '/ExponentialMovingAverage'
+        variables_to_restore[var_name] = variable
+    return variables_to_restore
+
--- a/research/object_detection/models/faster_rcnn_nas_feature_extractor_test.py
+++ b/research/object_detection/models/faster_rcnn_nas_feature_extractor_test.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for models.faster_rcnn_nas_feature_extractor."""
+
+import tensorflow as tf
+
+from object_detection.models import faster_rcnn_nas_feature_extractor as frcnn_nas
+
+
+class FasterRcnnNASFeatureExtractorTest(tf.test.TestCase):
+
+  def _build_feature_extractor(self, first_stage_features_stride):
+    return frcnn_nas.FasterRCNNNASFeatureExtractor(
+        is_training=False,
+        first_stage_features_stride=first_stage_features_stride,
+        batch_norm_trainable=False,
+        reuse_weights=None,
+        weight_decay=0.0)
+
+  def test_extract_proposal_features_returns_expected_size(self):
+    feature_extractor = self._build_feature_extractor(
+        first_stage_features_stride=16)
+    preprocessed_inputs = tf.random_uniform(
+        [1, 299, 299, 3], maxval=255, dtype=tf.float32)
+    rpn_feature_map = feature_extractor.extract_proposal_features(
+        preprocessed_inputs, scope='TestScope')
+    features_shape = tf.shape(rpn_feature_map)
+
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      features_shape_out = sess.run(features_shape)
+      self.assertAllEqual(features_shape_out, [1, 19, 19, 4032])
+
+  def test_extract_proposal_features_input_size_224(self):
+    feature_extractor = self._build_feature_extractor(
+        first_stage_features_stride=16)
+    preprocessed_inputs = tf.random_uniform(
+        [1, 224, 224, 3], maxval=255, dtype=tf.float32)
+    rpn_feature_map = feature_extractor.extract_proposal_features(
+        preprocessed_inputs, scope='TestScope')
+    features_shape = tf.shape(rpn_feature_map)
+
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      features_shape_out = sess.run(features_shape)
+      self.assertAllEqual(features_shape_out, [1, 14, 14, 4032])
+
+  def test_extract_proposal_features_input_size_112(self):
+    feature_extractor = self._build_feature_extractor(
+        first_stage_features_stride=16)
+    preprocessed_inputs = tf.random_uniform(
+        [1, 112, 112, 3], maxval=255, dtype=tf.float32)
+    rpn_feature_map = feature_extractor.extract_proposal_features(
+        preprocessed_inputs, scope='TestScope')
+    features_shape = tf.shape(rpn_feature_map)
+
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      features_shape_out = sess.run(features_shape)
+      self.assertAllEqual(features_shape_out, [1, 7, 7, 4032])
+
+  def test_extract_proposal_features_dies_on_invalid_stride(self):
+    with self.assertRaises(ValueError):
+      self._build_feature_extractor(first_stage_features_stride=99)
+
+  def test_extract_proposal_features_dies_with_incorrect_rank_inputs(self):
+    feature_extractor = self._build_feature_extractor(
+        first_stage_features_stride=16)
+    preprocessed_inputs = tf.random_uniform(
+        [224, 224, 3], maxval=255, dtype=tf.float32)
+    with self.assertRaises(ValueError):
+      feature_extractor.extract_proposal_features(
+          preprocessed_inputs, scope='TestScope')
+
+  def test_extract_box_classifier_features_returns_expected_size(self):
+    feature_extractor = self._build_feature_extractor(
+        first_stage_features_stride=16)
+    proposal_feature_maps = tf.random_uniform(
+        [2, 17, 17, 1088], maxval=255, dtype=tf.float32)
+    proposal_classifier_features = (
+        feature_extractor.extract_box_classifier_features(
+            proposal_feature_maps, scope='TestScope'))
+    features_shape = tf.shape(proposal_classifier_features)
+
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      features_shape_out = sess.run(features_shape)
+      self.assertAllEqual(features_shape_out, [2, 9, 9, 4032])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/models/faster_rcnn_resnet_v1_feature_extractor.py
+++ b/research/object_detection/models/faster_rcnn_resnet_v1_feature_extractor.py
@@ -42,6 +42,7 @@ class FasterRCNNResnetV1FeatureExtractor(
               resnet_model,
               is_training,
               first_stage_features_stride,
+               batch_norm_trainable=False,
               reuse_weights=None,
               weight_decay=0.0):
    """Constructor.
@@ -51,6 +52,7 @@ class FasterRCNNResnetV1FeatureExtractor(
      resnet_model: Definition of the Resnet V1 model.
      is_training: See base class.
      first_stage_features_stride: See base class.
+      batch_norm_trainable: See base class.
      reuse_weights: See base class.
      weight_decay: See base class.

@@ -62,7 +64,8 @@ class FasterRCNNResnetV1FeatureExtractor(
    self._architecture = architecture
    self._resnet_model = resnet_model
    super(FasterRCNNResnetV1FeatureExtractor, self).__init__(
-        is_training, first_stage_features_stride, reuse_weights, weight_decay)
+        is_training, first_stage_features_stride, batch_norm_trainable,
+        reuse_weights, weight_decay)

  def preprocess(self, resized_inputs):
    """Faster R-CNN Resnet V1 preprocessing.
@@ -119,7 +122,7 @@ class FasterRCNNResnetV1FeatureExtractor(
          _, activations = self._resnet_model(
              preprocessed_inputs,
              num_classes=None,
-              is_training=False,
+              is_training=self._train_batch_norm,
              global_pool=False,
              output_stride=self._first_stage_features_stride,
              spatial_squeeze=False,
@@ -148,7 +151,8 @@ class FasterRCNNResnetV1FeatureExtractor(
              batch_norm_epsilon=1e-5,
              batch_norm_scale=True,
              weight_decay=self._weight_decay)):
-        with slim.arg_scope([slim.batch_norm], is_training=False):
+        with slim.arg_scope([slim.batch_norm],
+                            is_training=self._train_batch_norm):
          blocks = [
              resnet_utils.Block('block4', resnet_v1.bottleneck, [{
                  'depth': 2048,
@@ -167,6 +171,7 @@ class FasterRCNNResnet50FeatureExtractor(FasterRCNNResnetV1FeatureExtractor):
  def __init__(self,
               is_training,
               first_stage_features_stride,
+               batch_norm_trainable=False,
               reuse_weights=None,
               weight_decay=0.0):
    """Constructor.
@@ -174,6 +179,7 @@ class FasterRCNNResnet50FeatureExtractor(FasterRCNNResnetV1FeatureExtractor):
    Args:
      is_training: See base class.
      first_stage_features_stride: See base class.
+      batch_norm_trainable: See base class.
      reuse_weights: See base class.
      weight_decay: See base class.

@@ -183,7 +189,8 @@ class FasterRCNNResnet50FeatureExtractor(FasterRCNNResnetV1FeatureExtractor):
    """
    super(FasterRCNNResnet50FeatureExtractor, self).__init__(
        'resnet_v1_50', resnet_v1.resnet_v1_50, is_training,
-        first_stage_features_stride, reuse_weights, weight_decay)
+        first_stage_features_stride, batch_norm_trainable,
+        reuse_weights, weight_decay)


 class FasterRCNNResnet101FeatureExtractor(FasterRCNNResnetV1FeatureExtractor):
@@ -192,6 +199,7 @@ class FasterRCNNResnet101FeatureExtractor(FasterRCNNResnetV1FeatureExtractor):
  def __init__(self,
               is_training,
               first_stage_features_stride,
+               batch_norm_trainable=False,
               reuse_weights=None,
               weight_decay=0.0):
    """Constructor.
@@ -199,6 +207,7 @@ class FasterRCNNResnet101FeatureExtractor(FasterRCNNResnetV1FeatureExtractor):
    Args:
      is_training: See base class.
      first_stage_features_stride: See base class.
+      batch_norm_trainable: See base class.
      reuse_weights: See base class.
      weight_decay: See base class.

@@ -208,7 +217,8 @@ class FasterRCNNResnet101FeatureExtractor(FasterRCNNResnetV1FeatureExtractor):
    """
    super(FasterRCNNResnet101FeatureExtractor, self).__init__(
        'resnet_v1_101', resnet_v1.resnet_v1_101, is_training,
-        first_stage_features_stride, reuse_weights, weight_decay)
+        first_stage_features_stride, batch_norm_trainable,
+        reuse_weights, weight_decay)


 class FasterRCNNResnet152FeatureExtractor(FasterRCNNResnetV1FeatureExtractor):
@@ -217,6 +227,7 @@ class FasterRCNNResnet152FeatureExtractor(FasterRCNNResnetV1FeatureExtractor):
  def __init__(self,
               is_training,
               first_stage_features_stride,
+               batch_norm_trainable=False,
               reuse_weights=None,
               weight_decay=0.0):
    """Constructor.
@@ -224,6 +235,7 @@ class FasterRCNNResnet152FeatureExtractor(FasterRCNNResnetV1FeatureExtractor):
    Args:
      is_training: See base class.
      first_stage_features_stride: See base class.
+      batch_norm_trainable: See base class.
      reuse_weights: See base class.
      weight_decay: See base class.

@@ -233,4 +245,5 @@ class FasterRCNNResnet152FeatureExtractor(FasterRCNNResnetV1FeatureExtractor):
    """
    super(FasterRCNNResnet152FeatureExtractor, self).__init__(
        'resnet_v1_152', resnet_v1.resnet_v1_152, is_training,
-        first_stage_features_stride, reuse_weights, weight_decay)
+        first_stage_features_stride, batch_norm_trainable,
+        reuse_weights, weight_decay)
--- a/research/object_detection/models/faster_rcnn_resnet_v1_feature_extractor_test.py
+++ b/research/object_detection/models/faster_rcnn_resnet_v1_feature_extractor_test.py
@@ -37,6 +37,7 @@ class FasterRcnnResnetV1FeatureExtractorTest(tf.test.TestCase):
    return feature_extractor_map[architecture](
        is_training=False,
        first_stage_features_stride=first_stage_features_stride,
+        batch_norm_trainable=False,
        reuse_weights=None,
        weight_decay=0.0)


--- a/research/object_detection/models/feature_map_generators.py
+++ b/research/object_detection/models/feature_map_generators.py
@@ -25,7 +25,6 @@ of final feature maps.
 """
 import collections
 import tensorflow as tf
-from object_detection.utils import ops
 slim = tf.contrib.slim


@@ -59,12 +58,13 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
     based on the spatial shape and depth configuration. Note that the current
     implementation only supports generating new layers using convolution of
     stride 2 resulting in a spatial resolution reduction by a factor of 2.
+     By default convolution kernel size is set to 3, and it can be customized
+     by caller.

  An example of the configuration for Inception V3:
  {
    'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''],
-    'layer_depth': [-1, -1, -1, 512, 256, 128],
-    'anchor_strides': [16, 32, 64, -1, -1, -1]
+    'layer_depth': [-1, -1, -1, 512, 256, 128]
  }

  Args:
@@ -72,14 +72,12 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
      layouts in the following format (Inception V2/V3 respectively):
      {
        'from_layer': ['Mixed_3c', 'Mixed_4c', 'Mixed_5c', '', '', ''],
-        'layer_depth': [-1, -1, -1, 512, 256, 128],
-        'anchor_strides': [16, 32, 64, -1, -1, -1]
+        'layer_depth': [-1, -1, -1, 512, 256, 128]
      }
      or
      {
        'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', '', ''],
-        'layer_depth': [-1, -1, -1, 512, 256, 128],
-        'anchor_strides': [16, 32, 64, -1, -1, -1]
+        'layer_depth': [-1, -1, -1, 512, 256, 128]
      }
      If 'from_layer' is specified, the specified feature map is directly used
      as a box predictor layer, and the layer_depth is directly infered from the
@@ -90,14 +88,11 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
      Note that the current implementation only supports generating new layers
      using convolutions of stride 2 (resulting in a spatial resolution
      reduction by a factor of 2), and will be extended to a more flexible
-      design. Finally, the optional 'anchor_strides' can be used to specify the
-      anchor stride at each layer where 'from_layer' is specified. Our
-      convention is to set 'anchor_strides' to -1 whenever at the positions that
-      'from_layer' is an empty string, and anchor strides at these layers will
-      be inferred from the previous layer's anchor strides and the current
-      layer's stride length. In the case where 'anchor_strides' is not
-      specified, the anchor strides will default to the image width and height
-      divided by the number of anchors.
+      design. Convolution kernel size is set to 3 by default, and can be
+      customized by 'conv_kernel_size' parameter (similarily, 'conv_kernel_size'
+      should be set to -1 if 'from_layer' is specified). The created convolution
+      operation will be a normal 2D convolution by default, and a depthwise
+      convolution followed by 1x1 convolution if 'use_depthwise' is set to True.
    depth_multiplier: Depth multiplier for convolutional layers.
    min_depth: Minimum depth for convolutional layers.
    insert_1x1_conv: A boolean indicating whether an additional 1x1 convolution
@@ -120,14 +115,14 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
  feature_map_keys = []
  feature_maps = []
  base_from_layer = ''
-  feature_map_strides = None
  use_depthwise = False
-  if 'anchor_strides' in feature_map_layout:
-    feature_map_strides = (feature_map_layout['anchor_strides'])
  if 'use_depthwise' in feature_map_layout:
    use_depthwise = feature_map_layout['use_depthwise']
-  for index, (from_layer, layer_depth) in enumerate(
-      zip(feature_map_layout['from_layer'], feature_map_layout['layer_depth'])):
+  for index, from_layer in enumerate(feature_map_layout['from_layer']):
+    layer_depth = feature_map_layout['layer_depth'][index]
+    conv_kernel_size = 3
+    if 'conv_kernel_size' in feature_map_layout:
+      conv_kernel_size = feature_map_layout['conv_kernel_size'][index]
    if from_layer:
      feature_map = image_features[from_layer]
      base_from_layer = from_layer
@@ -145,12 +140,13 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
            stride=1,
            scope=layer_name)
      stride = 2
-      layer_name = '{}_2_Conv2d_{}_3x3_s2_{}'.format(
-          base_from_layer, index, depth_fn(layer_depth))
+      layer_name = '{}_2_Conv2d_{}_{}x{}_s2_{}'.format(
+          base_from_layer, index, conv_kernel_size, conv_kernel_size,
+          depth_fn(layer_depth))
      if use_depthwise:
        feature_map = slim.separable_conv2d(
-            ops.pad_to_multiple(intermediate_layer, stride),
-            None, [3, 3],
+            intermediate_layer,
+            None, [conv_kernel_size, conv_kernel_size],
            depth_multiplier=1,
            padding='SAME',
            stride=stride,
@@ -163,16 +159,11 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
            scope=layer_name)
      else:
        feature_map = slim.conv2d(
-            ops.pad_to_multiple(intermediate_layer, stride),
-            depth_fn(layer_depth), [3, 3],
+            intermediate_layer,
+            depth_fn(layer_depth), [conv_kernel_size, conv_kernel_size],
            padding='SAME',
            stride=stride,
            scope=layer_name)
-
-      if (index > 0 and feature_map_strides and
-          feature_map_strides[index - 1] > 0):
-        feature_map_strides[index] = (
-            stride * feature_map_strides[index - 1])
      feature_map_keys.append(layer_name)
    feature_maps.append(feature_map)
  return collections.OrderedDict(

--- a/research/object_detection/models/feature_map_generators_test.py
+++ b/research/object_detection/models/feature_map_generators_test.py
@@ -33,8 +33,14 @@ INCEPTION_V3_LAYOUT = {
    'aspect_ratios': [1.0, 2.0, 1.0/2, 3.0, 1.0/3]
 }

+EMBEDDED_SSD_MOBILENET_V1_LAYOUT = {
+    'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', ''],
+    'layer_depth': [-1, -1, 512, 256, 256],
+    'conv_kernel_size': [-1, -1, 3, 3, 2],
+}
+

-# TODO: add tests with different anchor strides.
+# TODO(rathodv): add tests with different anchor strides.
 class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):

  def test_get_expected_feature_map_shapes_with_inception_v2(self):
@@ -96,6 +102,37 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):
          (key, value.shape) for key, value in out_feature_maps.items())
      self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)

+  def test_get_expected_feature_map_shapes_with_embedded_ssd_mobilenet_v1(
+      self):
+    image_features = {
+        'Conv2d_11_pointwise': tf.random_uniform([4, 16, 16, 512],
+                                                 dtype=tf.float32),
+        'Conv2d_13_pointwise': tf.random_uniform([4, 8, 8, 1024],
+                                                 dtype=tf.float32),
+    }
+
+    feature_maps = feature_map_generators.multi_resolution_feature_maps(
+        feature_map_layout=EMBEDDED_SSD_MOBILENET_V1_LAYOUT,
+        depth_multiplier=1,
+        min_depth=32,
+        insert_1x1_conv=True,
+        image_features=image_features)
+
+    expected_feature_map_shapes = {
+        'Conv2d_11_pointwise': (4, 16, 16, 512),
+        'Conv2d_13_pointwise': (4, 8, 8, 1024),
+        'Conv2d_13_pointwise_2_Conv2d_2_3x3_s2_512': (4, 4, 4, 512),
+        'Conv2d_13_pointwise_2_Conv2d_3_3x3_s2_256': (4, 2, 2, 256),
+        'Conv2d_13_pointwise_2_Conv2d_4_2x2_s2_256': (4, 1, 1, 256)}
+
+    init_op = tf.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      out_feature_maps = sess.run(feature_maps)
+      out_feature_map_shapes = dict(
+          (key, value.shape) for key, value in out_feature_maps.items())
+      self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)
+

 class GetDepthFunctionTest(tf.test.TestCase):


--- a/research/object_detection/models/ssd_feature_extractor_test.py
+++ b/research/object_detection/models/ssd_feature_extractor_test.py
@@ -46,34 +46,32 @@ class SsdFeatureExtractorTestBase(object):
        self.assertAllEqual(shape_out, exp_shape_out)

  @abstractmethod
-  def _create_feature_extractor(self, depth_multiplier):
+  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple):
    """Constructs a new feature extractor.

    Args:
      depth_multiplier: float depth multiplier for feature extractor
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
    Returns:
      an ssd_meta_arch.SSDFeatureExtractor object.
    """
    pass

  def check_extract_features_returns_correct_shape(
-      self,
-      image_height,
-      image_width,
-      depth_multiplier,
+      self, image_height, image_width, depth_multiplier, pad_to_multiple,
      expected_feature_map_shapes_out):
-    feature_extractor = self._create_feature_extractor(depth_multiplier)
+    feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                       pad_to_multiple)
    preprocessed_inputs = tf.random_uniform(
        [4, image_height, image_width, 3], dtype=tf.float32)
    self._validate_features_shape(
        feature_extractor, preprocessed_inputs, expected_feature_map_shapes_out)

  def check_extract_features_raises_error_with_invalid_image_size(
-      self,
-      image_height,
-      image_width,
-      depth_multiplier):
-    feature_extractor = self._create_feature_extractor(depth_multiplier)
+      self, image_height, image_width, depth_multiplier, pad_to_multiple):
+    feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                       pad_to_multiple)
    preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3))
    feature_maps = feature_extractor.extract_features(preprocessed_inputs)
    test_preprocessed_image = np.random.rand(4, image_height, image_width, 3)
@@ -83,12 +81,12 @@ class SsdFeatureExtractorTestBase(object):
        sess.run(feature_maps,
                 feed_dict={preprocessed_inputs: test_preprocessed_image})

-  def check_feature_extractor_variables_under_scope(self,
-                                                    depth_multiplier,
-                                                    scope_name):
+  def check_feature_extractor_variables_under_scope(
+      self, depth_multiplier, pad_to_multiple, scope_name):
    g = tf.Graph()
    with g.as_default():
-      feature_extractor = self._create_feature_extractor(depth_multiplier)
+      feature_extractor = self._create_feature_extractor(
+          depth_multiplier, pad_to_multiple)
      preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3))
      feature_extractor.extract_features(preprocessed_inputs)
      variables = g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

--- a/research/object_detection/models/ssd_inception_v2_feature_extractor.py
+++ b/research/object_detection/models/ssd_inception_v2_feature_extractor.py
@@ -18,6 +18,7 @@ import tensorflow as tf

 from object_detection.meta_architectures import ssd_meta_arch
 from object_detection.models import feature_map_generators
+from object_detection.utils import ops
 from nets import inception_v2

 slim = tf.contrib.slim
@@ -27,20 +28,31 @@ class SSDInceptionV2FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
  """SSD Feature Extractor using InceptionV2 features."""

  def __init__(self,
+               is_training,
               depth_multiplier,
               min_depth,
+               pad_to_multiple,
               conv_hyperparams,
+               batch_norm_trainable=True,
               reuse_weights=None):
    """InceptionV2 Feature Extractor for SSD Models.

    Args:
+      is_training: whether the network is in training mode.
      depth_multiplier: float depth multiplier for feature extractor.
      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
      conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
+      batch_norm_trainable: Whether to update batch norm parameters during
+        training or not. When training with a small batch size
+        (e.g. 1), it is desirable to disable batch norm update and use
+        pretrained batch norm params.
      reuse_weights: Whether to reuse variables. Default is None.
    """
    super(SSDInceptionV2FeatureExtractor, self).__init__(
-        depth_multiplier, min_depth, conv_hyperparams, reuse_weights)
+        is_training, depth_multiplier, min_depth, pad_to_multiple,
+        conv_hyperparams, batch_norm_trainable, reuse_weights)

  def preprocess(self, resized_inputs):
    """SSD preprocessing.
@@ -84,7 +96,7 @@ class SSDInceptionV2FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
        with tf.variable_scope('InceptionV2',
                               reuse=self._reuse_weights) as scope:
          _, image_features = inception_v2.inception_v2_base(
-              preprocessed_inputs,
+              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
              final_endpoint='Mixed_5c',
              min_depth=self._min_depth,
              depth_multiplier=self._depth_multiplier,

--- a/research/object_detection/models/ssd_inception_v2_feature_extractor_test.py
+++ b/research/object_detection/models/ssd_inception_v2_feature_extractor_test.py
@@ -22,73 +22,101 @@ from object_detection.models import ssd_inception_v2_feature_extractor


 class SsdInceptionV2FeatureExtractorTest(
-    ssd_feature_extractor_test.SsdFeatureExtractorTestBase,
-    tf.test.TestCase):
+    ssd_feature_extractor_test.SsdFeatureExtractorTestBase, tf.test.TestCase):

-  def _create_feature_extractor(self, depth_multiplier):
+  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
+                                is_training=True, batch_norm_trainable=True):
    """Constructs a SsdInceptionV2FeatureExtractor.

    Args:
      depth_multiplier: float depth multiplier for feature extractor
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      is_training: whether the network is in training mode.
+      batch_norm_trainable: Whether to update batch norm parameters during
+        training or not
    Returns:
      an ssd_inception_v2_feature_extractor.SsdInceptionV2FeatureExtractor.
    """
    min_depth = 32
    conv_hyperparams = {}
    return ssd_inception_v2_feature_extractor.SSDInceptionV2FeatureExtractor(
-        depth_multiplier, min_depth, conv_hyperparams)
+        is_training, depth_multiplier, min_depth, pad_to_multiple,
+        conv_hyperparams, batch_norm_trainable)

  def test_extract_features_returns_correct_shapes_128(self):
    image_height = 128
    image_width = 128
    depth_multiplier = 1.0
+    pad_to_multiple = 1
    expected_feature_map_shape = [(4, 8, 8, 576), (4, 4, 4, 1024),
                                  (4, 2, 2, 512), (4, 1, 1, 256),
                                  (4, 1, 1, 256), (4, 1, 1, 128)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, expected_feature_map_shape)
+        image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)

  def test_extract_features_returns_correct_shapes_299(self):
    image_height = 299
    image_width = 299
    depth_multiplier = 1.0
+    pad_to_multiple = 1
    expected_feature_map_shape = [(4, 19, 19, 576), (4, 10, 10, 1024),
                                  (4, 5, 5, 512), (4, 3, 3, 256),
                                  (4, 2, 2, 256), (4, 1, 1, 128)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, expected_feature_map_shape)
+        image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)

  def test_extract_features_returns_correct_shapes_enforcing_min_depth(self):
    image_height = 299
    image_width = 299
    depth_multiplier = 0.5**12
+    pad_to_multiple = 1
    expected_feature_map_shape = [(4, 19, 19, 128), (4, 10, 10, 128),
                                  (4, 5, 5, 32), (4, 3, 3, 32),
                                  (4, 2, 2, 32), (4, 1, 1, 32)]
    self.check_extract_features_returns_correct_shape(
-        image_height, image_width, depth_multiplier, expected_feature_map_shape)
+        image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+
+  def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self):
+    image_height = 299
+    image_width = 299
+    depth_multiplier = 1.0
+    pad_to_multiple = 32
+    expected_feature_map_shape = [(4, 20, 20, 576), (4, 10, 10, 1024),
+                                  (4, 5, 5, 512), (4, 3, 3, 256),
+                                  (4, 2, 2, 256), (4, 1, 1, 128)]
+    self.check_extract_features_returns_correct_shape(
+        image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)

  def test_extract_features_raises_error_with_invalid_image_size(self):
    image_height = 32
    image_width = 32
    depth_multiplier = 1.0
+    pad_to_multiple = 1
    self.check_extract_features_raises_error_with_invalid_image_size(
-        image_height, image_width, depth_multiplier)
+        image_height, image_width, depth_multiplier, pad_to_multiple)

  def test_preprocess_returns_correct_value_range(self):
    image_height = 128
    image_width = 128
    depth_multiplier = 1
+    pad_to_multiple = 1
    test_image = np.random.rand(4, image_height, image_width, 3)
-    feature_extractor = self._create_feature_extractor(depth_multiplier)
+    feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                       pad_to_multiple)
    preprocessed_image = feature_extractor.preprocess(test_image)
    self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))

  def test_variables_only_created_in_scope(self):
    depth_multiplier = 1
+    pad_to_multiple = 1
    scope_name = 'InceptionV2'
-    self.check_feature_extractor_variables_under_scope(depth_multiplier,
-                                                       scope_name)
+    self.check_feature_extractor_variables_under_scope(
+        depth_multiplier, pad_to_multiple, scope_name)


 if __name__ == '__main__':

--- a/research/object_detection/models/ssd_inception_v3_feature_extractor.py
+++ b/research/object_detection/models/ssd_inception_v3_feature_extractor.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""SSDFeatureExtractor for InceptionV3 features."""
+import tensorflow as tf
+
+from object_detection.meta_architectures import ssd_meta_arch
+from object_detection.models import feature_map_generators
+from object_detection.utils import ops
+from nets import inception_v3
+
+slim = tf.contrib.slim
+
+
+class SSDInceptionV3FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
+  """SSD Feature Extractor using InceptionV3 features."""
+
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams,
+               batch_norm_trainable=True,
+               reuse_weights=None):
+    """InceptionV3 Feature Extractor for SSD Models.
+
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: float depth multiplier for feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
+      batch_norm_trainable: Whether to update batch norm parameters during
+        training or not. When training with a small batch size
+        (e.g. 1), it is desirable to disable batch norm update and use
+        pretrained batch norm params.
+      reuse_weights: Whether to reuse variables. Default is None.
+    """
+    super(SSDInceptionV3FeatureExtractor, self).__init__(
+        is_training, depth_multiplier, min_depth, pad_to_multiple,
+        conv_hyperparams, batch_norm_trainable, reuse_weights)
+
+  def preprocess(self, resized_inputs):
+    """SSD preprocessing.
+
+    Maps pixel values to the range [-1, 1].
+
+    Args:
+      resized_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+
+    Returns:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+    """
+    return (2.0 / 255.0) * resized_inputs - 1.0
+
+  def extract_features(self, preprocessed_inputs):
+    """Extract features from preprocessed inputs.
+
+    Args:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+
+    Returns:
+      feature_maps: a list of tensors where the ith tensor has shape
+        [batch, height_i, width_i, depth_i]
+    """
+    preprocessed_inputs.get_shape().assert_has_rank(4)
+    shape_assert = tf.Assert(
+        tf.logical_and(tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
+                       tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
+        ['image size must at least be 33 in both height and width.'])
+
+    feature_map_layout = {
+        'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''],
+        'layer_depth': [-1, -1, -1, 512, 256, 128],
+    }
+
+    with tf.control_dependencies([shape_assert]):
+      with slim.arg_scope(self._conv_hyperparams):
+        with tf.variable_scope('InceptionV3',
+                               reuse=self._reuse_weights) as scope:
+          _, image_features = inception_v3.inception_v3_base(
+              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
+              final_endpoint='Mixed_7c',
+              min_depth=self._min_depth,
+              depth_multiplier=self._depth_multiplier,
+              scope=scope)
+          feature_maps = feature_map_generators.multi_resolution_feature_maps(
+              feature_map_layout=feature_map_layout,
+              depth_multiplier=self._depth_multiplier,
+              min_depth=self._min_depth,
+              insert_1x1_conv=True,
+              image_features=image_features)
+
+    return feature_maps.values()