Add model_builder and feature_map_extractor back.

a703fc0c · pkulzc · 9fce9c64 · a703fc0c · a703fc0c · a703fc0c
Commit a703fc0c authored Jun 05, 2018 by pkulzc
3 changed files
--- a/research/object_detection/builders/model_builder.py
+++ b/research/object_detection/builders/model_builder.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A function to build a DetectionModel from configuration."""
+from object_detection.builders import anchor_generator_builder
+from object_detection.builders import box_coder_builder
+from object_detection.builders import box_predictor_builder
+from object_detection.builders import hyperparams_builder
+from object_detection.builders import image_resizer_builder
+from object_detection.builders import losses_builder
+from object_detection.builders import matcher_builder
+from object_detection.builders import post_processing_builder
+from object_detection.builders import region_similarity_calculator_builder as sim_calc
+from object_detection.core import box_predictor
+from object_detection.meta_architectures import faster_rcnn_meta_arch
+from object_detection.meta_architectures import rfcn_meta_arch
+from object_detection.meta_architectures import ssd_meta_arch
+from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extractor as frcnn_inc_res
+from object_detection.models import faster_rcnn_inception_v2_feature_extractor as frcnn_inc_v2
+from object_detection.models import faster_rcnn_nas_feature_extractor as frcnn_nas
+from object_detection.models import faster_rcnn_pnas_feature_extractor as frcnn_pnas
+from object_detection.models import faster_rcnn_resnet_v1_feature_extractor as frcnn_resnet_v1
+from object_detection.models import ssd_resnet_v1_fpn_feature_extractor as ssd_resnet_v1_fpn
+from object_detection.models.embedded_ssd_mobilenet_v1_feature_extractor import EmbeddedSSDMobileNetV1FeatureExtractor
+from object_detection.models.ssd_inception_v2_feature_extractor import SSDInceptionV2FeatureExtractor
+from object_detection.models.ssd_inception_v3_feature_extractor import SSDInceptionV3FeatureExtractor
+from object_detection.models.ssd_mobilenet_v1_feature_extractor import SSDMobileNetV1FeatureExtractor
+from object_detection.models.ssd_mobilenet_v2_feature_extractor import SSDMobileNetV2FeatureExtractor
+from object_detection.protos import model_pb2
+# A map of names to SSD feature extractors.
+SSD_FEATURE_EXTRACTOR_CLASS_MAP = {
+    'ssd_inception_v2': SSDInceptionV2FeatureExtractor,
+    'ssd_inception_v3': SSDInceptionV3FeatureExtractor,
+    'ssd_mobilenet_v1': SSDMobileNetV1FeatureExtractor,
+    'ssd_mobilenet_v2': SSDMobileNetV2FeatureExtractor,
+    'ssd_resnet50_v1_fpn': ssd_resnet_v1_fpn.SSDResnet50V1FpnFeatureExtractor,
+    'ssd_resnet101_v1_fpn': ssd_resnet_v1_fpn.SSDResnet101V1FpnFeatureExtractor,
+    'ssd_resnet152_v1_fpn': ssd_resnet_v1_fpn.SSDResnet152V1FpnFeatureExtractor,
+    'embedded_ssd_mobilenet_v1': EmbeddedSSDMobileNetV1FeatureExtractor,
+}
+# A map of names to Faster R-CNN feature extractors.
+FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP = {
+    'faster_rcnn_nas':
+    frcnn_nas.FasterRCNNNASFeatureExtractor,
+    'faster_rcnn_pnas':
+    frcnn_pnas.FasterRCNNPNASFeatureExtractor,
+    'faster_rcnn_inception_resnet_v2':
+    frcnn_inc_res.FasterRCNNInceptionResnetV2FeatureExtractor,
+    'faster_rcnn_inception_v2':
+    frcnn_inc_v2.FasterRCNNInceptionV2FeatureExtractor,
+    'faster_rcnn_resnet50':
+    frcnn_resnet_v1.FasterRCNNResnet50FeatureExtractor,
+    'faster_rcnn_resnet101':
+    frcnn_resnet_v1.FasterRCNNResnet101FeatureExtractor,
+    'faster_rcnn_resnet152':
+    frcnn_resnet_v1.FasterRCNNResnet152FeatureExtractor,
+}
+def build(model_config, is_training, add_summaries=True,
+          add_background_class=True):
+  """Builds a DetectionModel based on the model config.
+  Args:
+    model_config: A model.proto object containing the config for the desired
+      DetectionModel.
+    is_training: True if this model is being built for training purposes.
+    add_summaries: Whether to add tensorflow summaries in the model graph.
+    add_background_class: Whether to add an implicit background class to one-hot
+      encodings of groundtruth labels. Set to false if using groundtruth labels
+      with an explicit background class or using multiclass scores instead of
+      truth in the case of distillation. Ignored in the case of faster_rcnn.
+  Returns:
+    DetectionModel based on the config.
+  Raises:
+    ValueError: On invalid meta architecture or model.
+  """
+  if not isinstance(model_config, model_pb2.DetectionModel):
+    raise ValueError('model_config not of type model_pb2.DetectionModel.')
+  meta_architecture = model_config.WhichOneof('model')
+  if meta_architecture == 'ssd':
+    return _build_ssd_model(model_config.ssd, is_training, add_summaries,
+                            add_background_class)
+  if meta_architecture == 'faster_rcnn':
+    return _build_faster_rcnn_model(model_config.faster_rcnn, is_training,
+                                    add_summaries)
+  raise ValueError('Unknown meta architecture: {}'.format(meta_architecture))
+def _build_ssd_feature_extractor(feature_extractor_config, is_training,
+                                 reuse_weights=None):
+  """Builds a ssd_meta_arch.SSDFeatureExtractor based on config.
+  Args:
+    feature_extractor_config: A SSDFeatureExtractor proto config from ssd.proto.
+    is_training: True if this feature extractor is being built for training.
+    reuse_weights: if the feature extractor should reuse weights.
+  Returns:
+    ssd_meta_arch.SSDFeatureExtractor based on config.
+  Raises:
+    ValueError: On invalid feature extractor type.
+  """
+  feature_type = feature_extractor_config.type
+  depth_multiplier = feature_extractor_config.depth_multiplier
+  min_depth = feature_extractor_config.min_depth
+  pad_to_multiple = feature_extractor_config.pad_to_multiple
+  use_explicit_padding = feature_extractor_config.use_explicit_padding
+  use_depthwise = feature_extractor_config.use_depthwise
+  conv_hyperparams = hyperparams_builder.build(
+      feature_extractor_config.conv_hyperparams, is_training)
+  override_base_feature_extractor_hyperparams = (
+      feature_extractor_config.override_base_feature_extractor_hyperparams)
+  if feature_type not in SSD_FEATURE_EXTRACTOR_CLASS_MAP:
+    raise ValueError('Unknown ssd feature_extractor: {}'.format(feature_type))
+  feature_extractor_class = SSD_FEATURE_EXTRACTOR_CLASS_MAP[feature_type]
+  return feature_extractor_class(
+      is_training, depth_multiplier, min_depth, pad_to_multiple,
+      conv_hyperparams, reuse_weights, use_explicit_padding, use_depthwise,
+      override_base_feature_extractor_hyperparams)
+def _build_ssd_model(ssd_config, is_training, add_summaries,
+                     add_background_class=True):
+  """Builds an SSD detection model based on the model config.
+  Args:
+    ssd_config: A ssd.proto object containing the config for the desired
+      SSDMetaArch.
+    is_training: True if this model is being built for training purposes.
+    add_summaries: Whether to add tf summaries in the model.
+    add_background_class: Whether to add an implicit background class to one-hot
+      encodings of groundtruth labels. Set to false if using groundtruth labels
+      with an explicit background class or using multiclass scores instead of
+      truth in the case of distillation.
+  Returns:
+    SSDMetaArch based on the config.
+  Raises:
+    ValueError: If ssd_config.type is not recognized (i.e. not registered in
+      model_class_map).
+  """
+  num_classes = ssd_config.num_classes
+  # Feature extractor
+  feature_extractor = _build_ssd_feature_extractor(
+      feature_extractor_config=ssd_config.feature_extractor,
+      is_training=is_training)
+  box_coder = box_coder_builder.build(ssd_config.box_coder)
+  matcher = matcher_builder.build(ssd_config.matcher)
+  region_similarity_calculator = sim_calc.build(
+      ssd_config.similarity_calculator)
+  encode_background_as_zeros = ssd_config.encode_background_as_zeros
+  negative_class_weight = ssd_config.negative_class_weight
+  ssd_box_predictor = box_predictor_builder.build(hyperparams_builder.build,
+                                                  ssd_config.box_predictor,
+                                                  is_training, num_classes)
+  anchor_generator = anchor_generator_builder.build(
+      ssd_config.anchor_generator)
+  image_resizer_fn = image_resizer_builder.build(ssd_config.image_resizer)
+  non_max_suppression_fn, score_conversion_fn = post_processing_builder.build(
+      ssd_config.post_processing)
+  (classification_loss, localization_loss, classification_weight,
+   localization_weight, hard_example_miner,
+   random_example_sampler) = losses_builder.build(ssd_config.loss)
+  normalize_loss_by_num_matches = ssd_config.normalize_loss_by_num_matches
+  normalize_loc_loss_by_codesize = ssd_config.normalize_loc_loss_by_codesize
+  return ssd_meta_arch.SSDMetaArch(
+      is_training,
+      anchor_generator,
+      ssd_box_predictor,
+      box_coder,
+      feature_extractor,
+      matcher,
+      region_similarity_calculator,
+      encode_background_as_zeros,
+      negative_class_weight,
+      image_resizer_fn,
+      non_max_suppression_fn,
+      score_conversion_fn,
+      classification_loss,
+      localization_loss,
+      classification_weight,
+      localization_weight,
+      normalize_loss_by_num_matches,
+      hard_example_miner,
+      add_summaries=add_summaries,
+      normalize_loc_loss_by_codesize=normalize_loc_loss_by_codesize,
+      freeze_batchnorm=ssd_config.freeze_batchnorm,
+      inplace_batchnorm_update=ssd_config.inplace_batchnorm_update,
+      add_background_class=add_background_class,
+      random_example_sampler=random_example_sampler)
+def _build_faster_rcnn_feature_extractor(
+    feature_extractor_config, is_training, reuse_weights=None,
+    inplace_batchnorm_update=False):
+  """Builds a faster_rcnn_meta_arch.FasterRCNNFeatureExtractor based on config.
+  Args:
+    feature_extractor_config: A FasterRcnnFeatureExtractor proto config from
+      faster_rcnn.proto.
+    is_training: True if this feature extractor is being built for training.
+    reuse_weights: if the feature extractor should reuse weights.
+    inplace_batchnorm_update: Whether to update batch_norm inplace during
+      training. This is required for batch norm to work correctly on TPUs. When
+      this is false, user must add a control dependency on
+      tf.GraphKeys.UPDATE_OPS for train/loss op in order to update the batch
+      norm moving average parameters.
+  Returns:
+    faster_rcnn_meta_arch.FasterRCNNFeatureExtractor based on config.
+  Raises:
+    ValueError: On invalid feature extractor type.
+  """
+  if inplace_batchnorm_update:
+    raise ValueError('inplace batchnorm updates not supported.')
+  feature_type = feature_extractor_config.type
+  first_stage_features_stride = (
+      feature_extractor_config.first_stage_features_stride)
+  batch_norm_trainable = feature_extractor_config.batch_norm_trainable
+  if feature_type not in FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP:
+    raise ValueError('Unknown Faster R-CNN feature_extractor: {}'.format(
+        feature_type))
+  feature_extractor_class = FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP[
+      feature_type]
+  return feature_extractor_class(
+      is_training, first_stage_features_stride,
+      batch_norm_trainable, reuse_weights)
+def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries):
+  """Builds a Faster R-CNN or R-FCN detection model based on the model config.
+  Builds R-FCN model if the second_stage_box_predictor in the config is of type
+  `rfcn_box_predictor` else builds a Faster R-CNN model.
+  Args:
+    frcnn_config: A faster_rcnn.proto object containing the config for the
+      desired FasterRCNNMetaArch or RFCNMetaArch.
+    is_training: True if this model is being built for training purposes.
+    add_summaries: Whether to add tf summaries in the model.
+  Returns:
+    FasterRCNNMetaArch based on the config.
+  Raises:
+    ValueError: If frcnn_config.type is not recognized (i.e. not registered in
+      model_class_map).
+  """
+  num_classes = frcnn_config.num_classes
+  image_resizer_fn = image_resizer_builder.build(frcnn_config.image_resizer)
+  feature_extractor = _build_faster_rcnn_feature_extractor(
+      frcnn_config.feature_extractor, is_training,
+      frcnn_config.inplace_batchnorm_update)
+  number_of_stages = frcnn_config.number_of_stages
+  first_stage_anchor_generator = anchor_generator_builder.build(
+      frcnn_config.first_stage_anchor_generator)
+  first_stage_atrous_rate = frcnn_config.first_stage_atrous_rate
+  first_stage_box_predictor_arg_scope_fn = hyperparams_builder.build(
+      frcnn_config.first_stage_box_predictor_conv_hyperparams, is_training)
+  first_stage_box_predictor_kernel_size = (
+      frcnn_config.first_stage_box_predictor_kernel_size)
+  first_stage_box_predictor_depth = frcnn_config.first_stage_box_predictor_depth
+  first_stage_minibatch_size = frcnn_config.first_stage_minibatch_size
+  first_stage_positive_balance_fraction = (
+      frcnn_config.first_stage_positive_balance_fraction)
+  first_stage_nms_score_threshold = frcnn_config.first_stage_nms_score_threshold
+  first_stage_nms_iou_threshold = frcnn_config.first_stage_nms_iou_threshold
+  first_stage_max_proposals = frcnn_config.first_stage_max_proposals
+  first_stage_loc_loss_weight = (
+      frcnn_config.first_stage_localization_loss_weight)
+  first_stage_obj_loss_weight = frcnn_config.first_stage_objectness_loss_weight
+  initial_crop_size = frcnn_config.initial_crop_size
+  maxpool_kernel_size = frcnn_config.maxpool_kernel_size
+  maxpool_stride = frcnn_config.maxpool_stride
+  second_stage_box_predictor = box_predictor_builder.build(
+      hyperparams_builder.build,
+      frcnn_config.second_stage_box_predictor,
+      is_training=is_training,
+      num_classes=num_classes)
+  second_stage_batch_size = frcnn_config.second_stage_batch_size
+  second_stage_balance_fraction = frcnn_config.second_stage_balance_fraction
+  (second_stage_non_max_suppression_fn, second_stage_score_conversion_fn
+  ) = post_processing_builder.build(frcnn_config.second_stage_post_processing)
+  second_stage_localization_loss_weight = (
+      frcnn_config.second_stage_localization_loss_weight)
+  second_stage_classification_loss = (
+      losses_builder.build_faster_rcnn_classification_loss(
+          frcnn_config.second_stage_classification_loss))
+  second_stage_classification_loss_weight = (
+      frcnn_config.second_stage_classification_loss_weight)
+  second_stage_mask_prediction_loss_weight = (
+      frcnn_config.second_stage_mask_prediction_loss_weight)
+  hard_example_miner = None
+  if frcnn_config.HasField('hard_example_miner'):
+    hard_example_miner = losses_builder.build_hard_example_miner(
+        frcnn_config.hard_example_miner,
+        second_stage_classification_loss_weight,
+        second_stage_localization_loss_weight)
+  common_kwargs = {
+      'is_training': is_training,
+      'num_classes': num_classes,
+      'image_resizer_fn': image_resizer_fn,
+      'feature_extractor': feature_extractor,
+      'number_of_stages': number_of_stages,
+      'first_stage_anchor_generator': first_stage_anchor_generator,
+      'first_stage_atrous_rate': first_stage_atrous_rate,
+      'first_stage_box_predictor_arg_scope_fn':
+      first_stage_box_predictor_arg_scope_fn,
+      'first_stage_box_predictor_kernel_size':
+      first_stage_box_predictor_kernel_size,
+      'first_stage_box_predictor_depth': first_stage_box_predictor_depth,
+      'first_stage_minibatch_size': first_stage_minibatch_size,
+      'first_stage_positive_balance_fraction':
+      first_stage_positive_balance_fraction,
+      'first_stage_nms_score_threshold': first_stage_nms_score_threshold,
+      'first_stage_nms_iou_threshold': first_stage_nms_iou_threshold,
+      'first_stage_max_proposals': first_stage_max_proposals,
+      'first_stage_localization_loss_weight': first_stage_loc_loss_weight,
+      'first_stage_objectness_loss_weight': first_stage_obj_loss_weight,
+      'second_stage_batch_size': second_stage_batch_size,
+      'second_stage_balance_fraction': second_stage_balance_fraction,
+      'second_stage_non_max_suppression_fn':
+      second_stage_non_max_suppression_fn,
+      'second_stage_score_conversion_fn': second_stage_score_conversion_fn,
+      'second_stage_localization_loss_weight':
+      second_stage_localization_loss_weight,
+      'second_stage_classification_loss':
+      second_stage_classification_loss,
+      'second_stage_classification_loss_weight':
+      second_stage_classification_loss_weight,
+      'hard_example_miner': hard_example_miner,
+      'add_summaries': add_summaries}
+  if isinstance(second_stage_box_predictor, box_predictor.RfcnBoxPredictor):
+    return rfcn_meta_arch.RFCNMetaArch(
+        second_stage_rfcn_box_predictor=second_stage_box_predictor,
+        **common_kwargs)
+  else:
+    return faster_rcnn_meta_arch.FasterRCNNMetaArch(
+        initial_crop_size=initial_crop_size,
+        maxpool_kernel_size=maxpool_kernel_size,
+        maxpool_stride=maxpool_stride,
+        second_stage_mask_rcnn_box_predictor=second_stage_box_predictor,
+        second_stage_mask_prediction_loss_weight=(
+            second_stage_mask_prediction_loss_weight),
+        **common_kwargs)
--- a/research/object_detection/builders/model_builder_test.py
+++ b/research/object_detection/builders/model_builder_test.py
--- a/research/object_detection/models/feature_map_generators.py
+++ b/research/object_detection/models/feature_map_generators.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions to generate a list of feature maps based on image features.
+Provides several feature map generators that can be used to build object
+detection feature extractors.
+Object detection feature extractors usually are built by stacking two components
+- A base feature extractor such as Inception V3 and a feature map generator.
+Feature map generators build on the base feature extractors and produce a list
+of final feature maps.
+"""
+import collections
+import tensorflow as tf
+from object_detection.utils import ops
+slim = tf.contrib.slim
+def get_depth_fn(depth_multiplier, min_depth):
+  """Builds a callable to compute depth (output channels) of conv filters.
+  Args:
+    depth_multiplier: a multiplier for the nominal depth.
+    min_depth: a lower bound on the depth of filters.
+  Returns:
+    A callable that takes in a nominal depth and returns the depth to use.
+  """
+  def multiply_depth(depth):
+    new_depth = int(depth * depth_multiplier)
+    return max(new_depth, min_depth)
+  return multiply_depth
+def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
+                                  min_depth, insert_1x1_conv, image_features):
+  """Generates multi resolution feature maps from input image features.
+  Generates multi-scale feature maps for detection as in the SSD papers by
+  Liu et al: https://arxiv.org/pdf/1512.02325v2.pdf, See Sec 2.1.
+  More specifically, it performs the following two tasks:
+  1) If a layer name is provided in the configuration, returns that layer as a
+     feature map.
+  2) If a layer name is left as an empty string, constructs a new feature map
+     based on the spatial shape and depth configuration. Note that the current
+     implementation only supports generating new layers using convolution of
+     stride 2 resulting in a spatial resolution reduction by a factor of 2.
+     By default convolution kernel size is set to 3, and it can be customized
+     by caller.
+  An example of the configuration for Inception V3:
+  {
+    'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''],
+    'layer_depth': [-1, -1, -1, 512, 256, 128]
+  }
+  Args:
+    feature_map_layout: Dictionary of specifications for the feature map
+      layouts in the following format (Inception V2/V3 respectively):
+      {
+        'from_layer': ['Mixed_3c', 'Mixed_4c', 'Mixed_5c', '', '', ''],
+        'layer_depth': [-1, -1, -1, 512, 256, 128]
+      }
+      or
+      {
+        'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', '', ''],
+        'layer_depth': [-1, -1, -1, 512, 256, 128]
+      }
+      If 'from_layer' is specified, the specified feature map is directly used
+      as a box predictor layer, and the layer_depth is directly infered from the
+      feature map (instead of using the provided 'layer_depth' parameter). In
+      this case, our convention is to set 'layer_depth' to -1 for clarity.
+      Otherwise, if 'from_layer' is an empty string, then the box predictor
+      layer will be built from the previous layer using convolution operations.
+      Note that the current implementation only supports generating new layers
+      using convolutions of stride 2 (resulting in a spatial resolution
+      reduction by a factor of 2), and will be extended to a more flexible
+      design. Convolution kernel size is set to 3 by default, and can be
+      customized by 'conv_kernel_size' parameter (similarily, 'conv_kernel_size'
+      should be set to -1 if 'from_layer' is specified). The created convolution
+      operation will be a normal 2D convolution by default, and a depthwise
+      convolution followed by 1x1 convolution if 'use_depthwise' is set to True.
+    depth_multiplier: Depth multiplier for convolutional layers.
+    min_depth: Minimum depth for convolutional layers.
+    insert_1x1_conv: A boolean indicating whether an additional 1x1 convolution
+      should be inserted before shrinking the feature map.
+    image_features: A dictionary of handles to activation tensors from the
+      base feature extractor.
+  Returns:
+    feature_maps: an OrderedDict mapping keys (feature map names) to
+      tensors where each tensor has shape [batch, height_i, width_i, depth_i].
+  Raises:
+    ValueError: if the number entries in 'from_layer' and
+      'layer_depth' do not match.
+    ValueError: if the generated layer does not have the same resolution
+      as specified.
+  """
+  depth_fn = get_depth_fn(depth_multiplier, min_depth)
+  feature_map_keys = []
+  feature_maps = []
+  base_from_layer = ''
+  use_explicit_padding = False
+  if 'use_explicit_padding' in feature_map_layout:
+    use_explicit_padding = feature_map_layout['use_explicit_padding']
+  use_depthwise = False
+  if 'use_depthwise' in feature_map_layout:
+    use_depthwise = feature_map_layout['use_depthwise']
+  for index, from_layer in enumerate(feature_map_layout['from_layer']):
+    layer_depth = feature_map_layout['layer_depth'][index]
+    conv_kernel_size = 3
+    if 'conv_kernel_size' in feature_map_layout:
+      conv_kernel_size = feature_map_layout['conv_kernel_size'][index]
+    if from_layer:
+      feature_map = image_features[from_layer]
+      base_from_layer = from_layer
+      feature_map_keys.append(from_layer)
+    else:
+      pre_layer = feature_maps[-1]
+      intermediate_layer = pre_layer
+      if insert_1x1_conv:
+        layer_name = '{}_1_Conv2d_{}_1x1_{}'.format(
+            base_from_layer, index, depth_fn(layer_depth / 2))
+        intermediate_layer = slim.conv2d(
+            pre_layer,
+            depth_fn(layer_depth / 2), [1, 1],
+            padding='SAME',
+            stride=1,
+            scope=layer_name)
+      layer_name = '{}_2_Conv2d_{}_{}x{}_s2_{}'.format(
+          base_from_layer, index, conv_kernel_size, conv_kernel_size,
+          depth_fn(layer_depth))
+      stride = 2
+      padding = 'SAME'
+      if use_explicit_padding:
+        padding = 'VALID'
+        intermediate_layer = ops.fixed_padding(
+            intermediate_layer, conv_kernel_size)
+      if use_depthwise:
+        feature_map = slim.separable_conv2d(
+            intermediate_layer,
+            None, [conv_kernel_size, conv_kernel_size],
+            depth_multiplier=1,
+            padding=padding,
+            stride=stride,
+            scope=layer_name + '_depthwise')
+        feature_map = slim.conv2d(
+            feature_map,
+            depth_fn(layer_depth), [1, 1],
+            padding='SAME',
+            stride=1,
+            scope=layer_name)
+      else:
+        feature_map = slim.conv2d(
+            intermediate_layer,
+            depth_fn(layer_depth), [conv_kernel_size, conv_kernel_size],
+            padding=padding,
+            stride=stride,
+            scope=layer_name)
+      feature_map_keys.append(layer_name)
+    feature_maps.append(feature_map)
+  return collections.OrderedDict(
+      [(x, y) for (x, y) in zip(feature_map_keys, feature_maps)])
+def fpn_top_down_feature_maps(image_features, depth, scope=None):
+  """Generates `top-down` feature maps for Feature Pyramid Networks.
+  See https://arxiv.org/abs/1612.03144 for details.
+  Args:
+    image_features: list of tuples of (tensor_name, image_feature_tensor).
+      Spatial resolutions of succesive tensors must reduce exactly by a factor
+      of 2.
+    depth: depth of output feature maps.
+    scope: A scope name to wrap this op under.
+  Returns:
+    feature_maps: an OrderedDict mapping keys (feature map names) to
+      tensors where each tensor has shape [batch, height_i, width_i, depth_i].
+  """
+  with tf.name_scope(scope, 'top_down'):
+    num_levels = len(image_features)
+    output_feature_maps_list = []
+    output_feature_map_keys = []
+    with slim.arg_scope(
+        [slim.conv2d], padding='SAME', stride=1):
+      top_down = slim.conv2d(
+          image_features[-1][1],
+          depth, [1, 1], activation_fn=None, normalizer_fn=None,
+          scope='projection_%d' % num_levels)
+      output_feature_maps_list.append(top_down)
+      output_feature_map_keys.append(
+          'top_down_%s' % image_features[-1][0])
+      for level in reversed(range(num_levels - 1)):
+        top_down = ops.nearest_neighbor_upsampling(top_down, 2)
+        residual = slim.conv2d(
+            image_features[level][1], depth, [1, 1],
+            activation_fn=None, normalizer_fn=None,
+            scope='projection_%d' % (level + 1))
+        top_down += residual
+        output_feature_maps_list.append(slim.conv2d(
+            top_down,
+            depth, [3, 3],
+            scope='smoothing_%d' % (level + 1)))
+        output_feature_map_keys.append('top_down_%s' % image_features[level][0])
+      return collections.OrderedDict(
+          reversed(zip(output_feature_map_keys, output_feature_maps_list)))