Merge pull request #3293 from pkulzc/master

Internal changes of object_detection

Merge pull request #3293 from pkulzc/master
Internal changes of object_detection
fd7b6887 · Jonathan Huang · GitHub · f98ec55e · 1efe98bb · fd7b6887
Unverified Commit fd7b6887 authored Feb 09, 2018 by Jonathan Huang Committed by GitHub Feb 09, 2018
20 changed files
--- a/research/object_detection/builders/losses_builder.py
+++ b/research/object_detection/builders/losses_builder.py
@@ -116,18 +116,17 @@ def build_faster_rcnn_classification_loss(loss_config):
  loss_type = loss_config.WhichOneof('classification_loss')
  if loss_type == 'weighted_sigmoid':
-    config = loss_config.weighted_sigmoid
+    return losses.WeightedSigmoidClassificationLoss()
-    return losses.WeightedSigmoidClassificationLoss(
-        anchorwise_output=config.anchorwise_output)
  if loss_type == 'weighted_softmax':
    config = loss_config.weighted_softmax
    return losses.WeightedSoftmaxClassificationLoss(
-        anchorwise_output=config.anchorwise_output)
+        logit_scale=config.logit_scale)
  # By default, Faster RCNN second stage classifier uses Softmax loss
  # with anchor-wise outputs.
+  config = loss_config.weighted_softmax
  return losses.WeightedSoftmaxClassificationLoss(
-      anchorwise_output=True)
+      logit_scale=config.logit_scale)
 def _build_localization_loss(loss_config):
@@ -148,14 +147,10 @@ def _build_localization_loss(loss_config):
  loss_type = loss_config.WhichOneof('localization_loss')
  if loss_type == 'weighted_l2':
-    config = loss_config.weighted_l2
+    return losses.WeightedL2LocalizationLoss()
-    return losses.WeightedL2LocalizationLoss(
-        anchorwise_output=config.anchorwise_output)
  if loss_type == 'weighted_smooth_l1':
-    config = loss_config.weighted_smooth_l1
+    return losses.WeightedSmoothL1LocalizationLoss()
-    return losses.WeightedSmoothL1LocalizationLoss(
-        anchorwise_output=config.anchorwise_output)
  if loss_type == 'weighted_iou':
    return losses.WeightedIOULocalizationLoss()
@@ -181,9 +176,7 @@ def _build_classification_loss(loss_config):
  loss_type = loss_config.WhichOneof('classification_loss')
  if loss_type == 'weighted_sigmoid':
-    config = loss_config.weighted_sigmoid
+    return losses.WeightedSigmoidClassificationLoss()
-    return losses.WeightedSigmoidClassificationLoss(
-        anchorwise_output=config.anchorwise_output)
  if loss_type == 'weighted_sigmoid_focal':
    config = loss_config.weighted_sigmoid_focal
@@ -191,21 +184,18 @@ def _build_classification_loss(loss_config):
    if config.HasField('alpha'):
      alpha = config.alpha
    return losses.SigmoidFocalClassificationLoss(
-        anchorwise_output=config.anchorwise_output,
        gamma=config.gamma,
        alpha=alpha)
  if loss_type == 'weighted_softmax':
    config = loss_config.weighted_softmax
    return losses.WeightedSoftmaxClassificationLoss(
-        anchorwise_output=config.anchorwise_output,
        logit_scale=config.logit_scale)
  if loss_type == 'bootstrapped_sigmoid':
    config = loss_config.bootstrapped_sigmoid
    return losses.BootstrappedSigmoidClassificationLoss(
        alpha=config.alpha,
-        bootstrap_type=('hard' if config.hard_bootstrap else 'soft'),
+        bootstrap_type=('hard' if config.hard_bootstrap else 'soft'))
-        anchorwise_output=config.anchorwise_output)
  raise ValueError('Empty loss config.')
--- a/research/object_detection/builders/losses_builder_test.py
+++ b/research/object_detection/builders/losses_builder_test.py
@@ -80,7 +80,6 @@ class LocalizationLossBuilderTest(tf.test.TestCase):
    losses_text_proto = """
      localization_loss {
        weighted_smooth_l1 {
-          anchorwise_output: true
        }
      }
      classification_loss {
@@ -245,7 +244,7 @@ class ClassificationLossBuilderTest(tf.test.TestCase):
    targets = tf.constant([[[0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]])
    weights = tf.constant([[1.0, 1.0]])
    loss = classification_loss(predictions, targets, weights=weights)
-    self.assertEqual(loss.shape, [1, 2])
+    self.assertEqual(loss.shape, [1, 2, 3])
  def test_raise_error_on_empty_config(self):
    losses_text_proto = """

--- a/research/object_detection/builders/matcher_builder.py
+++ b/research/object_detection/builders/matcher_builder.py
@@ -45,7 +45,9 @@ def build(matcher_config):
        matched_threshold=matched_threshold,
        unmatched_threshold=unmatched_threshold,
        negatives_lower_than_unmatched=matcher.negatives_lower_than_unmatched,
-        force_match_for_each_row=matcher.force_match_for_each_row)
+        force_match_for_each_row=matcher.force_match_for_each_row,
+        use_matmul_gather=matcher.use_matmul_gather)
  if matcher_config.WhichOneof('matcher_oneof') == 'bipartite_matcher':
-    return bipartite_matcher.GreedyBipartiteMatcher()
+    matcher = matcher_config.bipartite_matcher
+    return bipartite_matcher.GreedyBipartiteMatcher(matcher.use_matmul_gather)
  raise ValueError('Empty matcher.')
--- a/research/object_detection/builders/matcher_builder_test.py
+++ b/research/object_detection/builders/matcher_builder_test.py
@@ -62,6 +62,7 @@ class MatcherBuilderTest(tf.test.TestCase):
        unmatched_threshold: 0.3
        negatives_lower_than_unmatched: false
        force_match_for_each_row: true
+        use_matmul_gather: true
      }
    """
    matcher_proto = matcher_pb2.Matcher()
@@ -72,6 +73,7 @@ class MatcherBuilderTest(tf.test.TestCase):
    self.assertAlmostEqual(matcher_object._unmatched_threshold, 0.3)
    self.assertFalse(matcher_object._negatives_lower_than_unmatched)
    self.assertTrue(matcher_object._force_match_for_each_row)
+    self.assertTrue(matcher_object._use_matmul_gather)
  def test_build_bipartite_matcher(self):
    matcher_text_proto = """

--- a/research/object_detection/builders/model_builder.py
+++ b/research/object_detection/builders/model_builder.py
@@ -31,6 +31,7 @@ from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extr
 from object_detection.models import faster_rcnn_inception_v2_feature_extractor as frcnn_inc_v2
 from object_detection.models import faster_rcnn_nas_feature_extractor as frcnn_nas
 from object_detection.models import faster_rcnn_resnet_v1_feature_extractor as frcnn_resnet_v1
+from object_detection.models import ssd_resnet_v1_fpn_feature_extractor as ssd_resnet_v1_fpn
 from object_detection.models.embedded_ssd_mobilenet_v1_feature_extractor import EmbeddedSSDMobileNetV1FeatureExtractor
 from object_detection.models.ssd_inception_v2_feature_extractor import SSDInceptionV2FeatureExtractor
 from object_detection.models.ssd_inception_v3_feature_extractor import SSDInceptionV3FeatureExtractor
@@ -42,6 +43,9 @@ SSD_FEATURE_EXTRACTOR_CLASS_MAP = {
    'ssd_inception_v2': SSDInceptionV2FeatureExtractor,
    'ssd_inception_v3': SSDInceptionV3FeatureExtractor,
    'ssd_mobilenet_v1': SSDMobileNetV1FeatureExtractor,
+    'ssd_resnet50_v1_fpn': ssd_resnet_v1_fpn.SSDResnet50V1FpnFeatureExtractor,
+    'ssd_resnet101_v1_fpn': ssd_resnet_v1_fpn.SSDResnet101V1FpnFeatureExtractor,
+    'ssd_resnet152_v1_fpn': ssd_resnet_v1_fpn.SSDResnet152V1FpnFeatureExtractor,
    'embedded_ssd_mobilenet_v1': EmbeddedSSDMobileNetV1FeatureExtractor,
 }
@@ -62,13 +66,14 @@ FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP = {
 }
-def build(model_config, is_training):
+def build(model_config, is_training, add_summaries=True):
  """Builds a DetectionModel based on the model config.
  Args:
    model_config: A model.proto object containing the config for the desired
      DetectionModel.
    is_training: True if this model is being built for training purposes.
+    add_summaries: Whether to add tensorflow summaries in the model graph.
  Returns:
    DetectionModel based on the config.
@@ -80,9 +85,10 @@ def build(model_config, is_training):
    raise ValueError('model_config not of type model_pb2.DetectionModel.')
  meta_architecture = model_config.WhichOneof('model')
  if meta_architecture == 'ssd':
-    return _build_ssd_model(model_config.ssd, is_training)
+    return _build_ssd_model(model_config.ssd, is_training, add_summaries)
  if meta_architecture == 'faster_rcnn':
-    return _build_faster_rcnn_model(model_config.faster_rcnn, is_training)
+    return _build_faster_rcnn_model(model_config.faster_rcnn, is_training,
+                                    add_summaries)
  raise ValueError('Unknown meta architecture: {}'.format(meta_architecture))
@@ -106,6 +112,8 @@ def _build_ssd_feature_extractor(feature_extractor_config, is_training,
  min_depth = feature_extractor_config.min_depth
  pad_to_multiple = feature_extractor_config.pad_to_multiple
  batch_norm_trainable = feature_extractor_config.batch_norm_trainable
+  use_explicit_padding = feature_extractor_config.use_explicit_padding
+  use_depthwise = feature_extractor_config.use_depthwise
  conv_hyperparams = hyperparams_builder.build(
      feature_extractor_config.conv_hyperparams, is_training)
@@ -115,16 +123,18 @@ def _build_ssd_feature_extractor(feature_extractor_config, is_training,
  feature_extractor_class = SSD_FEATURE_EXTRACTOR_CLASS_MAP[feature_type]
  return feature_extractor_class(is_training, depth_multiplier, min_depth,
                                 pad_to_multiple, conv_hyperparams,
-                                 batch_norm_trainable, reuse_weights)
+                                 batch_norm_trainable, reuse_weights,
+                                 use_explicit_padding, use_depthwise)
-def _build_ssd_model(ssd_config, is_training):
+def _build_ssd_model(ssd_config, is_training, add_summaries):
  """Builds an SSD detection model based on the model config.
  Args:
    ssd_config: A ssd.proto object containing the config for the desired
      SSDMetaArch.
    is_training: True if this model is being built for training purposes.
+    add_summaries: Whether to add tf summaries in the model.
  Returns:
    SSDMetaArch based on the config.
@@ -171,7 +181,8 @@ def _build_ssd_model(ssd_config, is_training):
      classification_weight,
      localization_weight,
      normalize_loss_by_num_matches,
-      hard_example_miner)
+      hard_example_miner,
+      add_summaries=add_summaries)
 def _build_faster_rcnn_feature_extractor(
@@ -205,7 +216,7 @@ def _build_faster_rcnn_feature_extractor(
      batch_norm_trainable, reuse_weights)
-def _build_faster_rcnn_model(frcnn_config, is_training):
+def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries):
  """Builds a Faster R-CNN or R-FCN detection model based on the model config.
  Builds R-FCN model if the second_stage_box_predictor in the config is of type
@@ -213,8 +224,9 @@ def _build_faster_rcnn_model(frcnn_config, is_training):
  Args:
    frcnn_config: A faster_rcnn.proto object containing the config for the
-    desired FasterRCNNMetaArch or RFCNMetaArch.
+      desired FasterRCNNMetaArch or RFCNMetaArch.
    is_training: True if this model is being built for training purposes.
+    add_summaries: Whether to add tf summaries in the model.
  Returns:
    FasterRCNNMetaArch based on the config.
@@ -228,7 +240,7 @@ def _build_faster_rcnn_model(frcnn_config, is_training):
  feature_extractor = _build_faster_rcnn_feature_extractor(
      frcnn_config.feature_extractor, is_training)
-  first_stage_only = frcnn_config.first_stage_only
+  number_of_stages = frcnn_config.number_of_stages
  first_stage_anchor_generator = anchor_generator_builder.build(
      frcnn_config.first_stage_anchor_generator)
@@ -283,7 +295,7 @@ def _build_faster_rcnn_model(frcnn_config, is_training):
      'num_classes': num_classes,
      'image_resizer_fn': image_resizer_fn,
      'feature_extractor': feature_extractor,
-      'first_stage_only': first_stage_only,
+      'number_of_stages': number_of_stages,
      'first_stage_anchor_generator': first_stage_anchor_generator,
      'first_stage_atrous_rate': first_stage_atrous_rate,
      'first_stage_box_predictor_arg_scope':
@@ -310,7 +322,8 @@ def _build_faster_rcnn_model(frcnn_config, is_training):
      second_stage_classification_loss,
      'second_stage_classification_loss_weight':
      second_stage_classification_loss_weight,
-      'hard_example_miner': hard_example_miner}
+      'hard_example_miner': hard_example_miner,
+      'add_summaries': add_summaries}
  if isinstance(second_stage_box_predictor, box_predictor.RfcnBoxPredictor):
    return rfcn_meta_arch.RFCNMetaArch(

--- a/research/object_detection/builders/model_builder_test.py
+++ b/research/object_detection/builders/model_builder_test.py
@@ -26,12 +26,14 @@ from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extr
 from object_detection.models import faster_rcnn_inception_v2_feature_extractor as frcnn_inc_v2
 from object_detection.models import faster_rcnn_nas_feature_extractor as frcnn_nas
 from object_detection.models import faster_rcnn_resnet_v1_feature_extractor as frcnn_resnet_v1
+from object_detection.models import ssd_resnet_v1_fpn_feature_extractor as ssd_resnet_v1_fpn
+from object_detection.models.embedded_ssd_mobilenet_v1_feature_extractor import EmbeddedSSDMobileNetV1FeatureExtractor
 from object_detection.models.ssd_inception_v2_feature_extractor import SSDInceptionV2FeatureExtractor
 from object_detection.models.ssd_inception_v3_feature_extractor import SSDInceptionV3FeatureExtractor
 from object_detection.models.ssd_mobilenet_v1_feature_extractor import SSDMobileNetV1FeatureExtractor
 from object_detection.protos import model_pb2
-FEATURE_EXTRACTOR_MAPS = {
+FRCNN_RESNET_FEAT_MAPS = {
    'faster_rcnn_resnet50':
    frcnn_resnet_v1.FasterRCNNResnet50FeatureExtractor,
    'faster_rcnn_resnet101':
@@ -40,6 +42,15 @@ FEATURE_EXTRACTOR_MAPS = {
    frcnn_resnet_v1.FasterRCNNResnet152FeatureExtractor
 }
+SSD_RESNET_V1_FPN_FEAT_MAPS = {
+    'ssd_resnet50_v1_fpn':
+    ssd_resnet_v1_fpn.SSDResnet50V1FpnFeatureExtractor,
+    'ssd_resnet101_v1_fpn':
+    ssd_resnet_v1_fpn.SSDResnet101V1FpnFeatureExtractor,
+    'ssd_resnet152_v1_fpn':
+    ssd_resnet_v1_fpn.SSDResnet152V1FpnFeatureExtractor
+}
 class ModelBuilderTest(tf.test.TestCase):
@@ -197,6 +208,87 @@ class ModelBuilderTest(tf.test.TestCase):
    self.assertIsInstance(model._feature_extractor,
                          SSDInceptionV3FeatureExtractor)
+  def test_create_ssd_resnet_v1_fpn_model_from_config(self):
+    model_text_proto = """
+      ssd {
+        feature_extractor {
+          type: 'ssd_resnet50_v1_fpn'
+          conv_hyperparams {
+            regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+          }
+          batch_norm_trainable: true
+        }
+        box_coder {
+          faster_rcnn_box_coder {
+          }
+        }
+        matcher {
+          argmax_matcher {
+          }
+        }
+        similarity_calculator {
+          iou_similarity {
+          }
+        }
+        anchor_generator {
+          multiscale_anchor_generator {
+            aspect_ratios: [1.0, 2.0, 0.5]
+            scales_per_octave: 2
+          }
+        }
+        image_resizer {
+          fixed_shape_resizer {
+            height: 320
+            width: 320
+          }
+        }
+        box_predictor {
+          weight_shared_convolutional_box_predictor {
+            depth: 32
+            conv_hyperparams {
+              regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+            }
+            num_layers_before_predictor: 1
+          }
+        }
+        loss {
+          classification_loss {
+            weighted_sigmoid_focal {
+              alpha: 0.25
+              gamma: 2.0
+            }
+          }
+          localization_loss {
+            weighted_smooth_l1 {
+            }
+          }
+          classification_weight: 1.0
+          localization_weight: 1.0
+        }
+      }"""
+    model_proto = model_pb2.DetectionModel()
+    text_format.Merge(model_text_proto, model_proto)
+    for extractor_type, extractor_class in SSD_RESNET_V1_FPN_FEAT_MAPS.items():
+      model_proto.ssd.feature_extractor.type = extractor_type
+      model = model_builder.build(model_proto, is_training=True)
+      self.assertIsInstance(model, ssd_meta_arch.SSDMetaArch)
+      self.assertIsInstance(model._feature_extractor, extractor_class)
  def test_create_ssd_mobilenet_v1_model_from_config(self):
    model_text_proto = """
      ssd {
@@ -270,6 +362,78 @@ class ModelBuilderTest(tf.test.TestCase):
                          SSDMobileNetV1FeatureExtractor)
    self.assertTrue(model._feature_extractor._batch_norm_trainable)
+  def test_create_embedded_ssd_mobilenet_v1_model_from_config(self):
+    model_text_proto = """
+      ssd {
+        feature_extractor {
+          type: 'embedded_ssd_mobilenet_v1'
+          conv_hyperparams {
+            regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+          }
+          batch_norm_trainable: true
+        }
+        box_coder {
+          faster_rcnn_box_coder {
+          }
+        }
+        matcher {
+          argmax_matcher {
+          }
+        }
+        similarity_calculator {
+          iou_similarity {
+          }
+        }
+        anchor_generator {
+          ssd_anchor_generator {
+            aspect_ratios: 1.0
+          }
+        }
+        image_resizer {
+          fixed_shape_resizer {
+            height: 256
+            width: 256
+          }
+        }
+        box_predictor {
+          convolutional_box_predictor {
+            conv_hyperparams {
+              regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+            }
+          }
+        }
+        loss {
+          classification_loss {
+            weighted_softmax {
+            }
+          }
+          localization_loss {
+            weighted_smooth_l1 {
+            }
+          }
+        }
+      }"""
+    model_proto = model_pb2.DetectionModel()
+    text_format.Merge(model_text_proto, model_proto)
+    model = self.create_model(model_proto)
+    self.assertIsInstance(model, ssd_meta_arch.SSDMetaArch)
+    self.assertIsInstance(model._feature_extractor,
+                          EmbeddedSSDMobileNetV1FeatureExtractor)
  def test_create_faster_rcnn_resnet_v1_models_from_config(self):
    model_text_proto = """
      faster_rcnn {
@@ -331,7 +495,7 @@ class ModelBuilderTest(tf.test.TestCase):
      }"""
    model_proto = model_pb2.DetectionModel()
    text_format.Merge(model_text_proto, model_proto)
-    for extractor_type, extractor_class in FEATURE_EXTRACTOR_MAPS.items():
+    for extractor_type, extractor_class in FRCNN_RESNET_FEAT_MAPS.items():
      model_proto.faster_rcnn.feature_extractor.type = extractor_type
      model = model_builder.build(model_proto, is_training=True)
      self.assertIsInstance(model, faster_rcnn_meta_arch.FasterRCNNMetaArch)
@@ -730,7 +894,7 @@ class ModelBuilderTest(tf.test.TestCase):
      }"""
    model_proto = model_pb2.DetectionModel()
    text_format.Merge(model_text_proto, model_proto)
-    for extractor_type, extractor_class in FEATURE_EXTRACTOR_MAPS.items():
+    for extractor_type, extractor_class in FRCNN_RESNET_FEAT_MAPS.items():
      model_proto.faster_rcnn.feature_extractor.type = extractor_type
      model = model_builder.build(model_proto, is_training=True)
      self.assertIsInstance(model, rfcn_meta_arch.RFCNMetaArch)

--- a/research/object_detection/builders/optimizer_builder.py
+++ b/research/object_detection/builders/optimizer_builder.py
@@ -19,15 +19,14 @@ import tensorflow as tf
 from object_detection.utils import learning_schedules
-def build(optimizer_config, global_summaries):
+def build(optimizer_config):
  """Create optimizer based on config.
  Args:
    optimizer_config: A Optimizer proto message.
-    global_summaries: A set to attach learning rate summary to.
  Returns:
-    An optimizer.
+    An optimizer and a list of variables for summary.
  Raises:
    ValueError: when using an unsupported input data type.
@@ -35,24 +34,30 @@ def build(optimizer_config, global_summaries):
  optimizer_type = optimizer_config.WhichOneof('optimizer')
  optimizer = None
+  summary_vars = []
  if optimizer_type == 'rms_prop_optimizer':
    config = optimizer_config.rms_prop_optimizer
+    learning_rate = _create_learning_rate(config.learning_rate)
+    summary_vars.append(learning_rate)
    optimizer = tf.train.RMSPropOptimizer(
-        _create_learning_rate(config.learning_rate, global_summaries),
+        learning_rate,
        decay=config.decay,
        momentum=config.momentum_optimizer_value,
        epsilon=config.epsilon)
  if optimizer_type == 'momentum_optimizer':
    config = optimizer_config.momentum_optimizer
+    learning_rate = _create_learning_rate(config.learning_rate)
+    summary_vars.append(learning_rate)
    optimizer = tf.train.MomentumOptimizer(
-        _create_learning_rate(config.learning_rate, global_summaries),
+        learning_rate,
        momentum=config.momentum_optimizer_value)
  if optimizer_type == 'adam_optimizer':
    config = optimizer_config.adam_optimizer
-    optimizer = tf.train.AdamOptimizer(
+    learning_rate = _create_learning_rate(config.learning_rate)
-        _create_learning_rate(config.learning_rate, global_summaries))
+    summary_vars.append(learning_rate)
+    optimizer = tf.train.AdamOptimizer(learning_rate)
  if optimizer is None:
    raise ValueError('Optimizer %s not supported.' % optimizer_type)
@@ -61,15 +66,14 @@ def build(optimizer_config, global_summaries):
    optimizer = tf.contrib.opt.MovingAverageOptimizer(
        optimizer, average_decay=optimizer_config.moving_average_decay)
-  return optimizer
+  return optimizer, summary_vars
-def _create_learning_rate(learning_rate_config, global_summaries):
+def _create_learning_rate(learning_rate_config):
  """Create optimizer learning rate based on config.
  Args:
    learning_rate_config: A LearningRate proto message.
-    global_summaries: A set to attach learning rate summary to.
  Returns:
    A learning rate.
@@ -81,7 +85,7 @@ def _create_learning_rate(learning_rate_config, global_summaries):
  learning_rate_type = learning_rate_config.WhichOneof('learning_rate')
  if learning_rate_type == 'constant_learning_rate':
    config = learning_rate_config.constant_learning_rate
-    learning_rate = config.learning_rate
+    learning_rate = tf.constant(config.learning_rate, dtype=tf.float32)
  if learning_rate_type == 'exponential_decay_learning_rate':
    config = learning_rate_config.exponential_decay_learning_rate
@@ -115,5 +119,4 @@ def _create_learning_rate(learning_rate_config, global_summaries):
  if learning_rate is None:
    raise ValueError('Learning_rate %s not supported.' % learning_rate_type)
-  global_summaries.add(tf.summary.scalar('Learning_Rate', learning_rate))
  return learning_rate
--- a/research/object_detection/builders/optimizer_builder_test.py
+++ b/research/object_detection/builders/optimizer_builder_test.py
@@ -31,12 +31,13 @@ class LearningRateBuilderTest(tf.test.TestCase):
        learning_rate: 0.004
      }
    """
-    global_summaries = set([])
    learning_rate_proto = optimizer_pb2.LearningRate()
    text_format.Merge(learning_rate_text_proto, learning_rate_proto)
    learning_rate = optimizer_builder._create_learning_rate(
-        learning_rate_proto, global_summaries)
+        learning_rate_proto)
-    self.assertAlmostEqual(learning_rate, 0.004)
+    with self.test_session():
+      learning_rate_out = learning_rate.eval()
+    self.assertAlmostEqual(learning_rate_out, 0.004)
  def testBuildExponentialDecayLearningRate(self):
    learning_rate_text_proto = """
@@ -47,11 +48,10 @@ class LearningRateBuilderTest(tf.test.TestCase):
        staircase: false
      }
    """
-    global_summaries = set([])
    learning_rate_proto = optimizer_pb2.LearningRate()
    text_format.Merge(learning_rate_text_proto, learning_rate_proto)
    learning_rate = optimizer_builder._create_learning_rate(
-        learning_rate_proto, global_summaries)
+        learning_rate_proto)
    self.assertTrue(isinstance(learning_rate, tf.Tensor))
  def testBuildManualStepLearningRate(self):
@@ -67,11 +67,10 @@ class LearningRateBuilderTest(tf.test.TestCase):
        }
      }
    """
-    global_summaries = set([])
    learning_rate_proto = optimizer_pb2.LearningRate()
    text_format.Merge(learning_rate_text_proto, learning_rate_proto)
    learning_rate = optimizer_builder._create_learning_rate(
-        learning_rate_proto, global_summaries)
+        learning_rate_proto)
    self.assertTrue(isinstance(learning_rate, tf.Tensor))
  def testBuildCosineDecayLearningRate(self):
@@ -83,22 +82,19 @@ class LearningRateBuilderTest(tf.test.TestCase):
        warmup_steps: 1000
      }
    """
-    global_summaries = set([])
    learning_rate_proto = optimizer_pb2.LearningRate()
    text_format.Merge(learning_rate_text_proto, learning_rate_proto)
    learning_rate = optimizer_builder._create_learning_rate(
-        learning_rate_proto, global_summaries)
+        learning_rate_proto)
    self.assertTrue(isinstance(learning_rate, tf.Tensor))
  def testRaiseErrorOnEmptyLearningRate(self):
    learning_rate_text_proto = """
    """
-    global_summaries = set([])
    learning_rate_proto = optimizer_pb2.LearningRate()
    text_format.Merge(learning_rate_text_proto, learning_rate_proto)
    with self.assertRaises(ValueError):
-      optimizer_builder._create_learning_rate(
+      optimizer_builder._create_learning_rate(learning_rate_proto)
-          learning_rate_proto, global_summaries)
 class OptimizerBuilderTest(tf.test.TestCase):
@@ -119,10 +115,9 @@ class OptimizerBuilderTest(tf.test.TestCase):
      }
      use_moving_average: false
    """
-    global_summaries = set([])
    optimizer_proto = optimizer_pb2.Optimizer()
    text_format.Merge(optimizer_text_proto, optimizer_proto)
-    optimizer = optimizer_builder.build(optimizer_proto, global_summaries)
+    optimizer, _ = optimizer_builder.build(optimizer_proto)
    self.assertTrue(isinstance(optimizer, tf.train.RMSPropOptimizer))
  def testBuildMomentumOptimizer(self):
@@ -137,10 +132,9 @@ class OptimizerBuilderTest(tf.test.TestCase):
      }
      use_moving_average: false
    """
-    global_summaries = set([])
    optimizer_proto = optimizer_pb2.Optimizer()
    text_format.Merge(optimizer_text_proto, optimizer_proto)
-    optimizer = optimizer_builder.build(optimizer_proto, global_summaries)
+    optimizer, _ = optimizer_builder.build(optimizer_proto)
    self.assertTrue(isinstance(optimizer, tf.train.MomentumOptimizer))
  def testBuildAdamOptimizer(self):
@@ -154,10 +148,9 @@ class OptimizerBuilderTest(tf.test.TestCase):
      }
      use_moving_average: false
    """
-    global_summaries = set([])
    optimizer_proto = optimizer_pb2.Optimizer()
    text_format.Merge(optimizer_text_proto, optimizer_proto)
-    optimizer = optimizer_builder.build(optimizer_proto, global_summaries)
+    optimizer, _ = optimizer_builder.build(optimizer_proto)
    self.assertTrue(isinstance(optimizer, tf.train.AdamOptimizer))
  def testBuildMovingAverageOptimizer(self):
@@ -171,10 +164,9 @@ class OptimizerBuilderTest(tf.test.TestCase):
      }
      use_moving_average: True
    """
-    global_summaries = set([])
    optimizer_proto = optimizer_pb2.Optimizer()
    text_format.Merge(optimizer_text_proto, optimizer_proto)
-    optimizer = optimizer_builder.build(optimizer_proto, global_summaries)
+    optimizer, _ = optimizer_builder.build(optimizer_proto)
    self.assertTrue(
        isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer))
@@ -190,23 +182,21 @@ class OptimizerBuilderTest(tf.test.TestCase):
      use_moving_average: True
      moving_average_decay: 0.2
    """
-    global_summaries = set([])
    optimizer_proto = optimizer_pb2.Optimizer()
    text_format.Merge(optimizer_text_proto, optimizer_proto)
-    optimizer = optimizer_builder.build(optimizer_proto, global_summaries)
+    optimizer, _ = optimizer_builder.build(optimizer_proto)
    self.assertTrue(
        isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer))
-    # TODO(rathodv): Find a way to not depend on the private members.
+    # TODO: Find a way to not depend on the private members.
    self.assertAlmostEqual(optimizer._ema._decay, 0.2)
  def testBuildEmptyOptimizer(self):
    optimizer_text_proto = """
    """
-    global_summaries = set([])
    optimizer_proto = optimizer_pb2.Optimizer()
    text_format.Merge(optimizer_text_proto, optimizer_proto)
    with self.assertRaises(ValueError):
-      optimizer_builder.build(optimizer_proto, global_summaries)
+      optimizer_builder.build(optimizer_proto)
 if __name__ == '__main__':

--- a/research/object_detection/builders/preprocessor_builder.py
+++ b/research/object_detection/builders/preprocessor_builder.py
@@ -83,6 +83,7 @@ PREPROCESSING_FUNCTION_MAP = {
    'random_jitter_boxes': preprocessor.random_jitter_boxes,
    'random_crop_to_aspect_ratio': preprocessor.random_crop_to_aspect_ratio,
    'random_black_patches': preprocessor.random_black_patches,
+    'rgb_to_gray': preprocessor.rgb_to_gray,
    'scale_boxes_to_pixel_coordinates': (
        preprocessor.scale_boxes_to_pixel_coordinates),
    'subtract_channel_mean': preprocessor.subtract_channel_mean,

--- a/research/object_detection/builders/preprocessor_builder_test.py
+++ b/research/object_detection/builders/preprocessor_builder_test.py
@@ -379,6 +379,16 @@ class PreprocessorBuilderTest(tf.test.TestCase):
                            'new_width': 100,
                            'method': tf.image.ResizeMethod.BICUBIC})
+  def test_build_rgb_to_gray(self):
+    preprocessor_text_proto = """
+    rgb_to_gray {}
+    """
+    preprocessor_proto = preprocessor_pb2.PreprocessingStep()
+    text_format.Merge(preprocessor_text_proto, preprocessor_proto)
+    function, args = preprocessor_builder.build(preprocessor_proto)
+    self.assertEqual(function, preprocessor.rgb_to_gray)
+    self.assertEqual(args, {})
  def test_build_subtract_channel_mean(self):
    preprocessor_text_proto = """
    subtract_channel_mean {

--- a/research/object_detection/core/BUILD
+++ b/research/object_detection/core/BUILD
@@ -53,7 +53,7 @@ py_library(
    deps = [
        ":box_list",
        "//tensorflow",
-        "//tensorflow_models/object_detection/utils:shape_utils",
+        "//tensorflow/models/research/object_detection/utils:shape_utils",
    ],
 )
@@ -113,7 +113,7 @@ py_library(
        ":box_list",
        ":box_list_ops",
        "//tensorflow",
-        "//tensorflow_models/object_detection/utils:ops",
+        "//tensorflow/models/research/object_detection/utils:ops",
    ],
 )
@@ -123,6 +123,7 @@ py_library(
        "matcher.py",
    ],
    deps = [
+        "//tensorflow/models/research/object_detection/utils:ops",
    ],
 )
@@ -160,8 +161,17 @@ py_library(
        ":box_list",
        ":box_list_ops",
        ":keypoint_ops",
+        ":preprocessor_cache",
        ":standard_fields",
        "//tensorflow",
+        "//tensorflow/models/research/object_detection/utils:shape_utils",
+    ],
+)
+py_library(
+    name = "preprocessor_cache",
+    srcs = [
+        "preprocessor_cache.py",
    ],
 )
@@ -172,6 +182,7 @@ py_test(
    ],
    deps = [
        ":preprocessor",
+        ":preprocessor_cache",
        "//tensorflow",
    ],
 )
@@ -211,6 +222,7 @@ py_library(
        ":box_list_ops",
        ":standard_fields",
        "//tensorflow",
+        "//tensorflow/models/research/object_detection/utils:shape_utils",
    ],
 )
@@ -232,15 +244,16 @@ py_library(
    ],
    deps = [
        ":box_list",
-        ":box_list_ops",
        ":matcher",
        ":region_similarity_calculator",
+        ":standard_fields",
        "//tensorflow",
-        "//tensorflow_models/object_detection/box_coders:faster_rcnn_box_coder",
+        "//tensorflow/models/research/object_detection/box_coders:faster_rcnn_box_coder",
-        "//tensorflow_models/object_detection/box_coders:mean_stddev_box_coder",
+        "//tensorflow/models/research/object_detection/box_coders:mean_stddev_box_coder",
-        "//tensorflow_models/object_detection/core:box_coder",
+        "//tensorflow/models/research/object_detection/core:box_coder",
-        "//tensorflow_models/object_detection/matchers:argmax_matcher",
+        "//tensorflow/models/research/object_detection/matchers:argmax_matcher",
-        "//tensorflow_models/object_detection/matchers:bipartite_matcher",
+        "//tensorflow/models/research/object_detection/matchers:bipartite_matcher",
+        "//tensorflow/models/research/object_detection/utils:shape_utils",
    ],
 )
@@ -254,8 +267,10 @@ py_test(
        ":region_similarity_calculator",
        ":target_assigner",
        "//tensorflow",
-        "//tensorflow_models/object_detection/box_coders:mean_stddev_box_coder",
+        "//tensorflow/models/research/object_detection/box_coders:keypoint_box_coder",
-        "//tensorflow_models/object_detection/matchers:bipartite_matcher",
+        "//tensorflow/models/research/object_detection/box_coders:mean_stddev_box_coder",
+        "//tensorflow/models/research/object_detection/matchers:bipartite_matcher",
+        "//tensorflow/models/research/object_detection/utils:test_case",
    ],
 )
@@ -274,9 +289,9 @@ py_library(
    srcs = ["box_predictor.py"],
    deps = [
        "//tensorflow",
-        "//tensorflow_models/object_detection/utils:ops",
+        "//tensorflow/models/research/object_detection/utils:ops",
-        "//tensorflow_models/object_detection/utils:shape_utils",
+        "//tensorflow/models/research/object_detection/utils:shape_utils",
-        "//tensorflow_models/object_detection/utils:static_shape",
+        "//tensorflow/models/research/object_detection/utils:static_shape",
    ],
 )
@@ -286,8 +301,9 @@ py_test(
    deps = [
        ":box_predictor",
        "//tensorflow",
-        "//tensorflow_models/object_detection/builders:hyperparams_builder",
+        "//tensorflow/models/research/object_detection/builders:hyperparams_builder",
-        "//tensorflow_models/object_detection/protos:hyperparams_py_pb2",
+        "//tensorflow/models/research/object_detection/protos:hyperparams_py_pb2",
+        "//tensorflow/models/research/object_detection/utils:test_case",
    ],
 )
@@ -298,7 +314,7 @@ py_library(
    ],
    deps = [
        "//tensorflow",
-        "//tensorflow_models/object_detection/core:box_list_ops",
+        "//tensorflow/models/research/object_detection/core:box_list_ops",
    ],
 )
@@ -309,7 +325,7 @@ py_test(
    ],
    deps = [
        ":region_similarity_calculator",
-        "//tensorflow_models/object_detection/core:box_list",
+        "//tensorflow/models/research/object_detection/core:box_list",
    ],
 )
@@ -330,7 +346,7 @@ py_library(
    ],
    deps = [
        "//tensorflow",
-        "//tensorflow_models/object_detection/utils:ops",
+        "//tensorflow/models/research/object_detection/utils:ops",
    ],
 )

--- a/research/object_detection/core/__init__.py
+++ b/research/object_detection/core/__init__.py
--- a/research/object_detection/core/anchor_generator.py
+++ b/research/object_detection/core/anchor_generator.py
@@ -77,8 +77,8 @@ class AnchorGenerator(object):
  def generate(self, feature_map_shape_list, **params):
    """Generates a collection of bounding boxes to be used as anchors.
-    TODO: remove **params from argument list and make stride and offsets (for
+    TODO: remove **params from argument list and make stride and
-        multiple_grid_anchor_generator) constructor arguments.
+      offsets (for multiple_grid_anchor_generator) constructor arguments.
    Args:
      feature_map_shape_list: list of (height, width) pairs in the format
@@ -140,3 +140,4 @@ class AnchorGenerator(object):
                               * feature_map_shape[0]
                               * feature_map_shape[1])
    return tf.assert_equal(expected_num_anchors, anchors.num_boxes())
--- a/research/object_detection/core/box_list_ops.py
+++ b/research/object_detection/core/box_list_ops.py
@@ -183,7 +183,8 @@ def prune_completely_outside_window(boxlist, window, scope=None):
    scope: name scope.
  Returns:
-    pruned_corners: a tensor with shape [M_out, 4] where M_out <= M_in
+    pruned_boxlist: a new BoxList with all bounding boxes partially or fully in
+      the window.
    valid_indices: a tensor with shape [M_out] indexing the valid bounding boxes
     in the input tensor.
  """
@@ -982,3 +983,79 @@ def pad_or_clip_box_list(boxlist, num_boxes, scope=None):
          boxlist.get_field(field), num_boxes)
      subboxlist.add_field(field, subfield)
    return subboxlist
+def select_random_box(boxlist,
+                      default_box=None,
+                      seed=None,
+                      scope=None):
+  """Selects a random bounding box from a `BoxList`.
+  Args:
+    boxlist: A BoxList.
+    default_box: A [1, 4] float32 tensor. If no boxes are present in `boxlist`,
+      this default box will be returned. If None, will use a default box of
+      [[-1., -1., -1., -1.]].
+    seed: Random seed.
+    scope: Name scope.
+  Returns:
+    bbox: A [1, 4] tensor with a random bounding box.
+    valid: A bool tensor indicating whether a valid bounding box is returned
+      (True) or whether the default box is returned (False).
+  """
+  with tf.name_scope(scope, 'SelectRandomBox'):
+    bboxes = boxlist.get()
+    combined_shape = shape_utils.combined_static_and_dynamic_shape(bboxes)
+    number_of_boxes = combined_shape[0]
+    default_box = default_box or tf.constant([[-1., -1., -1., -1.]])
+    def select_box():
+      random_index = tf.random_uniform([],
+                                       maxval=number_of_boxes,
+                                       dtype=tf.int32,
+                                       seed=seed)
+      return tf.expand_dims(bboxes[random_index], axis=0), tf.constant(True)
+  return tf.cond(
+      tf.greater_equal(number_of_boxes, 1),
+      true_fn=select_box,
+      false_fn=lambda: (default_box, tf.constant(False)))
+def get_minimal_coverage_box(boxlist,
+                             default_box=None,
+                             scope=None):
+  """Creates a single bounding box which covers all boxes in the boxlist.
+  Args:
+    boxlist: A Boxlist.
+    default_box: A [1, 4] float32 tensor. If no boxes are present in `boxlist`,
+      this default box will be returned. If None, will use a default box of
+      [[0., 0., 1., 1.]].
+    scope: Name scope.
+  Returns:
+    A [1, 4] float32 tensor with a bounding box that tightly covers all the
+    boxes in the box list. If the boxlist does not contain any boxes, the
+    default box is returned.
+  """
+  with tf.name_scope(scope, 'CreateCoverageBox'):
+    num_boxes = boxlist.num_boxes()
+    def coverage_box(bboxes):
+      y_min, x_min, y_max, x_max = tf.split(
+          value=bboxes, num_or_size_splits=4, axis=1)
+      y_min_coverage = tf.reduce_min(y_min, axis=0)
+      x_min_coverage = tf.reduce_min(x_min, axis=0)
+      y_max_coverage = tf.reduce_max(y_max, axis=0)
+      x_max_coverage = tf.reduce_max(x_max, axis=0)
+      return tf.stack(
+          [y_min_coverage, x_min_coverage, y_max_coverage, x_max_coverage],
+          axis=1)
+    default_box = default_box or tf.constant([[0., 0., 1., 1.]])
+    return tf.cond(
+        tf.greater_equal(num_boxes, 1),
+        true_fn=lambda: coverage_box(boxlist.get()),
+        false_fn=lambda: default_box)
--- a/research/object_detection/core/box_list_ops_test.py
+++ b/research/object_detection/core/box_list_ops_test.py
@@ -153,6 +153,25 @@ class BoxListOpsTest(tf.test.TestCase):
      extra_data_out = sess.run(pruned.get_field('extra_data'))
      self.assertAllEqual(extra_data_out, [[1], [2], [3], [4], [6]])
+  def test_prune_completely_outside_window_with_empty_boxlist(self):
+    window = tf.constant([0, 0, 9, 14], tf.float32)
+    corners = tf.zeros(shape=[0, 4], dtype=tf.float32)
+    boxes = box_list.BoxList(corners)
+    boxes.add_field('extra_data', tf.zeros(shape=[0], dtype=tf.int32))
+    pruned, keep_indices = box_list_ops.prune_completely_outside_window(boxes,
+                                                                        window)
+    pruned_boxes = pruned.get()
+    extra = pruned.get_field('extra_data')
+    exp_pruned_boxes = np.zeros(shape=[0, 4], dtype=np.float32)
+    exp_extra = np.zeros(shape=[0], dtype=np.int32)
+    with self.test_session() as sess:
+      pruned_boxes_out, keep_indices_out, extra_out = sess.run(
+          [pruned_boxes, keep_indices, extra])
+      self.assertAllClose(exp_pruned_boxes, pruned_boxes_out)
+      self.assertAllEqual([], keep_indices_out)
+      self.assertAllEqual(exp_extra, extra_out)
  def test_intersection(self):
    corners1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]])
    corners2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0],
@@ -593,6 +612,58 @@ class BoxListOpsTest(tf.test.TestCase):
      self.assertAllEqual(expected_classes, classes_out)
      self.assertAllClose(expected_scores, scores_out)
+  def test_select_random_box(self):
+    boxes = [[0., 0., 1., 1.],
+             [0., 1., 2., 3.],
+             [0., 2., 3., 4.]]
+    corners = tf.constant(boxes, dtype=tf.float32)
+    boxlist = box_list.BoxList(corners)
+    random_bbox, valid = box_list_ops.select_random_box(boxlist)
+    with self.test_session() as sess:
+      random_bbox_out, valid_out = sess.run([random_bbox, valid])
+    norm_small = any(
+        [np.linalg.norm(random_bbox_out - box) < 1e-6 for box in boxes])
+    self.assertTrue(norm_small)
+    self.assertTrue(valid_out)
+  def test_select_random_box_with_empty_boxlist(self):
+    corners = tf.constant([], shape=[0, 4], dtype=tf.float32)
+    boxlist = box_list.BoxList(corners)
+    random_bbox, valid = box_list_ops.select_random_box(boxlist)
+    with self.test_session() as sess:
+      random_bbox_out, valid_out = sess.run([random_bbox, valid])
+    expected_bbox_out = np.array([[-1., -1., -1., -1.]], dtype=np.float32)
+    self.assertAllEqual(expected_bbox_out, random_bbox_out)
+    self.assertFalse(valid_out)
+  def test_get_minimal_coverage_box(self):
+    boxes = [[0., 0., 1., 1.],
+             [-1., 1., 2., 3.],
+             [0., 2., 3., 4.]]
+    expected_coverage_box = [[-1., 0., 3., 4.]]
+    corners = tf.constant(boxes, dtype=tf.float32)
+    boxlist = box_list.BoxList(corners)
+    coverage_box = box_list_ops.get_minimal_coverage_box(boxlist)
+    with self.test_session() as sess:
+      coverage_box_out = sess.run(coverage_box)
+    self.assertAllClose(expected_coverage_box, coverage_box_out)
+  def test_get_minimal_coverage_box_with_empty_boxlist(self):
+    corners = tf.constant([], shape=[0, 4], dtype=tf.float32)
+    boxlist = box_list.BoxList(corners)
+    coverage_box = box_list_ops.get_minimal_coverage_box(boxlist)
+    with self.test_session() as sess:
+      coverage_box_out = sess.run(coverage_box)
+    self.assertAllClose([[0.0, 0.0, 1.0, 1.0]], coverage_box_out)
 class ConcatenateTest(tf.test.TestCase):
@@ -958,5 +1029,6 @@ class BoxRefinementTest(tf.test.TestCase):
      self.assertAllClose(expected_scores, scores_out)
      self.assertAllEqual(extra_field_out, [0, 1, 1])
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/core/box_predictor.py
+++ b/research/object_detection/core/box_predictor.py
@@ -27,6 +27,7 @@ These modules are separated from the main model since the same
 few box predictor architectures are shared across many models.
 """
 from abc import abstractmethod
+import math
 import tensorflow as tf
 from object_detection.utils import ops
 from object_detection.utils import shape_utils
@@ -59,8 +60,8 @@ class BoxPredictor(object):
  def num_classes(self):
    return self._num_classes
-  def predict(self, image_features, num_predictions_per_location, scope,
+  def predict(self, image_features, num_predictions_per_location,
-              **params):
+              scope=None, **params):
    """Computes encoded object locations and corresponding confidences.
    Takes a high level image feature map as input and produce two predictions,
@@ -70,10 +71,10 @@ class BoxPredictor(object):
    and do not assume anything about their shapes.
    Args:
-      image_features: A float tensor of shape [batch_size, height, width,
+      image_features: A list of float tensors of shape [batch_size, height_i,
-        channels] containing features for a batch of images.
+      width_i, channels_i] containing features for a batch of images.
-      num_predictions_per_location: an integer representing the number of box
+      num_predictions_per_location: A list of integers representing the number
-        predictions to be made per spatial location in the feature map.
+        of box predictions to be made per spatial location for each feature map.
      scope: Variable and Op scope name.
      **params: Additional keyword arguments for specific implementations of
              BoxPredictor.
@@ -86,10 +87,22 @@ class BoxPredictor(object):
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
+    Raises:
+      ValueError: If length of `image_features` is not equal to length of
+        `num_predictions_per_location`.
    """
-    with tf.variable_scope(scope):
+    if len(image_features) != len(num_predictions_per_location):
-      return self._predict(image_features, num_predictions_per_location,
+      raise ValueError('image_feature and num_predictions_per_location must '
-                           **params)
+                       'be of same length, found: {} vs {}'.
+                       format(len(image_features),
+                              len(num_predictions_per_location)))
+    if scope is not None:
+      with tf.variable_scope(scope):
+        return self._predict(image_features, num_predictions_per_location,
+                             **params)
+    return self._predict(image_features, num_predictions_per_location,
+                         **params)
  # TODO: num_predictions_per_location could be moved to constructor.
  # This is currently only used by ConvolutionalBoxPredictor.
@@ -98,10 +111,10 @@ class BoxPredictor(object):
    """Implementations must override this method.
    Args:
-      image_features: A float tensor of shape [batch_size, height, width,
+      image_features: A list of float tensors of shape [batch_size, height_i,
-        channels] containing features for a batch of images.
+        width_i, channels_i] containing features for a batch of images.
-      num_predictions_per_location: an integer representing the number of box
+      num_predictions_per_location: A list of integers representing the number
-        predictions to be made per spatial location in the feature map.
+        of box predictions to be made per spatial location for each feature map.
      **params: Additional keyword arguments for specific implementations of
              BoxPredictor.
@@ -169,28 +182,35 @@ class RfcnBoxPredictor(BoxPredictor):
    """Computes encoded object locations and corresponding confidences.
    Args:
-      image_features: A float tensor of shape [batch_size, height, width,
+      image_features: A list of float tensors of shape [batch_size, height_i,
-        channels] containing features for a batch of images.
+      width_i, channels_i] containing features for a batch of images.
-      num_predictions_per_location: an integer representing the number of box
+      num_predictions_per_location: A list of integers representing the number
-        predictions to be made per spatial location in the feature map.
+        of box predictions to be made per spatial location for each feature map.
-        Currently, this must be set to 1, or an error will be raised.
+        Currently, this must be set to [1], or an error will be raised.
      proposal_boxes: A float tensor of shape [batch_size, num_proposals,
        box_code_size].
    Returns:
      box_encodings: A float tensor of shape
-        [batch_size, 1, num_classes, code_size] representing the
+        [batch_size, num_anchors, num_classes, code_size] representing the
        location of the objects.
      class_predictions_with_background: A float tensor of shape
-        [batch_size, 1, num_classes + 1] representing the class
+        [batch_size, num_anchors, num_classes + 1] representing the class
        predictions for the proposals.
    Raises:
-      ValueError: if num_predictions_per_location is not 1.
+      ValueError: if num_predictions_per_location is not 1 or if
+        len(image_features) is not 1.
    """
-    if num_predictions_per_location != 1:
+    if (len(num_predictions_per_location) != 1 or
+        num_predictions_per_location[0] != 1):
      raise ValueError('Currently RfcnBoxPredictor only supports '
                       'predicting a single box per class per location.')
+    if len(image_features) != 1:
+      raise ValueError('length of `image_features` must be 1. Found {}'.
+                       format(len(image_features)))
+    image_feature = image_features[0]
+    num_predictions_per_location = num_predictions_per_location[0]
    batch_size = tf.shape(proposal_boxes)[0]
    num_boxes = tf.shape(proposal_boxes)[1]
    def get_box_indices(proposals):
@@ -202,7 +222,7 @@ class RfcnBoxPredictor(BoxPredictor):
          tf.range(start=0, limit=proposals_shape[0]), 1)
      return tf.reshape(ones_mat * multiplier, [-1])
-    net = image_features
+    net = image_feature
    with slim.arg_scope(self._conv_hyperparams):
      net = slim.conv2d(net, self._depth, [1, 1], scope='reduce_depth')
      # Location predictions.
@@ -280,6 +300,7 @@ class MaskRCNNBoxPredictor(BoxPredictor):
               predict_instance_masks=False,
               mask_height=14,
               mask_width=14,
+               mask_prediction_num_conv_layers=2,
               mask_prediction_conv_depth=256,
               predict_keypoints=False):
    """Constructor.
@@ -304,13 +325,21 @@ class MaskRCNNBoxPredictor(BoxPredictor):
        boxes.
      mask_height: Desired output mask height. The default value is 14.
      mask_width: Desired output mask width. The default value is 14.
+      mask_prediction_num_conv_layers: Number of convolution layers applied to
+        the image_features in mask prediction branch.
      mask_prediction_conv_depth: The depth for the first conv2d_transpose op
-        applied to the image_features in the mask prediciton branch.
+        applied to the image_features in the mask prediction branch. If set
+        to 0, the depth of the convolution layers will be automatically chosen
+        based on the number of object classes and the number of channels in the
+        image features.
      predict_keypoints: Whether to predict keypoints insde detection boxes.
    Raises:
-      ValueError: If predict_instance_masks or predict_keypoints is true.
+      ValueError: If predict_instance_masks is true but conv_hyperparams is not
+        set.
+      ValueError: If predict_keypoints is true since it is not implemented yet.
+      ValueError: If mask_prediction_num_conv_layers is smaller than two.
    """
    super(MaskRCNNBoxPredictor, self).__init__(is_training, num_classes)
    self._fc_hyperparams = fc_hyperparams
@@ -321,6 +350,7 @@ class MaskRCNNBoxPredictor(BoxPredictor):
    self._predict_instance_masks = predict_instance_masks
    self._mask_height = mask_height
    self._mask_width = mask_width
+    self._mask_prediction_num_conv_layers = mask_prediction_num_conv_layers
    self._mask_prediction_conv_depth = mask_prediction_conv_depth
    self._predict_keypoints = predict_keypoints
    if self._predict_keypoints:
@@ -329,52 +359,33 @@ class MaskRCNNBoxPredictor(BoxPredictor):
        self._conv_hyperparams is None):
      raise ValueError('`conv_hyperparams` must be provided when predicting '
                       'masks.')
+    if self._mask_prediction_num_conv_layers < 2:
+      raise ValueError(
+          'Mask prediction should consist of at least 2 conv layers')
  @property
  def num_classes(self):
    return self._num_classes
-  def _predict(self, image_features, num_predictions_per_location):
+  @property
-    """Computes encoded object locations and corresponding confidences.
+  def predicts_instance_masks(self):
+    return self._predict_instance_masks
-    Flattens image_features and applies fully connected ops (with no
-    non-linearity) to predict box encodings and class predictions.  In this
-    setting, anchors are not spatially arranged in any way and are assumed to
-    have been folded into the batch dimension.  Thus we output 1 for the
-    anchors dimension.
-    Also optionally predicts instance masks.
+  def _predict_boxes_and_classes(self, image_features):
-    The mask prediction head is based on the Mask RCNN paper with the following
+    """Predicts boxes and class scores.
-    modifications: We replace the deconvolution layer with a bilinear resize
-    and a convolution.
    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
-      num_predictions_per_location: an integer representing the number of box
-        predictions to be made per spatial location in the feature map.
-        Currently, this must be set to 1, or an error will be raised.
    Returns:
-      A dictionary containing the following tensors.
+      box_encodings: A float tensor of shape
-        box_encodings: A float tensor of shape
+        [batch_size, 1, num_classes, code_size] representing the location of the
-          [batch_size, 1, num_classes, code_size] representing the
+        objects.
-          location of the objects.
+      class_predictions_with_background: A float tensor of shape
-        class_predictions_with_background: A float tensor of shape
+        [batch_size, 1, num_classes + 1] representing the class predictions for
-          [batch_size, 1, num_classes + 1] representing the class
+        the proposals.
-          predictions for the proposals.
-      If predict_masks is True the dictionary also contains:
-        instance_masks: A float tensor of shape
-          [batch_size, 1, num_classes, image_height, image_width]
-      If predict_keypoints is True the dictionary also contains:
-        keypoints: [batch_size, 1, num_keypoints, 2]
-    Raises:
-      ValueError: if num_predictions_per_location is not 1.
    """
-    if num_predictions_per_location != 1:
-      raise ValueError('Currently FullyConnectedBoxPredictor only supports '
-                       'predicting a single box per class per location.')
    spatial_averaged_image_features = tf.reduce_mean(image_features, [1, 2],
                                                     keep_dims=True,
                                                     name='AvgPool')
@@ -398,34 +409,155 @@ class MaskRCNNBoxPredictor(BoxPredictor):
        box_encodings, [-1, 1, self._num_classes, self._box_code_size])
    class_predictions_with_background = tf.reshape(
        class_predictions_with_background, [-1, 1, self._num_classes + 1])
+    return box_encodings, class_predictions_with_background
+  def _get_mask_predictor_conv_depth(self, num_feature_channels, num_classes,
+                                     class_weight=3.0, feature_weight=2.0):
+    """Computes the depth of the mask predictor convolutions.
+    Computes the depth of the mask predictor convolutions given feature channels
+    and number of classes by performing a weighted average of the two in
+    log space to compute the number of convolution channels. The weights that
+    are used for computing the weighted average do not need to sum to 1.
+    Args:
+      num_feature_channels: An integer containing the number of feature
+        channels.
+      num_classes: An integer containing the number of classes.
+      class_weight: Class weight used in computing the weighted average.
+      feature_weight: Feature weight used in computing the weighted average.
-    predictions_dict = {
+    Returns:
-        BOX_ENCODINGS: box_encodings,
+      An integer containing the number of convolution channels used by mask
-        CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background
+        predictor.
-    }
+    """
+    num_feature_channels_log = math.log(float(num_feature_channels), 2.0)
-    if self._predict_instance_masks:
+    num_classes_log = math.log(float(num_classes), 2.0)
-      with slim.arg_scope(self._conv_hyperparams):
+    weighted_num_feature_channels_log = (
-        upsampled_features = tf.image.resize_bilinear(
+        num_feature_channels_log * feature_weight)
-            image_features,
+    weighted_num_classes_log = num_classes_log * class_weight
-            [self._mask_height, self._mask_width],
+    total_weight = feature_weight + class_weight
-            align_corners=True)
+    num_conv_channels_log = round(
+        (weighted_num_feature_channels_log + weighted_num_classes_log) /
+        total_weight)
+    return int(math.pow(2.0, num_conv_channels_log))
+  def _predict_masks(self, image_features):
+    """Performs mask prediction.
+    Args:
+      image_features: A float tensor of shape [batch_size, height, width,
+        channels] containing features for a batch of images.
+    Returns:
+      instance_masks: A float tensor of shape
+          [batch_size, 1, num_classes, image_height, image_width].
+    """
+    num_conv_channels = self._mask_prediction_conv_depth
+    if num_conv_channels == 0:
+      num_feature_channels = image_features.get_shape().as_list()[3]
+      num_conv_channels = self._get_mask_predictor_conv_depth(
+          num_feature_channels, self.num_classes)
+    with slim.arg_scope(self._conv_hyperparams):
+      upsampled_features = tf.image.resize_bilinear(
+          image_features,
+          [self._mask_height, self._mask_width],
+          align_corners=True)
+      for _ in range(self._mask_prediction_num_conv_layers - 1):
        upsampled_features = slim.conv2d(
            upsampled_features,
-            num_outputs=self._mask_prediction_conv_depth,
+            num_outputs=num_conv_channels,
-            kernel_size=[2, 2])
+            kernel_size=[3, 3])
-        mask_predictions = slim.conv2d(upsampled_features,
+      mask_predictions = slim.conv2d(upsampled_features,
-                                       num_outputs=self.num_classes,
+                                     num_outputs=self.num_classes,
-                                       activation_fn=None,
+                                     activation_fn=None,
-                                       kernel_size=[3, 3])
+                                     kernel_size=[3, 3])
-        instance_masks = tf.expand_dims(tf.transpose(mask_predictions,
+      return tf.expand_dims(
-                                                     perm=[0, 3, 1, 2]),
+          tf.transpose(mask_predictions, perm=[0, 3, 1, 2]),
-                                        axis=1,
+          axis=1,
-                                        name='MaskPredictor')
+          name='MaskPredictor')
-      predictions_dict[MASK_PREDICTIONS] = instance_masks
+  def _predict(self, image_features, num_predictions_per_location,
+               predict_boxes_and_classes=True, predict_auxiliary_outputs=False):
+    """Optionally computes encoded object locations, confidences, and masks.
+    Flattens image_features and applies fully connected ops (with no
+    non-linearity) to predict box encodings and class predictions.  In this
+    setting, anchors are not spatially arranged in any way and are assumed to
+    have been folded into the batch dimension.  Thus we output 1 for the
+    anchors dimension.
+    Also optionally predicts instance masks.
+    The mask prediction head is based on the Mask RCNN paper with the following
+    modifications: We replace the deconvolution layer with a bilinear resize
+    and a convolution.
+    Args:
+      image_features: A list of float tensors of shape [batch_size, height_i,
+        width_i, channels_i] containing features for a batch of images.
+      num_predictions_per_location: A list of integers representing the number
+        of box predictions to be made per spatial location for each feature map.
+        Currently, this must be set to [1], or an error will be raised.
+      predict_boxes_and_classes: If true, the function will perform box
+        refinement and classification.
+      predict_auxiliary_outputs: If true, the function will perform other
+        predictions such as mask, keypoint, boundaries, etc. if any.
+    Returns:
+      A dictionary containing the following tensors.
+        box_encodings: A float tensor of shape
+          [batch_size, 1, num_classes, code_size] representing the
+          location of the objects.
+        class_predictions_with_background: A float tensor of shape
+          [batch_size, 1, num_classes + 1] representing the class
+          predictions for the proposals.
+      If predict_masks is True the dictionary also contains:
+        instance_masks: A float tensor of shape
+          [batch_size, 1, num_classes, image_height, image_width]
+      If predict_keypoints is True the dictionary also contains:
+        keypoints: [batch_size, 1, num_keypoints, 2]
+    Raises:
+      ValueError: If num_predictions_per_location is not 1 or if both
+        predict_boxes_and_classes and predict_auxiliary_outputs are false or if
+        len(image_features) is not 1.
+    """
+    if (len(num_predictions_per_location) != 1 or
+        num_predictions_per_location[0] != 1):
+      raise ValueError('Currently FullyConnectedBoxPredictor only supports '
+                       'predicting a single box per class per location.')
+    if not predict_boxes_and_classes and not predict_auxiliary_outputs:
+      raise ValueError('Should perform at least one prediction.')
+    if len(image_features) != 1:
+      raise ValueError('length of `image_features` must be 1. Found {}'.
+                       format(len(image_features)))
+    image_feature = image_features[0]
+    num_predictions_per_location = num_predictions_per_location[0]
+    predictions_dict = {}
+    if predict_boxes_and_classes:
+      (box_encodings, class_predictions_with_background
+      ) = self._predict_boxes_and_classes(image_feature)
+      predictions_dict[BOX_ENCODINGS] = box_encodings
+      predictions_dict[
+          CLASS_PREDICTIONS_WITH_BACKGROUND] = class_predictions_with_background
+    if self._predict_instance_masks and predict_auxiliary_outputs:
+      predictions_dict[MASK_PREDICTIONS] = self._predict_masks(image_feature)
    return predictions_dict
+class _NoopVariableScope(object):
+  """A dummy class that does not push any scope."""
+  def __enter__(self):
+    return None
+  def __exit__(self, exc_type, exc_value, traceback):
+    return False
 class ConvolutionalBoxPredictor(BoxPredictor):
  """Convolutional Box Predictor.
@@ -450,7 +582,8 @@ class ConvolutionalBoxPredictor(BoxPredictor):
               kernel_size,
               box_code_size,
               apply_sigmoid_to_scores=False,
-               class_prediction_bias_init=0.0):
+               class_prediction_bias_init=0.0,
+               use_depthwise=False):
    """Constructor.
    Args:
@@ -479,6 +612,8 @@ class ConvolutionalBoxPredictor(BoxPredictor):
        class_predictions.
      class_prediction_bias_init: constant value to initialize bias of the last
        conv2d layer before class prediction.
+      use_depthwise: Whether to use depthwise convolutions for prediction
+        steps. Default is False.
    Raises:
      ValueError: if min_depth > max_depth.
@@ -496,15 +631,17 @@ class ConvolutionalBoxPredictor(BoxPredictor):
    self._dropout_keep_prob = dropout_keep_prob
    self._apply_sigmoid_to_scores = apply_sigmoid_to_scores
    self._class_prediction_bias_init = class_prediction_bias_init
+    self._use_depthwise = use_depthwise
-  def _predict(self, image_features, num_predictions_per_location):
+  def _predict(self, image_features, num_predictions_per_location_list):
    """Computes encoded object locations and corresponding confidences.
    Args:
-      image_features: A float tensor of shape [batch_size, height, width,
+      image_features: A list of float tensors of shape [batch_size, height_i,
-        channels] containing features for a batch of images.
+        width_i, channels_i] containing features for a batch of images.
-      num_predictions_per_location: an integer representing the number of box
+      num_predictions_per_location_list: A list of integers representing the
-        predictions to be made per spatial location in the feature map.
+        number of box predictions to be made per spatial location for each
+        feature map.
    Returns:
      A dictionary containing the following tensors.
@@ -514,53 +651,245 @@ class ConvolutionalBoxPredictor(BoxPredictor):
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
    """
-    # Add a slot for the background class.
+    box_encodings_list = []
-    num_class_slots = self.num_classes + 1
+    class_predictions_list = []
-    net = image_features
+    # TODO: Come up with a better way to generate scope names
-    with slim.arg_scope(self._conv_hyperparams), \
+    # in box predictor once we have time to retrain all models in the zoo.
-         slim.arg_scope([slim.dropout], is_training=self._is_training):
+    # The following lines create scope names to be backwards compatible with the
-      # Add additional conv layers before the class predictor.
+    # existing checkpoints.
-      features_depth = static_shape.get_depth(image_features.get_shape())
+    box_predictor_scopes = [_NoopVariableScope()]
-      depth = max(min(features_depth, self._max_depth), self._min_depth)
+    if len(image_features) > 1:
-      tf.logging.info('depth of additional conv before box predictor: {}'.
+      box_predictor_scopes = [
-                      format(depth))
+          tf.variable_scope('BoxPredictor_{}'.format(i))
-      if depth > 0 and self._num_layers_before_predictor > 0:
+          for i in range(len(image_features))
-        for i in range(self._num_layers_before_predictor):
+      ]
-          net = slim.conv2d(
-              net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))
+    for (image_feature,
-      with slim.arg_scope([slim.conv2d], activation_fn=None,
+         num_predictions_per_location, box_predictor_scope) in zip(
-                          normalizer_fn=None, normalizer_params=None):
+             image_features, num_predictions_per_location_list,
-        box_encodings = slim.conv2d(
+             box_predictor_scopes):
-            net, num_predictions_per_location * self._box_code_size,
+      with box_predictor_scope:
-            [self._kernel_size, self._kernel_size],
+        # Add a slot for the background class.
-            scope='BoxEncodingPredictor')
+        num_class_slots = self.num_classes + 1
-        if self._use_dropout:
+        net = image_feature
-          net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
+        with slim.arg_scope(self._conv_hyperparams), \
-        class_predictions_with_background = slim.conv2d(
+             slim.arg_scope([slim.dropout], is_training=self._is_training):
-            net, num_predictions_per_location * num_class_slots,
+          # Add additional conv layers before the class predictor.
-            [self._kernel_size, self._kernel_size], scope='ClassPredictor',
+          features_depth = static_shape.get_depth(image_feature.get_shape())
-            biases_initializer=tf.constant_initializer(
+          depth = max(min(features_depth, self._max_depth), self._min_depth)
-                self._class_prediction_bias_init))
+          tf.logging.info('depth of additional conv before box predictor: {}'.
-        if self._apply_sigmoid_to_scores:
+                          format(depth))
-          class_predictions_with_background = tf.sigmoid(
+          if depth > 0 and self._num_layers_before_predictor > 0:
-              class_predictions_with_background)
+            for i in range(self._num_layers_before_predictor):
+              net = slim.conv2d(
-    combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape(
+                  net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))
-        image_features)
+          with slim.arg_scope([slim.conv2d], activation_fn=None,
-    box_encodings = tf.reshape(
+                              normalizer_fn=None, normalizer_params=None):
-        box_encodings, tf.stack([combined_feature_map_shape[0],
+            if self._use_depthwise:
-                                 combined_feature_map_shape[1] *
+              box_encodings = slim.separable_conv2d(
-                                 combined_feature_map_shape[2] *
+                  net, None, [self._kernel_size, self._kernel_size],
-                                 num_predictions_per_location,
+                  padding='SAME', depth_multiplier=1, stride=1,
-                                 1, self._box_code_size]))
+                  rate=1, scope='BoxEncodingPredictor_depthwise')
-    class_predictions_with_background = tf.reshape(
+              box_encodings = slim.conv2d(
-        class_predictions_with_background,
+                  box_encodings,
-        tf.stack([combined_feature_map_shape[0],
+                  num_predictions_per_location * self._box_code_size, [1, 1],
-                  combined_feature_map_shape[1] *
+                  scope='BoxEncodingPredictor')
-                  combined_feature_map_shape[2] *
+            else:
-                  num_predictions_per_location,
+              box_encodings = slim.conv2d(
-                  num_class_slots]))
+                  net, num_predictions_per_location * self._box_code_size,
-    return {BOX_ENCODINGS: box_encodings,
+                  [self._kernel_size, self._kernel_size],
+                  scope='BoxEncodingPredictor')
+            if self._use_dropout:
+              net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
+            if self._use_depthwise:
+              class_predictions_with_background = slim.separable_conv2d(
+                  net, None, [self._kernel_size, self._kernel_size],
+                  padding='SAME', depth_multiplier=1, stride=1,
+                  rate=1, scope='ClassPredictor_depthwise')
+              class_predictions_with_background = slim.conv2d(
+                  class_predictions_with_background,
+                  num_predictions_per_location * num_class_slots,
+                  [1, 1], scope='ClassPredictor')
+            else:
+              class_predictions_with_background = slim.conv2d(
+                  net, num_predictions_per_location * num_class_slots,
+                  [self._kernel_size, self._kernel_size],
+                  scope='ClassPredictor',
+                  biases_initializer=tf.constant_initializer(
+                      self._class_prediction_bias_init))
+            if self._apply_sigmoid_to_scores:
+              class_predictions_with_background = tf.sigmoid(
+                  class_predictions_with_background)
+        combined_feature_map_shape = (shape_utils.
+                                      combined_static_and_dynamic_shape(
+                                          image_feature))
+        box_encodings = tf.reshape(
+            box_encodings, tf.stack([combined_feature_map_shape[0],
+                                     combined_feature_map_shape[1] *
+                                     combined_feature_map_shape[2] *
+                                     num_predictions_per_location,
+                                     1, self._box_code_size]))
+        box_encodings_list.append(box_encodings)
+        class_predictions_with_background = tf.reshape(
+            class_predictions_with_background,
+            tf.stack([combined_feature_map_shape[0],
+                      combined_feature_map_shape[1] *
+                      combined_feature_map_shape[2] *
+                      num_predictions_per_location,
+                      num_class_slots]))
+        class_predictions_list.append(class_predictions_with_background)
+    return {BOX_ENCODINGS: tf.concat(box_encodings_list, axis=1),
            CLASS_PREDICTIONS_WITH_BACKGROUND:
-            class_predictions_with_background}
+            tf.concat(class_predictions_list, axis=1)}
+# TODO: Merge the implementation with ConvolutionalBoxPredictor above
+# since they are very similar.
+class WeightSharedConvolutionalBoxPredictor(BoxPredictor):
+  """Convolutional Box Predictor with weight sharing.
+  Defines the box predictor as defined in
+  https://arxiv.org/abs/1708.02002. This class differs from
+  ConvolutionalBoxPredictor in that it shares weights and biases while
+  predicting from different feature maps.  Separate multi-layer towers are
+  constructed for the box encoding and class predictors respectively.
+  """
+  def __init__(self,
+               is_training,
+               num_classes,
+               conv_hyperparams,
+               depth,
+               num_layers_before_predictor,
+               box_code_size,
+               kernel_size=3,
+               class_prediction_bias_init=0.0):
+    """Constructor.
+    Args:
+      is_training: Indicates whether the BoxPredictor is in training mode.
+      num_classes: number of classes.  Note that num_classes *does not*
+        include the background category, so if groundtruth labels take values
+        in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
+        assigned classification targets can range from {0,... K}).
+      conv_hyperparams: Slim arg_scope with hyperparameters for convolution ops.
+      depth: depth of conv layers.
+      num_layers_before_predictor: Number of the additional conv layers before
+        the predictor.
+      box_code_size: Size of encoding for each box.
+      kernel_size: Size of final convolution kernel.
+      class_prediction_bias_init: constant value to initialize bias of the last
+        conv2d layer before class prediction.
+    """
+    super(WeightSharedConvolutionalBoxPredictor, self).__init__(is_training,
+                                                                num_classes)
+    self._conv_hyperparams = conv_hyperparams
+    self._depth = depth
+    self._num_layers_before_predictor = num_layers_before_predictor
+    self._box_code_size = box_code_size
+    self._kernel_size = kernel_size
+    self._class_prediction_bias_init = class_prediction_bias_init
+  def _predict(self, image_features, num_predictions_per_location_list):
+    """Computes encoded object locations and corresponding confidences.
+    Args:
+      image_features: A list of float tensors of shape [batch_size, height_i,
+        width_i, channels] containing features for a batch of images. Note that
+        all tensors in the list must have the same number of channels.
+      num_predictions_per_location_list: A list of integers representing the
+        number of box predictions to be made per spatial location for each
+        feature map. Note that all values must be the same since the weights are
+        shared.
+    Returns:
+      A dictionary containing the following tensors.
+        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
+          code_size] representing the location of the objects, where
+          num_anchors = feat_height * feat_width * num_predictions_per_location
+        class_predictions_with_background: A float tensor of shape
+          [batch_size, num_anchors, num_classes + 1] representing the class
+          predictions for the proposals.
+    Raises:
+      ValueError: If the image feature maps do not have the same number of
+        channels or if the num predictions per locations is differs between the
+        feature maps.
+    """
+    if len(set(num_predictions_per_location_list)) > 1:
+      raise ValueError('num predictions per location must be same for all'
+                       'feature maps, found: {}'.format(
+                           num_predictions_per_location_list))
+    feature_channels = [
+        image_feature.shape[3].value for image_feature in image_features
+    ]
+    if len(set(feature_channels)) > 1:
+      raise ValueError('all feature maps must have the same number of '
+                       'channels, found: {}'.format(feature_channels))
+    box_encodings_list = []
+    class_predictions_list = []
+    for (image_feature, num_predictions_per_location) in zip(
+        image_features, num_predictions_per_location_list):
+      # Add a slot for the background class.
+      with tf.variable_scope('WeightSharedConvolutionalBoxPredictor',
+                             reuse=tf.AUTO_REUSE):
+        num_class_slots = self.num_classes + 1
+        box_encodings_net = image_feature
+        class_predictions_net = image_feature
+        with slim.arg_scope(self._conv_hyperparams):
+          for i in range(self._num_layers_before_predictor):
+            box_encodings_net = slim.conv2d(
+                box_encodings_net,
+                self._depth,
+                [self._kernel_size, self._kernel_size],
+                stride=1,
+                padding='SAME',
+                scope='BoxEncodingPredictionTower/conv2d_{}'.format(i))
+          box_encodings = slim.conv2d(
+              box_encodings_net,
+              num_predictions_per_location * self._box_code_size,
+              [self._kernel_size, self._kernel_size],
+              activation_fn=None, stride=1, padding='SAME',
+              scope='BoxEncodingPredictor')
+          for i in range(self._num_layers_before_predictor):
+            class_predictions_net = slim.conv2d(
+                class_predictions_net,
+                self._depth,
+                [self._kernel_size, self._kernel_size],
+                stride=1,
+                padding='SAME',
+                scope='ClassPredictionTower/conv2d_{}'.format(i))
+          class_predictions_with_background = slim.conv2d(
+              class_predictions_net,
+              num_predictions_per_location * num_class_slots,
+              [self._kernel_size, self._kernel_size],
+              activation_fn=None, stride=1, padding='SAME',
+              biases_initializer=tf.constant_initializer(
+                  self._class_prediction_bias_init),
+              scope='ClassPredictor')
+          combined_feature_map_shape = (shape_utils.
+                                        combined_static_and_dynamic_shape(
+                                            image_feature))
+          box_encodings = tf.reshape(
+              box_encodings, tf.stack([combined_feature_map_shape[0],
+                                       combined_feature_map_shape[1] *
+                                       combined_feature_map_shape[2] *
+                                       num_predictions_per_location,
+                                       1, self._box_code_size]))
+          box_encodings_list.append(box_encodings)
+          class_predictions_with_background = tf.reshape(
+              class_predictions_with_background,
+              tf.stack([combined_feature_map_shape[0],
+                        combined_feature_map_shape[1] *
+                        combined_feature_map_shape[2] *
+                        num_predictions_per_location,
+                        num_class_slots]))
+          class_predictions_list.append(class_predictions_with_background)
+    return {BOX_ENCODINGS: tf.concat(box_encodings_list, axis=1),
+            CLASS_PREDICTIONS_WITH_BACKGROUND:
+            tf.concat(class_predictions_list, axis=1)}
--- a/research/object_detection/core/box_predictor_test.py
+++ b/research/object_detection/core/box_predictor_test.py
@@ -14,7 +14,6 @@
 # ==============================================================================
 """Tests for object_detection.core.box_predictor."""
 import numpy as np
 import tensorflow as tf
@@ -22,6 +21,7 @@ from google.protobuf import text_format
 from object_detection.builders import hyperparams_builder
 from object_detection.core import box_predictor
 from object_detection.protos import hyperparams_pb2
+from object_detection.utils import test_case
 class MaskRCNNBoxPredictorTest(tf.test.TestCase):
@@ -55,7 +55,8 @@ class MaskRCNNBoxPredictorTest(tf.test.TestCase):
        box_code_size=4,
    )
    box_predictions = mask_box_predictor.predict(
-        image_features, num_predictions_per_location=1, scope='BoxPredictor')
+        [image_features], num_predictions_per_location=[1],
+        scope='BoxPredictor')
    box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
    class_predictions_with_background = box_predictions[
        box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
@@ -93,12 +94,16 @@ class MaskRCNNBoxPredictorTest(tf.test.TestCase):
            op_type=hyperparams_pb2.Hyperparams.CONV),
        predict_instance_masks=True)
    box_predictions = mask_box_predictor.predict(
-        image_features, num_predictions_per_location=1, scope='BoxPredictor')
+        [image_features],
+        num_predictions_per_location=[1],
+        scope='BoxPredictor',
+        predict_boxes_and_classes=True,
+        predict_auxiliary_outputs=True)
    mask_predictions = box_predictions[box_predictor.MASK_PREDICTIONS]
    self.assertListEqual([2, 1, 5, 14, 14],
                         mask_predictions.get_shape().as_list())
-  def test_do_not_return_instance_masks_and_keypoints_without_request(self):
+  def test_do_not_return_instance_masks_without_request(self):
    image_features = tf.random_uniform([2, 7, 7, 3], dtype=tf.float32)
    mask_box_predictor = box_predictor.MaskRCNNBoxPredictor(
        is_training=False,
@@ -108,7 +113,8 @@ class MaskRCNNBoxPredictorTest(tf.test.TestCase):
        dropout_keep_prob=0.5,
        box_code_size=4)
    box_predictions = mask_box_predictor.predict(
-        image_features, num_predictions_per_location=1, scope='BoxPredictor')
+        [image_features], num_predictions_per_location=[1],
+        scope='BoxPredictor')
    self.assertEqual(len(box_predictions), 2)
    self.assertTrue(box_predictor.BOX_ENCODINGS in box_predictions)
    self.assertTrue(box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND
@@ -156,7 +162,8 @@ class RfcnBoxPredictorTest(tf.test.TestCase):
        box_code_size=4
    )
    box_predictions = rfcn_box_predictor.predict(
-        image_features, num_predictions_per_location=1, scope='BoxPredictor',
+        [image_features], num_predictions_per_location=[1],
+        scope='BoxPredictor',
        proposal_boxes=proposal_boxes)
    box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
    class_predictions_with_background = box_predictions[
@@ -173,7 +180,7 @@ class RfcnBoxPredictorTest(tf.test.TestCase):
      self.assertAllEqual(class_predictions_shape, [8, 1, 3])
-class ConvolutionalBoxPredictorTest(tf.test.TestCase):
+class ConvolutionalBoxPredictorTest(test_case.TestCase):
  def _build_arg_scope_with_conv_hyperparams(self):
    conv_hyperparams = hyperparams_pb2.Hyperparams()
@@ -192,7 +199,94 @@ class ConvolutionalBoxPredictorTest(tf.test.TestCase):
    return hyperparams_builder.build(conv_hyperparams, is_training=True)
  def test_get_boxes_for_five_aspect_ratios_per_location(self):
-    image_features = tf.random_uniform([4, 8, 8, 64], dtype=tf.float32)
+    def graph_fn(image_features):
+      conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
+          is_training=False,
+          num_classes=0,
+          conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
+          min_depth=0,
+          max_depth=32,
+          num_layers_before_predictor=1,
+          use_dropout=True,
+          dropout_keep_prob=0.8,
+          kernel_size=1,
+          box_code_size=4
+      )
+      box_predictions = conv_box_predictor.predict(
+          [image_features], num_predictions_per_location=[5],
+          scope='BoxPredictor')
+      box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
+      objectness_predictions = box_predictions[
+          box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
+      return (box_encodings, objectness_predictions)
+    image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
+    (box_encodings, objectness_predictions) = self.execute(graph_fn,
+                                                           [image_features])
+    self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
+    self.assertAllEqual(objectness_predictions.shape, [4, 320, 1])
+  def test_get_boxes_for_one_aspect_ratio_per_location(self):
+    def graph_fn(image_features):
+      conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
+          is_training=False,
+          num_classes=0,
+          conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
+          min_depth=0,
+          max_depth=32,
+          num_layers_before_predictor=1,
+          use_dropout=True,
+          dropout_keep_prob=0.8,
+          kernel_size=1,
+          box_code_size=4
+      )
+      box_predictions = conv_box_predictor.predict(
+          [image_features], num_predictions_per_location=[1],
+          scope='BoxPredictor')
+      box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
+      objectness_predictions = box_predictions[
+          box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
+      return (box_encodings, objectness_predictions)
+    image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
+    (box_encodings, objectness_predictions) = self.execute(graph_fn,
+                                                           [image_features])
+    self.assertAllEqual(box_encodings.shape, [4, 64, 1, 4])
+    self.assertAllEqual(objectness_predictions.shape, [4, 64, 1])
+  def test_get_multi_class_predictions_for_five_aspect_ratios_per_location(
+      self):
+    num_classes_without_background = 6
+    image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
+    def graph_fn(image_features):
+      conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
+          is_training=False,
+          num_classes=num_classes_without_background,
+          conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
+          min_depth=0,
+          max_depth=32,
+          num_layers_before_predictor=1,
+          use_dropout=True,
+          dropout_keep_prob=0.8,
+          kernel_size=1,
+          box_code_size=4
+      )
+      box_predictions = conv_box_predictor.predict(
+          [image_features],
+          num_predictions_per_location=[5],
+          scope='BoxPredictor')
+      box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
+      class_predictions_with_background = box_predictions[
+          box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
+      return (box_encodings, class_predictions_with_background)
+    (box_encodings,
+     class_predictions_with_background) = self.execute(graph_fn,
+                                                       [image_features])
+    self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
+    self.assertAllEqual(class_predictions_with_background.shape,
+                        [4, 320, num_classes_without_background+1])
+  def test_get_predictions_with_feature_maps_of_dynamic_shape(
+      self):
+    image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
    conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
        is_training=False,
        num_classes=0,
@@ -206,22 +300,38 @@ class ConvolutionalBoxPredictorTest(tf.test.TestCase):
        box_code_size=4
    )
    box_predictions = conv_box_predictor.predict(
-        image_features, num_predictions_per_location=5, scope='BoxPredictor')
+        [image_features], num_predictions_per_location=[5],
+        scope='BoxPredictor')
    box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
    objectness_predictions = box_predictions[
        box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
    init_op = tf.global_variables_initializer()
+    resolution = 32
+    expected_num_anchors = resolution*resolution*5
    with self.test_session() as sess:
      sess.run(init_op)
      (box_encodings_shape,
       objectness_predictions_shape) = sess.run(
-           [tf.shape(box_encodings), tf.shape(objectness_predictions)])
+           [tf.shape(box_encodings), tf.shape(objectness_predictions)],
-      self.assertAllEqual(box_encodings_shape, [4, 320, 1, 4])
+           feed_dict={image_features:
-      self.assertAllEqual(objectness_predictions_shape, [4, 320, 1])
+                      np.random.rand(4, resolution, resolution, 64)})
+      actual_variable_set = set(
+          [var.op.name for var in tf.trainable_variables()])
+      self.assertAllEqual(box_encodings_shape, [4, expected_num_anchors, 1, 4])
+      self.assertAllEqual(objectness_predictions_shape,
+                          [4, expected_num_anchors, 1])
+    expected_variable_set = set([
+        'BoxPredictor/Conv2d_0_1x1_32/biases',
+        'BoxPredictor/Conv2d_0_1x1_32/weights',
+        'BoxPredictor/BoxEncodingPredictor/biases',
+        'BoxPredictor/BoxEncodingPredictor/weights',
+        'BoxPredictor/ClassPredictor/biases',
+        'BoxPredictor/ClassPredictor/weights'])
+    self.assertEqual(expected_variable_set, actual_variable_set)
-  def test_get_boxes_for_one_aspect_ratio_per_location(self):
+  def test_use_depthwise_convolution(self):
-    image_features = tf.random_uniform([4, 8, 8, 64], dtype=tf.float32)
+    image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
    conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
        is_training=False,
        num_classes=0,
@@ -229,77 +339,210 @@ class ConvolutionalBoxPredictorTest(tf.test.TestCase):
        min_depth=0,
        max_depth=32,
        num_layers_before_predictor=1,
-        use_dropout=True,
        dropout_keep_prob=0.8,
        kernel_size=1,
-        box_code_size=4
+        box_code_size=4,
+        use_dropout=True,
+        use_depthwise=True
    )
    box_predictions = conv_box_predictor.predict(
-        image_features, num_predictions_per_location=1, scope='BoxPredictor')
+        [image_features], num_predictions_per_location=[5],
+        scope='BoxPredictor')
    box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
    objectness_predictions = box_predictions[
        box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
    init_op = tf.global_variables_initializer()
+    resolution = 32
+    expected_num_anchors = resolution*resolution*5
    with self.test_session() as sess:
      sess.run(init_op)
      (box_encodings_shape,
       objectness_predictions_shape) = sess.run(
-           [tf.shape(box_encodings), tf.shape(objectness_predictions)])
+           [tf.shape(box_encodings), tf.shape(objectness_predictions)],
-      self.assertAllEqual(box_encodings_shape, [4, 64, 1, 4])
+           feed_dict={image_features:
-      self.assertAllEqual(objectness_predictions_shape, [4, 64, 1])
+                      np.random.rand(4, resolution, resolution, 64)})
+      actual_variable_set = set(
+          [var.op.name for var in tf.trainable_variables()])
+    self.assertAllEqual(box_encodings_shape, [4, expected_num_anchors, 1, 4])
+    self.assertAllEqual(objectness_predictions_shape,
+                        [4, expected_num_anchors, 1])
+    expected_variable_set = set([
+        'BoxPredictor/Conv2d_0_1x1_32/biases',
+        'BoxPredictor/Conv2d_0_1x1_32/weights',
+        'BoxPredictor/BoxEncodingPredictor_depthwise/biases',
+        'BoxPredictor/BoxEncodingPredictor_depthwise/depthwise_weights',
+        'BoxPredictor/BoxEncodingPredictor/biases',
+        'BoxPredictor/BoxEncodingPredictor/weights',
+        'BoxPredictor/ClassPredictor_depthwise/biases',
+        'BoxPredictor/ClassPredictor_depthwise/depthwise_weights',
+        'BoxPredictor/ClassPredictor/biases',
+        'BoxPredictor/ClassPredictor/weights'])
+    self.assertEqual(expected_variable_set, actual_variable_set)
+class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
+  def _build_arg_scope_with_conv_hyperparams(self):
+    conv_hyperparams = hyperparams_pb2.Hyperparams()
+    conv_hyperparams_text_proto = """
+      activation: RELU_6
+      regularizer {
+        l2_regularizer {
+        }
+      }
+      initializer {
+        truncated_normal_initializer {
+        }
+      }
+    """
+    text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
+    return hyperparams_builder.build(conv_hyperparams, is_training=True)
+  def test_get_boxes_for_five_aspect_ratios_per_location(self):
+    def graph_fn(image_features):
+      conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
+          is_training=False,
+          num_classes=0,
+          conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
+          depth=32,
+          num_layers_before_predictor=1,
+          box_code_size=4)
+      box_predictions = conv_box_predictor.predict(
+          [image_features], num_predictions_per_location=[5],
+          scope='BoxPredictor')
+      box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
+      objectness_predictions = box_predictions[
+          box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
+      return (box_encodings, objectness_predictions)
+    image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
+    (box_encodings, objectness_predictions) = self.execute(
+        graph_fn, [image_features])
+    self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
+    self.assertAllEqual(objectness_predictions.shape, [4, 320, 1])
  def test_get_multi_class_predictions_for_five_aspect_ratios_per_location(
      self):
    num_classes_without_background = 6
-    image_features = tf.random_uniform([4, 8, 8, 64], dtype=tf.float32)
+    def graph_fn(image_features):
-    conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
+      conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
-        is_training=False,
+          is_training=False,
-        num_classes=num_classes_without_background,
+          num_classes=num_classes_without_background,
-        conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
+          conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
-        min_depth=0,
+          depth=32,
-        max_depth=32,
+          num_layers_before_predictor=1,
-        num_layers_before_predictor=1,
+          box_code_size=4)
-        use_dropout=True,
+      box_predictions = conv_box_predictor.predict(
-        dropout_keep_prob=0.8,
+          [image_features],
-        kernel_size=1,
+          num_predictions_per_location=[5],
-        box_code_size=4
+          scope='BoxPredictor')
-    )
+      box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
-    box_predictions = conv_box_predictor.predict(
+      class_predictions_with_background = box_predictions[
-        image_features,
+          box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
-        num_predictions_per_location=5,
+      return (box_encodings, class_predictions_with_background)
-        scope='BoxPredictor')
-    box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
-    class_predictions_with_background = box_predictions[
-        box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
-    init_op = tf.global_variables_initializer()
+    image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
-    with self.test_session() as sess:
+    (box_encodings, class_predictions_with_background) = self.execute(
-      sess.run(init_op)
+        graph_fn, [image_features])
-      (box_encodings_shape, class_predictions_with_background_shape
+    self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
-      ) = sess.run([
+    self.assertAllEqual(class_predictions_with_background.shape,
-          tf.shape(box_encodings), tf.shape(class_predictions_with_background)])
+                        [4, 320, num_classes_without_background+1])
-      self.assertAllEqual(box_encodings_shape, [4, 320, 1, 4])
-      self.assertAllEqual(class_predictions_with_background_shape,
+  def test_get_multi_class_predictions_from_two_feature_maps(
-                          [4, 320, num_classes_without_background+1])
+      self):
-  def test_get_boxes_for_five_aspect_ratios_per_location_fully_convolutional(
+    num_classes_without_background = 6
+    def graph_fn(image_features1, image_features2):
+      conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
+          is_training=False,
+          num_classes=num_classes_without_background,
+          conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
+          depth=32,
+          num_layers_before_predictor=1,
+          box_code_size=4)
+      box_predictions = conv_box_predictor.predict(
+          [image_features1, image_features2],
+          num_predictions_per_location=[5, 5],
+          scope='BoxPredictor')
+      box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
+      class_predictions_with_background = box_predictions[
+          box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
+      return (box_encodings, class_predictions_with_background)
+    image_features1 = np.random.rand(4, 8, 8, 64).astype(np.float32)
+    image_features2 = np.random.rand(4, 8, 8, 64).astype(np.float32)
+    (box_encodings, class_predictions_with_background) = self.execute(
+        graph_fn, [image_features1, image_features2])
+    self.assertAllEqual(box_encodings.shape, [4, 640, 1, 4])
+    self.assertAllEqual(class_predictions_with_background.shape,
+                        [4, 640, num_classes_without_background+1])
+  def test_predictions_from_multiple_feature_maps_share_weights(self):
+    num_classes_without_background = 6
+    def graph_fn(image_features1, image_features2):
+      conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
+          is_training=False,
+          num_classes=num_classes_without_background,
+          conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
+          depth=32,
+          num_layers_before_predictor=2,
+          box_code_size=4)
+      box_predictions = conv_box_predictor.predict(
+          [image_features1, image_features2],
+          num_predictions_per_location=[5, 5],
+          scope='BoxPredictor')
+      box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
+      class_predictions_with_background = box_predictions[
+          box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
+      return (box_encodings, class_predictions_with_background)
+    with self.test_session(graph=tf.Graph()):
+      graph_fn(tf.random_uniform([4, 32, 32, 3], dtype=tf.float32),
+               tf.random_uniform([4, 16, 16, 3], dtype=tf.float32))
+      actual_variable_set = set(
+          [var.op.name for var in tf.trainable_variables()])
+    expected_variable_set = set([
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxEncodingPredictionTower/conv2d_0/weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxEncodingPredictionTower/conv2d_0/biases'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxEncodingPredictionTower/conv2d_1/weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxEncodingPredictionTower/conv2d_1/biases'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictionTower/conv2d_0/weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictionTower/conv2d_0/biases'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictionTower/conv2d_1/weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictionTower/conv2d_1/biases'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxEncodingPredictor/weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'BoxEncodingPredictor/biases'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictor/weights'),
+        ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
+         'ClassPredictor/biases')])
+    self.assertEqual(expected_variable_set, actual_variable_set)
+  def test_get_predictions_with_feature_maps_of_dynamic_shape(
      self):
    image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
-    conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
+    conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
        is_training=False,
        num_classes=0,
        conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
-        min_depth=0,
+        depth=32,
-        max_depth=32,
        num_layers_before_predictor=1,
-        use_dropout=True,
+        box_code_size=4)
-        dropout_keep_prob=0.8,
-        kernel_size=1,
-        box_code_size=4
-    )
    box_predictions = conv_box_predictor.predict(
-        image_features, num_predictions_per_location=5, scope='BoxPredictor')
+        [image_features], num_predictions_per_location=[5],
+        scope='BoxPredictor')
    box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
    objectness_predictions = box_predictions[
        box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
@@ -318,6 +561,5 @@ class ConvolutionalBoxPredictorTest(tf.test.TestCase):
      self.assertAllEqual(objectness_predictions_shape,
                          [4, expected_num_anchors, 1])
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/core/losses.py
+++ b/research/object_detection/core/losses.py
@@ -50,8 +50,10 @@ class Loss(object):
    """Call the loss function.
    Args:
-      prediction_tensor: a tensor representing predicted quantities.
+      prediction_tensor: an N-d tensor of shape [batch, anchors, ...]
-      target_tensor: a tensor representing regression or classification targets.
+        representing predicted quantities.
+      target_tensor: an N-d tensor of shape [batch, anchors, ...] representing
+        regression or classification targets.
      ignore_nan_targets: whether to ignore nan targets in the loss computation.
        E.g. can be used if the target tensor is missing groundtruth data that
        shouldn't be factored into the loss.
@@ -81,7 +83,8 @@ class Loss(object):
              the Loss.
    Returns:
-      loss: a tensor representing the value of the loss function
+      loss: an N-d tensor of shape [batch, anchors, ...] containing the loss per
+        anchor
    """
    pass
@@ -92,15 +95,6 @@ class WeightedL2LocalizationLoss(Loss):
  Loss[b,a] = .5 * ||weights[b,a] * (prediction[b,a,:] - target[b,a,:])||^2
  """
-  def __init__(self, anchorwise_output=False):
-    """Constructor.
-    Args:
-      anchorwise_output: Outputs loss per anchor. (default False)
-    """
-    self._anchorwise_output = anchorwise_output
  def _compute_loss(self, prediction_tensor, target_tensor, weights):
    """Compute loss function.
@@ -112,15 +106,13 @@ class WeightedL2LocalizationLoss(Loss):
      weights: a float tensor of shape [batch_size, num_anchors]
    Returns:
-      loss: a (scalar) tensor representing the value of the loss function
+      loss: a float tensor of shape [batch_size, num_anchors] tensor
-            or a float tensor of shape [batch_size, num_anchors]
+        representing the value of the loss function.
    """
    weighted_diff = (prediction_tensor - target_tensor) * tf.expand_dims(
        weights, 2)
    square_diff = 0.5 * tf.square(weighted_diff)
-    if self._anchorwise_output:
+    return tf.reduce_sum(square_diff, 2)
-      return tf.reduce_sum(square_diff, 2)
-    return tf.reduce_sum(square_diff)
 class WeightedSmoothL1LocalizationLoss(Loss):
@@ -132,15 +124,6 @@ class WeightedSmoothL1LocalizationLoss(Loss):
  See also Equation (3) in the Fast R-CNN paper by Ross Girshick (ICCV 2015)
  """
-  def __init__(self, anchorwise_output=False):
-    """Constructor.
-    Args:
-      anchorwise_output: Outputs loss per anchor. (default False)
-    """
-    self._anchorwise_output = anchorwise_output
  def _compute_loss(self, prediction_tensor, target_tensor, weights):
    """Compute loss function.
@@ -152,7 +135,8 @@ class WeightedSmoothL1LocalizationLoss(Loss):
      weights: a float tensor of shape [batch_size, num_anchors]
    Returns:
-      loss: a (scalar) tensor representing the value of the loss function
+      loss: a float tensor of shape [batch_size, num_anchors] tensor
+        representing the value of the loss function.
    """
    diff = prediction_tensor - target_tensor
    abs_diff = tf.abs(diff)
@@ -160,9 +144,7 @@ class WeightedSmoothL1LocalizationLoss(Loss):
    anchorwise_smooth_l1norm = tf.reduce_sum(
        tf.where(abs_diff_lt_1, 0.5 * tf.square(abs_diff), abs_diff - 0.5),
        2) * weights
-    if self._anchorwise_output:
+    return anchorwise_smooth_l1norm
-      return anchorwise_smooth_l1norm
-    return tf.reduce_sum(anchorwise_smooth_l1norm)
 class WeightedIOULocalizationLoss(Loss):
@@ -184,27 +166,19 @@ class WeightedIOULocalizationLoss(Loss):
      weights: a float tensor of shape [batch_size, num_anchors]
    Returns:
-      loss: a (scalar) tensor representing the value of the loss function
+      loss: a float tensor of shape [batch_size, num_anchors] tensor
+        representing the value of the loss function.
    """
    predicted_boxes = box_list.BoxList(tf.reshape(prediction_tensor, [-1, 4]))
    target_boxes = box_list.BoxList(tf.reshape(target_tensor, [-1, 4]))
    per_anchor_iou_loss = 1.0 - box_list_ops.matched_iou(predicted_boxes,
                                                         target_boxes)
-    return tf.reduce_sum(tf.reshape(weights, [-1]) * per_anchor_iou_loss)
+    return tf.reshape(weights, [-1]) * per_anchor_iou_loss
 class WeightedSigmoidClassificationLoss(Loss):
  """Sigmoid cross entropy classification loss function."""
-  def __init__(self, anchorwise_output=False):
-    """Constructor.
-    Args:
-      anchorwise_output: Outputs loss per anchor. (default False)
-    """
-    self._anchorwise_output = anchorwise_output
  def _compute_loss(self,
                    prediction_tensor,
                    target_tensor,
@@ -222,8 +196,8 @@ class WeightedSigmoidClassificationLoss(Loss):
        If provided, computes loss only for the specified class indices.
    Returns:
-      loss: a (scalar) tensor representing the value of the loss function
+      loss: a float tensor of shape [batch_size, num_anchors, num_classes]
-            or a float tensor of shape [batch_size, num_anchors]
+        representing the value of the loss function.
    """
    weights = tf.expand_dims(weights, 2)
    if class_indices is not None:
@@ -233,9 +207,7 @@ class WeightedSigmoidClassificationLoss(Loss):
          [1, 1, -1])
    per_entry_cross_ent = (tf.nn.sigmoid_cross_entropy_with_logits(
        labels=target_tensor, logits=prediction_tensor))
-    if self._anchorwise_output:
+    return per_entry_cross_ent * weights
-      return tf.reduce_sum(per_entry_cross_ent * weights, 2)
-    return tf.reduce_sum(per_entry_cross_ent * weights)
 class SigmoidFocalClassificationLoss(Loss):
@@ -245,15 +217,13 @@ class SigmoidFocalClassificationLoss(Loss):
  examples. See https://arxiv.org/pdf/1708.02002.pdf for the loss definition.
  """
-  def __init__(self, anchorwise_output=False, gamma=2.0, alpha=0.25):
+  def __init__(self, gamma=2.0, alpha=0.25):
    """Constructor.
    Args:
-      anchorwise_output: Outputs loss per anchor. (default False)
      gamma: exponent of the modulating factor (1 - p_t) ^ gamma.
      alpha: optional alpha weighting factor to balance positives vs negatives.
    """
-    self._anchorwise_output = anchorwise_output
    self._alpha = alpha
    self._gamma = gamma
@@ -274,8 +244,8 @@ class SigmoidFocalClassificationLoss(Loss):
        If provided, computes loss only for the specified class indices.
    Returns:
-      loss: a (scalar) tensor representing the value of the loss function
+      loss: a float tensor of shape [batch_size, num_anchors, num_classes]
-            or a float tensor of shape [batch_size, num_anchors]
+        representing the value of the loss function.
    """
    weights = tf.expand_dims(weights, 2)
    if class_indices is not None:
@@ -297,25 +267,21 @@ class SigmoidFocalClassificationLoss(Loss):
                             (1 - target_tensor) * (1 - self._alpha))
    focal_cross_entropy_loss = (modulating_factor * alpha_weight_factor *
                                per_entry_cross_ent)
-    if self._anchorwise_output:
+    return focal_cross_entropy_loss * weights
-      return tf.reduce_sum(focal_cross_entropy_loss * weights, 2)
-    return tf.reduce_sum(focal_cross_entropy_loss * weights)
 class WeightedSoftmaxClassificationLoss(Loss):
  """Softmax loss function."""
-  def __init__(self, anchorwise_output=False, logit_scale=1.0):
+  def __init__(self, logit_scale=1.0):
    """Constructor.
    Args:
-      anchorwise_output: Whether to output loss per anchor (default False)
      logit_scale: When this value is high, the prediction is "diffused" and
                   when this value is low, the prediction is made peakier.
                   (default 1.0)
    """
-    self._anchorwise_output = anchorwise_output
    self._logit_scale = logit_scale
  def _compute_loss(self, prediction_tensor, target_tensor, weights):
@@ -329,7 +295,8 @@ class WeightedSoftmaxClassificationLoss(Loss):
      weights: a float tensor of shape [batch_size, num_anchors]
    Returns:
-      loss: a (scalar) tensor representing the value of the loss function
+      loss: a float tensor of shape [batch_size, num_anchors]
+        representing the value of the loss function.
    """
    num_classes = prediction_tensor.get_shape().as_list()[-1]
    prediction_tensor = tf.divide(
@@ -337,9 +304,7 @@ class WeightedSoftmaxClassificationLoss(Loss):
    per_row_cross_ent = (tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.reshape(target_tensor, [-1, num_classes]),
        logits=tf.reshape(prediction_tensor, [-1, num_classes])))
-    if self._anchorwise_output:
+    return tf.reshape(per_row_cross_ent, tf.shape(weights)) * weights
-      return tf.reshape(per_row_cross_ent, tf.shape(weights)) * weights
-    return tf.reduce_sum(per_row_cross_ent * tf.reshape(weights, [-1]))
 class BootstrappedSigmoidClassificationLoss(Loss):
@@ -359,14 +324,13 @@ class BootstrappedSigmoidClassificationLoss(Loss):
  Reed et al. (ICLR 2015).
  """
-  def __init__(self, alpha, bootstrap_type='soft', anchorwise_output=False):
+  def __init__(self, alpha, bootstrap_type='soft'):
    """Constructor.
    Args:
      alpha: a float32 scalar tensor between 0 and 1 representing interpolation
        weight
      bootstrap_type: set to either 'hard' or 'soft' (default)
-      anchorwise_output: Outputs loss per anchor. (default False)
    Raises:
      ValueError: if bootstrap_type is not either 'hard' or 'soft'
@@ -376,7 +340,6 @@ class BootstrappedSigmoidClassificationLoss(Loss):
                       '\'hard\' or \'soft.\'')
    self._alpha = alpha
    self._bootstrap_type = bootstrap_type
-    self._anchorwise_output = anchorwise_output
  def _compute_loss(self, prediction_tensor, target_tensor, weights):
    """Compute loss function.
@@ -389,8 +352,8 @@ class BootstrappedSigmoidClassificationLoss(Loss):
      weights: a float tensor of shape [batch_size, num_anchors]
    Returns:
-      loss: a (scalar) tensor representing the value of the loss function
+      loss: a float tensor of shape [batch_size, num_anchors, num_classes]
-            or a float tensor of shape [batch_size, num_anchors]
+        representing the value of the loss function.
    """
    if self._bootstrap_type == 'soft':
      bootstrap_target_tensor = self._alpha * target_tensor + (
@@ -401,9 +364,7 @@ class BootstrappedSigmoidClassificationLoss(Loss):
              tf.sigmoid(prediction_tensor) > 0.5, tf.float32)
    per_entry_cross_ent = (tf.nn.sigmoid_cross_entropy_with_logits(
        labels=bootstrap_target_tensor, logits=prediction_tensor))
-    if self._anchorwise_output:
+    return per_entry_cross_ent * tf.expand_dims(weights, 2)
-      return tf.reduce_sum(per_entry_cross_ent * tf.expand_dims(weights, 2), 2)
-    return tf.reduce_sum(per_entry_cross_ent * tf.expand_dims(weights, 2))
 class HardExampleMiner(object):

--- a/research/object_detection/core/losses_test.py
+++ b/research/object_detection/core/losses_test.py
@@ -26,7 +26,7 @@ from object_detection.core import matcher
 class WeightedL2LocalizationLossTest(tf.test.TestCase):
-  def testReturnsCorrectLoss(self):
+  def testReturnsCorrectWeightedLoss(self):
    batch_size = 3
    num_anchors = 10
    code_size = 4
@@ -36,7 +36,8 @@ class WeightedL2LocalizationLossTest(tf.test.TestCase):
                           [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
                           [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]], tf.float32)
    loss_op = losses.WeightedL2LocalizationLoss()
-    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
+    loss = tf.reduce_sum(loss_op(prediction_tensor, target_tensor,
+                                 weights=weights))
    expected_loss = (3 * 5 * 4) / 2.0
    with self.test_session() as sess:
@@ -50,7 +51,7 @@ class WeightedL2LocalizationLossTest(tf.test.TestCase):
    prediction_tensor = tf.ones([batch_size, num_anchors, code_size])
    target_tensor = tf.zeros([batch_size, num_anchors, code_size])
    weights = tf.ones([batch_size, num_anchors])
-    loss_op = losses.WeightedL2LocalizationLoss(anchorwise_output=True)
+    loss_op = losses.WeightedL2LocalizationLoss()
    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
    expected_loss = np.ones((batch_size, num_anchors)) * 2
@@ -58,22 +59,6 @@ class WeightedL2LocalizationLossTest(tf.test.TestCase):
      loss_output = sess.run(loss)
      self.assertAllClose(loss_output, expected_loss)
-  def testReturnsCorrectLossSum(self):
-    batch_size = 3
-    num_anchors = 16
-    code_size = 4
-    prediction_tensor = tf.ones([batch_size, num_anchors, code_size])
-    target_tensor = tf.zeros([batch_size, num_anchors, code_size])
-    weights = tf.ones([batch_size, num_anchors])
-    loss_op = losses.WeightedL2LocalizationLoss(anchorwise_output=False)
-    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
-    expected_loss = tf.nn.l2_loss(prediction_tensor - target_tensor)
-    with self.test_session() as sess:
-      loss_output = sess.run(loss)
-      expected_loss_output = sess.run(expected_loss)
-      self.assertAllClose(loss_output, expected_loss_output)
  def testReturnsCorrectNanLoss(self):
    batch_size = 3
    num_anchors = 10
@@ -87,6 +72,7 @@ class WeightedL2LocalizationLossTest(tf.test.TestCase):
    loss_op = losses.WeightedL2LocalizationLoss()
    loss = loss_op(prediction_tensor, target_tensor, weights=weights,
                   ignore_nan_targets=True)
+    loss = tf.reduce_sum(loss)
    expected_loss = (3 * 5 * 4) / 2.0
    with self.test_session() as sess:
@@ -111,6 +97,7 @@ class WeightedSmoothL1LocalizationLossTest(tf.test.TestCase):
                           [0, 3, 0]], tf.float32)
    loss_op = losses.WeightedSmoothL1LocalizationLoss()
    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
+    loss = tf.reduce_sum(loss)
    exp_loss = 7.695
    with self.test_session() as sess:
@@ -130,6 +117,7 @@ class WeightedIOULocalizationLossTest(tf.test.TestCase):
    weights = [[1.0, .5, 2.0]]
    loss_op = losses.WeightedIOULocalizationLoss()
    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
+    loss = tf.reduce_sum(loss)
    exp_loss = 2.0
    with self.test_session() as sess:
      loss_output = sess.run(loss)
@@ -159,6 +147,7 @@ class WeightedSigmoidClassificationLossTest(tf.test.TestCase):
                           [1, 1, 1, 0]], tf.float32)
    loss_op = losses.WeightedSigmoidClassificationLoss()
    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
+    loss = tf.reduce_sum(loss)
    exp_loss = -2 * math.log(.5)
    with self.test_session() as sess:
@@ -184,8 +173,9 @@ class WeightedSigmoidClassificationLossTest(tf.test.TestCase):
                                  [1, 0, 0]]], tf.float32)
    weights = tf.constant([[1, 1, 1, 1],
                           [1, 1, 1, 0]], tf.float32)
-    loss_op = losses.WeightedSigmoidClassificationLoss(True)
+    loss_op = losses.WeightedSigmoidClassificationLoss()
    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
+    loss = tf.reduce_sum(loss, axis=2)
    exp_loss = np.matrix([[0, 0, -math.log(.5), 0],
                          [-math.log(.5), 0, 0, 0]])
@@ -214,9 +204,10 @@ class WeightedSigmoidClassificationLossTest(tf.test.TestCase):
                           [1, 1, 1, 0]], tf.float32)
    # Ignores the last class.
    class_indices = tf.constant([0, 1, 2], tf.int32)
-    loss_op = losses.WeightedSigmoidClassificationLoss(True)
+    loss_op = losses.WeightedSigmoidClassificationLoss()
    loss = loss_op(prediction_tensor, target_tensor, weights=weights,
                   class_indices=class_indices)
+    loss = tf.reduce_sum(loss, axis=2)
    exp_loss = np.matrix([[0, 0, -math.log(.5), 0],
                          [-math.log(.5), 0, 0, 0]])
@@ -245,14 +236,13 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
                                  [0],
                                  [0]]], tf.float32)
    weights = tf.constant([[1, 1, 1, 1, 1, 1]], tf.float32)
-    focal_loss_op = losses.SigmoidFocalClassificationLoss(
+    focal_loss_op = losses.SigmoidFocalClassificationLoss(gamma=2.0, alpha=None)
-        anchorwise_output=True, gamma=2.0, alpha=None)
+    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
-    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
+    focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
-        anchorwise_output=True)
+                                             weights=weights), axis=2)
-    focal_loss = focal_loss_op(prediction_tensor, target_tensor,
+    sigmoid_loss = tf.reduce_sum(sigmoid_loss_op(prediction_tensor,
-                               weights=weights)
+                                                 target_tensor,
-    sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
+                                                 weights=weights), axis=2)
-                                   weights=weights)
    with self.test_session() as sess:
      sigmoid_loss, focal_loss = sess.run([sigmoid_loss, focal_loss])
@@ -272,14 +262,13 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
                                  [0],
                                  [0]]], tf.float32)
    weights = tf.constant([[1, 1, 1, 1, 1]], tf.float32)
-    focal_loss_op = losses.SigmoidFocalClassificationLoss(
+    focal_loss_op = losses.SigmoidFocalClassificationLoss(gamma=2.0, alpha=None)
-        anchorwise_output=True, gamma=2.0, alpha=None)
+    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
-    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
+    focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
-        anchorwise_output=True)
+                                             weights=weights), axis=2)
-    focal_loss = focal_loss_op(prediction_tensor, target_tensor,
+    sigmoid_loss = tf.reduce_sum(sigmoid_loss_op(prediction_tensor,
-                               weights=weights)
+                                                 target_tensor,
-    sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
+                                                 weights=weights), axis=2)
-                                   weights=weights)
    with self.test_session() as sess:
      sigmoid_loss, focal_loss = sess.run([sigmoid_loss, focal_loss])
@@ -299,14 +288,13 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
                                  [0],
                                  [0]]], tf.float32)
    weights = tf.constant([[1, 1, 1, 1, 1]], tf.float32)
-    focal_loss_op = losses.SigmoidFocalClassificationLoss(
+    focal_loss_op = losses.SigmoidFocalClassificationLoss(gamma=2.0, alpha=None)
-        anchorwise_output=False, gamma=2.0, alpha=None)
+    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
-    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
+    focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
-        anchorwise_output=False)
+                                             weights=weights))
-    focal_loss = focal_loss_op(prediction_tensor, target_tensor,
+    sigmoid_loss = tf.reduce_sum(sigmoid_loss_op(prediction_tensor,
-                               weights=weights)
+                                                 target_tensor,
-    sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
+                                                 weights=weights))
-                                   weights=weights)
    with self.test_session() as sess:
      sigmoid_loss, focal_loss = sess.run([sigmoid_loss, focal_loss])
@@ -326,14 +314,13 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
                                  [0],
                                  [0]]], tf.float32)
    weights = tf.constant([[1, 1, 1, 1, 1]], tf.float32)
-    focal_loss_op = losses.SigmoidFocalClassificationLoss(
+    focal_loss_op = losses.SigmoidFocalClassificationLoss(gamma=2.0, alpha=1.0)
-        anchorwise_output=True, gamma=2.0, alpha=1.0)
+    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
-    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
+    focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
-        anchorwise_output=True)
+                                             weights=weights), axis=2)
-    focal_loss = focal_loss_op(prediction_tensor, target_tensor,
+    sigmoid_loss = tf.reduce_sum(sigmoid_loss_op(prediction_tensor,
-                               weights=weights)
+                                                 target_tensor,
-    sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
+                                                 weights=weights), axis=2)
-                                   weights=weights)
    with self.test_session() as sess:
      sigmoid_loss, focal_loss = sess.run([sigmoid_loss, focal_loss])
@@ -355,14 +342,13 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
                                  [0],
                                  [0]]], tf.float32)
    weights = tf.constant([[1, 1, 1, 1, 1]], tf.float32)
-    focal_loss_op = losses.SigmoidFocalClassificationLoss(
+    focal_loss_op = losses.SigmoidFocalClassificationLoss(gamma=2.0, alpha=0.0)
-        anchorwise_output=True, gamma=2.0, alpha=0.0)
+    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
-    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
+    focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
-        anchorwise_output=True)
+                                             weights=weights), axis=2)
-    focal_loss = focal_loss_op(prediction_tensor, target_tensor,
+    sigmoid_loss = tf.reduce_sum(sigmoid_loss_op(prediction_tensor,
-                               weights=weights)
+                                                 target_tensor,
-    sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
+                                                 weights=weights), axis=2)
-                                   weights=weights)
    with self.test_session() as sess:
      sigmoid_loss, focal_loss = sess.run([sigmoid_loss, focal_loss])
@@ -391,10 +377,8 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
                                  [1, 0, 0]]], tf.float32)
    weights = tf.constant([[1, 1, 1, 1],
                           [1, 1, 1, 0]], tf.float32)
-    focal_loss_op = losses.SigmoidFocalClassificationLoss(
+    focal_loss_op = losses.SigmoidFocalClassificationLoss(alpha=0.5, gamma=0.0)
-        anchorwise_output=True, alpha=0.5, gamma=0.0)
+    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
-    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
-        anchorwise_output=True)
    focal_loss = focal_loss_op(prediction_tensor, target_tensor,
                               weights=weights)
    sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
@@ -423,10 +407,8 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
                                  [1, 0, 0]]], tf.float32)
    weights = tf.constant([[1, 1, 1, 1],
                           [1, 1, 1, 0]], tf.float32)
-    focal_loss_op = losses.SigmoidFocalClassificationLoss(
+    focal_loss_op = losses.SigmoidFocalClassificationLoss(alpha=None, gamma=0.0)
-        anchorwise_output=True, alpha=None, gamma=0.0)
+    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
-    sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
-        anchorwise_output=True)
    focal_loss = focal_loss_op(prediction_tensor, target_tensor,
                               weights=weights)
    sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
@@ -456,11 +438,10 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
                                  [1, 0, 0]]], tf.float32)
    weights = tf.constant([[1, 1, 1, 1],
                           [1, 1, 1, 1]], tf.float32)
-    focal_loss_op = losses.SigmoidFocalClassificationLoss(
+    focal_loss_op = losses.SigmoidFocalClassificationLoss(alpha=1.0, gamma=0.0)
-        anchorwise_output=False, alpha=1.0, gamma=0.0)
-    focal_loss = focal_loss_op(prediction_tensor, target_tensor,
+    focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
-                               weights=weights)
+                                             weights=weights))
    with self.test_session() as sess:
      focal_loss = sess.run(focal_loss)
      self.assertAllClose(
@@ -489,11 +470,10 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
                                  [1, 0, 0]]], tf.float32)
    weights = tf.constant([[1, 1, 1, 1],
                           [1, 1, 1, 1]], tf.float32)
-    focal_loss_op = losses.SigmoidFocalClassificationLoss(
+    focal_loss_op = losses.SigmoidFocalClassificationLoss(alpha=0.75, gamma=0.0)
-        anchorwise_output=False, alpha=0.75, gamma=0.0)
-    focal_loss = focal_loss_op(prediction_tensor, target_tensor,
+    focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
-                               weights=weights)
+                                             weights=weights))
    with self.test_session() as sess:
      focal_loss = sess.run(focal_loss)
      self.assertAllClose(
@@ -528,6 +508,7 @@ class WeightedSoftmaxClassificationLossTest(tf.test.TestCase):
                           [1, 1, 1, 0]], tf.float32)
    loss_op = losses.WeightedSoftmaxClassificationLoss()
    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
+    loss = tf.reduce_sum(loss)
    exp_loss = - 1.5 * math.log(.5)
    with self.test_session() as sess:
@@ -553,7 +534,7 @@ class WeightedSoftmaxClassificationLossTest(tf.test.TestCase):
                                  [1, 0, 0]]], tf.float32)
    weights = tf.constant([[1, 1, .5, 1],
                           [1, 1, 1, 0]], tf.float32)
-    loss_op = losses.WeightedSoftmaxClassificationLoss(True)
+    loss_op = losses.WeightedSoftmaxClassificationLoss()
    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
    exp_loss = np.matrix([[0, 0, - 0.5 * math.log(.5), 0],
@@ -564,7 +545,7 @@ class WeightedSoftmaxClassificationLossTest(tf.test.TestCase):
  def testReturnsCorrectAnchorWiseLossWithHighLogitScaleSetting(self):
    """At very high logit_scale, all predictions will be ~0.33."""
-    # TODO(yonib): Also test logit_scale with anchorwise=False.
+    # TODO: Also test logit_scale with anchorwise=False.
    logit_scale = 10e16
    prediction_tensor = tf.constant([[[-100, 100, -100],
                                      [100, -100, -100],
@@ -584,8 +565,7 @@ class WeightedSoftmaxClassificationLossTest(tf.test.TestCase):
                                  [1, 0, 0]]], tf.float32)
    weights = tf.constant([[1, 1, 1, 1],
                           [1, 1, 1, 1]], tf.float32)
-    loss_op = losses.WeightedSoftmaxClassificationLoss(
+    loss_op = losses.WeightedSoftmaxClassificationLoss(logit_scale=logit_scale)
-        anchorwise_output=True, logit_scale=logit_scale)
    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
    uniform_distribution_loss = - math.log(.33333333333)
@@ -621,6 +601,7 @@ class BootstrappedSigmoidClassificationLossTest(tf.test.TestCase):
    loss_op = losses.BootstrappedSigmoidClassificationLoss(
        alpha, bootstrap_type='soft')
    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
+    loss = tf.reduce_sum(loss)
    exp_loss = -math.log(.5)
    with self.test_session() as sess:
      loss_output = sess.run(loss)
@@ -649,6 +630,7 @@ class BootstrappedSigmoidClassificationLossTest(tf.test.TestCase):
    loss_op = losses.BootstrappedSigmoidClassificationLoss(
        alpha, bootstrap_type='hard')
    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
+    loss = tf.reduce_sum(loss)
    exp_loss = -math.log(.5)
    with self.test_session() as sess:
      loss_output = sess.run(loss)
@@ -675,9 +657,9 @@ class BootstrappedSigmoidClassificationLossTest(tf.test.TestCase):
                           [1, 1, 1, 0]], tf.float32)
    alpha = tf.constant(.5, tf.float32)
    loss_op = losses.BootstrappedSigmoidClassificationLoss(
-        alpha, bootstrap_type='hard', anchorwise_output=True)
+        alpha, bootstrap_type='hard')
    loss = loss_op(prediction_tensor, target_tensor, weights=weights)
+    loss = tf.reduce_sum(loss, axis=2)
    exp_loss = np.matrix([[0, 0, -math.log(.5), 0],
                          [-math.log(.5), 0, 0, 0]])
    with self.test_session() as sess:

--- a/research/object_detection/core/matcher.py
+++ b/research/object_detection/core/matcher.py
@@ -36,6 +36,8 @@ from abc import abstractmethod
 import tensorflow as tf
+from object_detection.utils import ops
 class Match(object):
  """Class to store results from the matcher.
@@ -44,7 +46,7 @@ class Match(object):
  convenient methods to query the matching results.
  """
-  def __init__(self, match_results):
+  def __init__(self, match_results, use_matmul_gather=False):
    """Constructs a Match object.
    Args:
@@ -52,6 +54,8 @@ class Match(object):
        meaning that column i is matched with row match_results[i].
        (2) match_results[i]=-1, meaning that column i is not matched.
        (3) match_results[i]=-2, meaning that column i is ignored.
+      use_matmul_gather: Use matrix multiplication based gather instead of
+        standard tf.gather. (Default: False).
    Raises:
      ValueError: if match_results does not have rank 1 or is not an
@@ -63,6 +67,9 @@ class Match(object):
      raise ValueError('match_results should be an int32 or int64 scalar '
                       'tensor')
    self._match_results = match_results
+    self._gather_op = tf.gather
+    if use_matmul_gather:
+      self._gather_op = ops.matmul_gather_on_zeroth_axis
  @property
  def match_results(self):
@@ -163,17 +170,55 @@ class Match(object):
      row_indices: int32 tensor of shape [K] with row indices.
    """
    return self._reshape_and_cast(
-        tf.gather(self._match_results, self.matched_column_indices()))
+        self._gather_op(self._match_results, self.matched_column_indices()))
  def _reshape_and_cast(self, t):
    return tf.cast(tf.reshape(t, [-1]), tf.int32)
+  def gather_based_on_match(self, input_tensor, unmatched_value,
+                            ignored_value):
+    """Gathers elements from `input_tensor` based on match results.
+    For columns that are matched to a row, gathered_tensor[col] is set to
+    input_tensor[match_results[col]]. For columns that are unmatched,
+    gathered_tensor[col] is set to unmatched_value. Finally, for columns that
+    are ignored gathered_tensor[col] is set to ignored_value.
+    Note that the input_tensor.shape[1:] must match with unmatched_value.shape
+    and ignored_value.shape
+    Args:
+      input_tensor: Tensor to gather values from.
+      unmatched_value: Constant tensor value for unmatched columns.
+      ignored_value: Constant tensor value for ignored columns.
+    Returns:
+      gathered_tensor: A tensor containing values gathered from input_tensor.
+        The shape of the gathered tensor is [match_results.shape[0]] +
+        input_tensor.shape[1:].
+    """
+    input_tensor = tf.concat([tf.stack([ignored_value, unmatched_value]),
+                              input_tensor], axis=0)
+    gather_indices = tf.maximum(self.match_results + 2, 0)
+    gathered_tensor = self._gather_op(input_tensor, gather_indices)
+    return gathered_tensor
 class Matcher(object):
  """Abstract base class for matcher.
  """
  __metaclass__ = ABCMeta
+  def __init__(self, use_matmul_gather=False):
+    """Constructs a Matcher.
+    Args:
+      use_matmul_gather: Force constructed match objects to use matrix
+        multiplication based gather instead of standard tf.gather.
+        (Default: False).
+    """
+    self._use_matmul_gather = use_matmul_gather
  def match(self, similarity_matrix, scope=None, **params):
    """Computes matches among row and column indices and returns the result.
@@ -191,11 +236,12 @@ class Matcher(object):
      A Match object with the results of matching.
    """
    with tf.name_scope(scope, 'Match', [similarity_matrix, params]) as scope:
-      return Match(self._match(similarity_matrix, **params))
+      return Match(self._match(similarity_matrix, **params),
+                   self._use_matmul_gather)
  @abstractmethod
  def _match(self, similarity_matrix, **params):
-    """Method to be overriden by implementations.
+    """Method to be overridden by implementations.
    Args:
      similarity_matrix: Float tensor of shape [N, M] with pairwise similarity