Merge pull request #3293 from pkulzc/master

Internal changes of object_detection

Merge pull request #3293 from pkulzc/master
Internal changes of object_detection
fd7b6887 · Jonathan Huang · GitHub · f98ec55e · 1efe98bb · fd7b6887
Unverified Commit fd7b6887 authored Feb 09, 2018 by Jonathan Huang Committed by GitHub Feb 09, 2018
20 changed files
--- a/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_test.py
+++ b/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_test.py
+"""Tests for ssd resnet v1 FPN feature extractors."""
+import tensorflow as tf
+
+from object_detection.models import ssd_resnet_v1_fpn_feature_extractor
+from object_detection.models import ssd_resnet_v1_fpn_feature_extractor_testbase
+
+
+class SSDResnet50V1FeatureExtractorTest(
+    ssd_resnet_v1_fpn_feature_extractor_testbase.
+    SSDResnetFPNFeatureExtractorTestBase):
+  """SSDResnet50v1Fpn feature extractor test."""
+
+  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple):
+    min_depth = 32
+    conv_hyperparams = {}
+    batch_norm_trainable = True
+    is_training = True
+    return ssd_resnet_v1_fpn_feature_extractor.SSDResnet50V1FpnFeatureExtractor(
+        is_training, depth_multiplier, min_depth, pad_to_multiple,
+        conv_hyperparams, batch_norm_trainable)
+
+  def _resnet_scope_name(self):
+    return 'resnet_v1_50'
+
+
+class SSDResnet101V1FeatureExtractorTest(
+    ssd_resnet_v1_fpn_feature_extractor_testbase.
+    SSDResnetFPNFeatureExtractorTestBase):
+  """SSDResnet101v1Fpn feature extractor test."""
+
+  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple):
+    min_depth = 32
+    conv_hyperparams = {}
+    batch_norm_trainable = True
+    is_training = True
+    return (
+        ssd_resnet_v1_fpn_feature_extractor.SSDResnet101V1FpnFeatureExtractor(
+            is_training, depth_multiplier, min_depth, pad_to_multiple,
+            conv_hyperparams, batch_norm_trainable))
+
+  def _resnet_scope_name(self):
+    return 'resnet_v1_101'
+
+
+class SSDResnet152V1FeatureExtractorTest(
+    ssd_resnet_v1_fpn_feature_extractor_testbase.
+    SSDResnetFPNFeatureExtractorTestBase):
+  """SSDResnet152v1Fpn feature extractor test."""
+
+  def _create_feature_extractor(self, depth_multiplier, pad_to_multiple):
+    min_depth = 32
+    conv_hyperparams = {}
+    batch_norm_trainable = True
+    is_training = True
+    return (
+        ssd_resnet_v1_fpn_feature_extractor.SSDResnet152V1FpnFeatureExtractor(
+            is_training, depth_multiplier, min_depth, pad_to_multiple,
+            conv_hyperparams, batch_norm_trainable))
+
+  def _resnet_scope_name(self):
+    return 'resnet_v1_152'
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_testbase.py
+++ b/research/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_testbase.py
+"""Tests for ssd resnet v1 FPN feature extractors."""
+import abc
+import numpy as np
+import tensorflow as tf
+
+from object_detection.models import ssd_feature_extractor_test
+
+
+class SSDResnetFPNFeatureExtractorTestBase(
+    ssd_feature_extractor_test.SsdFeatureExtractorTestBase):
+  """Helper test class for SSD Resnet v1 FPN feature extractors."""
+
+  @abc.abstractmethod
+  def _resnet_scope_name(self):
+    pass
+
+  @abc.abstractmethod
+  def _fpn_scope_name(self):
+    return 'fpn'
+
+  def test_extract_features_returns_correct_shapes_256(self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 32, 32, 256), (2, 16, 16, 256),
+                                  (2, 8, 8, 256), (2, 4, 4, 256),
+                                  (2, 2, 2, 256)]
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+
+  def test_extract_features_returns_correct_shapes_with_dynamic_inputs(self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 32, 32, 256), (2, 16, 16, 256),
+                                  (2, 8, 8, 256), (2, 4, 4, 256),
+                                  (2, 2, 2, 256)]
+    self.check_extract_features_returns_correct_shapes_with_dynamic_inputs(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+
+  def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self):
+    image_height = 254
+    image_width = 254
+    depth_multiplier = 1.0
+    pad_to_multiple = 32
+    expected_feature_map_shape = [(2, 32, 32, 256), (2, 16, 16, 256),
+                                  (2, 8, 8, 256), (2, 4, 4, 256),
+                                  (2, 2, 2, 256)]
+
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+
+  def test_extract_features_raises_error_with_invalid_image_size(self):
+    image_height = 32
+    image_width = 32
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    self.check_extract_features_raises_error_with_invalid_image_size(
+        image_height, image_width, depth_multiplier, pad_to_multiple)
+
+  def test_preprocess_returns_correct_value_range(self):
+    image_height = 128
+    image_width = 128
+    depth_multiplier = 1
+    pad_to_multiple = 1
+    test_image = np.random.rand(4, image_height, image_width, 3)
+    feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                       pad_to_multiple)
+    preprocessed_image = feature_extractor.preprocess(test_image)
+    self.assertAllClose(preprocessed_image,
+                        test_image - [[123.68, 116.779, 103.939]])
+
+  def test_variables_only_created_in_scope(self):
+    depth_multiplier = 1
+    pad_to_multiple = 1
+    g = tf.Graph()
+    with g.as_default():
+      feature_extractor = self._create_feature_extractor(
+          depth_multiplier, pad_to_multiple)
+      preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3))
+      feature_extractor.extract_features(preprocessed_inputs)
+      variables = g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
+      for variable in variables:
+        self.assertTrue(
+            variable.name.startswith(self._resnet_scope_name())
+            or variable.name.startswith(self._fpn_scope_name()))
+
+
--- a/research/object_detection/object_detection_tutorial.ipynb
+++ b/research/object_detection/object_detection_tutorial.ipynb
@@ -35,6 +35,7 @@
    "from io import StringIO\n",
    "from matplotlib import pyplot as plt\n",
    "from PIL import Image\n",
+    "from object_detection.utils import ops as utils_ops\n",
    "\n",
    "if tf.__version__ < '1.4.0':\n",
    "  raise ImportError('Please upgrade your tensorflow installation to v1.4.* or later!')\n"
@@ -223,6 +224,59 @@
    "IMAGE_SIZE = (12, 8)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_inference_for_single_image(image, graph):\n",
+    "  with graph.as_default():\n",
+    "    with tf.Session() as sess:\n",
+    "      # Get handles to input and output tensors\n",
+    "      ops = tf.get_default_graph().get_operations()\n",
+    "      all_tensor_names = {output.name for op in ops for output in op.outputs}\n",
+    "      tensor_dict = {}\n",
+    "      for key in [\n",
+    "          'num_detections', 'detection_boxes', 'detection_scores',\n",
+    "          'detection_classes', 'detection_masks'\n",
+    "      ]:\n",
+    "        tensor_name = key + ':0'\n",
+    "        if tensor_name in all_tensor_names:\n",
+    "          tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(\n",
+    "              tensor_name)\n",
+    "      if 'detection_masks' in tensor_dict:\n",
+    "        # The following processing is only for single image\n",
+    "        detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])\n",
+    "        detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])\n",
+    "        # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.\n",
+    "        real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)\n",
+    "        detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])\n",
+    "        detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])\n",
+    "        detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(\n",
+    "            detection_masks, detection_boxes, image.shape[0], image.shape[1])\n",
+    "        detection_masks_reframed = tf.cast(\n",
+    "            tf.greater(detection_masks_reframed, 0.5), tf.uint8)\n",
+    "        # Follow the convention by adding back the batch dimension\n",
+    "        tensor_dict['detection_masks'] = tf.expand_dims(\n",
+    "            detection_masks_reframed, 0)\n",
+    "      image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')\n",
+    "\n",
+    "      # Run inference\n",
+    "      output_dict = sess.run(tensor_dict,\n",
+    "                             feed_dict={image_tensor: np.expand_dims(image, 0)})\n",
+    "\n",
+    "      # all outputs are float32 numpy arrays, so convert types as appropriate\n",
+    "      output_dict['num_detections'] = int(output_dict['num_detections'][0])\n",
+    "      output_dict['detection_classes'] = output_dict[\n",
+    "          'detection_classes'][0].astype(np.uint8)\n",
+    "      output_dict['detection_boxes'] = output_dict['detection_boxes'][0]\n",
+    "      output_dict['detection_scores'] = output_dict['detection_scores'][0]\n",
+    "      if 'detection_masks' in output_dict:\n",
+    "        output_dict['detection_masks'] = output_dict['detection_masks'][0]\n",
+    "  return output_dict"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -231,39 +285,27 @@
   },
   "outputs": [],
   "source": [
-    "with detection_graph.as_default():\n",
-    "  with tf.Session(graph=detection_graph) as sess:\n",
-    "    # Definite input and output Tensors for detection_graph\n",
-    "    image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')\n",
-    "    # Each box represents a part of the image where a particular object was detected.\n",
-    "    detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')\n",
-    "    # Each score represent how level of confidence for each of the objects.\n",
-    "    # Score is shown on the result image, together with the class label.\n",
-    "    detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')\n",
-    "    detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')\n",
-    "    num_detections = detection_graph.get_tensor_by_name('num_detections:0')\n",
-    "    for image_path in TEST_IMAGE_PATHS:\n",
-    "      image = Image.open(image_path)\n",
-    "      # the array based representation of the image will be used later in order to prepare the\n",
-    "      # result image with boxes and labels on it.\n",
-    "      image_np = load_image_into_numpy_array(image)\n",
-    "      # Expand dimensions since the model expects images to have shape: [1, None, None, 3]\n",
-    "      image_np_expanded = np.expand_dims(image_np, axis=0)\n",
-    "      # Actual detection.\n",
-    "      (boxes, scores, classes, num) = sess.run(\n",
-    "          [detection_boxes, detection_scores, detection_classes, num_detections],\n",
-    "          feed_dict={image_tensor: image_np_expanded})\n",
-    "      # Visualization of the results of a detection.\n",
-    "      vis_util.visualize_boxes_and_labels_on_image_array(\n",
-    "          image_np,\n",
-    "          np.squeeze(boxes),\n",
-    "          np.squeeze(classes).astype(np.int32),\n",
-    "          np.squeeze(scores),\n",
-    "          category_index,\n",
-    "          use_normalized_coordinates=True,\n",
-    "          line_thickness=8)\n",
-    "      plt.figure(figsize=IMAGE_SIZE)\n",
-    "      plt.imshow(image_np)"
+    "for image_path in TEST_IMAGE_PATHS:\n",
+    "  image = Image.open(image_path)\n",
+    "  # the array based representation of the image will be used later in order to prepare the\n",
+    "  # result image with boxes and labels on it.\n",
+    "  image_np = load_image_into_numpy_array(image)\n",
+    "  # Expand dimensions since the model expects images to have shape: [1, None, None, 3]\n",
+    "  image_np_expanded = np.expand_dims(image_np, axis=0)\n",
+    "  # Actual detection.\n",
+    "  output_dict = run_inference_for_single_image(image_np, detection_graph)\n",
+    "  # Visualization of the results of a detection.\n",
+    "  vis_util.visualize_boxes_and_labels_on_image_array(\n",
+    "      image_np,\n",
+    "      output_dict['detection_boxes'],\n",
+    "      output_dict['detection_classes'],\n",
+    "      output_dict['detection_scores'],\n",
+    "      category_index,\n",
+    "      instance_masks=output_dict.get('detection_masks'),\n",
+    "      use_normalized_coordinates=True,\n",
+    "      line_thickness=8)\n",
+    "  plt.figure(figsize=IMAGE_SIZE)\n",
+    "  plt.imshow(image_np)"
   ]
  },
  {
@@ -275,6 +317,9 @@
  }
 ],
 "metadata": {
+  "colab": {
+   "version": "0.3.2"
+  },
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",

--- a/research/object_detection/protos/BUILD
+++ b/research/object_detection/protos/BUILD
@@ -9,6 +9,7 @@ licenses(["notice"])
 proto_library(
    name = "argmax_matcher_proto",
    srcs = ["argmax_matcher.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -20,6 +21,7 @@ py_proto_library(
 proto_library(
    name = "bipartite_matcher_proto",
    srcs = ["bipartite_matcher.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -31,6 +33,7 @@ py_proto_library(
 proto_library(
    name = "matcher_proto",
    srcs = ["matcher.proto"],
+    cc_api_version = 2,
    deps = [
        ":argmax_matcher_proto",
        ":bipartite_matcher_proto",
@@ -46,6 +49,7 @@ py_proto_library(
 proto_library(
    name = "faster_rcnn_box_coder_proto",
    srcs = ["faster_rcnn_box_coder.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -57,6 +61,7 @@ py_proto_library(
 proto_library(
    name = "keypoint_box_coder_proto",
    srcs = ["keypoint_box_coder.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -68,6 +73,7 @@ py_proto_library(
 proto_library(
    name = "mean_stddev_box_coder_proto",
    srcs = ["mean_stddev_box_coder.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -79,6 +85,7 @@ py_proto_library(
 proto_library(
    name = "square_box_coder_proto",
    srcs = ["square_box_coder.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -90,6 +97,7 @@ py_proto_library(
 proto_library(
    name = "box_coder_proto",
    srcs = ["box_coder.proto"],
+    cc_api_version = 2,
    deps = [
        ":faster_rcnn_box_coder_proto",
        ":keypoint_box_coder_proto",
@@ -107,6 +115,7 @@ py_proto_library(
 proto_library(
    name = "grid_anchor_generator_proto",
    srcs = ["grid_anchor_generator.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -118,6 +127,7 @@ py_proto_library(
 proto_library(
    name = "ssd_anchor_generator_proto",
    srcs = ["ssd_anchor_generator.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -126,11 +136,25 @@ py_proto_library(
    deps = [":ssd_anchor_generator_proto"],
 )

+proto_library(
+    name = "multiscale_anchor_generator_proto",
+    srcs = ["multiscale_anchor_generator.proto"],
+    cc_api_version = 2,
+)
+
+py_proto_library(
+    name = "multiscale_anchor_generator_py_pb2",
+    api_version = 2,
+    deps = [":multiscale_anchor_generator_proto"],
+)
+
 proto_library(
    name = "anchor_generator_proto",
    srcs = ["anchor_generator.proto"],
+    cc_api_version = 2,
    deps = [
        ":grid_anchor_generator_proto",
+        ":multiscale_anchor_generator_proto",
        ":ssd_anchor_generator_proto",
    ],
 )
@@ -144,6 +168,7 @@ py_proto_library(
 proto_library(
    name = "input_reader_proto",
    srcs = ["input_reader.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -155,6 +180,7 @@ py_proto_library(
 proto_library(
    name = "losses_proto",
    srcs = ["losses.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -166,6 +192,7 @@ py_proto_library(
 proto_library(
    name = "optimizer_proto",
    srcs = ["optimizer.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -177,6 +204,7 @@ py_proto_library(
 proto_library(
    name = "post_processing_proto",
    srcs = ["post_processing.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -188,6 +216,7 @@ py_proto_library(
 proto_library(
    name = "hyperparams_proto",
    srcs = ["hyperparams.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -199,6 +228,7 @@ py_proto_library(
 proto_library(
    name = "box_predictor_proto",
    srcs = ["box_predictor.proto"],
+    cc_api_version = 2,
    deps = [":hyperparams_proto"],
 )

@@ -211,6 +241,7 @@ py_proto_library(
 proto_library(
    name = "region_similarity_calculator_proto",
    srcs = ["region_similarity_calculator.proto"],
+    cc_api_version = 2,
    deps = [],
 )

@@ -223,6 +254,7 @@ py_proto_library(
 proto_library(
    name = "preprocessor_proto",
    srcs = ["preprocessor.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -234,6 +266,7 @@ py_proto_library(
 proto_library(
    name = "train_proto",
    srcs = ["train.proto"],
+    cc_api_version = 2,
    deps = [
        ":optimizer_proto",
        ":preprocessor_proto",
@@ -249,6 +282,7 @@ py_proto_library(
 proto_library(
    name = "eval_proto",
    srcs = ["eval.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -260,6 +294,7 @@ py_proto_library(
 proto_library(
    name = "image_resizer_proto",
    srcs = ["image_resizer.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(
@@ -271,19 +306,21 @@ py_proto_library(
 proto_library(
    name = "faster_rcnn_proto",
    srcs = ["faster_rcnn.proto"],
+    cc_api_version = 2,
    deps = [
        ":box_predictor_proto",
-        "//object_detection/protos:anchor_generator_proto",
-        "//object_detection/protos:hyperparams_proto",
-        "//object_detection/protos:image_resizer_proto",
-        "//object_detection/protos:losses_proto",
-        "//object_detection/protos:post_processing_proto",
+        "//tensorflow/models/research/object_detection/protos:anchor_generator_proto",
+        "//tensorflow/models/research/object_detection/protos:hyperparams_proto",
+        "//tensorflow/models/research/object_detection/protos:image_resizer_proto",
+        "//tensorflow/models/research/object_detection/protos:losses_proto",
+        "//tensorflow/models/research/object_detection/protos:post_processing_proto",
    ],
 )

 proto_library(
    name = "ssd_proto",
    srcs = ["ssd.proto"],
+    cc_api_version = 2,
    deps = [
        ":anchor_generator_proto",
        ":box_coder_proto",
@@ -300,6 +337,7 @@ proto_library(
 proto_library(
    name = "model_proto",
    srcs = ["model.proto"],
+    cc_api_version = 2,
    deps = [
        ":faster_rcnn_proto",
        ":ssd_proto",
@@ -315,6 +353,7 @@ py_proto_library(
 proto_library(
    name = "pipeline_proto",
    srcs = ["pipeline.proto"],
+    cc_api_version = 2,
    deps = [
        ":eval_proto",
        ":input_reader_proto",
@@ -332,6 +371,7 @@ py_proto_library(
 proto_library(
    name = "string_int_label_map_proto",
    srcs = ["string_int_label_map.proto"],
+    cc_api_version = 2,
 )

 py_proto_library(

--- a/research/object_detection/protos/anchor_generator.proto
+++ b/research/object_detection/protos/anchor_generator.proto
@@ -4,6 +4,7 @@ package object_detection.protos;

 import "object_detection/protos/grid_anchor_generator.proto";
 import "object_detection/protos/ssd_anchor_generator.proto";
+import "object_detection/protos/multiscale_anchor_generator.proto";

 // Configuration proto for the anchor generator to use in the object detection
 // pipeline. See core/anchor_generator.py for details.
@@ -11,5 +12,6 @@ message AnchorGenerator {
  oneof anchor_generator_oneof {
    GridAnchorGenerator grid_anchor_generator = 1;
    SsdAnchorGenerator ssd_anchor_generator = 2;
+    MultiscaleAnchorGenerator multiscale_anchor_generator = 3;
  }
 }
--- a/research/object_detection/protos/argmax_matcher.proto
+++ b/research/object_detection/protos/argmax_matcher.proto
@@ -22,4 +22,8 @@ message ArgMaxMatcher {

  // Whether to ensure each row is matched to at least one column.
  optional bool force_match_for_each_row = 5 [default = false];
+
+  // Force constructed match objects to use matrix multiplication based gather
+  // instead of standard tf.gather
+  optional bool use_matmul_gather = 6 [default = false];
 }
--- a/research/object_detection/protos/bipartite_matcher.proto
+++ b/research/object_detection/protos/bipartite_matcher.proto
@@ -5,4 +5,7 @@ package object_detection.protos;
 // Configuration proto for bipartite matcher. See
 // matchers/bipartite_matcher.py for details.
 message BipartiteMatcher {
+  // Force constructed match objects to use matrix multiplication based gather
+  // instead of standard tf.gather
+  optional bool use_matmul_gather = 6 [default = false];
 }
--- a/research/object_detection/protos/box_predictor.proto
+++ b/research/object_detection/protos/box_predictor.proto
@@ -11,6 +11,7 @@ message BoxPredictor {
    ConvolutionalBoxPredictor convolutional_box_predictor = 1;
    MaskRCNNBoxPredictor mask_rcnn_box_predictor = 2;
    RfcnBoxPredictor rfcn_box_predictor = 3;
+    WeightSharedConvolutionalBoxPredictor weight_shared_convolutional_box_predictor = 4;
  }
 }

@@ -46,10 +47,40 @@ message ConvolutionalBoxPredictor {
  optional int32 box_code_size = 8 [default = 4];

  // Whether to apply sigmoid to the output of class predictions.
-  // TODO: Do we need this since we have a post processing module.?
+  // TODO(jonathanhuang): Do we need this since we have a post processing
+  // module.?
  optional bool apply_sigmoid_to_scores = 9 [default = false];

  optional float class_prediction_bias_init = 10 [default = 0.0];
+
+  // Whether to use depthwise separable convolution for box predictor layers.
+  optional bool use_depthwise = 11 [default = false];
+}
+
+// Configuration proto for weight shared convolutional box predictor.
+message WeightSharedConvolutionalBoxPredictor {
+  // Hyperparameters for convolution ops used in the box predictor.
+  optional Hyperparams conv_hyperparams = 1;
+
+  // Number of the additional conv layers before the predictor.
+  optional int32 num_layers_before_predictor = 4 [default = 0];
+
+  // Output depth for the convolution ops prior to predicting box encodings
+  // and class predictions.
+  optional int32 depth = 2 [default = 0];
+
+  // Size of final convolution kernel. If the spatial resolution of the feature
+  // map is smaller than the kernel size, then the kernel size is set to
+  // min(feature_width, feature_height).
+  optional int32 kernel_size = 7 [default = 3];
+
+  // Size of the encoding for boxes.
+  optional int32 box_code_size = 8 [default = 4];
+
+  // Bias initialization for class prediction. It has been show to stabilize
+  // training where there are large number of negative boxes. See
+  // https://arxiv.org/abs/1708.02002 for details.
+  optional float class_prediction_bias_init = 10 [default = 0.0];
 }

 message MaskRCNNBoxPredictor {
@@ -71,12 +102,22 @@ message MaskRCNNBoxPredictor {
  // Whether to predict instance masks inside detection boxes.
  optional bool predict_instance_masks = 6 [default = false];

-  // The depth for the first conv2d_transpose op  applied to the
-  // image_features in the mask prediciton branch
+  // The depth for the first conv2d_transpose op applied to the
+  // image_features in the mask prediction branch. If set to 0, the value
+  // will be set automatically based on the number of channels in the image
+  // features and the number of classes.
  optional int32 mask_prediction_conv_depth = 7 [default = 256];

  // Whether to predict keypoints inside detection boxes.
  optional bool predict_keypoints = 8 [default = false];
+
+  // The height and the width of the predicted mask.
+  optional int32 mask_height = 9 [default = 15];
+  optional int32 mask_width = 10 [default = 15];
+
+  // The number of convolutions applied to image_features in the mask prediction
+  // branch.
+  optional int32 mask_prediction_num_conv_layers = 11 [default = 2];
 }

 message RfcnBoxPredictor {

--- a/research/object_detection/protos/eval.proto
+++ b/research/object_detection/protos/eval.proto
@@ -26,9 +26,8 @@ message EvalConfig {
  // BNS name of the TensorFlow master.
  optional string eval_master = 7 [default=""];

-  // Type of metrics to use for evaluation. Currently supports only Pascal VOC
-  // detection metrics.
-  optional string metrics_set = 8 [default="pascal_voc_metrics"];
+  // Type of metrics to use for evaluation.
+  repeated string metrics_set = 8;

  // Path to export detections to COCO compatible JSON format.
  optional string export_path = 9 [default=''];
@@ -38,10 +37,35 @@ message EvalConfig {
  optional bool ignore_groundtruth = 10 [default=false];

  // Use exponential moving averages of variables for evaluation.
+  // TODO(rathodv): When this is false make sure the model is constructed
+  // without moving averages in restore_fn.
  optional bool use_moving_averages = 11 [default=false];

  // Whether to evaluate instance masks.
  // Note that since there is no evaluation code currently for instance
  // segmenation this option is unused.
  optional bool eval_instance_masks = 12 [default=false];
+
+  // Minimum score threshold for a detected object box to be visualized
+  optional float min_score_threshold = 13 [default=0.5];
+
+  // Maximum number of detections to visualize
+  optional int32 max_num_boxes_to_visualize = 14 [default=20];
+
+  // When drawing a single detection, each label is by default visualized as
+  // <label name> : <label score>. One can skip the name or/and score using the
+  // following fields:
+  optional bool skip_scores = 15 [default=false];
+  optional bool skip_labels = 16 [default=false];
+
+  // Whether to show groundtruth boxes in addition to detected boxes in
+  // visualizations.
+  optional bool visualize_groundtruth_boxes = 17 [default=false];
+
+  // Box color for visualizing groundtruth boxes.
+  optional string groundtruth_box_visualization_color = 18 [default="black"];
+
+  // Whether to keep image identifier in filename when exported to
+  // visualization_export_dir.
+  optional bool keep_image_id_for_visualization_export = 19 [default=false];
 }
--- a/research/object_detection/protos/faster_rcnn.proto
+++ b/research/object_detection/protos/faster_rcnn.proto
@@ -20,7 +20,7 @@ import "object_detection/protos/post_processing.proto";
 message FasterRcnn {

  // Whether to construct only the Region Proposal Network (RPN).
-  optional bool first_stage_only = 1 [default=false];
+  optional int32 number_of_stages = 1 [default=2];

  // Number of classes to predict.
  optional int32 num_classes = 3;

--- a/research/object_detection/protos/image_resizer.proto
+++ b/research/object_detection/protos/image_resizer.proto
@@ -29,6 +29,14 @@ message KeepAspectRatioResizer {

  // Desired method when resizing image.
  optional ResizeType resize_method = 3 [default = BILINEAR];
+
+  // Whether to pad the image with zeros so the output spatial size is
+  // [max_dimension, max_dimension]. Note that the zeros are padded to the
+  // bottom and the right of the resized image.
+  optional bool pad_to_max_dimension = 4 [default = false];
+
+  // Whether to also resize the image channels from 3 to 1 (RGB to grayscale).
+  optional bool convert_to_grayscale = 5 [default = false];
 }

 // Configuration proto for image resizer that resizes to a fixed shape.
@@ -41,4 +49,7 @@ message FixedShapeResizer {

  // Desired method when resizing image.
  optional ResizeType resize_method = 3 [default = BILINEAR];
+
+  // Whether to also resize the image channels from 3 to 1 (RGB to grayscale).
+  optional bool convert_to_grayscale = 4 [default = false];
 }
--- a/research/object_detection/protos/input_reader.proto
+++ b/research/object_detection/protos/input_reader.proto
@@ -15,6 +15,13 @@ package object_detection.protos;
 // 'groundtruth_instance_masks': (Optional), a [num_boxes, image_height,
 //    image_width] float tensor storing binary mask of the objects in boxes.

+// Instance mask format. Note that PNG masks are much more space efficient.
+enum InstanceMaskType {
+  DEFAULT = 0;          // Default implementation, currently NUMERICAL_MASKS
+  NUMERICAL_MASKS = 1;  // [num_masks, H, W] float32 binary masks.
+  PNG_MASKS = 2;        // Encoded PNG masks.
+}
+
 message InputReader {
  // Path to StringIntLabelMap pbtxt file specifying the mapping from string
  // labels to integer ids.
@@ -24,6 +31,12 @@ message InputReader {
  // shuffled randomly.
  optional bool shuffle = 2 [default=true];

+  // Buffer size to be used when shuffling.
+  optional uint32 shuffle_buffer_size = 11 [default = 100];
+
+  // Buffer size to be used when shuffling file names.
+  optional uint32 filenames_shuffle_buffer_size = 12 [default = 100];
+
  // Maximum number of records to keep in reader queue.
  optional uint32 queue_capacity = 3 [default=2000];

@@ -38,9 +51,15 @@ message InputReader {
  // Number of reader instances to create.
  optional uint32 num_readers = 6 [default=8];

+  // Size of the buffer for prefetching (in batches).
+  optional uint32 prefetch_buffer_size = 13 [default = 2];
+
  // Whether to load groundtruth instance masks.
  optional bool load_instance_masks = 7 [default = false];

+  // Type of instance mask.
+  optional InstanceMaskType mask_type = 10 [default = NUMERICAL_MASKS];
+
  oneof input_reader {
    TFRecordInputReader tf_record_input_reader = 8;
    ExternalInputReader external_input_reader = 9;

--- a/research/object_detection/protos/losses.proto
+++ b/research/object_detection/protos/losses.proto
@@ -33,12 +33,14 @@ message LocalizationLoss {

 // L2 location loss: 0.5 * ||weight * (a - b)|| ^ 2
 message WeightedL2LocalizationLoss {
+  // DEPRECATED, do not use.
  // Output loss per anchor.
  optional bool anchorwise_output = 1 [default=false];
 }

 // SmoothL1 (Huber) location loss: .5 * x ^ 2 if |x| < 1 else |x| - .5
 message WeightedSmoothL1LocalizationLoss {
+  // DEPRECATED, do not use.
  // Output loss per anchor.
  optional bool anchorwise_output = 1 [default=false];
 }
@@ -59,6 +61,7 @@ message ClassificationLoss {

 // Classification loss using a sigmoid function over class predictions.
 message WeightedSigmoidClassificationLoss {
+  // DEPRECATED, do not use.
  // Output loss per anchor.
  optional bool anchorwise_output = 1 [default=false];
 }
@@ -66,6 +69,7 @@ message WeightedSigmoidClassificationLoss {
 // Sigmoid Focal cross entropy loss as described in
 // https://arxiv.org/abs/1708.02002
 message SigmoidFocalClassificationLoss {
+  // DEPRECATED, do not use.
  optional bool anchorwise_output = 1 [default = false];
  // modulating factor for the loss.
  optional float gamma = 2 [default = 2.0];
@@ -75,6 +79,7 @@ message SigmoidFocalClassificationLoss {

 // Classification loss using a softmax function over class predictions.
 message WeightedSoftmaxClassificationLoss {
+  // DEPRECATED, do not use.
  // Output loss per anchor.
  optional bool anchorwise_output = 1 [default=false];
  // Scale logit (input) value before calculating softmax classification loss.
@@ -93,6 +98,7 @@ message BootstrappedSigmoidClassificationLoss {
  // probabilities.
  optional bool hard_bootstrap = 2 [default=false];

+  // DEPRECATED, do not use.
  // Output loss per anchor.
  optional bool anchorwise_output = 3 [default=false];
 }

--- a/research/object_detection/protos/multiscale_anchor_generator.proto
+++ b/research/object_detection/protos/multiscale_anchor_generator.proto
+  syntax = "proto2";
+
+package object_detection.protos;
+
+// Configuration proto for RetinaNet anchor generator described in
+// https://arxiv.org/abs/1708.02002. See
+// anchor_generators/multiscale_grid_anchor_generator.py for details.
+message MultiscaleAnchorGenerator {
+  // minimum level in feature pyramid
+  optional int32 min_level = 1 [default = 3];
+
+  // maximum level in feature pyramid
+  optional int32 max_level = 2 [default = 7];
+
+  // Scale of anchor to feature stride
+  optional float anchor_scale = 3 [default = 4.0];
+
+  // Aspect ratios for anchors at each grid point.
+  repeated float aspect_ratios = 4;
+
+  // Number of intermediate scale each scale octave
+  optional int32 scales_per_octave = 5 [default = 2];
+}
--- a/research/object_detection/protos/pipeline.proto
+++ b/research/object_detection/protos/pipeline.proto
@@ -15,4 +15,5 @@ message TrainEvalPipelineConfig {
  optional InputReader train_input_reader = 3;
  optional EvalConfig eval_config = 4;
  optional InputReader eval_input_reader = 5;
+  extensions 1000 to max;
 }
--- a/research/object_detection/protos/preprocessor.proto
+++ b/research/object_detection/protos/preprocessor.proto
@@ -3,7 +3,7 @@ syntax = "proto2";
 package object_detection.protos;

 // Message for defining a preprocessing operation on input data.
-// See: //object_detection/core/preprocessor.py
+// See: //third_party/tensorflow_models/object_detection/core/preprocessor.py
 message PreprocessingStep {
  oneof preprocessing_step {
    NormalizeImage normalize_image = 1;
@@ -32,6 +32,7 @@ message PreprocessingStep {
    SSDRandomCropPadFixedAspectRatio ssd_random_crop_pad_fixed_aspect_ratio = 24;
    RandomVerticalFlip random_vertical_flip = 25;
    RandomRotation90 random_rotation90 = 26;
+    RGBtoGray rgb_to_gray = 27;
  }
 }

@@ -202,7 +203,7 @@ message RandomCropPadImage {
  repeated float max_padded_size_ratio = 9;

  // Color of the padding. If unset, will pad using average color of the input
-  // image.
+  // image. This field should be of length 3.
  repeated float pad_color = 10;
 }

@@ -236,6 +237,11 @@ message RandomResizeMethod {
  optional float target_width = 2;
 }

+// Converts the RGB image to a grayscale image. This also converts the image
+// depth from 3 to 1, unlike RandomRGBtoGray which does not change the image
+// depth.
+message RGBtoGray {}
+
 // Scales boxes from normalized coordinates to pixel coordinates.
 message ScaleBoxesToPixelCoordinates {
 }

--- a/research/object_detection/protos/ssd.proto
+++ b/research/object_detection/protos/ssd.proto
@@ -82,4 +82,12 @@ message SsdFeatureExtractor {
  // will apply only to the additional layers that are added and are outside the
  // canned arg_scope.
  optional bool batch_norm_trainable = 6 [default=true];
+
+  // Whether to use explicit padding when extracting SSD multiresolution
+  // features. Note that this does not apply to the base feature extractor.
+  optional bool use_explicit_padding = 7 [default=false];
+
+  // Whether to use depthwise separable convolutions for to extract additional
+  // feature maps added by SSD.
+  optional bool use_depthwise = 8 [default=false];
 }
--- a/research/object_detection/protos/train.proto
+++ b/research/object_detection/protos/train.proto
@@ -35,6 +35,11 @@ message TrainConfig {
  // If false, it assumes the checkpoint was a object classification model.
  optional bool from_detection_checkpoint = 8 [default=false];

+  // Whether to load all checkpoint vars that match model variable names and
+  // sizes. This option is only available if `from_detection_checkpoint` is
+  // True.
+  optional bool load_all_detection_checkpoint_vars = 19 [default = false];
+
  // Number of steps to train the DetectionModel for. If 0, will train the model
  // indefinitely.
  optional uint32 num_steps = 9 [default=0];
@@ -66,4 +71,21 @@ message TrainConfig {
  // This is useful when each box can have multiple labels.
  // Note that only Sigmoid classification losses should be used.
  optional bool merge_multiple_label_boxes = 17 [default=false];
+
+  // Whether to add regularization loss to `total_loss`. This is true by
+  // default and adds all regularization losses defined in the model to
+  // `total_loss`.
+  // Setting this option to false is very useful while debugging the model and
+  // losses.
+  optional bool add_regularization_loss = 18 [default=true];
+
+  // Maximum number of boxes used during training.
+  // Set this to at least the maximum amount of boxes in the input data.
+  // Otherwise, it may cause "Data loss: Attempted to pad to a smaller size
+  // than the input element" errors.
+  optional int32 max_number_of_boxes = 20 [default=50];
+
+  // Whether to remove padding along `num_boxes` dimension of the groundtruth
+  // tensors.
+  optional bool unpad_groundtruth_tensors = 21 [default=true];
 }
--- a/research/object_detection/samples/configs/BUILD
+++ b/research/object_detection/samples/configs/BUILD
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])
+
+exports_files([
+    "faster_rcnn_resnet50_pets.config",
+    "ssd_inception_v2_pets.config",
+    "ssd_mobilenet_v1_focal_loss_pets.config",
+])
--- a/research/object_detection/samples/configs/embedded_ssd_mobilenet_v1_coco.config
+++ b/research/object_detection/samples/configs/embedded_ssd_mobilenet_v1_coco.config
+# Embedded SSD with Mobilenet v1 configuration for MSCOCO Dataset.
+# Users should configure the fine_tune_checkpoint field in the train config as
+# well as the label_map_path and input_path fields in the train_input_reader and
+# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
+# should be configured.
+
+model {
+  ssd {
+    num_classes: 90
+    box_coder {
+      faster_rcnn_box_coder {
+        y_scale: 10.0
+        x_scale: 10.0
+        height_scale: 5.0
+        width_scale: 5.0
+      }
+    }
+    matcher {
+      argmax_matcher {
+        matched_threshold: 0.5
+        unmatched_threshold: 0.5
+        ignore_thresholds: false
+        negatives_lower_than_unmatched: true
+        force_match_for_each_row: true
+      }
+    }
+    similarity_calculator {
+      iou_similarity {
+      }
+    }
+    anchor_generator {
+      ssd_anchor_generator {
+        num_layers: 5
+        min_scale: 0.2
+        max_scale: 0.95
+        aspect_ratios: 1.0
+        aspect_ratios: 2.0
+        aspect_ratios: 0.5
+        aspect_ratios: 3.0
+        aspect_ratios: 0.3333
+      }
+    }
+    image_resizer {
+      fixed_shape_resizer {
+        height: 256
+        width: 256
+      }
+    }
+    box_predictor {
+      convolutional_box_predictor {
+        min_depth: 0
+        max_depth: 0
+        num_layers_before_predictor: 0
+        use_dropout: false
+        dropout_keep_probability: 0.8
+        kernel_size: 1
+        box_code_size: 4
+        apply_sigmoid_to_scores: false
+        conv_hyperparams {
+          activation: RELU_6,
+          regularizer {
+            l2_regularizer {
+              weight: 0.00004
+            }
+          }
+          initializer {
+            truncated_normal_initializer {
+              stddev: 0.03
+              mean: 0.0
+            }
+          }
+          batch_norm {
+            train: true,
+            scale: true,
+            center: true,
+            decay: 0.9997,
+            epsilon: 0.001,
+          }
+        }
+      }
+    }
+    feature_extractor {
+      type: 'embedded_ssd_mobilenet_v1'
+      min_depth: 16
+      depth_multiplier: 0.125
+      conv_hyperparams {
+        activation: RELU_6,
+        regularizer {
+          l2_regularizer {
+            weight: 0.00004
+          }
+        }
+        initializer {
+          truncated_normal_initializer {
+            stddev: 0.03
+            mean: 0.0
+          }
+        }
+        batch_norm {
+          train: true,
+          scale: true,
+          center: true,
+          decay: 0.9997,
+          epsilon: 0.001,
+        }
+      }
+    }
+    loss {
+      classification_loss {
+        weighted_sigmoid {
+        }
+      }
+      localization_loss {
+        weighted_smooth_l1 {
+        }
+      }
+      hard_example_miner {
+        num_hard_examples: 3000
+        iou_threshold: 0.99
+        loss_type: CLASSIFICATION
+        max_negatives_per_positive: 3
+        min_negatives_per_image: 0
+      }
+      classification_weight: 1.0
+      localization_weight: 1.0
+    }
+    normalize_loss_by_num_matches: true
+    post_processing {
+      batch_non_max_suppression {
+        score_threshold: 1e-8
+        iou_threshold: 0.6
+        max_detections_per_class: 100
+        max_total_detections: 100
+      }
+      score_converter: SIGMOID
+    }
+  }
+}
+
+train_config: {
+  batch_size: 32
+  optimizer {
+    rms_prop_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.004
+          decay_steps: 800720
+          decay_factor: 0.95
+        }
+      }
+      momentum_optimizer_value: 0.9
+      decay: 0.9
+      epsilon: 1.0
+    }
+  }
+  fine_tune_checkpoint: "/PATH_TO_BE_CONFIGURED/model.ckpt"
+  data_augmentation_options {
+    random_horizontal_flip {
+    }
+  }
+  data_augmentation_options {
+    ssd_random_crop {
+    }
+  }
+}
+
+train_input_reader: {
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/mscoco_train.record"
+  }
+  label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt"
+}
+
+eval_config: {
+  num_examples: 8000
+  use_moving_averages: true
+}
+
+eval_input_reader: {
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/mscoco_val.record"
+  }
+  label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt"
+  shuffle: false
+  num_readers: 1
+}