Commit a4944a57 authored by derekjchow's avatar derekjchow Committed by Sergio Guadarrama
Browse files

Add Tensorflow Object Detection API. (#1561)

For details see our paper:
"Speed/accuracy trade-offs for modern convolutional object detectors."
Huang J, Rathod V, Sun C, Zhu M, Korattikara A, Fathi A, Fischer I,
Wojna Z, Song Y, Guadarrama S, Murphy K, CVPR 2017
https://arxiv.org/abs/1611.10012
parent 60c3ed2e
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for object_detection.models.ssd_inception_v2_feature_extractor."""
import numpy as np
import tensorflow as tf
from object_detection.models import ssd_feature_extractor_test
from object_detection.models import ssd_inception_v2_feature_extractor
class SsdInceptionV2FeatureExtractorTest(
ssd_feature_extractor_test.SsdFeatureExtractorTestBase,
tf.test.TestCase):
def _create_feature_extractor(self, depth_multiplier):
"""Constructs a SsdInceptionV2FeatureExtractor.
Args:
depth_multiplier: float depth multiplier for feature extractor
Returns:
an ssd_inception_v2_feature_extractor.SsdInceptionV2FeatureExtractor.
"""
min_depth = 32
conv_hyperparams = {}
return ssd_inception_v2_feature_extractor.SSDInceptionV2FeatureExtractor(
depth_multiplier, min_depth, conv_hyperparams)
def test_extract_features_returns_correct_shapes_128(self):
image_height = 128
image_width = 128
depth_multiplier = 1.0
expected_feature_map_shape = [(4, 8, 8, 576), (4, 4, 4, 1024),
(4, 2, 2, 512), (4, 1, 1, 256),
(4, 1, 1, 256), (4, 1, 1, 128)]
self.check_extract_features_returns_correct_shape(
image_height, image_width, depth_multiplier, expected_feature_map_shape)
def test_extract_features_returns_correct_shapes_299(self):
image_height = 299
image_width = 299
depth_multiplier = 1.0
expected_feature_map_shape = [(4, 19, 19, 576), (4, 10, 10, 1024),
(4, 5, 5, 512), (4, 3, 3, 256),
(4, 2, 2, 256), (4, 1, 1, 128)]
self.check_extract_features_returns_correct_shape(
image_height, image_width, depth_multiplier, expected_feature_map_shape)
def test_extract_features_returns_correct_shapes_enforcing_min_depth(self):
image_height = 299
image_width = 299
depth_multiplier = 0.5**12
expected_feature_map_shape = [(4, 19, 19, 128), (4, 10, 10, 128),
(4, 5, 5, 32), (4, 3, 3, 32),
(4, 2, 2, 32), (4, 1, 1, 32)]
self.check_extract_features_returns_correct_shape(
image_height, image_width, depth_multiplier, expected_feature_map_shape)
def test_extract_features_raises_error_with_invalid_image_size(self):
image_height = 32
image_width = 32
depth_multiplier = 1.0
self.check_extract_features_raises_error_with_invalid_image_size(
image_height, image_width, depth_multiplier)
def test_preprocess_returns_correct_value_range(self):
image_height = 128
image_width = 128
depth_multiplier = 1
test_image = np.random.rand(4, image_height, image_width, 3)
feature_extractor = self._create_feature_extractor(depth_multiplier)
preprocessed_image = feature_extractor.preprocess(test_image)
self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))
def test_variables_only_created_in_scope(self):
depth_multiplier = 1
scope_name = 'InceptionV2'
self.check_feature_extractor_variables_under_scope(depth_multiplier,
scope_name)
if __name__ == '__main__':
tf.test.main()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SSDFeatureExtractor for MobilenetV1 features."""
import tensorflow as tf
from object_detection.meta_architectures import ssd_meta_arch
from object_detection.models import feature_map_generators
from nets import mobilenet_v1
slim = tf.contrib.slim
class SSDMobileNetV1FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
"""SSD Feature Extractor using MobilenetV1 features."""
def __init__(self,
depth_multiplier,
min_depth,
conv_hyperparams,
reuse_weights=None):
"""MobileNetV1 Feature Extractor for SSD Models.
Args:
depth_multiplier: float depth multiplier for feature extractor.
min_depth: minimum feature extractor depth.
conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
reuse_weights: Whether to reuse variables. Default is None.
"""
super(SSDMobileNetV1FeatureExtractor, self).__init__(
depth_multiplier, min_depth, conv_hyperparams, reuse_weights)
def preprocess(self, resized_inputs):
"""SSD preprocessing.
Maps pixel values to the range [-1, 1].
Args:
resized_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
"""
return (2.0 / 255.0) * resized_inputs - 1.0
def extract_features(self, preprocessed_inputs):
"""Extract features from preprocessed inputs.
Args:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
feature_maps: a list of tensors where the ith tensor has shape
[batch, height_i, width_i, depth_i]
"""
preprocessed_inputs.get_shape().assert_has_rank(4)
shape_assert = tf.Assert(
tf.logical_and(tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
['image size must at least be 33 in both height and width.'])
feature_map_layout = {
'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '',
'', ''],
'layer_depth': [-1, -1, 512, 256, 256, 128],
}
with tf.control_dependencies([shape_assert]):
with slim.arg_scope(self._conv_hyperparams):
with tf.variable_scope('MobilenetV1',
reuse=self._reuse_weights) as scope:
_, image_features = mobilenet_v1.mobilenet_v1_base(
preprocessed_inputs,
final_endpoint='Conv2d_13_pointwise',
min_depth=self._min_depth,
depth_multiplier=self._depth_multiplier,
scope=scope)
feature_maps = feature_map_generators.multi_resolution_feature_maps(
feature_map_layout=feature_map_layout,
depth_multiplier=self._depth_multiplier,
min_depth=self._min_depth,
insert_1x1_conv=True,
image_features=image_features)
return feature_maps.values()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for ssd_mobilenet_v1_feature_extractor."""
import numpy as np
import tensorflow as tf
from object_detection.models import ssd_feature_extractor_test
from object_detection.models import ssd_mobilenet_v1_feature_extractor
class SsdMobilenetV1FeatureExtractorTest(
ssd_feature_extractor_test.SsdFeatureExtractorTestBase, tf.test.TestCase):
def _create_feature_extractor(self, depth_multiplier):
"""Constructs a new feature extractor.
Args:
depth_multiplier: float depth multiplier for feature extractor
Returns:
an ssd_meta_arch.SSDFeatureExtractor object.
"""
min_depth = 32
conv_hyperparams = {}
return ssd_mobilenet_v1_feature_extractor.SSDMobileNetV1FeatureExtractor(
depth_multiplier, min_depth, conv_hyperparams)
def test_extract_features_returns_correct_shapes_128(self):
image_height = 128
image_width = 128
depth_multiplier = 1.0
expected_feature_map_shape = [(4, 8, 8, 512), (4, 4, 4, 1024),
(4, 2, 2, 512), (4, 1, 1, 256),
(4, 1, 1, 256), (4, 1, 1, 128)]
self.check_extract_features_returns_correct_shape(
image_height, image_width, depth_multiplier, expected_feature_map_shape)
def test_extract_features_returns_correct_shapes_299(self):
image_height = 299
image_width = 299
depth_multiplier = 1.0
expected_feature_map_shape = [(4, 19, 19, 512), (4, 10, 10, 1024),
(4, 5, 5, 512), (4, 3, 3, 256),
(4, 2, 2, 256), (4, 1, 1, 128)]
self.check_extract_features_returns_correct_shape(
image_height, image_width, depth_multiplier, expected_feature_map_shape)
def test_extract_features_returns_correct_shapes_enforcing_min_depth(self):
image_height = 299
image_width = 299
depth_multiplier = 0.5**12
expected_feature_map_shape = [(4, 19, 19, 32), (4, 10, 10, 32),
(4, 5, 5, 32), (4, 3, 3, 32),
(4, 2, 2, 32), (4, 1, 1, 32)]
self.check_extract_features_returns_correct_shape(
image_height, image_width, depth_multiplier, expected_feature_map_shape)
def test_extract_features_raises_error_with_invalid_image_size(self):
image_height = 32
image_width = 32
depth_multiplier = 1.0
self.check_extract_features_raises_error_with_invalid_image_size(
image_height, image_width, depth_multiplier)
def test_preprocess_returns_correct_value_range(self):
image_height = 128
image_width = 128
depth_multiplier = 1
test_image = np.random.rand(4, image_height, image_width, 3)
feature_extractor = self._create_feature_extractor(depth_multiplier)
preprocessed_image = feature_extractor.preprocess(test_image)
self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))
def test_variables_only_created_in_scope(self):
depth_multiplier = 1
scope_name = 'MobilenetV1'
self.check_feature_extractor_variables_under_scope(depth_multiplier,
scope_name)
if __name__ == '__main__':
tf.test.main()
include "devtools/blueprint/ncl/blueprint_file.ncl";
include "releasetools/rapid/ncl/rapid_config.ncl";
blueprint_file = ::blueprint::BlueprintFile(
project_name = "open_tf_object_detection",
project_grouping = ["Search", "Search Features", "Image Search", "Visual Search"],
mdb_groups = ["vale-project"],
tech_lead = ["jonathanhuang", "kpmurphy"],
dev_mailing_list = "object-detection-reviews@google.com",
buganizer_component_ids = [163596],
owned_code_depotpaths = [
"//depot/google3/third_party/tensorflow_models/object_detection/...",
],
buildable_units = [
::blueprint::BuildableUnit(
name = "open_tf_object_detection.fastbuild",
enable_continuous_build = true,
enable_release = false,
continuous_build_email = ::blueprint::ContinuousBuildEmailInfo(
build_cop_email_addrs = ["vale-project+tap@google.com"]),
build_patterns = [
"third_party/tensorflow_models/object_detection/...",
],
build_flags = [
"--compilation_mode=fastbuild",
],
test_patterns = [
"third_party/tensorflow_models/object_detection/...",
],
enable_coverage = true,
),
::blueprint::BuildableUnit(
name = "open_tf_object_detection.opt",
enable_continuous_build = true,
enable_release = false,
continuous_build_email = ::blueprint::ContinuousBuildEmailInfo(
build_cop_email_addrs = ["vale-project+tap@google.com"]),
build_patterns = [
"third_party/tensorflow_models/object_detection/...",
"image/understanding/object_detection/...",
],
build_flags = [
"--compilation_mode=opt",
],
test_patterns = [
"third_party/tensorflow_models/object_detection/...",
"image/understanding/object_detection/...",
],
),
],
);
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Object Detection Demo\n",
"Welcome to the object detection inference walkthrough! This notebook will walk you step by step through the process of using a pre-trained model to detect objects in an image."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import os\n",
"import sys\n",
"import tensorflow as tf\n",
"\n",
"from collections import defaultdict\n",
"from io import StringIO\n",
"from matplotlib import pyplot as plt\n",
"from PIL import Image"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Env setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# This is needed to display the images.\n",
"%matplotlib inline\n",
"\n",
"# This is needed since the notebook is stored in the object_detection folder.\n",
"sys.path.append(\"..\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Object detection imports\n",
"Here are the imports from the object detection module."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from utils import label_map_util\n",
"\n",
"from utils import visualization_utils as vis_util"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Model preparation "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Variables"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Path to frozen detection graph. This is the actual model that is used for the object detection.\n",
"PATH_TO_CKPT = os.path.join('test_ckpt', 'ssd_inception_v2.pb')\n",
"\n",
"# List of the strings that is used to add correct label for each box.\n",
"PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')\n",
"\n",
"NUM_CLASSES = 90"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load a (frozen) Tensorflow model into memory."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"detection_graph = tf.Graph()\n",
"with detection_graph.as_default():\n",
" od_graph_def = tf.GraphDef()\n",
" with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:\n",
" serialized_graph = fid.read()\n",
" od_graph_def.ParseFromString(serialized_graph)\n",
" tf.import_graph_def(od_graph_def, name='')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading label map\n",
"Label maps map indices to category names, so that when our convolution network predicts `5`, we know that this corresponds to `airplane`. Here we use internal utility functions, but anything that returns a dictionary mapping integers to appropriate string labels would be fine"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"label_map = label_map_util.load_labelmap(PATH_TO_LABELS)\n",
"categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)\n",
"category_index = label_map_util.create_category_index(categories)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Helper code"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def load_image_into_numpy_array(image):\n",
" (im_width, im_height) = image.size\n",
" return np.array(image.getdata()).reshape(\n",
" (im_height, im_width, 3)).astype(np.uint8)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Detection"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# For the sake of simplicity we will use only 2 images:\n",
"# image1.jpg\n",
"# image2.jpg\n",
"# If you want to test the code with your images, just add path to the images to the TEST_IMAGE_PATHS.\n",
"PATH_TO_TEST_IMAGES_DIR = 'test_images'\n",
"TEST_IMAGE_PATHS = [ os.path.join(PATH_TO_TEST_IMAGES_DIR, 'image{}.jpg'.format(i)) for i in range(1, 3) ]\n",
"\n",
"# Size, in inches, of the output images.\n",
"IMAGE_SIZE = (12, 8)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"with detection_graph.as_default():\n",
" with tf.Session(graph=detection_graph) as sess:\n",
" for image_path in TEST_IMAGE_PATHS:\n",
" image = Image.open(image_path)\n",
" # the array based representation of the image will be used later in order to prepare the\n",
" # result image with boxes and labels on it.\n",
" image_np = load_image_into_numpy_array(image)\n",
" # Expand dimensions since the model expects images to have shape: [1, None, None, 3]\n",
" image_np_expanded = np.expand_dims(image_np, axis=0)\n",
" image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')\n",
" # Each box represents a part of the image where a particular object was detected.\n",
" boxes = detection_graph.get_tensor_by_name('detection_boxes:0')\n",
" # Each score represent how level of confidence for each of the objects.\n",
" # Score is shown on the result image, together with the class label.\n",
" scores = detection_graph.get_tensor_by_name('detection_scores:0')\n",
" classes = detection_graph.get_tensor_by_name('detection_classes:0')\n",
" num_detections = detection_graph.get_tensor_by_name('num_detections:0')\n",
" # Actual detection.\n",
" (boxes, scores, classes, num_detections) = sess.run(\n",
" [boxes, scores, classes, num_detections],\n",
" feed_dict={image_tensor: image_np_expanded})\n",
" # Visualization of the results of a detection.\n",
" vis_util.visualize_boxes_and_labels_on_image_array(\n",
" image_np,\n",
" np.squeeze(boxes),\n",
" np.squeeze(classes).astype(np.int32),\n",
" np.squeeze(scores),\n",
" category_index,\n",
" use_normalized_coordinates=True)\n",
" plt.figure(figsize=IMAGE_SIZE)\n",
" plt.imshow(image_np)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
# Tensorflow Object Detection API: Configuration protos.
package(
default_visibility = ["//visibility:public"],
)
licenses(["notice"])
proto_library(
name = "argmax_matcher_proto",
srcs = ["argmax_matcher.proto"],
)
py_proto_library(
name = "argmax_matcher_py_pb2",
api_version = 2,
deps = [":argmax_matcher_proto"],
)
proto_library(
name = "bipartite_matcher_proto",
srcs = ["bipartite_matcher.proto"],
)
py_proto_library(
name = "bipartite_matcher_py_pb2",
api_version = 2,
deps = [":bipartite_matcher_proto"],
)
proto_library(
name = "matcher_proto",
srcs = ["matcher.proto"],
deps = [
":argmax_matcher_proto",
":bipartite_matcher_proto",
],
)
py_proto_library(
name = "matcher_py_pb2",
api_version = 2,
deps = [":matcher_proto"],
)
proto_library(
name = "faster_rcnn_box_coder_proto",
srcs = ["faster_rcnn_box_coder.proto"],
)
py_proto_library(
name = "faster_rcnn_box_coder_py_pb2",
api_version = 2,
deps = [":faster_rcnn_box_coder_proto"],
)
proto_library(
name = "mean_stddev_box_coder_proto",
srcs = ["mean_stddev_box_coder.proto"],
)
py_proto_library(
name = "mean_stddev_box_coder_py_pb2",
api_version = 2,
deps = [":mean_stddev_box_coder_proto"],
)
proto_library(
name = "square_box_coder_proto",
srcs = ["square_box_coder.proto"],
)
py_proto_library(
name = "square_box_coder_py_pb2",
api_version = 2,
deps = [":square_box_coder_proto"],
)
proto_library(
name = "box_coder_proto",
srcs = ["box_coder.proto"],
deps = [
":faster_rcnn_box_coder_proto",
":mean_stddev_box_coder_proto",
":square_box_coder_proto",
],
)
py_proto_library(
name = "box_coder_py_pb2",
api_version = 2,
deps = [":box_coder_proto"],
)
proto_library(
name = "grid_anchor_generator_proto",
srcs = ["grid_anchor_generator.proto"],
)
py_proto_library(
name = "grid_anchor_generator_py_pb2",
api_version = 2,
deps = [":grid_anchor_generator_proto"],
)
proto_library(
name = "ssd_anchor_generator_proto",
srcs = ["ssd_anchor_generator.proto"],
)
py_proto_library(
name = "ssd_anchor_generator_py_pb2",
api_version = 2,
deps = [":ssd_anchor_generator_proto"],
)
proto_library(
name = "anchor_generator_proto",
srcs = ["anchor_generator.proto"],
deps = [
":grid_anchor_generator_proto",
":ssd_anchor_generator_proto",
],
)
py_proto_library(
name = "anchor_generator_py_pb2",
api_version = 2,
deps = [":anchor_generator_proto"],
)
proto_library(
name = "input_reader_proto",
srcs = ["input_reader.proto"],
)
py_proto_library(
name = "input_reader_py_pb2",
api_version = 2,
deps = [":input_reader_proto"],
)
proto_library(
name = "losses_proto",
srcs = ["losses.proto"],
)
py_proto_library(
name = "losses_py_pb2",
api_version = 2,
deps = [":losses_proto"],
)
proto_library(
name = "optimizer_proto",
srcs = ["optimizer.proto"],
)
py_proto_library(
name = "optimizer_py_pb2",
api_version = 2,
deps = [":optimizer_proto"],
)
proto_library(
name = "post_processing_proto",
srcs = ["post_processing.proto"],
)
py_proto_library(
name = "post_processing_py_pb2",
api_version = 2,
deps = [":post_processing_proto"],
)
proto_library(
name = "hyperparams_proto",
srcs = ["hyperparams.proto"],
)
py_proto_library(
name = "hyperparams_py_pb2",
api_version = 2,
deps = [":hyperparams_proto"],
)
proto_library(
name = "box_predictor_proto",
srcs = ["box_predictor.proto"],
deps = [":hyperparams_proto"],
)
py_proto_library(
name = "box_predictor_py_pb2",
api_version = 2,
deps = [":box_predictor_proto"],
)
proto_library(
name = "region_similarity_calculator_proto",
srcs = ["region_similarity_calculator.proto"],
deps = [],
)
py_proto_library(
name = "region_similarity_calculator_py_pb2",
api_version = 2,
deps = [":region_similarity_calculator_proto"],
)
proto_library(
name = "preprocessor_proto",
srcs = ["preprocessor.proto"],
)
py_proto_library(
name = "preprocessor_py_pb2",
api_version = 2,
deps = [":preprocessor_proto"],
)
proto_library(
name = "train_proto",
srcs = ["train.proto"],
deps = [
":optimizer_proto",
":preprocessor_proto",
],
)
py_proto_library(
name = "train_py_pb2",
api_version = 2,
deps = [":train_proto"],
)
proto_library(
name = "eval_proto",
srcs = ["eval.proto"],
)
py_proto_library(
name = "eval_py_pb2",
api_version = 2,
deps = [":eval_proto"],
)
proto_library(
name = "image_resizer_proto",
srcs = ["image_resizer.proto"],
)
py_proto_library(
name = "image_resizer_py_pb2",
api_version = 2,
deps = [":image_resizer_proto"],
)
proto_library(
name = "faster_rcnn_proto",
srcs = ["faster_rcnn.proto"],
deps = [
":box_predictor_proto",
"//object_detection/protos:anchor_generator_proto",
"//object_detection/protos:hyperparams_proto",
"//object_detection/protos:image_resizer_proto",
"//object_detection/protos:losses_proto",
"//object_detection/protos:post_processing_proto",
],
)
proto_library(
name = "ssd_proto",
srcs = ["ssd.proto"],
deps = [
":anchor_generator_proto",
":box_coder_proto",
":box_predictor_proto",
":hyperparams_proto",
":image_resizer_proto",
":losses_proto",
":matcher_proto",
":post_processing_proto",
":region_similarity_calculator_proto",
],
)
proto_library(
name = "model_proto",
srcs = ["model.proto"],
deps = [
":faster_rcnn_proto",
":ssd_proto",
],
)
py_proto_library(
name = "model_py_pb2",
api_version = 2,
deps = [":model_proto"],
)
proto_library(
name = "pipeline_proto",
srcs = ["pipeline.proto"],
deps = [
":eval_proto",
":input_reader_proto",
":model_proto",
":train_proto",
],
)
py_proto_library(
name = "pipeline_py_pb2",
api_version = 2,
deps = [":pipeline_proto"],
)
proto_library(
name = "string_int_label_map_proto",
srcs = ["string_int_label_map.proto"],
)
py_proto_library(
name = "string_int_label_map_py_pb2",
api_version = 2,
deps = [":string_int_label_map_proto"],
)
syntax = "proto2";
package object_detection.protos;
import "object_detection/protos/grid_anchor_generator.proto";
import "object_detection/protos/ssd_anchor_generator.proto";
// Configuration proto for the anchor generator to use in the object detection
// pipeline. See core/anchor_generator.py for details.
message AnchorGenerator {
oneof anchor_generator_oneof {
GridAnchorGenerator grid_anchor_generator = 1;
SsdAnchorGenerator ssd_anchor_generator = 2;
}
}
syntax = "proto2";
package object_detection.protos;
// Configuration proto for ArgMaxMatcher. See
// matchers/argmax_matcher.py for details.
message ArgMaxMatcher {
// Threshold for positive matches.
optional float matched_threshold = 1 [default = 0.5];
// Threshold for negative matches.
optional float unmatched_threshold = 2 [default = 0.5];
// Whether to construct ArgMaxMatcher without thresholds.
optional bool ignore_thresholds = 3 [default = false];
// If True then negative matches are the ones below the unmatched_threshold,
// whereas ignored matches are in between the matched and umatched
// threshold. If False, then negative matches are in between the matched
// and unmatched threshold, and everything lower than unmatched is ignored.
optional bool negatives_lower_than_unmatched = 4 [default = true];
// Whether to ensure each row is matched to at least one column.
optional bool force_match_for_each_row = 5 [default = false];
}
syntax = "proto2";
package object_detection.protos;
// Configuration proto for bipartite matcher. See
// matchers/bipartite_matcher.py for details.
message BipartiteMatcher {
}
syntax = "proto2";
package object_detection.protos;
import "object_detection/protos/faster_rcnn_box_coder.proto";
import "object_detection/protos/mean_stddev_box_coder.proto";
import "object_detection/protos/square_box_coder.proto";
// Configuration proto for the box coder to be used in the object detection
// pipeline. See core/box_coder.py for details.
message BoxCoder {
oneof box_coder_oneof {
FasterRcnnBoxCoder faster_rcnn_box_coder = 1;
MeanStddevBoxCoder mean_stddev_box_coder = 2;
SquareBoxCoder square_box_coder = 3;
}
}
syntax = "proto2";
package object_detection.protos;
import "object_detection/protos/hyperparams.proto";
// Configuration proto for box predictor. See core/box_predictor.py for details.
message BoxPredictor {
oneof box_predictor_oneof {
ConvolutionalBoxPredictor convolutional_box_predictor = 1;
MaskRCNNBoxPredictor mask_rcnn_box_predictor = 2;
RfcnBoxPredictor rfcn_box_predictor = 3;
}
}
// Configuration proto for Convolutional box predictor.
message ConvolutionalBoxPredictor {
// Hyperparameters for convolution ops used in the box predictor.
optional Hyperparams conv_hyperparams = 1;
// Minumum feature depth prior to predicting box encodings and class
// predictions.
optional int32 min_depth = 2 [default = 0];
// Maximum feature depth prior to predicting box encodings and class
// predictions. If max_depth is set to 0, no additional feature map will be
// inserted before location and class predictions.
optional int32 max_depth = 3 [default = 0];
// Number of the additional conv layers before the predictor.
optional int32 num_layers_before_predictor = 4 [default = 0];
// Whether to use dropout for class prediction.
optional bool use_dropout = 5 [default = true];
// Keep probability for dropout
optional float dropout_keep_probability = 6 [default = 0.8];
// Size of final convolution kernel. If the spatial resolution of the feature
// map is smaller than the kernel size, then the kernel size is set to
// min(feature_width, feature_height).
optional int32 kernel_size = 7 [default = 1];
// Size of the encoding for boxes.
optional int32 box_code_size = 8 [default = 4];
// Whether to apply sigmoid to the output of class predictions.
// TODO: Do we need this since we have a post processing module.?
optional bool apply_sigmoid_to_scores = 9 [default = false];
}
message MaskRCNNBoxPredictor {
// Hyperparameters for fully connected ops used in the box predictor.
optional Hyperparams fc_hyperparams = 1;
// Whether to use dropout op prior to the both box and class predictions.
optional bool use_dropout = 2 [default= false];
// Keep probability for dropout. This is only used if use_dropout is true.
optional float dropout_keep_probability = 3 [default = 0.5];
// Size of the encoding for the boxes.
optional int32 box_code_size = 4 [default = 4];
// Hyperparameters for convolution ops used in the box predictor.
optional Hyperparams conv_hyperparams = 5;
// Whether to predict instance masks inside detection boxes.
optional bool predict_instance_masks = 6 [default = false];
// The depth for the first conv2d_transpose op applied to the
// image_features in the mask prediciton branch
optional int32 mask_prediction_conv_depth = 7 [default = 256];
// Whether to predict keypoints inside detection boxes.
optional bool predict_keypoints = 8 [default = false];
}
message RfcnBoxPredictor {
// Hyperparameters for convolution ops used in the box predictor.
optional Hyperparams conv_hyperparams = 1;
// Bin sizes for RFCN crops.
optional int32 num_spatial_bins_height = 2 [default = 3];
optional int32 num_spatial_bins_width = 3 [default = 3];
// Target depth to reduce the input image features to.
optional int32 depth = 4 [default=1024];
// Size of the encoding for the boxes.
optional int32 box_code_size = 5 [default = 4];
// Size to resize the rfcn crops to.
optional int32 crop_height = 6 [default= 12];
optional int32 crop_width = 7 [default=12];
}
syntax = "proto2";
package object_detection.protos;
// Message for configuring DetectionModel evaluation jobs (eval.py).
message EvalConfig {
// Number of visualization images to generate.
optional uint32 num_visualizations = 1 [default=10];
// Number of examples to process of evaluation.
optional uint32 num_examples = 2 [default=5000];
// How often to run evaluation.
optional uint32 eval_interval_secs = 3 [default=300];
// Maximum number of times to run evaluation. If set to 0, will run forever.
optional uint32 max_evals = 4 [default=0];
// Whether the TensorFlow graph used for evaluation should be saved to disk.
optional bool save_graph = 5 [default=false];
// Path to directory to store visualizations in. If empty, visualization
// images are not exported (only shown on Tensorboard).
optional string visualization_export_dir = 6 [default=""];
// BNS name of the TensorFlow master.
optional string eval_master = 7 [default=""];
// Type of metrics to use for evaluation. Currently supports only Pascal VOC
// detection metrics.
optional string metrics_set = 8 [default="pascal_voc_metrics"];
// Path to export detections to COCO compatible JSON format.
optional string export_path = 9 [default=''];
// Option to not read groundtruth labels and only export detections to
// COCO-compatible JSON file.
optional bool ignore_groundtruth = 10 [default=false];
// Use exponential moving averages of variables for evaluation.
// TODO: When this is false make sure the model is constructed
// without moving averages in restore_fn.
optional bool use_moving_averages = 11 [default=false];
// Whether to evaluate instance masks.
optional bool eval_instance_masks = 12 [default=false];
}
syntax = "proto2";
package object_detection.protos;
import "object_detection/protos/anchor_generator.proto";
import "object_detection/protos/box_predictor.proto";
import "object_detection/protos/hyperparams.proto";
import "object_detection/protos/image_resizer.proto";
import "object_detection/protos/losses.proto";
import "object_detection/protos/post_processing.proto";
// Configuration for Faster R-CNN models.
// See meta_architectures/faster_rcnn_meta_arch.py and models/model_builder.py
//
// Naming conventions:
// Faster R-CNN models have two stages: a first stage region proposal network
// (or RPN) and a second stage box classifier. We thus use the prefixes
// `first_stage_` and `second_stage_` to indicate the stage to which each
// parameter pertains when relevant.
message FasterRcnn {
// Whether to construct only the Region Proposal Network (RPN).
optional bool first_stage_only = 1 [default=false];
// Number of classes to predict.
optional int32 num_classes = 3;
// Image resizer for preprocessing the input image.
optional ImageResizer image_resizer = 4;
// Feature extractor config.
optional FasterRcnnFeatureExtractor feature_extractor = 5;
// (First stage) region proposal network (RPN) parameters.
// Anchor generator to compute RPN anchors.
optional AnchorGenerator first_stage_anchor_generator = 6;
// Atrous rate for the convolution op applied to the
// `first_stage_features_to_crop` tensor to obtain box predictions.
optional int32 first_stage_atrous_rate = 7 [default=1];
// Hyperparameters for the convolutional RPN box predictor.
optional Hyperparams first_stage_box_predictor_conv_hyperparams = 8;
// Kernel size to use for the convolution op just prior to RPN box
// predictions.
optional int32 first_stage_box_predictor_kernel_size = 9 [default=3];
// Output depth for the convolution op just prior to RPN box predictions.
optional int32 first_stage_box_predictor_depth = 10 [default=512];
// The batch size to use for computing the first stage objectness and
// location losses.
optional int32 first_stage_minibatch_size = 11 [default=256];
// Fraction of positive examples per image for the RPN.
optional float first_stage_positive_balance_fraction = 12 [default=0.5];
// Non max suppression score threshold applied to first stage RPN proposals.
optional float first_stage_nms_score_threshold = 13 [default=0.0];
// Non max suppression IOU threshold applied to first stage RPN proposals.
optional float first_stage_nms_iou_threshold = 14 [default=0.7];
// Maximum number of RPN proposals retained after first stage postprocessing.
optional int32 first_stage_max_proposals = 15 [default=300];
// First stage RPN localization loss weight.
optional float first_stage_localization_loss_weight = 16 [default=1.0];
// First stage RPN objectness loss weight.
optional float first_stage_objectness_loss_weight = 17 [default=1.0];
// Per-region cropping parameters.
// Note that if a R-FCN model is constructed the per region cropping
// parameters below are ignored.
// Output size (width and height are set to be the same) of the initial
// bilinear interpolation based cropping during ROI pooling.
optional int32 initial_crop_size = 18;
// Kernel size of the max pool op on the cropped feature map during
// ROI pooling.
optional int32 maxpool_kernel_size = 19;
// Stride of the max pool op on the cropped feature map during ROI pooling.
optional int32 maxpool_stride = 20;
// (Second stage) box classifier parameters
// Hyperparameters for the second stage box predictor. If box predictor type
// is set to rfcn_box_predictor, a R-FCN model is constructed, otherwise a
// Faster R-CNN model is constructed.
optional BoxPredictor second_stage_box_predictor = 21;
// The batch size per image used for computing the classification and refined
// location loss of the box classifier.
// Note that this field is ignored if `hard_example_miner` is configured.
optional int32 second_stage_batch_size = 22 [default=64];
// Fraction of positive examples to use per image for the box classifier.
optional float second_stage_balance_fraction = 23 [default=0.25];
// Post processing to apply on the second stage box classifier predictions.
// Note: the `score_converter` provided to the FasterRCNNMetaArch constructor
// is taken from this `second_stage_post_processing` proto.
optional PostProcessing second_stage_post_processing = 24;
// Second stage refined localization loss weight.
optional float second_stage_localization_loss_weight = 25 [default=1.0];
// Second stage classification loss weight
optional float second_stage_classification_loss_weight = 26 [default=1.0];
// If not left to default, applies hard example mining.
optional HardExampleMiner hard_example_miner = 27;
}
message FasterRcnnFeatureExtractor {
// Type of Faster R-CNN model (e.g., 'faster_rcnn_resnet101';
// See models/model_builder.py for expected types).
optional string type = 1;
// Output stride of extracted RPN feature map.
optional int32 first_stage_features_stride = 2 [default=16];
}
syntax = "proto2";
package object_detection.protos;
// Configuration proto for FasterRCNNBoxCoder. See
// box_coders/faster_rcnn_box_coder.py for details.
message FasterRcnnBoxCoder {
// Scale factor for anchor encoded box center.
optional float y_scale = 1 [default = 10.0];
optional float x_scale = 2 [default = 10.0];
// Scale factor for anchor encoded box height.
optional float height_scale = 3 [default = 5.0];
// Scale factor for anchor encoded box width.
optional float width_scale = 4 [default = 5.0];
}
syntax = "proto2";
package object_detection.protos;
// Configuration proto for GridAnchorGenerator. See
// anchor_generators/grid_anchor_generator.py for details.
message GridAnchorGenerator {
// Anchor height in pixels.
optional int32 height = 1 [default = 256];
// Anchor width in pixels.
optional int32 width = 2 [default = 256];
// Anchor stride in height dimension in pixels.
optional int32 height_stride = 3 [default = 16];
// Anchor stride in width dimension in pixels.
optional int32 width_stride = 4 [default = 16];
// Anchor height offset in pixels.
optional int32 height_offset = 5 [default = 0];
// Anchor width offset in pixels.
optional int32 width_offset = 6 [default = 0];
// At any given location, len(scales) * len(aspect_ratios) anchors are
// generated with all possible combinations of scales and aspect ratios.
// List of scales for the anchors.
repeated float scales = 7;
// List of aspect ratios for the anchors.
repeated float aspect_ratios = 8;
}
syntax = "proto2";
package object_detection.protos;
// Configuration proto for the convolution op hyperparameters to use in the
// object detection pipeline.
message Hyperparams {
// Operations affected by hyperparameters.
enum Op {
// Convolution, Separable Convolution, Convolution transpose.
CONV = 1;
// Fully connected
FC = 2;
}
optional Op op = 1 [default = CONV];
// Regularizer for the weights of the convolution op.
optional Regularizer regularizer = 2;
// Initializer for the weights of the convolution op.
optional Initializer initializer = 3;
// Type of activation to apply after convolution.
enum Activation {
// Use None (no activation)
NONE = 0;
// Use tf.nn.relu
RELU = 1;
// Use tf.nn.relu6
RELU_6 = 2;
}
optional Activation activation = 4 [default = RELU];
// BatchNorm hyperparameters. If this parameter is NOT set then BatchNorm is
// not applied!
optional BatchNorm batch_norm = 5;
}
// Proto with one-of field for regularizers.
message Regularizer {
oneof regularizer_oneof {
L1Regularizer l1_regularizer = 1;
L2Regularizer l2_regularizer = 2;
}
}
// Configuration proto for L1 Regularizer.
// See https://www.tensorflow.org/api_docs/python/tf/contrib/layers/l1_regularizer
message L1Regularizer {
optional float weight = 1 [default = 1.0];
}
// Configuration proto for L2 Regularizer.
// See https://www.tensorflow.org/api_docs/python/tf/contrib/layers/l2_regularizer
message L2Regularizer {
optional float weight = 1 [default = 1.0];
}
// Proto with one-of field for initializers.
message Initializer {
oneof initializer_oneof {
TruncatedNormalInitializer truncated_normal_initializer = 1;
VarianceScalingInitializer variance_scaling_initializer = 2;
}
}
// Configuration proto for truncated normal initializer. See
// https://www.tensorflow.org/api_docs/python/tf/truncated_normal_initializer
message TruncatedNormalInitializer {
optional float mean = 1 [default = 0.0];
optional float stddev = 2 [default = 1.0];
}
// Configuration proto for variance scaling initializer. See
// https://www.tensorflow.org/api_docs/python/tf/contrib/layers/
// variance_scaling_initializer
message VarianceScalingInitializer {
optional float factor = 1 [default = 2.0];
optional bool uniform = 2 [default = false];
enum Mode {
FAN_IN = 0;
FAN_OUT = 1;
FAN_AVG = 2;
}
optional Mode mode = 3 [default = FAN_IN];
}
// Configuration proto for batch norm to apply after convolution op. See
// https://www.tensorflow.org/api_docs/python/tf/contrib/layers/batch_norm
message BatchNorm {
optional float decay = 1 [default = 0.999];
optional bool center = 2 [default = true];
optional bool scale = 3 [default = false];
optional float epsilon = 4 [default = 0.001];
// Whether to train the batch norm variables. If this is set to false during
// training, the current value of the batch_norm variables are used for
// forward pass but they are never updated.
optional bool train = 5 [default = true];
}
syntax = "proto2";
package object_detection.protos;
// Configuration proto for image resizing operations.
// See builders/image_resizer_builder.py for details.
message ImageResizer {
oneof image_resizer_oneof {
KeepAspectRatioResizer keep_aspect_ratio_resizer = 1;
FixedShapeResizer fixed_shape_resizer = 2;
}
}
// Configuration proto for image resizer that keeps aspect ratio.
message KeepAspectRatioResizer {
// Desired size of the smaller image dimension in pixels.
optional int32 min_dimension = 1 [default = 600];
// Desired size of the larger image dimension in pixels.
optional int32 max_dimension = 2 [default = 1024];
}
// Configuration proto for image resizer that resizes to a fixed shape.
message FixedShapeResizer {
// Desired height of image in pixels.
optional int32 height = 1 [default = 300];
// Desired width of image in pixels.
optional int32 width = 2 [default = 300];
}
syntax = "proto2";
package object_detection.protos;
// Configuration proto for defining input readers that generate Object Detection
// Examples from input sources. Input readers are expected to generate a
// dictionary of tensors, with the following fields populated:
//
// 'image': an [image_height, image_width, channels] image tensor that detection
// will be run on.
// 'groundtruth_classes': a [num_boxes] int32 tensor storing the class
// labels of detected boxes in the image.
// 'groundtruth_boxes': a [num_boxes, 4] float tensor storing the coordinates of
// detected boxes in the image.
// 'groundtruth_instance_masks': (Optional), a [num_boxes, image_height,
// image_width] float tensor storing binary mask of the objects in boxes.
message InputReader {
// Path to StringIntLabelMap pbtxt file specifying the mapping from string
// labels to integer ids.
optional string label_map_path = 1 [default=""];
// Whether data should be processed in the order they are read in, or
// shuffled randomly.
optional bool shuffle = 2 [default=true];
// Maximum number of records to keep in reader queue.
optional uint32 queue_capacity = 3 [default=2000];
// Minimum number of records to keep in reader queue. A large value is needed
// to generate a good random shuffle.
optional uint32 min_after_dequeue = 4 [default=1000];
// The number of times a data source is read. If set to zero, the data source
// will be reused indefinitely.
optional uint32 num_epochs = 5 [default=0];
// Number of reader instances to create.
optional uint32 num_readers = 6 [default=8];
// Whether to load groundtruth instance masks.
optional bool load_instance_masks = 7 [default = false];
oneof input_reader {
TFRecordInputReader tf_record_input_reader = 8;
ExternalInputReader external_input_reader = 9;
}
}
// An input reader that reads TF Example protos from local TFRecord files.
message TFRecordInputReader {
// Path to TFRecordFile.
optional string input_path = 1 [default=""];
}
// An externally defined input reader. Users may define an extension to this
// proto to interface their own input readers.
message ExternalInputReader {
extensions 1 to 999;
}
syntax = "proto2";
package object_detection.protos;
// Message for configuring the localization loss, classification loss and hard
// example miner used for training object detection models. See core/losses.py
// for details
message Loss {
// Localization loss to use.
optional LocalizationLoss localization_loss = 1;
// Classification loss to use.
optional ClassificationLoss classification_loss = 2;
// If not left to default, applies hard example mining.
optional HardExampleMiner hard_example_miner = 3;
// Classification loss weight.
optional float classification_weight = 4 [default=1.0];
// Localization loss weight.
optional float localization_weight = 5 [default=1.0];
}
// Configuration for bounding box localization loss function.
message LocalizationLoss {
oneof localization_loss {
WeightedL2LocalizationLoss weighted_l2 = 1;
WeightedSmoothL1LocalizationLoss weighted_smooth_l1 = 2;
WeightedIOULocalizationLoss weighted_iou = 3;
}
}
// L2 location loss: 0.5 * ||weight * (a - b)|| ^ 2
message WeightedL2LocalizationLoss {
// Output loss per anchor.
optional bool anchorwise_output = 1 [default=false];
}
// SmoothL1 (Huber) location loss: .5 * x ^ 2 if |x| < 1 else |x| - .5
message WeightedSmoothL1LocalizationLoss {
// Output loss per anchor.
optional bool anchorwise_output = 1 [default=false];
}
// Intersection over union location loss: 1 - IOU
message WeightedIOULocalizationLoss {
}
// Configuration for class prediction loss function.
message ClassificationLoss {
oneof classification_loss {
WeightedSigmoidClassificationLoss weighted_sigmoid = 1;
WeightedSoftmaxClassificationLoss weighted_softmax = 2;
BootstrappedSigmoidClassificationLoss bootstrapped_sigmoid = 3;
}
}
// Classification loss using a sigmoid function over class predictions.
message WeightedSigmoidClassificationLoss {
// Output loss per anchor.
optional bool anchorwise_output = 1 [default=false];
}
// Classification loss using a softmax function over class predictions.
message WeightedSoftmaxClassificationLoss {
// Output loss per anchor.
optional bool anchorwise_output = 1 [default=false];
}
// Classification loss using a sigmoid function over the class prediction with
// the highest prediction score.
message BootstrappedSigmoidClassificationLoss {
// Interpolation weight between 0 and 1.
optional float alpha = 1;
// Whether hard boot strapping should be used or not. If true, will only use
// one class favored by model. Othewise, will use all predicted class
// probabilities.
optional bool hard_bootstrap = 2 [default=false];
// Output loss per anchor.
optional bool anchorwise_output = 3 [default=false];
}
// Configuation for hard example miner.
message HardExampleMiner {
// Maximum number of hard examples to be selected per image (prior to
// enforcing max negative to positive ratio constraint). If set to 0,
// all examples obtained after NMS are considered.
optional int32 num_hard_examples = 1 [default=64];
// Minimum intersection over union for an example to be discarded during NMS.
optional float iou_threshold = 2 [default=0.7];
// Whether to use classification losses ('cls', default), localization losses
// ('loc') or both losses ('both'). In the case of 'both', cls_loss_weight and
// loc_loss_weight are used to compute weighted sum of the two losses.
enum LossType {
BOTH = 0;
CLASSIFICATION = 1;
LOCALIZATION = 2;
}
optional LossType loss_type = 3 [default=BOTH];
// Maximum number of negatives to retain for each positive anchor. If
// num_negatives_per_positive is 0 no prespecified negative:positive ratio is
// enforced.
optional int32 max_negatives_per_positive = 4 [default=0];
// Minimum number of negative anchors to sample for a given image. Setting
// this to a positive number samples negatives in an image without any
// positive anchors and thus not bias the model towards having at least one
// detection per image.
optional int32 min_negatives_per_image = 5 [default=0];
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment