Commit 27b4acd4 authored by Aman Gupta's avatar Aman Gupta
Browse files

Merge remote-tracking branch 'upstream/master'

parents 5133522f d4e1f97f
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SSD MobilenetV2 FPN Feature Extractor."""
import copy
import functools
import tensorflow as tf
from object_detection.meta_architectures import ssd_meta_arch
from object_detection.models import feature_map_generators
from object_detection.utils import context_manager
from object_detection.utils import ops
from object_detection.utils import shape_utils
from nets.mobilenet import mobilenet
from nets.mobilenet import mobilenet_v2
slim = tf.contrib.slim
# A modified config of mobilenet v2 that makes it more detection friendly,
def _create_modified_mobilenet_config():
conv_defs = copy.copy(mobilenet_v2.V2_DEF)
conv_defs['spec'][-1] = mobilenet.op(
slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=256)
return conv_defs
_CONV_DEFS = _create_modified_mobilenet_config()
class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
"""SSD Feature Extractor using MobilenetV2 FPN features."""
def __init__(self,
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams_fn,
fpn_min_level=3,
fpn_max_level=7,
additional_layer_depth=256,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=False,
override_base_feature_extractor_hyperparams=False):
"""SSD FPN feature extractor based on Mobilenet v2 architecture.
Args:
is_training: whether the network is in training mode.
depth_multiplier: float depth multiplier for feature extractor.
min_depth: minimum feature extractor depth.
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
and separable_conv2d ops in the layers that are added on top of the base
feature extractor.
fpn_min_level: the highest resolution feature map to use in FPN. The valid
values are {2, 3, 4, 5} which map to MobileNet v2 layers
{layer_4, layer_7, layer_14, layer_19}, respectively.
fpn_max_level: the smallest resolution feature map to construct or use in
FPN. FPN constructions uses features maps starting from fpn_min_level
upto the fpn_max_level. In the case that there are not enough feature
maps in the backbone network, additional feature maps are created by
applying stride 2 convolutions until we get the desired number of fpn
levels.
additional_layer_depth: additional feature map layer channel depth.
reuse_weights: whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False.
use_depthwise: Whether to use depthwise convolutions. Default is False.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
"""
super(SSDMobileNetV2FpnFeatureExtractor, self).__init__(
is_training=is_training,
depth_multiplier=depth_multiplier,
min_depth=min_depth,
pad_to_multiple=pad_to_multiple,
conv_hyperparams_fn=conv_hyperparams_fn,
reuse_weights=reuse_weights,
use_explicit_padding=use_explicit_padding,
use_depthwise=use_depthwise,
override_base_feature_extractor_hyperparams=
override_base_feature_extractor_hyperparams)
self._fpn_min_level = fpn_min_level
self._fpn_max_level = fpn_max_level
self._additional_layer_depth = additional_layer_depth
def preprocess(self, resized_inputs):
"""SSD preprocessing.
Maps pixel values to the range [-1, 1].
Args:
resized_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
"""
return (2.0 / 255.0) * resized_inputs - 1.0
def extract_features(self, preprocessed_inputs):
"""Extract features from preprocessed inputs.
Args:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
feature_maps: a list of tensors where the ith tensor has shape
[batch, height_i, width_i, depth_i]
"""
preprocessed_inputs = shape_utils.check_min_image_dim(
33, preprocessed_inputs)
with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope:
with slim.arg_scope(
mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \
slim.arg_scope(
[mobilenet.depth_multiplier], min_depth=self._min_depth):
with (slim.arg_scope(self._conv_hyperparams_fn())
if self._override_base_feature_extractor_hyperparams else
context_manager.IdentityContextManager()):
_, image_features = mobilenet_v2.mobilenet_base(
ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
final_endpoint='layer_19',
depth_multiplier=self._depth_multiplier,
conv_defs=_CONV_DEFS if self._use_depthwise else None,
use_explicit_padding=self._use_explicit_padding,
scope=scope)
depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth)
with slim.arg_scope(self._conv_hyperparams_fn()):
with tf.variable_scope('fpn', reuse=self._reuse_weights):
feature_blocks = [
'layer_4', 'layer_7', 'layer_14', 'layer_19'
]
base_fpn_max_level = min(self._fpn_max_level, 5)
feature_block_list = []
for level in range(self._fpn_min_level, base_fpn_max_level + 1):
feature_block_list.append(feature_blocks[level - 2])
fpn_features = feature_map_generators.fpn_top_down_feature_maps(
[(key, image_features[key]) for key in feature_block_list],
depth=depth_fn(self._additional_layer_depth),
use_depthwise=self._use_depthwise)
feature_maps = []
for level in range(self._fpn_min_level, base_fpn_max_level + 1):
feature_maps.append(fpn_features['top_down_{}'.format(
feature_blocks[level - 2])])
last_feature_map = fpn_features['top_down_{}'.format(
feature_blocks[base_fpn_max_level - 2])]
# Construct coarse features
for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1):
if self._use_depthwise:
conv_op = functools.partial(
slim.separable_conv2d, depth_multiplier=1)
else:
conv_op = slim.conv2d
last_feature_map = conv_op(
last_feature_map,
num_outputs=depth_fn(self._additional_layer_depth),
kernel_size=[3, 3],
stride=2,
padding='SAME',
scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 19))
feature_maps.append(last_feature_map)
return feature_maps
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for ssd_mobilenet_v2_fpn_feature_extractor."""
import numpy as np
import tensorflow as tf
from object_detection.models import ssd_feature_extractor_test
from object_detection.models import ssd_mobilenet_v2_fpn_feature_extractor
slim = tf.contrib.slim
class SsdMobilenetV2FpnFeatureExtractorTest(
ssd_feature_extractor_test.SsdFeatureExtractorTestBase):
def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
is_training=True, use_explicit_padding=False):
"""Constructs a new feature extractor.
Args:
depth_multiplier: float depth multiplier for feature extractor
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
is_training: whether the network is in training mode.
use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
inputs so that the output dimensions are the same as if 'SAME' padding
were used.
Returns:
an ssd_meta_arch.SSDFeatureExtractor object.
"""
min_depth = 32
return (ssd_mobilenet_v2_fpn_feature_extractor.
SSDMobileNetV2FpnFeatureExtractor(
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
self.conv_hyperparams_fn,
use_explicit_padding=use_explicit_padding))
def test_extract_features_returns_correct_shapes_256(self):
image_height = 256
image_width = 256
depth_multiplier = 1.0
pad_to_multiple = 1
expected_feature_map_shape = [(2, 32, 32, 256), (2, 16, 16, 256),
(2, 8, 8, 256), (2, 4, 4, 256),
(2, 2, 2, 256)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=False)
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=True)
def test_extract_features_returns_correct_shapes_384(self):
image_height = 320
image_width = 320
depth_multiplier = 1.0
pad_to_multiple = 1
expected_feature_map_shape = [(2, 40, 40, 256), (2, 20, 20, 256),
(2, 10, 10, 256), (2, 5, 5, 256),
(2, 3, 3, 256)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=False)
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=True)
def test_extract_features_with_dynamic_image_shape(self):
image_height = 256
image_width = 256
depth_multiplier = 1.0
pad_to_multiple = 1
expected_feature_map_shape = [(2, 32, 32, 256), (2, 16, 16, 256),
(2, 8, 8, 256), (2, 4, 4, 256),
(2, 2, 2, 256)]
self.check_extract_features_returns_correct_shapes_with_dynamic_inputs(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=False)
self.check_extract_features_returns_correct_shapes_with_dynamic_inputs(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=True)
def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self):
image_height = 299
image_width = 299
depth_multiplier = 1.0
pad_to_multiple = 32
expected_feature_map_shape = [(2, 40, 40, 256), (2, 20, 20, 256),
(2, 10, 10, 256), (2, 5, 5, 256),
(2, 3, 3, 256)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=False)
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=True)
def test_extract_features_returns_correct_shapes_enforcing_min_depth(self):
image_height = 256
image_width = 256
depth_multiplier = 0.5**12
pad_to_multiple = 1
expected_feature_map_shape = [(2, 32, 32, 32), (2, 16, 16, 32),
(2, 8, 8, 32), (2, 4, 4, 32),
(2, 2, 2, 32)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=False)
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=True)
def test_extract_features_raises_error_with_invalid_image_size(self):
image_height = 32
image_width = 32
depth_multiplier = 1.0
pad_to_multiple = 1
self.check_extract_features_raises_error_with_invalid_image_size(
image_height, image_width, depth_multiplier, pad_to_multiple)
def test_preprocess_returns_correct_value_range(self):
image_height = 256
image_width = 256
depth_multiplier = 1
pad_to_multiple = 1
test_image = np.random.rand(2, image_height, image_width, 3)
feature_extractor = self._create_feature_extractor(depth_multiplier,
pad_to_multiple)
preprocessed_image = feature_extractor.preprocess(test_image)
self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))
def test_variables_only_created_in_scope(self):
depth_multiplier = 1
pad_to_multiple = 1
scope_name = 'MobilenetV2'
self.check_feature_extractor_variables_under_scope(
depth_multiplier, pad_to_multiple, scope_name)
def test_fused_batchnorm(self):
image_height = 256
image_width = 256
depth_multiplier = 1
pad_to_multiple = 1
image_placeholder = tf.placeholder(tf.float32,
[1, image_height, image_width, 3])
feature_extractor = self._create_feature_extractor(depth_multiplier,
pad_to_multiple)
preprocessed_image = feature_extractor.preprocess(image_placeholder)
_ = feature_extractor.extract_features(preprocessed_image)
self.assertTrue(
any(op.type == 'FusedBatchNorm'
for op in tf.get_default_graph().get_operations()))
def test_get_expected_feature_map_variable_names(self):
depth_multiplier = 1.0
pad_to_multiple = 1
expected_feature_maps_variables = set([
# Mobilenet V2 feature maps
'MobilenetV2/expanded_conv_4/depthwise/depthwise_weights',
'MobilenetV2/expanded_conv_7/depthwise/depthwise_weights',
'MobilenetV2/expanded_conv_14/depthwise/depthwise_weights',
'MobilenetV2/Conv_1/weights',
# FPN layers
'MobilenetV2/fpn/bottom_up_Conv2d_20/weights',
'MobilenetV2/fpn/bottom_up_Conv2d_21/weights',
'MobilenetV2/fpn/smoothing_1/weights',
'MobilenetV2/fpn/smoothing_2/weights',
'MobilenetV2/fpn/projection_1/weights',
'MobilenetV2/fpn/projection_2/weights',
'MobilenetV2/fpn/projection_3/weights',
])
g = tf.Graph()
with g.as_default():
preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3))
feature_extractor = self._create_feature_extractor(
depth_multiplier, pad_to_multiple)
feature_extractor.extract_features(preprocessed_inputs)
actual_variable_set = set([
var.op.name for var in g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
])
variable_intersection = expected_feature_maps_variables.intersection(
actual_variable_set)
self.assertSetEqual(expected_feature_maps_variables,
variable_intersection)
if __name__ == '__main__':
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SSDFeatureExtractor for MobilenetV2 features."""
import tensorflow as tf
from object_detection.meta_architectures import ssd_meta_arch
from object_detection.models import feature_map_generators
from object_detection.models.keras_applications import mobilenet_v2
from object_detection.utils import ops
from object_detection.utils import shape_utils
class SSDMobileNetV2KerasFeatureExtractor(
ssd_meta_arch.SSDKerasFeatureExtractor):
"""SSD Feature Extractor using MobilenetV2 features."""
def __init__(self,
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams,
freeze_batchnorm,
inplace_batchnorm_update,
use_explicit_padding=False,
use_depthwise=False,
override_base_feature_extractor_hyperparams=False,
name=None):
"""MobileNetV2 Feature Extractor for SSD Models.
Mobilenet v2 (experimental), designed by sandler@. More details can be found
in //knowledge/cerebra/brain/compression/mobilenet/mobilenet_experimental.py
Args:
is_training: whether the network is in training mode.
depth_multiplier: float depth multiplier for feature extractor (Functions
as a width multiplier for the mobilenet_v2 network itself).
min_depth: minimum feature extractor depth.
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams: `hyperparams_builder.KerasLayerHyperparams` object
containing convolution hyperparameters for the layers added on top of
the base feature extractor.
freeze_batchnorm: Whether to freeze batch norm parameters during
training or not. When training with a small batch size (e.g. 1), it is
desirable to freeze batch norm update and use pretrained batch norm
params.
inplace_batchnorm_update: Whether to update batch norm moving average
values inplace. When this is false train op must add a control
dependency on tf.graphkeys.UPDATE_OPS collection in order to update
batch norm statistics.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False.
use_depthwise: Whether to use depthwise convolutions. Default is False.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
name: A string name scope to assign to the model. If 'None', Keras
will auto-generate one from the class name.
"""
super(SSDMobileNetV2KerasFeatureExtractor, self).__init__(
is_training=is_training,
depth_multiplier=depth_multiplier,
min_depth=min_depth,
pad_to_multiple=pad_to_multiple,
conv_hyperparams=conv_hyperparams,
freeze_batchnorm=freeze_batchnorm,
inplace_batchnorm_update=inplace_batchnorm_update,
use_explicit_padding=use_explicit_padding,
use_depthwise=use_depthwise,
override_base_feature_extractor_hyperparams=
override_base_feature_extractor_hyperparams,
name=name)
feature_map_layout = {
'from_layer': ['layer_15/expansion_output', 'layer_19', '', '', '', ''],
'layer_depth': [-1, -1, 512, 256, 256, 128],
'use_depthwise': self._use_depthwise,
'use_explicit_padding': self._use_explicit_padding,
}
with tf.name_scope('MobilenetV2'):
full_mobilenet_v2 = mobilenet_v2.mobilenet_v2(
batchnorm_training=(is_training and not freeze_batchnorm),
conv_hyperparams=(conv_hyperparams
if self._override_base_feature_extractor_hyperparams
else None),
weights=None,
use_explicit_padding=use_explicit_padding,
alpha=self._depth_multiplier,
min_depth=self._min_depth,
include_top=False)
conv2d_11_pointwise = full_mobilenet_v2.get_layer(
name='block_13_expand_relu').output
conv2d_13_pointwise = full_mobilenet_v2.get_layer(name='out_relu').output
self.mobilenet_v2 = tf.keras.Model(
inputs=full_mobilenet_v2.inputs,
outputs=[conv2d_11_pointwise, conv2d_13_pointwise])
self.feature_map_generator = (
feature_map_generators.KerasMultiResolutionFeatureMaps(
feature_map_layout=feature_map_layout,
depth_multiplier=self._depth_multiplier,
min_depth=self._min_depth,
insert_1x1_conv=True,
is_training=is_training,
conv_hyperparams=conv_hyperparams,
freeze_batchnorm=freeze_batchnorm,
name='FeatureMaps'))
def preprocess(self, resized_inputs):
"""SSD preprocessing.
Maps pixel values to the range [-1, 1].
Args:
resized_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
"""
return (2.0 / 255.0) * resized_inputs - 1.0
def _extract_features(self, preprocessed_inputs):
"""Extract features from preprocessed inputs.
Args:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
feature_maps: a list of tensors where the ith tensor has shape
[batch, height_i, width_i, depth_i]
"""
preprocessed_inputs = shape_utils.check_min_image_dim(
33, preprocessed_inputs)
image_features = self.mobilenet_v2(
ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple))
feature_maps = self.feature_map_generator({
'layer_15/expansion_output': image_features[0],
'layer_19': image_features[1]})
return feature_maps.values()
...@@ -43,6 +43,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -43,6 +43,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
fpn_scope_name, fpn_scope_name,
fpn_min_level=3, fpn_min_level=3,
fpn_max_level=7, fpn_max_level=7,
additional_layer_depth=256,
reuse_weights=None, reuse_weights=None,
use_explicit_padding=False, use_explicit_padding=False,
use_depthwise=False, use_depthwise=False,
...@@ -72,6 +73,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -72,6 +73,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
maps in the backbone network, additional feature maps are created by maps in the backbone network, additional feature maps are created by
applying stride 2 convolutions until we get the desired number of fpn applying stride 2 convolutions until we get the desired number of fpn
levels. levels.
additional_layer_depth: additional feature map layer channel depth.
reuse_weights: Whether to reuse variables. Default is None. reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False. UNUSED currently. features. Default is False. UNUSED currently.
...@@ -104,6 +106,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -104,6 +106,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
self._fpn_scope_name = fpn_scope_name self._fpn_scope_name = fpn_scope_name
self._fpn_min_level = fpn_min_level self._fpn_min_level = fpn_min_level
self._fpn_max_level = fpn_max_level self._fpn_max_level = fpn_max_level
self._additional_layer_depth = additional_layer_depth
def preprocess(self, resized_inputs): def preprocess(self, resized_inputs):
"""SSD preprocessing. """SSD preprocessing.
...@@ -177,7 +180,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -177,7 +180,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
feature_block_list.append('block{}'.format(level - 1)) feature_block_list.append('block{}'.format(level - 1))
fpn_features = feature_map_generators.fpn_top_down_feature_maps( fpn_features = feature_map_generators.fpn_top_down_feature_maps(
[(key, image_features[key]) for key in feature_block_list], [(key, image_features[key]) for key in feature_block_list],
depth=256) depth=self._additional_layer_depth)
feature_maps = [] feature_maps = []
for level in range(self._fpn_min_level, base_fpn_max_level + 1): for level in range(self._fpn_min_level, base_fpn_max_level + 1):
feature_maps.append( feature_maps.append(
...@@ -188,7 +191,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -188,7 +191,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
for i in range(base_fpn_max_level, self._fpn_max_level): for i in range(base_fpn_max_level, self._fpn_max_level):
last_feature_map = slim.conv2d( last_feature_map = slim.conv2d(
last_feature_map, last_feature_map,
num_outputs=256, num_outputs=self._additional_layer_depth,
kernel_size=[3, 3], kernel_size=[3, 3],
stride=2, stride=2,
padding='SAME', padding='SAME',
...@@ -208,6 +211,7 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor): ...@@ -208,6 +211,7 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
conv_hyperparams_fn, conv_hyperparams_fn,
fpn_min_level=3, fpn_min_level=3,
fpn_max_level=7, fpn_max_level=7,
additional_layer_depth=256,
reuse_weights=None, reuse_weights=None,
use_explicit_padding=False, use_explicit_padding=False,
use_depthwise=False, use_depthwise=False,
...@@ -226,6 +230,7 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor): ...@@ -226,6 +230,7 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
base feature extractor. base feature extractor.
fpn_min_level: the minimum level in feature pyramid networks. fpn_min_level: the minimum level in feature pyramid networks.
fpn_max_level: the maximum level in feature pyramid networks. fpn_max_level: the maximum level in feature pyramid networks.
additional_layer_depth: additional feature map layer channel depth.
reuse_weights: Whether to reuse variables. Default is None. reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False. UNUSED currently. features. Default is False. UNUSED currently.
...@@ -245,6 +250,7 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor): ...@@ -245,6 +250,7 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
'fpn', 'fpn',
fpn_min_level, fpn_min_level,
fpn_max_level, fpn_max_level,
additional_layer_depth,
reuse_weights=reuse_weights, reuse_weights=reuse_weights,
use_explicit_padding=use_explicit_padding, use_explicit_padding=use_explicit_padding,
use_depthwise=use_depthwise, use_depthwise=use_depthwise,
...@@ -263,6 +269,7 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor): ...@@ -263,6 +269,7 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
conv_hyperparams_fn, conv_hyperparams_fn,
fpn_min_level=3, fpn_min_level=3,
fpn_max_level=7, fpn_max_level=7,
additional_layer_depth=256,
reuse_weights=None, reuse_weights=None,
use_explicit_padding=False, use_explicit_padding=False,
use_depthwise=False, use_depthwise=False,
...@@ -281,6 +288,7 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor): ...@@ -281,6 +288,7 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
base feature extractor. base feature extractor.
fpn_min_level: the minimum level in feature pyramid networks. fpn_min_level: the minimum level in feature pyramid networks.
fpn_max_level: the maximum level in feature pyramid networks. fpn_max_level: the maximum level in feature pyramid networks.
additional_layer_depth: additional feature map layer channel depth.
reuse_weights: Whether to reuse variables. Default is None. reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False. UNUSED currently. features. Default is False. UNUSED currently.
...@@ -300,6 +308,7 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor): ...@@ -300,6 +308,7 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
'fpn', 'fpn',
fpn_min_level, fpn_min_level,
fpn_max_level, fpn_max_level,
additional_layer_depth,
reuse_weights=reuse_weights, reuse_weights=reuse_weights,
use_explicit_padding=use_explicit_padding, use_explicit_padding=use_explicit_padding,
use_depthwise=use_depthwise, use_depthwise=use_depthwise,
...@@ -318,6 +327,7 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor): ...@@ -318,6 +327,7 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
conv_hyperparams_fn, conv_hyperparams_fn,
fpn_min_level=3, fpn_min_level=3,
fpn_max_level=7, fpn_max_level=7,
additional_layer_depth=256,
reuse_weights=None, reuse_weights=None,
use_explicit_padding=False, use_explicit_padding=False,
use_depthwise=False, use_depthwise=False,
...@@ -336,6 +346,7 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor): ...@@ -336,6 +346,7 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
base feature extractor. base feature extractor.
fpn_min_level: the minimum level in feature pyramid networks. fpn_min_level: the minimum level in feature pyramid networks.
fpn_max_level: the maximum level in feature pyramid networks. fpn_max_level: the maximum level in feature pyramid networks.
additional_layer_depth: additional feature map layer channel depth.
reuse_weights: Whether to reuse variables. Default is None. reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False. UNUSED currently. features. Default is False. UNUSED currently.
...@@ -355,6 +366,7 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor): ...@@ -355,6 +366,7 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
'fpn', 'fpn',
fpn_min_level, fpn_min_level,
fpn_max_level, fpn_max_level,
additional_layer_depth,
reuse_weights=reuse_weights, reuse_weights=reuse_weights,
use_explicit_padding=use_explicit_padding, use_explicit_padding=use_explicit_padding,
use_depthwise=use_depthwise, use_depthwise=use_depthwise,
......
...@@ -36,7 +36,6 @@ ...@@ -36,7 +36,6 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"from distutils.version import StrictVersion\n",
"import numpy as np\n", "import numpy as np\n",
"import os\n", "import os\n",
"import six.moves.urllib as urllib\n", "import six.moves.urllib as urllib\n",
...@@ -45,6 +44,7 @@ ...@@ -45,6 +44,7 @@
"import tensorflow as tf\n", "import tensorflow as tf\n",
"import zipfile\n", "import zipfile\n",
"\n", "\n",
"from distutils.version import StrictVersion\n",
"from collections import defaultdict\n", "from collections import defaultdict\n",
"from io import StringIO\n", "from io import StringIO\n",
"from matplotlib import pyplot as plt\n", "from matplotlib import pyplot as plt\n",
...@@ -166,9 +166,7 @@ ...@@ -166,9 +166,7 @@
"PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'\n", "PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'\n",
"\n", "\n",
"# List of the strings that is used to add correct label for each box.\n", "# List of the strings that is used to add correct label for each box.\n",
"PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')\n", "PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')"
"\n",
"NUM_CLASSES = 90"
] ]
}, },
{ {
...@@ -265,9 +263,7 @@ ...@@ -265,9 +263,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"label_map = label_map_util.load_labelmap(PATH_TO_LABELS)\n", "category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS, use_display_name=True)"
"categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)\n",
"category_index = label_map_util.create_category_index(categories)"
] ]
}, },
{ {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
# ============================================================================== # ==============================================================================
"""Convolutional Box Predictors with and without weight sharing.""" """Convolutional Box Predictors with and without weight sharing."""
import functools
import tensorflow as tf import tensorflow as tf
from object_detection.core import box_predictor from object_detection.core import box_predictor
from object_detection.utils import static_shape from object_detection.utils import static_shape
...@@ -163,7 +164,7 @@ class ConvolutionalBoxPredictor(box_predictor.BoxPredictor): ...@@ -163,7 +164,7 @@ class ConvolutionalBoxPredictor(box_predictor.BoxPredictor):
else: else:
head_obj = self._other_heads[head_name] head_obj = self._other_heads[head_name]
prediction = head_obj.predict( prediction = head_obj.predict(
features=image_feature, features=net,
num_predictions_per_location=num_predictions_per_location) num_predictions_per_location=num_predictions_per_location)
predictions[head_name].append(prediction) predictions[head_name].append(prediction)
return predictions return predictions
...@@ -203,7 +204,8 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor): ...@@ -203,7 +204,8 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor):
num_layers_before_predictor, num_layers_before_predictor,
kernel_size=3, kernel_size=3,
apply_batch_norm=False, apply_batch_norm=False,
share_prediction_tower=False): share_prediction_tower=False,
use_depthwise=False):
"""Constructor. """Constructor.
Args: Args:
...@@ -226,6 +228,8 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor): ...@@ -226,6 +228,8 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor):
this predictor. this predictor.
share_prediction_tower: Whether to share the multi-layer tower between box share_prediction_tower: Whether to share the multi-layer tower between box
prediction and class prediction heads. prediction and class prediction heads.
use_depthwise: Whether to use depthwise separable conv2d instead of
regular conv2d.
""" """
super(WeightSharedConvolutionalBoxPredictor, self).__init__(is_training, super(WeightSharedConvolutionalBoxPredictor, self).__init__(is_training,
num_classes) num_classes)
...@@ -238,6 +242,7 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor): ...@@ -238,6 +242,7 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor):
self._kernel_size = kernel_size self._kernel_size = kernel_size
self._apply_batch_norm = apply_batch_norm self._apply_batch_norm = apply_batch_norm
self._share_prediction_tower = share_prediction_tower self._share_prediction_tower = share_prediction_tower
self._use_depthwise = use_depthwise
@property @property
def num_classes(self): def num_classes(self):
...@@ -270,7 +275,11 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor): ...@@ -270,7 +275,11 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor):
inserted_layer_counter): inserted_layer_counter):
net = image_feature net = image_feature
for i in range(self._num_layers_before_predictor): for i in range(self._num_layers_before_predictor):
net = slim.conv2d( if self._use_depthwise:
conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1)
else:
conv_op = slim.conv2d
net = conv_op(
net, net,
self._depth, [self._kernel_size, self._kernel_size], self._depth, [self._kernel_size, self._kernel_size],
stride=1, stride=1,
......
...@@ -234,6 +234,40 @@ class ConvolutionalBoxPredictorTest(test_case.TestCase): ...@@ -234,6 +234,40 @@ class ConvolutionalBoxPredictorTest(test_case.TestCase):
'BoxPredictor/ClassPredictor/weights']) 'BoxPredictor/ClassPredictor/weights'])
self.assertEqual(expected_variable_set, actual_variable_set) self.assertEqual(expected_variable_set, actual_variable_set)
def test_no_dangling_outputs(self):
image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
conv_box_predictor = (
box_predictor_builder.build_convolutional_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4,
use_dropout=True,
use_depthwise=True))
box_predictions = conv_box_predictor.predict(
[image_features], num_predictions_per_location=[5],
scope='BoxPredictor')
tf.concat(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
tf.concat(
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
axis=1)
bad_dangling_ops = []
types_safe_to_dangle = set(['Assign', 'Mul', 'Const'])
for op in tf.get_default_graph().get_operations():
if (not op.outputs) or (not op.outputs[0].consumers()):
if 'BoxPredictor' in op.name:
if op.type not in types_safe_to_dangle:
bad_dangling_ops.append(op)
self.assertEqual(bad_dangling_ops, [])
class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase): class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
...@@ -545,6 +579,79 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase): ...@@ -545,6 +579,79 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
'ClassPredictor/biases')]) 'ClassPredictor/biases')])
self.assertEqual(expected_variable_set, actual_variable_set) self.assertEqual(expected_variable_set, actual_variable_set)
def test_predictions_multiple_feature_maps_share_weights_with_depthwise(
self):
num_classes_without_background = 6
def graph_fn(image_features1, image_features2):
conv_box_predictor = (
box_predictor_builder.build_weight_shared_convolutional_box_predictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=2,
box_code_size=4,
apply_batch_norm=False,
use_depthwise=True))
box_predictions = conv_box_predictor.predict(
[image_features1, image_features2],
num_predictions_per_location=[5, 5],
scope='BoxPredictor')
box_encodings = tf.concat(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
class_predictions_with_background = tf.concat(
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
axis=1)
return (box_encodings, class_predictions_with_background)
with self.test_session(graph=tf.Graph()):
graph_fn(tf.random_uniform([4, 32, 32, 3], dtype=tf.float32),
tf.random_uniform([4, 16, 16, 3], dtype=tf.float32))
actual_variable_set = set(
[var.op.name for var in tf.trainable_variables()])
expected_variable_set = set([
# Box prediction tower
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictionTower/conv2d_0/depthwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictionTower/conv2d_0/pointwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictionTower/conv2d_0/biases'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictionTower/conv2d_1/depthwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictionTower/conv2d_1/pointwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictionTower/conv2d_1/biases'),
# Box prediction head
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictor/depthwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictor/pointwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictor/biases'),
# Class prediction tower
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_0/depthwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_0/pointwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_0/biases'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_1/depthwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_1/pointwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_1/biases'),
# Class prediction head
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictor/depthwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictor/pointwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictor/biases')])
self.assertEqual(expected_variable_set, actual_variable_set)
def test_no_batchnorm_params_when_batchnorm_is_not_configured(self): def test_no_batchnorm_params_when_batchnorm_is_not_configured(self):
num_classes_without_background = 6 num_classes_without_background = 6
def graph_fn(image_features1, image_features2): def graph_fn(image_features1, image_features2):
......
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Convolutional Box Predictors with and without weight sharing."""
import collections
import tensorflow as tf
from object_detection.core import box_predictor
from object_detection.utils import static_shape
keras = tf.keras.layers
BOX_ENCODINGS = box_predictor.BOX_ENCODINGS
CLASS_PREDICTIONS_WITH_BACKGROUND = (
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND)
MASK_PREDICTIONS = box_predictor.MASK_PREDICTIONS
class _NoopVariableScope(object):
"""A dummy class that does not push any scope."""
def __enter__(self):
return None
def __exit__(self, exc_type, exc_value, traceback):
return False
class ConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor):
"""Convolutional Keras Box Predictor.
Optionally add an intermediate 1x1 convolutional layer after features and
predict in parallel branches box_encodings and
class_predictions_with_background.
Currently this box predictor assumes that predictions are "shared" across
classes --- that is each anchor makes box predictions which do not depend
on class.
"""
def __init__(self,
is_training,
num_classes,
box_prediction_heads,
class_prediction_heads,
other_heads,
conv_hyperparams,
num_layers_before_predictor,
min_depth,
max_depth,
freeze_batchnorm,
inplace_batchnorm_update,
name=None):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: number of classes. Note that num_classes *does not*
include the background category, so if groundtruth labels take values
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
assigned classification targets can range from {0,... K}).
box_prediction_heads: A list of heads that predict the boxes.
class_prediction_heads: A list of heads that predict the classes.
other_heads: A dictionary mapping head names to lists of convolutional
heads.
conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
containing hyperparameters for convolution ops.
num_layers_before_predictor: Number of the additional conv layers before
the predictor.
min_depth: Minimum feature depth prior to predicting box encodings
and class predictions.
max_depth: Maximum feature depth prior to predicting box encodings
and class predictions. If max_depth is set to 0, no additional
feature map will be inserted before location and class predictions.
freeze_batchnorm: Whether to freeze batch norm parameters during
training or not. When training with a small batch size (e.g. 1), it is
desirable to freeze batch norm update and use pretrained batch norm
params.
inplace_batchnorm_update: Whether to update batch norm moving average
values inplace. When this is false train op must add a control
dependency on tf.graphkeys.UPDATE_OPS collection in order to update
batch norm statistics.
name: A string name scope to assign to the model. If `None`, Keras
will auto-generate one from the class name.
Raises:
ValueError: if min_depth > max_depth.
"""
super(ConvolutionalBoxPredictor, self).__init__(
is_training, num_classes, freeze_batchnorm=freeze_batchnorm,
inplace_batchnorm_update=inplace_batchnorm_update,
name=name)
if min_depth > max_depth:
raise ValueError('min_depth should be less than or equal to max_depth')
if len(box_prediction_heads) != len(class_prediction_heads):
raise ValueError('All lists of heads must be the same length.')
for other_head_list in other_heads.values():
if len(box_prediction_heads) != len(other_head_list):
raise ValueError('All lists of heads must be the same length.')
self._prediction_heads = {
BOX_ENCODINGS: box_prediction_heads,
CLASS_PREDICTIONS_WITH_BACKGROUND: class_prediction_heads,
}
if other_heads:
self._prediction_heads.update(other_heads)
self._conv_hyperparams = conv_hyperparams
self._min_depth = min_depth
self._max_depth = max_depth
self._num_layers_before_predictor = num_layers_before_predictor
self._shared_nets = []
def build(self, input_shapes):
"""Creates the variables of the layer."""
if len(input_shapes) != len(self._prediction_heads[BOX_ENCODINGS]):
raise ValueError('This box predictor was constructed with %d heads,'
'but there are %d inputs.' %
(len(self._prediction_heads[BOX_ENCODINGS]),
len(input_shapes)))
for stack_index, input_shape in enumerate(input_shapes):
net = tf.keras.Sequential(name='PreHeadConvolutions_%d' % stack_index)
self._shared_nets.append(net)
# Add additional conv layers before the class predictor.
features_depth = static_shape.get_depth(input_shape)
depth = max(min(features_depth, self._max_depth), self._min_depth)
tf.logging.info(
'depth of additional conv before box predictor: {}'.format(depth))
if depth > 0 and self._num_layers_before_predictor > 0:
for i in range(self._num_layers_before_predictor):
net.add(keras.Conv2D(depth, [1, 1],
name='Conv2d_%d_1x1_%d' % (i, depth),
padding='SAME',
**self._conv_hyperparams.params()))
net.add(self._conv_hyperparams.build_batch_norm(
training=(self._is_training and not self._freeze_batchnorm),
name='Conv2d_%d_1x1_%d_norm' % (i, depth)))
net.add(self._conv_hyperparams.build_activation_layer(
name='Conv2d_%d_1x1_%d_activation' % (i, depth),
))
self.built = True
def _predict(self, image_features):
"""Computes encoded object locations and corresponding confidences.
Args:
image_features: A list of float tensors of shape [batch_size, height_i,
width_i, channels_i] containing features for a batch of images.
Returns:
box_encodings: A list of float tensors of shape
[batch_size, num_anchors_i, q, code_size] representing the location of
the objects, where q is 1 or the number of classes. Each entry in the
list corresponds to a feature map in the input `image_features` list.
class_predictions_with_background: A list of float tensors of shape
[batch_size, num_anchors_i, num_classes + 1] representing the class
predictions for the proposals. Each entry in the list corresponds to a
feature map in the input `image_features` list.
"""
predictions = collections.defaultdict(list)
for (index, image_feature) in enumerate(image_features):
# Apply shared conv layers before the head predictors.
net = self._shared_nets[index](image_feature)
for head_name in self._prediction_heads:
head_obj = self._prediction_heads[head_name][index]
prediction = head_obj(net)
predictions[head_name].append(prediction)
return predictions
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for object_detection.predictors.convolutional_keras_box_predictor."""
import numpy as np
import tensorflow as tf
from google.protobuf import text_format
from object_detection.builders import box_predictor_builder
from object_detection.builders import hyperparams_builder
from object_detection.predictors import convolutional_keras_box_predictor as box_predictor
from object_detection.protos import hyperparams_pb2
from object_detection.utils import test_case
class ConvolutionalKerasBoxPredictorTest(test_case.TestCase):
def _build_conv_hyperparams(self):
conv_hyperparams = hyperparams_pb2.Hyperparams()
conv_hyperparams_text_proto = """
activation: RELU_6
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
"""
text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
def test_get_boxes_for_five_aspect_ratios_per_location(self):
def graph_fn(image_features):
conv_box_predictor = (
box_predictor_builder.build_convolutional_keras_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams=self._build_conv_hyperparams(),
freeze_batchnorm=False,
inplace_batchnorm_update=False,
num_predictions_per_location_list=[5],
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
))
box_predictions = conv_box_predictor([image_features])
box_encodings = tf.concat(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
objectness_predictions = tf.concat(
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
axis=1)
return (box_encodings, objectness_predictions)
image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
(box_encodings, objectness_predictions) = self.execute(graph_fn,
[image_features])
self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
self.assertAllEqual(objectness_predictions.shape, [4, 320, 1])
def test_get_boxes_for_one_aspect_ratio_per_location(self):
def graph_fn(image_features):
conv_box_predictor = (
box_predictor_builder.build_convolutional_keras_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams=self._build_conv_hyperparams(),
freeze_batchnorm=False,
inplace_batchnorm_update=False,
num_predictions_per_location_list=[1],
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
))
box_predictions = conv_box_predictor([image_features])
box_encodings = tf.concat(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
objectness_predictions = tf.concat(box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1)
return (box_encodings, objectness_predictions)
image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
(box_encodings, objectness_predictions) = self.execute(graph_fn,
[image_features])
self.assertAllEqual(box_encodings.shape, [4, 64, 1, 4])
self.assertAllEqual(objectness_predictions.shape, [4, 64, 1])
def test_get_multi_class_predictions_for_five_aspect_ratios_per_location(
self):
num_classes_without_background = 6
image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
def graph_fn(image_features):
conv_box_predictor = (
box_predictor_builder.build_convolutional_keras_box_predictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams=self._build_conv_hyperparams(),
freeze_batchnorm=False,
inplace_batchnorm_update=False,
num_predictions_per_location_list=[5],
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
))
box_predictions = conv_box_predictor([image_features])
box_encodings = tf.concat(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
class_predictions_with_background = tf.concat(
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
axis=1)
return (box_encodings, class_predictions_with_background)
(box_encodings,
class_predictions_with_background) = self.execute(graph_fn,
[image_features])
self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
self.assertAllEqual(class_predictions_with_background.shape,
[4, 320, num_classes_without_background+1])
def test_get_predictions_with_feature_maps_of_dynamic_shape(
self):
image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
conv_box_predictor = (
box_predictor_builder.build_convolutional_keras_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams=self._build_conv_hyperparams(),
freeze_batchnorm=False,
inplace_batchnorm_update=False,
num_predictions_per_location_list=[5],
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
))
box_predictions = conv_box_predictor([image_features])
box_encodings = tf.concat(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
objectness_predictions = tf.concat(
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
axis=1)
init_op = tf.global_variables_initializer()
resolution = 32
expected_num_anchors = resolution*resolution*5
with self.test_session() as sess:
sess.run(init_op)
(box_encodings_shape,
objectness_predictions_shape) = sess.run(
[tf.shape(box_encodings), tf.shape(objectness_predictions)],
feed_dict={image_features:
np.random.rand(4, resolution, resolution, 64)})
actual_variable_set = set(
[var.op.name for var in tf.trainable_variables()])
self.assertAllEqual(box_encodings_shape, [4, expected_num_anchors, 1, 4])
self.assertAllEqual(objectness_predictions_shape,
[4, expected_num_anchors, 1])
expected_variable_set = set([
'BoxPredictor/PreHeadConvolutions_0/Conv2d_0_1x1_32/bias',
'BoxPredictor/PreHeadConvolutions_0/Conv2d_0_1x1_32/kernel',
'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/bias',
'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/kernel',
'BoxPredictor/ConvolutionalClassHead_0/ClassPredictor/bias',
'BoxPredictor/ConvolutionalClassHead_0/ClassPredictor/kernel'])
self.assertEqual(expected_variable_set, actual_variable_set)
# TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
if __name__ == '__main__':
tf.test.main()
...@@ -19,6 +19,7 @@ Contains Box prediction head classes for different meta architectures. ...@@ -19,6 +19,7 @@ Contains Box prediction head classes for different meta architectures.
All the box prediction heads have a predict function that receives the All the box prediction heads have a predict function that receives the
`features` as the first argument and returns `box_encodings`. `features` as the first argument and returns `box_encodings`.
""" """
import functools
import tensorflow as tf import tensorflow as tf
from object_detection.predictors.heads import head from object_detection.predictors.heads import head
...@@ -196,18 +197,22 @@ class WeightSharedConvolutionalBoxHead(head.Head): ...@@ -196,18 +197,22 @@ class WeightSharedConvolutionalBoxHead(head.Head):
def __init__(self, def __init__(self,
box_code_size, box_code_size,
kernel_size=3, kernel_size=3,
class_prediction_bias_init=0.0): use_depthwise=False,
box_encodings_clip_range=None):
"""Constructor. """Constructor.
Args: Args:
box_code_size: Size of encoding for each box. box_code_size: Size of encoding for each box.
kernel_size: Size of final convolution kernel. kernel_size: Size of final convolution kernel.
class_prediction_bias_init: constant value to initialize bias of the last use_depthwise: Whether to use depthwise convolutions for prediction steps.
conv2d layer before class prediction. Default is False.
box_encodings_clip_range: Min and max values for clipping box_encodings.
""" """
super(WeightSharedConvolutionalBoxHead, self).__init__() super(WeightSharedConvolutionalBoxHead, self).__init__()
self._box_code_size = box_code_size self._box_code_size = box_code_size
self._kernel_size = kernel_size self._kernel_size = kernel_size
self._use_depthwise = use_depthwise
self._box_encodings_clip_range = box_encodings_clip_range
def predict(self, features, num_predictions_per_location): def predict(self, features, num_predictions_per_location):
"""Predicts boxes. """Predicts boxes.
...@@ -224,7 +229,11 @@ class WeightSharedConvolutionalBoxHead(head.Head): ...@@ -224,7 +229,11 @@ class WeightSharedConvolutionalBoxHead(head.Head):
the objects. the objects.
""" """
box_encodings_net = features box_encodings_net = features
box_encodings = slim.conv2d( if self._use_depthwise:
conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1)
else:
conv_op = slim.conv2d
box_encodings = conv_op(
box_encodings_net, box_encodings_net,
num_predictions_per_location * self._box_code_size, num_predictions_per_location * self._box_code_size,
[self._kernel_size, self._kernel_size], [self._kernel_size, self._kernel_size],
...@@ -234,6 +243,11 @@ class WeightSharedConvolutionalBoxHead(head.Head): ...@@ -234,6 +243,11 @@ class WeightSharedConvolutionalBoxHead(head.Head):
batch_size = features.get_shape().as_list()[0] batch_size = features.get_shape().as_list()[0]
if batch_size is None: if batch_size is None:
batch_size = tf.shape(features)[0] batch_size = tf.shape(features)[0]
# Clipping the box encodings to make the inference graph TPU friendly.
if self._box_encodings_clip_range is not None:
box_encodings = tf.clip_by_value(
box_encodings, self._box_encodings_clip_range.min,
self._box_encodings_clip_range.max)
box_encodings = tf.reshape(box_encodings, box_encodings = tf.reshape(box_encodings,
[batch_size, -1, self._box_code_size]) [batch_size, -1, self._box_code_size])
return box_encodings return box_encodings
...@@ -19,6 +19,7 @@ Contains Class prediction head classes for different meta architectures. ...@@ -19,6 +19,7 @@ Contains Class prediction head classes for different meta architectures.
All the class prediction heads have a predict function that receives the All the class prediction heads have a predict function that receives the
`features` as the first argument and returns class predictions with background. `features` as the first argument and returns class predictions with background.
""" """
import functools
import tensorflow as tf import tensorflow as tf
from object_detection.predictors.heads import head from object_detection.predictors.heads import head
...@@ -211,7 +212,9 @@ class WeightSharedConvolutionalClassHead(head.Head): ...@@ -211,7 +212,9 @@ class WeightSharedConvolutionalClassHead(head.Head):
kernel_size=3, kernel_size=3,
class_prediction_bias_init=0.0, class_prediction_bias_init=0.0,
use_dropout=False, use_dropout=False,
dropout_keep_prob=0.8): dropout_keep_prob=0.8,
use_depthwise=False,
score_converter_fn=tf.identity):
"""Constructor. """Constructor.
Args: Args:
...@@ -224,6 +227,10 @@ class WeightSharedConvolutionalClassHead(head.Head): ...@@ -224,6 +227,10 @@ class WeightSharedConvolutionalClassHead(head.Head):
conv2d layer before class prediction. conv2d layer before class prediction.
use_dropout: Whether to apply dropout to class prediction head. use_dropout: Whether to apply dropout to class prediction head.
dropout_keep_prob: Probability of keeping activiations. dropout_keep_prob: Probability of keeping activiations.
use_depthwise: Whether to use depthwise convolutions for prediction
steps. Default is False.
score_converter_fn: Callable elementwise nonlinearity (that takes tensors
as inputs and returns tensors).
""" """
super(WeightSharedConvolutionalClassHead, self).__init__() super(WeightSharedConvolutionalClassHead, self).__init__()
self._num_classes = num_classes self._num_classes = num_classes
...@@ -231,6 +238,8 @@ class WeightSharedConvolutionalClassHead(head.Head): ...@@ -231,6 +238,8 @@ class WeightSharedConvolutionalClassHead(head.Head):
self._class_prediction_bias_init = class_prediction_bias_init self._class_prediction_bias_init = class_prediction_bias_init
self._use_dropout = use_dropout self._use_dropout = use_dropout
self._dropout_keep_prob = dropout_keep_prob self._dropout_keep_prob = dropout_keep_prob
self._use_depthwise = use_depthwise
self._score_converter_fn = score_converter_fn
def predict(self, features, num_predictions_per_location): def predict(self, features, num_predictions_per_location):
"""Predicts boxes. """Predicts boxes.
...@@ -252,7 +261,11 @@ class WeightSharedConvolutionalClassHead(head.Head): ...@@ -252,7 +261,11 @@ class WeightSharedConvolutionalClassHead(head.Head):
if self._use_dropout: if self._use_dropout:
class_predictions_net = slim.dropout( class_predictions_net = slim.dropout(
class_predictions_net, keep_prob=self._dropout_keep_prob) class_predictions_net, keep_prob=self._dropout_keep_prob)
class_predictions_with_background = slim.conv2d( if self._use_depthwise:
conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1)
else:
conv_op = slim.conv2d
class_predictions_with_background = conv_op(
class_predictions_net, class_predictions_net,
num_predictions_per_location * num_class_slots, num_predictions_per_location * num_class_slots,
[self._kernel_size, self._kernel_size], [self._kernel_size, self._kernel_size],
...@@ -264,6 +277,8 @@ class WeightSharedConvolutionalClassHead(head.Head): ...@@ -264,6 +277,8 @@ class WeightSharedConvolutionalClassHead(head.Head):
batch_size = features.get_shape().as_list()[0] batch_size = features.get_shape().as_list()[0]
if batch_size is None: if batch_size is None:
batch_size = tf.shape(features)[0] batch_size = tf.shape(features)[0]
class_predictions_with_background = self._score_converter_fn(
class_predictions_with_background)
class_predictions_with_background = tf.reshape( class_predictions_with_background = tf.reshape(
class_predictions_with_background, [batch_size, -1, num_class_slots]) class_predictions_with_background, [batch_size, -1, num_class_slots])
return class_predictions_with_background return class_predictions_with_background
...@@ -36,6 +36,8 @@ Mask RCNN box predictor. ...@@ -36,6 +36,8 @@ Mask RCNN box predictor.
""" """
from abc import abstractmethod from abc import abstractmethod
import tensorflow as tf
class Head(object): class Head(object):
"""Mask RCNN head base class.""" """Mask RCNN head base class."""
...@@ -57,3 +59,23 @@ class Head(object): ...@@ -57,3 +59,23 @@ class Head(object):
A tf.float32 tensor. A tf.float32 tensor.
""" """
pass pass
class KerasHead(tf.keras.Model):
"""Keras head base class."""
def call(self, features):
"""The Keras model call will delegate to the `_predict` method."""
return self._predict(features)
@abstractmethod
def _predict(self, features):
"""Returns the head's predictions.
Args:
features: A float tensor of features.
Returns:
A tf.float32 tensor.
"""
pass
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Box Head.
Contains Box prediction head classes for different meta architectures.
All the box prediction heads have a _predict function that receives the
`features` as the first argument and returns `box_encodings`.
"""
import tensorflow as tf
from object_detection.predictors.heads import head
class ConvolutionalBoxHead(head.KerasHead):
"""Convolutional box prediction head."""
def __init__(self,
is_training,
box_code_size,
kernel_size,
num_predictions_per_location,
conv_hyperparams,
freeze_batchnorm,
use_depthwise=True,
name=None):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
box_code_size: Size of encoding for each box.
kernel_size: Size of final convolution kernel. If the
spatial resolution of the feature map is smaller than the kernel size,
then the kernel size is automatically set to be
min(feature_width, feature_height).
num_predictions_per_location: Number of box predictions to be made per
spatial location. Int specifying number of boxes per location.
conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
containing hyperparameters for convolution ops.
freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
training or not. When training with a small batch size (e.g. 1), it is
desirable to freeze batch norm update and use pretrained batch norm
params.
use_depthwise: Whether to use depthwise convolutions for prediction
steps. Default is False.
name: A string name scope to assign to the model. If `None`, Keras
will auto-generate one from the class name.
Raises:
ValueError: if min_depth > max_depth.
"""
super(ConvolutionalBoxHead, self).__init__(name=name)
self._is_training = is_training
self._box_code_size = box_code_size
self._kernel_size = kernel_size
self._num_predictions_per_location = num_predictions_per_location
self._use_depthwise = use_depthwise
self._box_encoder_layers = []
if self._use_depthwise:
self._box_encoder_layers.append(
tf.keras.layers.DepthwiseConv2D(
[self._kernel_size, self._kernel_size],
padding='SAME',
depth_multiplier=1,
strides=1,
dilation_rate=1,
name='BoxEncodingPredictor_depthwise',
**conv_hyperparams.params()))
self._box_encoder_layers.append(
conv_hyperparams.build_batch_norm(
training=(is_training and not freeze_batchnorm),
name='BoxEncodingPredictor_depthwise_batchnorm'))
self._box_encoder_layers.append(
conv_hyperparams.build_activation_layer(
name='BoxEncodingPredictor_depthwise_activation'))
self._box_encoder_layers.append(
tf.keras.layers.Conv2D(
num_predictions_per_location * self._box_code_size, [1, 1],
name='BoxEncodingPredictor',
**conv_hyperparams.params(activation=None)))
else:
self._box_encoder_layers.append(
tf.keras.layers.Conv2D(
num_predictions_per_location * self._box_code_size,
[self._kernel_size, self._kernel_size],
padding='SAME',
name='BoxEncodingPredictor',
**conv_hyperparams.params(activation=None)))
def _predict(self, features):
"""Predicts boxes.
Args:
features: A float tensor of shape [batch_size, height, width, channels]
containing image features.
Returns:
box_encodings: A float tensor of shape
[batch_size, num_anchors, q, code_size] representing the location of
the objects, where q is 1 or the number of classes.
"""
box_encodings = features
for layer in self._box_encoder_layers:
box_encodings = layer(box_encodings)
batch_size = features.get_shape().as_list()[0]
if batch_size is None:
batch_size = tf.shape(features)[0]
box_encodings = tf.reshape(box_encodings,
[batch_size, -1, 1, self._box_code_size])
return box_encodings
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for object_detection.predictors.heads.box_head."""
import tensorflow as tf
from google.protobuf import text_format
from object_detection.builders import hyperparams_builder
from object_detection.predictors.heads import keras_box_head
from object_detection.protos import hyperparams_pb2
from object_detection.utils import test_case
class ConvolutionalKerasBoxHeadTest(test_case.TestCase):
def _build_conv_hyperparams(self):
conv_hyperparams = hyperparams_pb2.Hyperparams()
conv_hyperparams_text_proto = """
activation: NONE
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
"""
text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
def test_prediction_size_depthwise_false(self):
conv_hyperparams = self._build_conv_hyperparams()
box_prediction_head = keras_box_head.ConvolutionalBoxHead(
is_training=True,
box_code_size=4,
kernel_size=3,
conv_hyperparams=conv_hyperparams,
freeze_batchnorm=False,
num_predictions_per_location=1,
use_depthwise=False)
image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
box_encodings = box_prediction_head(image_feature)
self.assertAllEqual([64, 323, 1, 4], box_encodings.get_shape().as_list())
# TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
if __name__ == '__main__':
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Class Head.
Contains Class prediction head classes for different meta architectures.
All the class prediction heads have a predict function that receives the
`features` as the first argument and returns class predictions with background.
"""
import tensorflow as tf
from object_detection.predictors.heads import head
class ConvolutionalClassHead(head.KerasHead):
"""Convolutional class prediction head."""
def __init__(self,
is_training,
num_classes,
use_dropout,
dropout_keep_prob,
kernel_size,
num_predictions_per_location,
conv_hyperparams,
freeze_batchnorm,
class_prediction_bias_init=0.0,
use_depthwise=False,
name=None):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: Number of classes.
use_dropout: Option to use dropout or not. Note that a single dropout
op is applied here prior to both box and class predictions, which stands
in contrast to the ConvolutionalBoxPredictor below.
dropout_keep_prob: Keep probability for dropout.
This is only used if use_dropout is True.
kernel_size: Size of final convolution kernel. If the
spatial resolution of the feature map is smaller than the kernel size,
then the kernel size is automatically set to be
min(feature_width, feature_height).
num_predictions_per_location: Number of box predictions to be made per
spatial location. Int specifying number of boxes per location.
conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
containing hyperparameters for convolution ops.
freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
training or not. When training with a small batch size (e.g. 1), it is
desirable to freeze batch norm update and use pretrained batch norm
params.
class_prediction_bias_init: constant value to initialize bias of the last
conv2d layer before class prediction.
use_depthwise: Whether to use depthwise convolutions for prediction
steps. Default is False.
name: A string name scope to assign to the model. If `None`, Keras
will auto-generate one from the class name.
Raises:
ValueError: if min_depth > max_depth.
"""
super(ConvolutionalClassHead, self).__init__(name=name)
self._is_training = is_training
self._num_classes = num_classes
self._use_dropout = use_dropout
self._dropout_keep_prob = dropout_keep_prob
self._kernel_size = kernel_size
self._class_prediction_bias_init = class_prediction_bias_init
self._use_depthwise = use_depthwise
self._num_class_slots = self._num_classes + 1
self._class_predictor_layers = []
if self._use_dropout:
self._class_predictor_layers.append(
# The Dropout layer's `training` parameter for the call method must
# be set implicitly by the Keras set_learning_phase. The object
# detection training code takes care of this.
tf.keras.layers.Dropout(rate=1.0 - self._dropout_keep_prob))
if self._use_depthwise:
self._class_predictor_layers.append(
tf.keras.layers.DepthwiseConv2D(
[self._kernel_size, self._kernel_size],
padding='SAME',
depth_multiplier=1,
strides=1,
dilation_rate=1,
name='ClassPredictor_depthwise',
**conv_hyperparams.params()))
self._class_predictor_layers.append(
conv_hyperparams.build_batch_norm(
training=(is_training and not freeze_batchnorm),
name='ClassPredictor_depthwise_batchnorm'))
self._class_predictor_layers.append(
conv_hyperparams.build_activation_layer(
name='ClassPredictor_depthwise_activation'))
self._class_predictor_layers.append(
tf.keras.layers.Conv2D(
num_predictions_per_location * self._num_class_slots, [1, 1],
name='ClassPredictor',
**conv_hyperparams.params(activation=None)))
else:
self._class_predictor_layers.append(
tf.keras.layers.Conv2D(
num_predictions_per_location * self._num_class_slots,
[self._kernel_size, self._kernel_size],
padding='SAME',
name='ClassPredictor',
bias_initializer=tf.constant_initializer(
self._class_prediction_bias_init),
**conv_hyperparams.params(activation=None)))
def _predict(self, features):
"""Predicts boxes.
Args:
features: A float tensor of shape [batch_size, height, width, channels]
containing image features.
Returns:
class_predictions_with_background: A float tensor of shape
[batch_size, num_anchors, num_classes + 1] representing the class
predictions for the proposals.
"""
# Add a slot for the background class.
class_predictions_with_background = features
for layer in self._class_predictor_layers:
class_predictions_with_background = layer(
class_predictions_with_background)
batch_size = features.get_shape().as_list()[0]
if batch_size is None:
batch_size = tf.shape(features)[0]
class_predictions_with_background = tf.reshape(
class_predictions_with_background,
[batch_size, -1, self._num_class_slots])
return class_predictions_with_background
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for object_detection.predictors.heads.class_head."""
import tensorflow as tf
from google.protobuf import text_format
from object_detection.builders import hyperparams_builder
from object_detection.predictors.heads import keras_class_head
from object_detection.protos import hyperparams_pb2
from object_detection.utils import test_case
class ConvolutionalKerasClassPredictorTest(test_case.TestCase):
def _build_conv_hyperparams(self):
conv_hyperparams = hyperparams_pb2.Hyperparams()
conv_hyperparams_text_proto = """
activation: NONE
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
"""
text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
def test_prediction_size_depthwise_false(self):
conv_hyperparams = self._build_conv_hyperparams()
class_prediction_head = keras_class_head.ConvolutionalClassHead(
is_training=True,
num_classes=20,
use_dropout=True,
dropout_keep_prob=0.5,
kernel_size=3,
conv_hyperparams=conv_hyperparams,
freeze_batchnorm=False,
num_predictions_per_location=1,
use_depthwise=False)
image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
class_predictions = class_prediction_head(image_feature,)
self.assertAllEqual([64, 323, 21],
class_predictions.get_shape().as_list())
# TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
if __name__ == '__main__':
tf.test.main()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras Mask Heads.
Contains Mask prediction head classes for different meta architectures.
All the mask prediction heads have a predict function that receives the
`features` as the first argument and returns `mask_predictions`.
"""
import tensorflow as tf
from object_detection.predictors.heads import head
class ConvolutionalMaskHead(head.KerasHead):
"""Convolutional class prediction head."""
def __init__(self,
is_training,
num_classes,
use_dropout,
dropout_keep_prob,
kernel_size,
num_predictions_per_location,
conv_hyperparams,
freeze_batchnorm,
use_depthwise=False,
mask_height=7,
mask_width=7,
masks_are_class_agnostic=False,
name=None):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: Number of classes.
use_dropout: Option to use dropout or not. Note that a single dropout
op is applied here prior to both box and class predictions, which stands
in contrast to the ConvolutionalBoxPredictor below.
dropout_keep_prob: Keep probability for dropout.
This is only used if use_dropout is True.
kernel_size: Size of final convolution kernel. If the
spatial resolution of the feature map is smaller than the kernel size,
then the kernel size is automatically set to be
min(feature_width, feature_height).
num_predictions_per_location: Number of box predictions to be made per
spatial location. Int specifying number of boxes per location.
conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
containing hyperparameters for convolution ops.
freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
training or not. When training with a small batch size (e.g. 1), it is
desirable to freeze batch norm update and use pretrained batch norm
params.
use_depthwise: Whether to use depthwise convolutions for prediction
steps. Default is False.
mask_height: Desired output mask height. The default value is 7.
mask_width: Desired output mask width. The default value is 7.
masks_are_class_agnostic: Boolean determining if the mask-head is
class-agnostic or not.
name: A string name scope to assign to the model. If `None`, Keras
will auto-generate one from the class name.
Raises:
ValueError: if min_depth > max_depth.
"""
super(ConvolutionalMaskHead, self).__init__(name=name)
self._is_training = is_training
self._num_classes = num_classes
self._use_dropout = use_dropout
self._dropout_keep_prob = dropout_keep_prob
self._kernel_size = kernel_size
self._num_predictions_per_location = num_predictions_per_location
self._use_depthwise = use_depthwise
self._mask_height = mask_height
self._mask_width = mask_width
self._masks_are_class_agnostic = masks_are_class_agnostic
self._mask_predictor_layers = []
# Add a slot for the background class.
if self._masks_are_class_agnostic:
self._num_masks = 1
else:
self._num_masks = self._num_classes
num_mask_channels = self._num_masks * self._mask_height * self._mask_width
if self._use_dropout:
self._mask_predictor_layers.append(
# The Dropout layer's `training` parameter for the call method must
# be set implicitly by the Keras set_learning_phase. The object
# detection training code takes care of this.
tf.keras.layers.Dropout(rate=1.0 - self._dropout_keep_prob))
if self._use_depthwise:
self._mask_predictor_layers.append(
tf.keras.layers.DepthwiseConv2D(
[self._kernel_size, self._kernel_size],
padding='SAME',
depth_multiplier=1,
strides=1,
dilation_rate=1,
name='MaskPredictor_depthwise',
**conv_hyperparams.params()))
self._mask_predictor_layers.append(
conv_hyperparams.build_batch_norm(
training=(is_training and not freeze_batchnorm),
name='MaskPredictor_depthwise_batchnorm'))
self._mask_predictor_layers.append(
conv_hyperparams.build_activation_layer(
name='MaskPredictor_depthwise_activation'))
self._mask_predictor_layers.append(
tf.keras.layers.Conv2D(
num_predictions_per_location * num_mask_channels, [1, 1],
name='MaskPredictor',
**conv_hyperparams.params(activation=None)))
else:
self._mask_predictor_layers.append(
tf.keras.layers.Conv2D(
num_predictions_per_location * num_mask_channels,
[self._kernel_size, self._kernel_size],
padding='SAME',
name='MaskPredictor',
**conv_hyperparams.params(activation=None)))
def _predict(self, features):
"""Predicts boxes.
Args:
features: A float tensor of shape [batch_size, height, width, channels]
containing image features.
Returns:
mask_predictions: A float tensors of shape
[batch_size, num_anchors, num_masks, mask_height, mask_width]
representing the mask predictions for the proposals.
"""
mask_predictions = features
for layer in self._mask_predictor_layers:
mask_predictions = layer(mask_predictions)
batch_size = features.get_shape().as_list()[0]
if batch_size is None:
batch_size = tf.shape(features)[0]
mask_predictions = tf.reshape(
mask_predictions,
[batch_size, -1, self._num_masks, self._mask_height, self._mask_width])
return mask_predictions
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for object_detection.predictors.heads.mask_head."""
import tensorflow as tf
from google.protobuf import text_format
from object_detection.builders import hyperparams_builder
from object_detection.predictors.heads import keras_mask_head
from object_detection.protos import hyperparams_pb2
from object_detection.utils import test_case
class ConvolutionalMaskPredictorTest(test_case.TestCase):
def _build_conv_hyperparams(self):
conv_hyperparams = hyperparams_pb2.Hyperparams()
conv_hyperparams_text_proto = """
activation: NONE
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
"""
text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
def test_prediction_size_use_depthwise_false(self):
conv_hyperparams = self._build_conv_hyperparams()
mask_prediction_head = keras_mask_head.ConvolutionalMaskHead(
is_training=True,
num_classes=20,
use_dropout=True,
dropout_keep_prob=0.5,
kernel_size=3,
conv_hyperparams=conv_hyperparams,
freeze_batchnorm=False,
num_predictions_per_location=1,
use_depthwise=False,
mask_height=7,
mask_width=7)
image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
mask_predictions = mask_prediction_head(image_feature)
self.assertAllEqual([64, 323, 20, 7, 7],
mask_predictions.get_shape().as_list())
# TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
def test_class_agnostic_prediction_size_use_depthwise_false(self):
conv_hyperparams = self._build_conv_hyperparams()
mask_prediction_head = keras_mask_head.ConvolutionalMaskHead(
is_training=True,
num_classes=20,
use_dropout=True,
dropout_keep_prob=0.5,
kernel_size=3,
conv_hyperparams=conv_hyperparams,
freeze_batchnorm=False,
num_predictions_per_location=1,
use_depthwise=False,
mask_height=7,
mask_width=7,
masks_are_class_agnostic=True)
image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
mask_predictions = mask_prediction_head(image_feature)
self.assertAllEqual([64, 323, 1, 7, 7],
mask_predictions.get_shape().as_list())
# TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
if __name__ == '__main__':
tf.test.main()
...@@ -148,6 +148,7 @@ class MaskRCNNMaskHead(head.Head): ...@@ -148,6 +148,7 @@ class MaskRCNNMaskHead(head.Head):
upsampled_features, upsampled_features,
num_outputs=num_masks, num_outputs=num_masks,
activation_fn=None, activation_fn=None,
normalizer_fn=None,
kernel_size=[3, 3]) kernel_size=[3, 3])
return tf.expand_dims( return tf.expand_dims(
tf.transpose(mask_predictions, perm=[0, 3, 1, 2]), tf.transpose(mask_predictions, perm=[0, 3, 1, 2]),
......
...@@ -15,7 +15,21 @@ message BoxPredictor { ...@@ -15,7 +15,21 @@ message BoxPredictor {
} }
} }
// Configuration proto for MaskHead in predictors.
// Next id: 4
message MaskHead {
// The height and the width of the predicted mask. Only used when
// predict_instance_masks is true.
optional int32 mask_height = 1 [default = 15];
optional int32 mask_width = 2 [default = 15];
// Whether to predict class agnostic masks. Only used when
// predict_instance_masks is true.
optional bool masks_are_class_agnostic = 3 [default = true];
}
// Configuration proto for Convolutional box predictor. // Configuration proto for Convolutional box predictor.
// Next id: 13
message ConvolutionalBoxPredictor { message ConvolutionalBoxPredictor {
// Hyperparameters for convolution ops used in the box predictor. // Hyperparameters for convolution ops used in the box predictor.
optional Hyperparams conv_hyperparams = 1; optional Hyperparams conv_hyperparams = 1;
...@@ -55,9 +69,13 @@ message ConvolutionalBoxPredictor { ...@@ -55,9 +69,13 @@ message ConvolutionalBoxPredictor {
// Whether to use depthwise separable convolution for box predictor layers. // Whether to use depthwise separable convolution for box predictor layers.
optional bool use_depthwise = 11 [default = false]; optional bool use_depthwise = 11 [default = false];
// Configs for a mask prediction head.
optional MaskHead mask_head = 12;
} }
// Configuration proto for weight shared convolutional box predictor. // Configuration proto for weight shared convolutional box predictor.
// Next id: 18
message WeightSharedConvolutionalBoxPredictor { message WeightSharedConvolutionalBoxPredictor {
// Hyperparameters for convolution ops used in the box predictor. // Hyperparameters for convolution ops used in the box predictor.
optional Hyperparams conv_hyperparams = 1; optional Hyperparams conv_hyperparams = 1;
...@@ -85,12 +103,37 @@ message WeightSharedConvolutionalBoxPredictor { ...@@ -85,12 +103,37 @@ message WeightSharedConvolutionalBoxPredictor {
// Whether to use dropout for class prediction. // Whether to use dropout for class prediction.
optional bool use_dropout = 11 [default = false]; optional bool use_dropout = 11 [default = false];
// Keep probability for dropout // Keep probability for dropout.
optional float dropout_keep_probability = 12 [default = 0.8]; optional float dropout_keep_probability = 12 [default = 0.8];
// Whether to share the multi-layer tower between box prediction and class // Whether to share the multi-layer tower between box prediction and class
// prediction heads. // prediction heads.
optional bool share_prediction_tower = 13 [default = false]; optional bool share_prediction_tower = 13 [default = false];
// Whether to use depthwise separable convolution for box predictor layers.
optional bool use_depthwise = 14 [default = false];
// Configs for a mask prediction head.
optional MaskHead mask_head = 15;
// Enum to specify how to convert the detection scores at inference time.
enum ScoreConverter {
// Input scores equals output scores.
IDENTITY = 0;
// Applies a sigmoid on input scores.
SIGMOID = 1;
}
// Callable elementwise score converter at inference time.
optional ScoreConverter score_converter = 16 [default = IDENTITY];
// If specified, apply clipping to box encodings.
message BoxEncodingsClipRange {
optional float min = 1;
optional float max = 2;
}
optional BoxEncodingsClipRange box_encodings_clip_range = 17;
} }
// TODO(alirezafathi): Refactor the proto file to be able to configure mask rcnn // TODO(alirezafathi): Refactor the proto file to be able to configure mask rcnn
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment