Commit 27b4acd4 authored by Aman Gupta's avatar Aman Gupta
Browse files

Merge remote-tracking branch 'upstream/master'

parents 5133522f d4e1f97f
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SSD MobilenetV2 FPN Feature Extractor."""
import copy
import functools
import tensorflow as tf
from object_detection.meta_architectures import ssd_meta_arch
from object_detection.models import feature_map_generators
from object_detection.utils import context_manager
from object_detection.utils import ops
from object_detection.utils import shape_utils
from nets.mobilenet import mobilenet
from nets.mobilenet import mobilenet_v2
slim = tf.contrib.slim
# A modified config of mobilenet v2 that makes it more detection friendly,
def _create_modified_mobilenet_config():
conv_defs = copy.copy(mobilenet_v2.V2_DEF)
conv_defs['spec'][-1] = mobilenet.op(
slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=256)
return conv_defs
_CONV_DEFS = _create_modified_mobilenet_config()
class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
"""SSD Feature Extractor using MobilenetV2 FPN features."""
def __init__(self,
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams_fn,
fpn_min_level=3,
fpn_max_level=7,
additional_layer_depth=256,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=False,
override_base_feature_extractor_hyperparams=False):
"""SSD FPN feature extractor based on Mobilenet v2 architecture.
Args:
is_training: whether the network is in training mode.
depth_multiplier: float depth multiplier for feature extractor.
min_depth: minimum feature extractor depth.
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
and separable_conv2d ops in the layers that are added on top of the base
feature extractor.
fpn_min_level: the highest resolution feature map to use in FPN. The valid
values are {2, 3, 4, 5} which map to MobileNet v2 layers
{layer_4, layer_7, layer_14, layer_19}, respectively.
fpn_max_level: the smallest resolution feature map to construct or use in
FPN. FPN constructions uses features maps starting from fpn_min_level
upto the fpn_max_level. In the case that there are not enough feature
maps in the backbone network, additional feature maps are created by
applying stride 2 convolutions until we get the desired number of fpn
levels.
additional_layer_depth: additional feature map layer channel depth.
reuse_weights: whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False.
use_depthwise: Whether to use depthwise convolutions. Default is False.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
"""
super(SSDMobileNetV2FpnFeatureExtractor, self).__init__(
is_training=is_training,
depth_multiplier=depth_multiplier,
min_depth=min_depth,
pad_to_multiple=pad_to_multiple,
conv_hyperparams_fn=conv_hyperparams_fn,
reuse_weights=reuse_weights,
use_explicit_padding=use_explicit_padding,
use_depthwise=use_depthwise,
override_base_feature_extractor_hyperparams=
override_base_feature_extractor_hyperparams)
self._fpn_min_level = fpn_min_level
self._fpn_max_level = fpn_max_level
self._additional_layer_depth = additional_layer_depth
def preprocess(self, resized_inputs):
"""SSD preprocessing.
Maps pixel values to the range [-1, 1].
Args:
resized_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
"""
return (2.0 / 255.0) * resized_inputs - 1.0
def extract_features(self, preprocessed_inputs):
"""Extract features from preprocessed inputs.
Args:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
feature_maps: a list of tensors where the ith tensor has shape
[batch, height_i, width_i, depth_i]
"""
preprocessed_inputs = shape_utils.check_min_image_dim(
33, preprocessed_inputs)
with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope:
with slim.arg_scope(
mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \
slim.arg_scope(
[mobilenet.depth_multiplier], min_depth=self._min_depth):
with (slim.arg_scope(self._conv_hyperparams_fn())
if self._override_base_feature_extractor_hyperparams else
context_manager.IdentityContextManager()):
_, image_features = mobilenet_v2.mobilenet_base(
ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
final_endpoint='layer_19',
depth_multiplier=self._depth_multiplier,
conv_defs=_CONV_DEFS if self._use_depthwise else None,
use_explicit_padding=self._use_explicit_padding,
scope=scope)
depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth)
with slim.arg_scope(self._conv_hyperparams_fn()):
with tf.variable_scope('fpn', reuse=self._reuse_weights):
feature_blocks = [
'layer_4', 'layer_7', 'layer_14', 'layer_19'
]
base_fpn_max_level = min(self._fpn_max_level, 5)
feature_block_list = []
for level in range(self._fpn_min_level, base_fpn_max_level + 1):
feature_block_list.append(feature_blocks[level - 2])
fpn_features = feature_map_generators.fpn_top_down_feature_maps(
[(key, image_features[key]) for key in feature_block_list],
depth=depth_fn(self._additional_layer_depth),
use_depthwise=self._use_depthwise)
feature_maps = []
for level in range(self._fpn_min_level, base_fpn_max_level + 1):
feature_maps.append(fpn_features['top_down_{}'.format(
feature_blocks[level - 2])])
last_feature_map = fpn_features['top_down_{}'.format(
feature_blocks[base_fpn_max_level - 2])]
# Construct coarse features
for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1):
if self._use_depthwise:
conv_op = functools.partial(
slim.separable_conv2d, depth_multiplier=1)
else:
conv_op = slim.conv2d
last_feature_map = conv_op(
last_feature_map,
num_outputs=depth_fn(self._additional_layer_depth),
kernel_size=[3, 3],
stride=2,
padding='SAME',
scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 19))
feature_maps.append(last_feature_map)
return feature_maps
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for ssd_mobilenet_v2_fpn_feature_extractor."""
import numpy as np
import tensorflow as tf
from object_detection.models import ssd_feature_extractor_test
from object_detection.models import ssd_mobilenet_v2_fpn_feature_extractor
slim = tf.contrib.slim
class SsdMobilenetV2FpnFeatureExtractorTest(
ssd_feature_extractor_test.SsdFeatureExtractorTestBase):
def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
is_training=True, use_explicit_padding=False):
"""Constructs a new feature extractor.
Args:
depth_multiplier: float depth multiplier for feature extractor
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
is_training: whether the network is in training mode.
use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
inputs so that the output dimensions are the same as if 'SAME' padding
were used.
Returns:
an ssd_meta_arch.SSDFeatureExtractor object.
"""
min_depth = 32
return (ssd_mobilenet_v2_fpn_feature_extractor.
SSDMobileNetV2FpnFeatureExtractor(
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
self.conv_hyperparams_fn,
use_explicit_padding=use_explicit_padding))
def test_extract_features_returns_correct_shapes_256(self):
image_height = 256
image_width = 256
depth_multiplier = 1.0
pad_to_multiple = 1
expected_feature_map_shape = [(2, 32, 32, 256), (2, 16, 16, 256),
(2, 8, 8, 256), (2, 4, 4, 256),
(2, 2, 2, 256)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=False)
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=True)
def test_extract_features_returns_correct_shapes_384(self):
image_height = 320
image_width = 320
depth_multiplier = 1.0
pad_to_multiple = 1
expected_feature_map_shape = [(2, 40, 40, 256), (2, 20, 20, 256),
(2, 10, 10, 256), (2, 5, 5, 256),
(2, 3, 3, 256)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=False)
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=True)
def test_extract_features_with_dynamic_image_shape(self):
image_height = 256
image_width = 256
depth_multiplier = 1.0
pad_to_multiple = 1
expected_feature_map_shape = [(2, 32, 32, 256), (2, 16, 16, 256),
(2, 8, 8, 256), (2, 4, 4, 256),
(2, 2, 2, 256)]
self.check_extract_features_returns_correct_shapes_with_dynamic_inputs(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=False)
self.check_extract_features_returns_correct_shapes_with_dynamic_inputs(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=True)
def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self):
image_height = 299
image_width = 299
depth_multiplier = 1.0
pad_to_multiple = 32
expected_feature_map_shape = [(2, 40, 40, 256), (2, 20, 20, 256),
(2, 10, 10, 256), (2, 5, 5, 256),
(2, 3, 3, 256)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=False)
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=True)
def test_extract_features_returns_correct_shapes_enforcing_min_depth(self):
image_height = 256
image_width = 256
depth_multiplier = 0.5**12
pad_to_multiple = 1
expected_feature_map_shape = [(2, 32, 32, 32), (2, 16, 16, 32),
(2, 8, 8, 32), (2, 4, 4, 32),
(2, 2, 2, 32)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=False)
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, use_explicit_padding=True)
def test_extract_features_raises_error_with_invalid_image_size(self):
image_height = 32
image_width = 32
depth_multiplier = 1.0
pad_to_multiple = 1
self.check_extract_features_raises_error_with_invalid_image_size(
image_height, image_width, depth_multiplier, pad_to_multiple)
def test_preprocess_returns_correct_value_range(self):
image_height = 256
image_width = 256
depth_multiplier = 1
pad_to_multiple = 1
test_image = np.random.rand(2, image_height, image_width, 3)
feature_extractor = self._create_feature_extractor(depth_multiplier,
pad_to_multiple)
preprocessed_image = feature_extractor.preprocess(test_image)
self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))
def test_variables_only_created_in_scope(self):
depth_multiplier = 1
pad_to_multiple = 1
scope_name = 'MobilenetV2'
self.check_feature_extractor_variables_under_scope(
depth_multiplier, pad_to_multiple, scope_name)
def test_fused_batchnorm(self):
image_height = 256
image_width = 256
depth_multiplier = 1
pad_to_multiple = 1
image_placeholder = tf.placeholder(tf.float32,
[1, image_height, image_width, 3])
feature_extractor = self._create_feature_extractor(depth_multiplier,
pad_to_multiple)
preprocessed_image = feature_extractor.preprocess(image_placeholder)
_ = feature_extractor.extract_features(preprocessed_image)
self.assertTrue(
any(op.type == 'FusedBatchNorm'
for op in tf.get_default_graph().get_operations()))
def test_get_expected_feature_map_variable_names(self):
depth_multiplier = 1.0
pad_to_multiple = 1
expected_feature_maps_variables = set([
# Mobilenet V2 feature maps
'MobilenetV2/expanded_conv_4/depthwise/depthwise_weights',
'MobilenetV2/expanded_conv_7/depthwise/depthwise_weights',
'MobilenetV2/expanded_conv_14/depthwise/depthwise_weights',
'MobilenetV2/Conv_1/weights',
# FPN layers
'MobilenetV2/fpn/bottom_up_Conv2d_20/weights',
'MobilenetV2/fpn/bottom_up_Conv2d_21/weights',
'MobilenetV2/fpn/smoothing_1/weights',
'MobilenetV2/fpn/smoothing_2/weights',
'MobilenetV2/fpn/projection_1/weights',
'MobilenetV2/fpn/projection_2/weights',
'MobilenetV2/fpn/projection_3/weights',
])
g = tf.Graph()
with g.as_default():
preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3))
feature_extractor = self._create_feature_extractor(
depth_multiplier, pad_to_multiple)
feature_extractor.extract_features(preprocessed_inputs)
actual_variable_set = set([
var.op.name for var in g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
])
variable_intersection = expected_feature_maps_variables.intersection(
actual_variable_set)
self.assertSetEqual(expected_feature_maps_variables,
variable_intersection)
if __name__ == '__main__':
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SSDFeatureExtractor for MobilenetV2 features."""
import tensorflow as tf
from object_detection.meta_architectures import ssd_meta_arch
from object_detection.models import feature_map_generators
from object_detection.models.keras_applications import mobilenet_v2
from object_detection.utils import ops
from object_detection.utils import shape_utils
class SSDMobileNetV2KerasFeatureExtractor(
ssd_meta_arch.SSDKerasFeatureExtractor):
"""SSD Feature Extractor using MobilenetV2 features."""
def __init__(self,
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams,
freeze_batchnorm,
inplace_batchnorm_update,
use_explicit_padding=False,
use_depthwise=False,
override_base_feature_extractor_hyperparams=False,
name=None):
"""MobileNetV2 Feature Extractor for SSD Models.
Mobilenet v2 (experimental), designed by sandler@. More details can be found
in //knowledge/cerebra/brain/compression/mobilenet/mobilenet_experimental.py
Args:
is_training: whether the network is in training mode.
depth_multiplier: float depth multiplier for feature extractor (Functions
as a width multiplier for the mobilenet_v2 network itself).
min_depth: minimum feature extractor depth.
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams: `hyperparams_builder.KerasLayerHyperparams` object
containing convolution hyperparameters for the layers added on top of
the base feature extractor.
freeze_batchnorm: Whether to freeze batch norm parameters during
training or not. When training with a small batch size (e.g. 1), it is
desirable to freeze batch norm update and use pretrained batch norm
params.
inplace_batchnorm_update: Whether to update batch norm moving average
values inplace. When this is false train op must add a control
dependency on tf.graphkeys.UPDATE_OPS collection in order to update
batch norm statistics.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False.
use_depthwise: Whether to use depthwise convolutions. Default is False.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
name: A string name scope to assign to the model. If 'None', Keras
will auto-generate one from the class name.
"""
super(SSDMobileNetV2KerasFeatureExtractor, self).__init__(
is_training=is_training,
depth_multiplier=depth_multiplier,
min_depth=min_depth,
pad_to_multiple=pad_to_multiple,
conv_hyperparams=conv_hyperparams,
freeze_batchnorm=freeze_batchnorm,
inplace_batchnorm_update=inplace_batchnorm_update,
use_explicit_padding=use_explicit_padding,
use_depthwise=use_depthwise,
override_base_feature_extractor_hyperparams=
override_base_feature_extractor_hyperparams,
name=name)
feature_map_layout = {
'from_layer': ['layer_15/expansion_output', 'layer_19', '', '', '', ''],
'layer_depth': [-1, -1, 512, 256, 256, 128],
'use_depthwise': self._use_depthwise,
'use_explicit_padding': self._use_explicit_padding,
}
with tf.name_scope('MobilenetV2'):
full_mobilenet_v2 = mobilenet_v2.mobilenet_v2(
batchnorm_training=(is_training and not freeze_batchnorm),
conv_hyperparams=(conv_hyperparams
if self._override_base_feature_extractor_hyperparams
else None),
weights=None,
use_explicit_padding=use_explicit_padding,
alpha=self._depth_multiplier,
min_depth=self._min_depth,
include_top=False)
conv2d_11_pointwise = full_mobilenet_v2.get_layer(
name='block_13_expand_relu').output
conv2d_13_pointwise = full_mobilenet_v2.get_layer(name='out_relu').output
self.mobilenet_v2 = tf.keras.Model(
inputs=full_mobilenet_v2.inputs,
outputs=[conv2d_11_pointwise, conv2d_13_pointwise])
self.feature_map_generator = (
feature_map_generators.KerasMultiResolutionFeatureMaps(
feature_map_layout=feature_map_layout,
depth_multiplier=self._depth_multiplier,
min_depth=self._min_depth,
insert_1x1_conv=True,
is_training=is_training,
conv_hyperparams=conv_hyperparams,
freeze_batchnorm=freeze_batchnorm,
name='FeatureMaps'))
def preprocess(self, resized_inputs):
"""SSD preprocessing.
Maps pixel values to the range [-1, 1].
Args:
resized_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
"""
return (2.0 / 255.0) * resized_inputs - 1.0
def _extract_features(self, preprocessed_inputs):
"""Extract features from preprocessed inputs.
Args:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
feature_maps: a list of tensors where the ith tensor has shape
[batch, height_i, width_i, depth_i]
"""
preprocessed_inputs = shape_utils.check_min_image_dim(
33, preprocessed_inputs)
image_features = self.mobilenet_v2(
ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple))
feature_maps = self.feature_map_generator({
'layer_15/expansion_output': image_features[0],
'layer_19': image_features[1]})
return feature_maps.values()
......@@ -43,6 +43,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
fpn_scope_name,
fpn_min_level=3,
fpn_max_level=7,
additional_layer_depth=256,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=False,
......@@ -72,6 +73,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
maps in the backbone network, additional feature maps are created by
applying stride 2 convolutions until we get the desired number of fpn
levels.
additional_layer_depth: additional feature map layer channel depth.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False. UNUSED currently.
......@@ -104,6 +106,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
self._fpn_scope_name = fpn_scope_name
self._fpn_min_level = fpn_min_level
self._fpn_max_level = fpn_max_level
self._additional_layer_depth = additional_layer_depth
def preprocess(self, resized_inputs):
"""SSD preprocessing.
......@@ -177,7 +180,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
feature_block_list.append('block{}'.format(level - 1))
fpn_features = feature_map_generators.fpn_top_down_feature_maps(
[(key, image_features[key]) for key in feature_block_list],
depth=256)
depth=self._additional_layer_depth)
feature_maps = []
for level in range(self._fpn_min_level, base_fpn_max_level + 1):
feature_maps.append(
......@@ -188,7 +191,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
for i in range(base_fpn_max_level, self._fpn_max_level):
last_feature_map = slim.conv2d(
last_feature_map,
num_outputs=256,
num_outputs=self._additional_layer_depth,
kernel_size=[3, 3],
stride=2,
padding='SAME',
......@@ -208,6 +211,7 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
conv_hyperparams_fn,
fpn_min_level=3,
fpn_max_level=7,
additional_layer_depth=256,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=False,
......@@ -226,6 +230,7 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
base feature extractor.
fpn_min_level: the minimum level in feature pyramid networks.
fpn_max_level: the maximum level in feature pyramid networks.
additional_layer_depth: additional feature map layer channel depth.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False. UNUSED currently.
......@@ -245,6 +250,7 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
'fpn',
fpn_min_level,
fpn_max_level,
additional_layer_depth,
reuse_weights=reuse_weights,
use_explicit_padding=use_explicit_padding,
use_depthwise=use_depthwise,
......@@ -263,6 +269,7 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
conv_hyperparams_fn,
fpn_min_level=3,
fpn_max_level=7,
additional_layer_depth=256,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=False,
......@@ -281,6 +288,7 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
base feature extractor.
fpn_min_level: the minimum level in feature pyramid networks.
fpn_max_level: the maximum level in feature pyramid networks.
additional_layer_depth: additional feature map layer channel depth.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False. UNUSED currently.
......@@ -300,6 +308,7 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
'fpn',
fpn_min_level,
fpn_max_level,
additional_layer_depth,
reuse_weights=reuse_weights,
use_explicit_padding=use_explicit_padding,
use_depthwise=use_depthwise,
......@@ -318,6 +327,7 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
conv_hyperparams_fn,
fpn_min_level=3,
fpn_max_level=7,
additional_layer_depth=256,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=False,
......@@ -336,6 +346,7 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
base feature extractor.
fpn_min_level: the minimum level in feature pyramid networks.
fpn_max_level: the maximum level in feature pyramid networks.
additional_layer_depth: additional feature map layer channel depth.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False. UNUSED currently.
......@@ -355,6 +366,7 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
'fpn',
fpn_min_level,
fpn_max_level,
additional_layer_depth,
reuse_weights=reuse_weights,
use_explicit_padding=use_explicit_padding,
use_depthwise=use_depthwise,
......
......@@ -36,7 +36,6 @@
},
"outputs": [],
"source": [
"from distutils.version import StrictVersion\n",
"import numpy as np\n",
"import os\n",
"import six.moves.urllib as urllib\n",
......@@ -45,6 +44,7 @@
"import tensorflow as tf\n",
"import zipfile\n",
"\n",
"from distutils.version import StrictVersion\n",
"from collections import defaultdict\n",
"from io import StringIO\n",
"from matplotlib import pyplot as plt\n",
......@@ -166,9 +166,7 @@
"PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'\n",
"\n",
"# List of the strings that is used to add correct label for each box.\n",
"PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')\n",
"\n",
"NUM_CLASSES = 90"
"PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')"
]
},
{
......@@ -265,9 +263,7 @@
},
"outputs": [],
"source": [
"label_map = label_map_util.load_labelmap(PATH_TO_LABELS)\n",
"categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)\n",
"category_index = label_map_util.create_category_index(categories)"
"category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS, use_display_name=True)"
]
},
{
......
......@@ -14,6 +14,7 @@
# ==============================================================================
"""Convolutional Box Predictors with and without weight sharing."""
import functools
import tensorflow as tf
from object_detection.core import box_predictor
from object_detection.utils import static_shape
......@@ -163,7 +164,7 @@ class ConvolutionalBoxPredictor(box_predictor.BoxPredictor):
else:
head_obj = self._other_heads[head_name]
prediction = head_obj.predict(
features=image_feature,
features=net,
num_predictions_per_location=num_predictions_per_location)
predictions[head_name].append(prediction)
return predictions
......@@ -203,7 +204,8 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor):
num_layers_before_predictor,
kernel_size=3,
apply_batch_norm=False,
share_prediction_tower=False):
share_prediction_tower=False,
use_depthwise=False):
"""Constructor.
Args:
......@@ -226,6 +228,8 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor):
this predictor.
share_prediction_tower: Whether to share the multi-layer tower between box
prediction and class prediction heads.
use_depthwise: Whether to use depthwise separable conv2d instead of
regular conv2d.
"""
super(WeightSharedConvolutionalBoxPredictor, self).__init__(is_training,
num_classes)
......@@ -238,6 +242,7 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor):
self._kernel_size = kernel_size
self._apply_batch_norm = apply_batch_norm
self._share_prediction_tower = share_prediction_tower
self._use_depthwise = use_depthwise
@property
def num_classes(self):
......@@ -270,7 +275,11 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor):
inserted_layer_counter):
net = image_feature
for i in range(self._num_layers_before_predictor):
net = slim.conv2d(
if self._use_depthwise:
conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1)
else:
conv_op = slim.conv2d
net = conv_op(
net,
self._depth, [self._kernel_size, self._kernel_size],
stride=1,
......
......@@ -234,6 +234,40 @@ class ConvolutionalBoxPredictorTest(test_case.TestCase):
'BoxPredictor/ClassPredictor/weights'])
self.assertEqual(expected_variable_set, actual_variable_set)
def test_no_dangling_outputs(self):
image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
conv_box_predictor = (
box_predictor_builder.build_convolutional_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4,
use_dropout=True,
use_depthwise=True))
box_predictions = conv_box_predictor.predict(
[image_features], num_predictions_per_location=[5],
scope='BoxPredictor')
tf.concat(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
tf.concat(
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
axis=1)
bad_dangling_ops = []
types_safe_to_dangle = set(['Assign', 'Mul', 'Const'])
for op in tf.get_default_graph().get_operations():
if (not op.outputs) or (not op.outputs[0].consumers()):
if 'BoxPredictor' in op.name:
if op.type not in types_safe_to_dangle:
bad_dangling_ops.append(op)
self.assertEqual(bad_dangling_ops, [])
class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
......@@ -545,6 +579,79 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
'ClassPredictor/biases')])
self.assertEqual(expected_variable_set, actual_variable_set)
def test_predictions_multiple_feature_maps_share_weights_with_depthwise(
self):
num_classes_without_background = 6
def graph_fn(image_features1, image_features2):
conv_box_predictor = (
box_predictor_builder.build_weight_shared_convolutional_box_predictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=2,
box_code_size=4,
apply_batch_norm=False,
use_depthwise=True))
box_predictions = conv_box_predictor.predict(
[image_features1, image_features2],
num_predictions_per_location=[5, 5],
scope='BoxPredictor')
box_encodings = tf.concat(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
class_predictions_with_background = tf.concat(
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
axis=1)
return (box_encodings, class_predictions_with_background)
with self.test_session(graph=tf.Graph()):
graph_fn(tf.random_uniform([4, 32, 32, 3], dtype=tf.float32),
tf.random_uniform([4, 16, 16, 3], dtype=tf.float32))
actual_variable_set = set(
[var.op.name for var in tf.trainable_variables()])
expected_variable_set = set([
# Box prediction tower
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictionTower/conv2d_0/depthwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictionTower/conv2d_0/pointwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictionTower/conv2d_0/biases'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictionTower/conv2d_1/depthwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictionTower/conv2d_1/pointwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictionTower/conv2d_1/biases'),
# Box prediction head
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictor/depthwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictor/pointwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxPredictor/biases'),
# Class prediction tower
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_0/depthwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_0/pointwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_0/biases'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_1/depthwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_1/pointwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_1/biases'),
# Class prediction head
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictor/depthwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictor/pointwise_weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictor/biases')])
self.assertEqual(expected_variable_set, actual_variable_set)
def test_no_batchnorm_params_when_batchnorm_is_not_configured(self):
num_classes_without_background = 6
def graph_fn(image_features1, image_features2):
......
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Convolutional Box Predictors with and without weight sharing."""
import collections
import tensorflow as tf
from object_detection.core import box_predictor
from object_detection.utils import static_shape
keras = tf.keras.layers
BOX_ENCODINGS = box_predictor.BOX_ENCODINGS
CLASS_PREDICTIONS_WITH_BACKGROUND = (
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND)
MASK_PREDICTIONS = box_predictor.MASK_PREDICTIONS
class _NoopVariableScope(object):
"""A dummy class that does not push any scope."""
def __enter__(self):
return None
def __exit__(self, exc_type, exc_value, traceback):
return False
class ConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor):
"""Convolutional Keras Box Predictor.
Optionally add an intermediate 1x1 convolutional layer after features and
predict in parallel branches box_encodings and
class_predictions_with_background.
Currently this box predictor assumes that predictions are "shared" across
classes --- that is each anchor makes box predictions which do not depend
on class.
"""
def __init__(self,
is_training,
num_classes,
box_prediction_heads,
class_prediction_heads,
other_heads,
conv_hyperparams,
num_layers_before_predictor,
min_depth,
max_depth,
freeze_batchnorm,
inplace_batchnorm_update,
name=None):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: number of classes. Note that num_classes *does not*
include the background category, so if groundtruth labels take values
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
assigned classification targets can range from {0,... K}).
box_prediction_heads: A list of heads that predict the boxes.
class_prediction_heads: A list of heads that predict the classes.
other_heads: A dictionary mapping head names to lists of convolutional
heads.
conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
containing hyperparameters for convolution ops.
num_layers_before_predictor: Number of the additional conv layers before
the predictor.
min_depth: Minimum feature depth prior to predicting box encodings
and class predictions.
max_depth: Maximum feature depth prior to predicting box encodings
and class predictions. If max_depth is set to 0, no additional
feature map will be inserted before location and class predictions.
freeze_batchnorm: Whether to freeze batch norm parameters during
training or not. When training with a small batch size (e.g. 1), it is
desirable to freeze batch norm update and use pretrained batch norm
params.
inplace_batchnorm_update: Whether to update batch norm moving average
values inplace. When this is false train op must add a control
dependency on tf.graphkeys.UPDATE_OPS collection in order to update
batch norm statistics.
name: A string name scope to assign to the model. If `None`, Keras
will auto-generate one from the class name.
Raises:
ValueError: if min_depth > max_depth.
"""
super(ConvolutionalBoxPredictor, self).__init__(
is_training, num_classes, freeze_batchnorm=freeze_batchnorm,
inplace_batchnorm_update=inplace_batchnorm_update,
name=name)
if min_depth > max_depth:
raise ValueError('min_depth should be less than or equal to max_depth')
if len(box_prediction_heads) != len(class_prediction_heads):
raise ValueError('All lists of heads must be the same length.')
for other_head_list in other_heads.values():
if len(box_prediction_heads) != len(other_head_list):
raise ValueError('All lists of heads must be the same length.')
self._prediction_heads = {
BOX_ENCODINGS: box_prediction_heads,
CLASS_PREDICTIONS_WITH_BACKGROUND: class_prediction_heads,
}
if other_heads:
self._prediction_heads.update(other_heads)
self._conv_hyperparams = conv_hyperparams
self._min_depth = min_depth
self._max_depth = max_depth
self._num_layers_before_predictor = num_layers_before_predictor
self._shared_nets = []
def build(self, input_shapes):
"""Creates the variables of the layer."""
if len(input_shapes) != len(self._prediction_heads[BOX_ENCODINGS]):
raise ValueError('This box predictor was constructed with %d heads,'
'but there are %d inputs.' %
(len(self._prediction_heads[BOX_ENCODINGS]),
len(input_shapes)))
for stack_index, input_shape in enumerate(input_shapes):
net = tf.keras.Sequential(name='PreHeadConvolutions_%d' % stack_index)
self._shared_nets.append(net)
# Add additional conv layers before the class predictor.
features_depth = static_shape.get_depth(input_shape)
depth = max(min(features_depth, self._max_depth), self._min_depth)
tf.logging.info(
'depth of additional conv before box predictor: {}'.format(depth))
if depth > 0 and self._num_layers_before_predictor > 0:
for i in range(self._num_layers_before_predictor):
net.add(keras.Conv2D(depth, [1, 1],
name='Conv2d_%d_1x1_%d' % (i, depth),
padding='SAME',
**self._conv_hyperparams.params()))
net.add(self._conv_hyperparams.build_batch_norm(
training=(self._is_training and not self._freeze_batchnorm),
name='Conv2d_%d_1x1_%d_norm' % (i, depth)))
net.add(self._conv_hyperparams.build_activation_layer(
name='Conv2d_%d_1x1_%d_activation' % (i, depth),
))
self.built = True
def _predict(self, image_features):
"""Computes encoded object locations and corresponding confidences.
Args:
image_features: A list of float tensors of shape [batch_size, height_i,
width_i, channels_i] containing features for a batch of images.
Returns:
box_encodings: A list of float tensors of shape
[batch_size, num_anchors_i, q, code_size] representing the location of
the objects, where q is 1 or the number of classes. Each entry in the
list corresponds to a feature map in the input `image_features` list.
class_predictions_with_background: A list of float tensors of shape
[batch_size, num_anchors_i, num_classes + 1] representing the class
predictions for the proposals. Each entry in the list corresponds to a
feature map in the input `image_features` list.
"""
predictions = collections.defaultdict(list)
for (index, image_feature) in enumerate(image_features):
# Apply shared conv layers before the head predictors.
net = self._shared_nets[index](image_feature)
for head_name in self._prediction_heads:
head_obj = self._prediction_heads[head_name][index]
prediction = head_obj(net)
predictions[head_name].append(prediction)
return predictions
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for object_detection.predictors.convolutional_keras_box_predictor."""
import numpy as np
import tensorflow as tf
from google.protobuf import text_format
from object_detection.builders import box_predictor_builder
from object_detection.builders import hyperparams_builder
from object_detection.predictors import convolutional_keras_box_predictor as box_predictor
from object_detection.protos import hyperparams_pb2
from object_detection.utils import test_case
class ConvolutionalKerasBoxPredictorTest(test_case.TestCase):
def _build_conv_hyperparams(self):
conv_hyperparams = hyperparams_pb2.Hyperparams()
conv_hyperparams_text_proto = """
activation: RELU_6
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
"""
text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
def test_get_boxes_for_five_aspect_ratios_per_location(self):
def graph_fn(image_features):
conv_box_predictor = (
box_predictor_builder.build_convolutional_keras_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams=self._build_conv_hyperparams(),
freeze_batchnorm=False,
inplace_batchnorm_update=False,
num_predictions_per_location_list=[5],
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
))
box_predictions = conv_box_predictor([image_features])
box_encodings = tf.concat(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
objectness_predictions = tf.concat(
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
axis=1)
return (box_encodings, objectness_predictions)
image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
(box_encodings, objectness_predictions) = self.execute(graph_fn,
[image_features])
self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
self.assertAllEqual(objectness_predictions.shape, [4, 320, 1])
def test_get_boxes_for_one_aspect_ratio_per_location(self):
def graph_fn(image_features):
conv_box_predictor = (
box_predictor_builder.build_convolutional_keras_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams=self._build_conv_hyperparams(),
freeze_batchnorm=False,
inplace_batchnorm_update=False,
num_predictions_per_location_list=[1],
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
))
box_predictions = conv_box_predictor([image_features])
box_encodings = tf.concat(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
objectness_predictions = tf.concat(box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1)
return (box_encodings, objectness_predictions)
image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
(box_encodings, objectness_predictions) = self.execute(graph_fn,
[image_features])
self.assertAllEqual(box_encodings.shape, [4, 64, 1, 4])
self.assertAllEqual(objectness_predictions.shape, [4, 64, 1])
def test_get_multi_class_predictions_for_five_aspect_ratios_per_location(
self):
num_classes_without_background = 6
image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
def graph_fn(image_features):
conv_box_predictor = (
box_predictor_builder.build_convolutional_keras_box_predictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams=self._build_conv_hyperparams(),
freeze_batchnorm=False,
inplace_batchnorm_update=False,
num_predictions_per_location_list=[5],
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
))
box_predictions = conv_box_predictor([image_features])
box_encodings = tf.concat(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
class_predictions_with_background = tf.concat(
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
axis=1)
return (box_encodings, class_predictions_with_background)
(box_encodings,
class_predictions_with_background) = self.execute(graph_fn,
[image_features])
self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
self.assertAllEqual(class_predictions_with_background.shape,
[4, 320, num_classes_without_background+1])
def test_get_predictions_with_feature_maps_of_dynamic_shape(
self):
image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
conv_box_predictor = (
box_predictor_builder.build_convolutional_keras_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams=self._build_conv_hyperparams(),
freeze_batchnorm=False,
inplace_batchnorm_update=False,
num_predictions_per_location_list=[5],
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
))
box_predictions = conv_box_predictor([image_features])
box_encodings = tf.concat(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
objectness_predictions = tf.concat(
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
axis=1)
init_op = tf.global_variables_initializer()
resolution = 32
expected_num_anchors = resolution*resolution*5
with self.test_session() as sess:
sess.run(init_op)
(box_encodings_shape,
objectness_predictions_shape) = sess.run(
[tf.shape(box_encodings), tf.shape(objectness_predictions)],
feed_dict={image_features:
np.random.rand(4, resolution, resolution, 64)})
actual_variable_set = set(
[var.op.name for var in tf.trainable_variables()])
self.assertAllEqual(box_encodings_shape, [4, expected_num_anchors, 1, 4])
self.assertAllEqual(objectness_predictions_shape,
[4, expected_num_anchors, 1])
expected_variable_set = set([
'BoxPredictor/PreHeadConvolutions_0/Conv2d_0_1x1_32/bias',
'BoxPredictor/PreHeadConvolutions_0/Conv2d_0_1x1_32/kernel',
'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/bias',
'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/kernel',
'BoxPredictor/ConvolutionalClassHead_0/ClassPredictor/bias',
'BoxPredictor/ConvolutionalClassHead_0/ClassPredictor/kernel'])
self.assertEqual(expected_variable_set, actual_variable_set)
# TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
if __name__ == '__main__':
tf.test.main()
......@@ -19,6 +19,7 @@ Contains Box prediction head classes for different meta architectures.
All the box prediction heads have a predict function that receives the
`features` as the first argument and returns `box_encodings`.
"""
import functools
import tensorflow as tf
from object_detection.predictors.heads import head
......@@ -196,18 +197,22 @@ class WeightSharedConvolutionalBoxHead(head.Head):
def __init__(self,
box_code_size,
kernel_size=3,
class_prediction_bias_init=0.0):
use_depthwise=False,
box_encodings_clip_range=None):
"""Constructor.
Args:
box_code_size: Size of encoding for each box.
kernel_size: Size of final convolution kernel.
class_prediction_bias_init: constant value to initialize bias of the last
conv2d layer before class prediction.
use_depthwise: Whether to use depthwise convolutions for prediction steps.
Default is False.
box_encodings_clip_range: Min and max values for clipping box_encodings.
"""
super(WeightSharedConvolutionalBoxHead, self).__init__()
self._box_code_size = box_code_size
self._kernel_size = kernel_size
self._use_depthwise = use_depthwise
self._box_encodings_clip_range = box_encodings_clip_range
def predict(self, features, num_predictions_per_location):
"""Predicts boxes.
......@@ -224,7 +229,11 @@ class WeightSharedConvolutionalBoxHead(head.Head):
the objects.
"""
box_encodings_net = features
box_encodings = slim.conv2d(
if self._use_depthwise:
conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1)
else:
conv_op = slim.conv2d
box_encodings = conv_op(
box_encodings_net,
num_predictions_per_location * self._box_code_size,
[self._kernel_size, self._kernel_size],
......@@ -234,6 +243,11 @@ class WeightSharedConvolutionalBoxHead(head.Head):
batch_size = features.get_shape().as_list()[0]
if batch_size is None:
batch_size = tf.shape(features)[0]
# Clipping the box encodings to make the inference graph TPU friendly.
if self._box_encodings_clip_range is not None:
box_encodings = tf.clip_by_value(
box_encodings, self._box_encodings_clip_range.min,
self._box_encodings_clip_range.max)
box_encodings = tf.reshape(box_encodings,
[batch_size, -1, self._box_code_size])
return box_encodings
......@@ -19,6 +19,7 @@ Contains Class prediction head classes for different meta architectures.
All the class prediction heads have a predict function that receives the
`features` as the first argument and returns class predictions with background.
"""
import functools
import tensorflow as tf
from object_detection.predictors.heads import head
......@@ -211,7 +212,9 @@ class WeightSharedConvolutionalClassHead(head.Head):
kernel_size=3,
class_prediction_bias_init=0.0,
use_dropout=False,
dropout_keep_prob=0.8):
dropout_keep_prob=0.8,
use_depthwise=False,
score_converter_fn=tf.identity):
"""Constructor.
Args:
......@@ -224,6 +227,10 @@ class WeightSharedConvolutionalClassHead(head.Head):
conv2d layer before class prediction.
use_dropout: Whether to apply dropout to class prediction head.
dropout_keep_prob: Probability of keeping activiations.
use_depthwise: Whether to use depthwise convolutions for prediction
steps. Default is False.
score_converter_fn: Callable elementwise nonlinearity (that takes tensors
as inputs and returns tensors).
"""
super(WeightSharedConvolutionalClassHead, self).__init__()
self._num_classes = num_classes
......@@ -231,6 +238,8 @@ class WeightSharedConvolutionalClassHead(head.Head):
self._class_prediction_bias_init = class_prediction_bias_init
self._use_dropout = use_dropout
self._dropout_keep_prob = dropout_keep_prob
self._use_depthwise = use_depthwise
self._score_converter_fn = score_converter_fn
def predict(self, features, num_predictions_per_location):
"""Predicts boxes.
......@@ -252,7 +261,11 @@ class WeightSharedConvolutionalClassHead(head.Head):
if self._use_dropout:
class_predictions_net = slim.dropout(
class_predictions_net, keep_prob=self._dropout_keep_prob)
class_predictions_with_background = slim.conv2d(
if self._use_depthwise:
conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1)
else:
conv_op = slim.conv2d
class_predictions_with_background = conv_op(
class_predictions_net,
num_predictions_per_location * num_class_slots,
[self._kernel_size, self._kernel_size],
......@@ -264,6 +277,8 @@ class WeightSharedConvolutionalClassHead(head.Head):
batch_size = features.get_shape().as_list()[0]
if batch_size is None:
batch_size = tf.shape(features)[0]
class_predictions_with_background = self._score_converter_fn(
class_predictions_with_background)
class_predictions_with_background = tf.reshape(
class_predictions_with_background, [batch_size, -1, num_class_slots])
return class_predictions_with_background
......@@ -36,6 +36,8 @@ Mask RCNN box predictor.
"""
from abc import abstractmethod
import tensorflow as tf
class Head(object):
"""Mask RCNN head base class."""
......@@ -57,3 +59,23 @@ class Head(object):
A tf.float32 tensor.
"""
pass
class KerasHead(tf.keras.Model):
"""Keras head base class."""
def call(self, features):
"""The Keras model call will delegate to the `_predict` method."""
return self._predict(features)
@abstractmethod
def _predict(self, features):
"""Returns the head's predictions.
Args:
features: A float tensor of features.
Returns:
A tf.float32 tensor.
"""
pass
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Box Head.
Contains Box prediction head classes for different meta architectures.
All the box prediction heads have a _predict function that receives the
`features` as the first argument and returns `box_encodings`.
"""
import tensorflow as tf
from object_detection.predictors.heads import head
class ConvolutionalBoxHead(head.KerasHead):
"""Convolutional box prediction head."""
def __init__(self,
is_training,
box_code_size,
kernel_size,
num_predictions_per_location,
conv_hyperparams,
freeze_batchnorm,
use_depthwise=True,
name=None):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
box_code_size: Size of encoding for each box.
kernel_size: Size of final convolution kernel. If the
spatial resolution of the feature map is smaller than the kernel size,
then the kernel size is automatically set to be
min(feature_width, feature_height).
num_predictions_per_location: Number of box predictions to be made per
spatial location. Int specifying number of boxes per location.
conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
containing hyperparameters for convolution ops.
freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
training or not. When training with a small batch size (e.g. 1), it is
desirable to freeze batch norm update and use pretrained batch norm
params.
use_depthwise: Whether to use depthwise convolutions for prediction
steps. Default is False.
name: A string name scope to assign to the model. If `None`, Keras
will auto-generate one from the class name.
Raises:
ValueError: if min_depth > max_depth.
"""
super(ConvolutionalBoxHead, self).__init__(name=name)
self._is_training = is_training
self._box_code_size = box_code_size
self._kernel_size = kernel_size
self._num_predictions_per_location = num_predictions_per_location
self._use_depthwise = use_depthwise
self._box_encoder_layers = []
if self._use_depthwise:
self._box_encoder_layers.append(
tf.keras.layers.DepthwiseConv2D(
[self._kernel_size, self._kernel_size],
padding='SAME',
depth_multiplier=1,
strides=1,
dilation_rate=1,
name='BoxEncodingPredictor_depthwise',
**conv_hyperparams.params()))
self._box_encoder_layers.append(
conv_hyperparams.build_batch_norm(
training=(is_training and not freeze_batchnorm),
name='BoxEncodingPredictor_depthwise_batchnorm'))
self._box_encoder_layers.append(
conv_hyperparams.build_activation_layer(
name='BoxEncodingPredictor_depthwise_activation'))
self._box_encoder_layers.append(
tf.keras.layers.Conv2D(
num_predictions_per_location * self._box_code_size, [1, 1],
name='BoxEncodingPredictor',
**conv_hyperparams.params(activation=None)))
else:
self._box_encoder_layers.append(
tf.keras.layers.Conv2D(
num_predictions_per_location * self._box_code_size,
[self._kernel_size, self._kernel_size],
padding='SAME',
name='BoxEncodingPredictor',
**conv_hyperparams.params(activation=None)))
def _predict(self, features):
"""Predicts boxes.
Args:
features: A float tensor of shape [batch_size, height, width, channels]
containing image features.
Returns:
box_encodings: A float tensor of shape
[batch_size, num_anchors, q, code_size] representing the location of
the objects, where q is 1 or the number of classes.
"""
box_encodings = features
for layer in self._box_encoder_layers:
box_encodings = layer(box_encodings)
batch_size = features.get_shape().as_list()[0]
if batch_size is None:
batch_size = tf.shape(features)[0]
box_encodings = tf.reshape(box_encodings,
[batch_size, -1, 1, self._box_code_size])
return box_encodings
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for object_detection.predictors.heads.box_head."""
import tensorflow as tf
from google.protobuf import text_format
from object_detection.builders import hyperparams_builder
from object_detection.predictors.heads import keras_box_head
from object_detection.protos import hyperparams_pb2
from object_detection.utils import test_case
class ConvolutionalKerasBoxHeadTest(test_case.TestCase):
def _build_conv_hyperparams(self):
conv_hyperparams = hyperparams_pb2.Hyperparams()
conv_hyperparams_text_proto = """
activation: NONE
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
"""
text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
def test_prediction_size_depthwise_false(self):
conv_hyperparams = self._build_conv_hyperparams()
box_prediction_head = keras_box_head.ConvolutionalBoxHead(
is_training=True,
box_code_size=4,
kernel_size=3,
conv_hyperparams=conv_hyperparams,
freeze_batchnorm=False,
num_predictions_per_location=1,
use_depthwise=False)
image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
box_encodings = box_prediction_head(image_feature)
self.assertAllEqual([64, 323, 1, 4], box_encodings.get_shape().as_list())
# TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
if __name__ == '__main__':
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Class Head.
Contains Class prediction head classes for different meta architectures.
All the class prediction heads have a predict function that receives the
`features` as the first argument and returns class predictions with background.
"""
import tensorflow as tf
from object_detection.predictors.heads import head
class ConvolutionalClassHead(head.KerasHead):
"""Convolutional class prediction head."""
def __init__(self,
is_training,
num_classes,
use_dropout,
dropout_keep_prob,
kernel_size,
num_predictions_per_location,
conv_hyperparams,
freeze_batchnorm,
class_prediction_bias_init=0.0,
use_depthwise=False,
name=None):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: Number of classes.
use_dropout: Option to use dropout or not. Note that a single dropout
op is applied here prior to both box and class predictions, which stands
in contrast to the ConvolutionalBoxPredictor below.
dropout_keep_prob: Keep probability for dropout.
This is only used if use_dropout is True.
kernel_size: Size of final convolution kernel. If the
spatial resolution of the feature map is smaller than the kernel size,
then the kernel size is automatically set to be
min(feature_width, feature_height).
num_predictions_per_location: Number of box predictions to be made per
spatial location. Int specifying number of boxes per location.
conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
containing hyperparameters for convolution ops.
freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
training or not. When training with a small batch size (e.g. 1), it is
desirable to freeze batch norm update and use pretrained batch norm
params.
class_prediction_bias_init: constant value to initialize bias of the last
conv2d layer before class prediction.
use_depthwise: Whether to use depthwise convolutions for prediction
steps. Default is False.
name: A string name scope to assign to the model. If `None`, Keras
will auto-generate one from the class name.
Raises:
ValueError: if min_depth > max_depth.
"""
super(ConvolutionalClassHead, self).__init__(name=name)
self._is_training = is_training
self._num_classes = num_classes
self._use_dropout = use_dropout
self._dropout_keep_prob = dropout_keep_prob
self._kernel_size = kernel_size
self._class_prediction_bias_init = class_prediction_bias_init
self._use_depthwise = use_depthwise
self._num_class_slots = self._num_classes + 1
self._class_predictor_layers = []
if self._use_dropout:
self._class_predictor_layers.append(
# The Dropout layer's `training` parameter for the call method must
# be set implicitly by the Keras set_learning_phase. The object
# detection training code takes care of this.
tf.keras.layers.Dropout(rate=1.0 - self._dropout_keep_prob))
if self._use_depthwise:
self._class_predictor_layers.append(
tf.keras.layers.DepthwiseConv2D(
[self._kernel_size, self._kernel_size],
padding='SAME',
depth_multiplier=1,
strides=1,
dilation_rate=1,
name='ClassPredictor_depthwise',
**conv_hyperparams.params()))
self._class_predictor_layers.append(
conv_hyperparams.build_batch_norm(
training=(is_training and not freeze_batchnorm),
name='ClassPredictor_depthwise_batchnorm'))
self._class_predictor_layers.append(
conv_hyperparams.build_activation_layer(
name='ClassPredictor_depthwise_activation'))
self._class_predictor_layers.append(
tf.keras.layers.Conv2D(
num_predictions_per_location * self._num_class_slots, [1, 1],
name='ClassPredictor',
**conv_hyperparams.params(activation=None)))
else:
self._class_predictor_layers.append(
tf.keras.layers.Conv2D(
num_predictions_per_location * self._num_class_slots,
[self._kernel_size, self._kernel_size],
padding='SAME',
name='ClassPredictor',
bias_initializer=tf.constant_initializer(
self._class_prediction_bias_init),
**conv_hyperparams.params(activation=None)))
def _predict(self, features):
"""Predicts boxes.
Args:
features: A float tensor of shape [batch_size, height, width, channels]
containing image features.
Returns:
class_predictions_with_background: A float tensor of shape
[batch_size, num_anchors, num_classes + 1] representing the class
predictions for the proposals.
"""
# Add a slot for the background class.
class_predictions_with_background = features
for layer in self._class_predictor_layers:
class_predictions_with_background = layer(
class_predictions_with_background)
batch_size = features.get_shape().as_list()[0]
if batch_size is None:
batch_size = tf.shape(features)[0]
class_predictions_with_background = tf.reshape(
class_predictions_with_background,
[batch_size, -1, self._num_class_slots])
return class_predictions_with_background
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for object_detection.predictors.heads.class_head."""
import tensorflow as tf
from google.protobuf import text_format
from object_detection.builders import hyperparams_builder
from object_detection.predictors.heads import keras_class_head
from object_detection.protos import hyperparams_pb2
from object_detection.utils import test_case
class ConvolutionalKerasClassPredictorTest(test_case.TestCase):
def _build_conv_hyperparams(self):
conv_hyperparams = hyperparams_pb2.Hyperparams()
conv_hyperparams_text_proto = """
activation: NONE
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
"""
text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
def test_prediction_size_depthwise_false(self):
conv_hyperparams = self._build_conv_hyperparams()
class_prediction_head = keras_class_head.ConvolutionalClassHead(
is_training=True,
num_classes=20,
use_dropout=True,
dropout_keep_prob=0.5,
kernel_size=3,
conv_hyperparams=conv_hyperparams,
freeze_batchnorm=False,
num_predictions_per_location=1,
use_depthwise=False)
image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
class_predictions = class_prediction_head(image_feature,)
self.assertAllEqual([64, 323, 21],
class_predictions.get_shape().as_list())
# TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
if __name__ == '__main__':
tf.test.main()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras Mask Heads.
Contains Mask prediction head classes for different meta architectures.
All the mask prediction heads have a predict function that receives the
`features` as the first argument and returns `mask_predictions`.
"""
import tensorflow as tf
from object_detection.predictors.heads import head
class ConvolutionalMaskHead(head.KerasHead):
"""Convolutional class prediction head."""
def __init__(self,
is_training,
num_classes,
use_dropout,
dropout_keep_prob,
kernel_size,
num_predictions_per_location,
conv_hyperparams,
freeze_batchnorm,
use_depthwise=False,
mask_height=7,
mask_width=7,
masks_are_class_agnostic=False,
name=None):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: Number of classes.
use_dropout: Option to use dropout or not. Note that a single dropout
op is applied here prior to both box and class predictions, which stands
in contrast to the ConvolutionalBoxPredictor below.
dropout_keep_prob: Keep probability for dropout.
This is only used if use_dropout is True.
kernel_size: Size of final convolution kernel. If the
spatial resolution of the feature map is smaller than the kernel size,
then the kernel size is automatically set to be
min(feature_width, feature_height).
num_predictions_per_location: Number of box predictions to be made per
spatial location. Int specifying number of boxes per location.
conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
containing hyperparameters for convolution ops.
freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
training or not. When training with a small batch size (e.g. 1), it is
desirable to freeze batch norm update and use pretrained batch norm
params.
use_depthwise: Whether to use depthwise convolutions for prediction
steps. Default is False.
mask_height: Desired output mask height. The default value is 7.
mask_width: Desired output mask width. The default value is 7.
masks_are_class_agnostic: Boolean determining if the mask-head is
class-agnostic or not.
name: A string name scope to assign to the model. If `None`, Keras
will auto-generate one from the class name.
Raises:
ValueError: if min_depth > max_depth.
"""
super(ConvolutionalMaskHead, self).__init__(name=name)
self._is_training = is_training
self._num_classes = num_classes
self._use_dropout = use_dropout
self._dropout_keep_prob = dropout_keep_prob
self._kernel_size = kernel_size
self._num_predictions_per_location = num_predictions_per_location
self._use_depthwise = use_depthwise
self._mask_height = mask_height
self._mask_width = mask_width
self._masks_are_class_agnostic = masks_are_class_agnostic
self._mask_predictor_layers = []
# Add a slot for the background class.
if self._masks_are_class_agnostic:
self._num_masks = 1
else:
self._num_masks = self._num_classes
num_mask_channels = self._num_masks * self._mask_height * self._mask_width
if self._use_dropout:
self._mask_predictor_layers.append(
# The Dropout layer's `training` parameter for the call method must
# be set implicitly by the Keras set_learning_phase. The object
# detection training code takes care of this.
tf.keras.layers.Dropout(rate=1.0 - self._dropout_keep_prob))
if self._use_depthwise:
self._mask_predictor_layers.append(
tf.keras.layers.DepthwiseConv2D(
[self._kernel_size, self._kernel_size],
padding='SAME',
depth_multiplier=1,
strides=1,
dilation_rate=1,
name='MaskPredictor_depthwise',
**conv_hyperparams.params()))
self._mask_predictor_layers.append(
conv_hyperparams.build_batch_norm(
training=(is_training and not freeze_batchnorm),
name='MaskPredictor_depthwise_batchnorm'))
self._mask_predictor_layers.append(
conv_hyperparams.build_activation_layer(
name='MaskPredictor_depthwise_activation'))
self._mask_predictor_layers.append(
tf.keras.layers.Conv2D(
num_predictions_per_location * num_mask_channels, [1, 1],
name='MaskPredictor',
**conv_hyperparams.params(activation=None)))
else:
self._mask_predictor_layers.append(
tf.keras.layers.Conv2D(
num_predictions_per_location * num_mask_channels,
[self._kernel_size, self._kernel_size],
padding='SAME',
name='MaskPredictor',
**conv_hyperparams.params(activation=None)))
def _predict(self, features):
"""Predicts boxes.
Args:
features: A float tensor of shape [batch_size, height, width, channels]
containing image features.
Returns:
mask_predictions: A float tensors of shape
[batch_size, num_anchors, num_masks, mask_height, mask_width]
representing the mask predictions for the proposals.
"""
mask_predictions = features
for layer in self._mask_predictor_layers:
mask_predictions = layer(mask_predictions)
batch_size = features.get_shape().as_list()[0]
if batch_size is None:
batch_size = tf.shape(features)[0]
mask_predictions = tf.reshape(
mask_predictions,
[batch_size, -1, self._num_masks, self._mask_height, self._mask_width])
return mask_predictions
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for object_detection.predictors.heads.mask_head."""
import tensorflow as tf
from google.protobuf import text_format
from object_detection.builders import hyperparams_builder
from object_detection.predictors.heads import keras_mask_head
from object_detection.protos import hyperparams_pb2
from object_detection.utils import test_case
class ConvolutionalMaskPredictorTest(test_case.TestCase):
def _build_conv_hyperparams(self):
conv_hyperparams = hyperparams_pb2.Hyperparams()
conv_hyperparams_text_proto = """
activation: NONE
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
"""
text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
def test_prediction_size_use_depthwise_false(self):
conv_hyperparams = self._build_conv_hyperparams()
mask_prediction_head = keras_mask_head.ConvolutionalMaskHead(
is_training=True,
num_classes=20,
use_dropout=True,
dropout_keep_prob=0.5,
kernel_size=3,
conv_hyperparams=conv_hyperparams,
freeze_batchnorm=False,
num_predictions_per_location=1,
use_depthwise=False,
mask_height=7,
mask_width=7)
image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
mask_predictions = mask_prediction_head(image_feature)
self.assertAllEqual([64, 323, 20, 7, 7],
mask_predictions.get_shape().as_list())
# TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
def test_class_agnostic_prediction_size_use_depthwise_false(self):
conv_hyperparams = self._build_conv_hyperparams()
mask_prediction_head = keras_mask_head.ConvolutionalMaskHead(
is_training=True,
num_classes=20,
use_dropout=True,
dropout_keep_prob=0.5,
kernel_size=3,
conv_hyperparams=conv_hyperparams,
freeze_batchnorm=False,
num_predictions_per_location=1,
use_depthwise=False,
mask_height=7,
mask_width=7,
masks_are_class_agnostic=True)
image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
mask_predictions = mask_prediction_head(image_feature)
self.assertAllEqual([64, 323, 1, 7, 7],
mask_predictions.get_shape().as_list())
# TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
if __name__ == '__main__':
tf.test.main()
......@@ -148,6 +148,7 @@ class MaskRCNNMaskHead(head.Head):
upsampled_features,
num_outputs=num_masks,
activation_fn=None,
normalizer_fn=None,
kernel_size=[3, 3])
return tf.expand_dims(
tf.transpose(mask_predictions, perm=[0, 3, 1, 2]),
......
......@@ -15,7 +15,21 @@ message BoxPredictor {
}
}
// Configuration proto for MaskHead in predictors.
// Next id: 4
message MaskHead {
// The height and the width of the predicted mask. Only used when
// predict_instance_masks is true.
optional int32 mask_height = 1 [default = 15];
optional int32 mask_width = 2 [default = 15];
// Whether to predict class agnostic masks. Only used when
// predict_instance_masks is true.
optional bool masks_are_class_agnostic = 3 [default = true];
}
// Configuration proto for Convolutional box predictor.
// Next id: 13
message ConvolutionalBoxPredictor {
// Hyperparameters for convolution ops used in the box predictor.
optional Hyperparams conv_hyperparams = 1;
......@@ -55,9 +69,13 @@ message ConvolutionalBoxPredictor {
// Whether to use depthwise separable convolution for box predictor layers.
optional bool use_depthwise = 11 [default = false];
// Configs for a mask prediction head.
optional MaskHead mask_head = 12;
}
// Configuration proto for weight shared convolutional box predictor.
// Next id: 18
message WeightSharedConvolutionalBoxPredictor {
// Hyperparameters for convolution ops used in the box predictor.
optional Hyperparams conv_hyperparams = 1;
......@@ -85,12 +103,37 @@ message WeightSharedConvolutionalBoxPredictor {
// Whether to use dropout for class prediction.
optional bool use_dropout = 11 [default = false];
// Keep probability for dropout
// Keep probability for dropout.
optional float dropout_keep_probability = 12 [default = 0.8];
// Whether to share the multi-layer tower between box prediction and class
// prediction heads.
optional bool share_prediction_tower = 13 [default = false];
// Whether to use depthwise separable convolution for box predictor layers.
optional bool use_depthwise = 14 [default = false];
// Configs for a mask prediction head.
optional MaskHead mask_head = 15;
// Enum to specify how to convert the detection scores at inference time.
enum ScoreConverter {
// Input scores equals output scores.
IDENTITY = 0;
// Applies a sigmoid on input scores.
SIGMOID = 1;
}
// Callable elementwise score converter at inference time.
optional ScoreConverter score_converter = 16 [default = IDENTITY];
// If specified, apply clipping to box encodings.
message BoxEncodingsClipRange {
optional float min = 1;
optional float max = 2;
}
optional BoxEncodingsClipRange box_encodings_clip_range = 17;
}
// TODO(alirezafathi): Refactor the proto file to be able to configure mask rcnn
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment