"vscode:/vscode.git/clone" did not exist on "2e41d8ca790fada5a98b46883f80f7a2d89d2790"
Commit e00e0e13 authored by dreamdragon's avatar dreamdragon
Browse files

Merge remote-tracking branch 'upstream/master'

parents b915db4e 402b561b
...@@ -85,41 +85,44 @@ class SSDMobileNetV2KerasFeatureExtractor( ...@@ -85,41 +85,44 @@ class SSDMobileNetV2KerasFeatureExtractor(
override_base_feature_extractor_hyperparams= override_base_feature_extractor_hyperparams=
override_base_feature_extractor_hyperparams, override_base_feature_extractor_hyperparams,
name=name) name=name)
feature_map_layout = { self._feature_map_layout = {
'from_layer': ['layer_15/expansion_output', 'layer_19', '', '', '', ''], 'from_layer': ['layer_15/expansion_output', 'layer_19', '', '', '', ''],
'layer_depth': [-1, -1, 512, 256, 256, 128], 'layer_depth': [-1, -1, 512, 256, 256, 128],
'use_depthwise': self._use_depthwise, 'use_depthwise': self._use_depthwise,
'use_explicit_padding': self._use_explicit_padding, 'use_explicit_padding': self._use_explicit_padding,
} }
with tf.name_scope('MobilenetV2'): self.mobilenet_v2 = None
full_mobilenet_v2 = mobilenet_v2.mobilenet_v2( self.feature_map_generator = None
batchnorm_training=(is_training and not freeze_batchnorm),
conv_hyperparams=(conv_hyperparams def build(self, input_shape):
if self._override_base_feature_extractor_hyperparams full_mobilenet_v2 = mobilenet_v2.mobilenet_v2(
else None), batchnorm_training=(self._is_training and not self._freeze_batchnorm),
weights=None, conv_hyperparams=(self._conv_hyperparams
use_explicit_padding=use_explicit_padding, if self._override_base_feature_extractor_hyperparams
alpha=self._depth_multiplier, else None),
min_depth=self._min_depth, weights=None,
include_top=False) use_explicit_padding=self._use_explicit_padding,
conv2d_11_pointwise = full_mobilenet_v2.get_layer( alpha=self._depth_multiplier,
name='block_13_expand_relu').output min_depth=self._min_depth,
conv2d_13_pointwise = full_mobilenet_v2.get_layer(name='out_relu').output include_top=False)
self.mobilenet_v2 = tf.keras.Model( conv2d_11_pointwise = full_mobilenet_v2.get_layer(
inputs=full_mobilenet_v2.inputs, name='block_13_expand_relu').output
outputs=[conv2d_11_pointwise, conv2d_13_pointwise]) conv2d_13_pointwise = full_mobilenet_v2.get_layer(name='out_relu').output
self.mobilenet_v2 = tf.keras.Model(
self.feature_map_generator = ( inputs=full_mobilenet_v2.inputs,
feature_map_generators.KerasMultiResolutionFeatureMaps( outputs=[conv2d_11_pointwise, conv2d_13_pointwise])
feature_map_layout=feature_map_layout, self.feature_map_generator = (
depth_multiplier=self._depth_multiplier, feature_map_generators.KerasMultiResolutionFeatureMaps(
min_depth=self._min_depth, feature_map_layout=self._feature_map_layout,
insert_1x1_conv=True, depth_multiplier=self._depth_multiplier,
is_training=is_training, min_depth=self._min_depth,
conv_hyperparams=conv_hyperparams, insert_1x1_conv=True,
freeze_batchnorm=freeze_batchnorm, is_training=self._is_training,
name='FeatureMaps')) conv_hyperparams=self._conv_hyperparams,
freeze_batchnorm=self._freeze_batchnorm,
name='FeatureMaps'))
self.built = True
def preprocess(self, resized_inputs): def preprocess(self, resized_inputs):
"""SSD preprocessing. """SSD preprocessing.
......
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SSDFeatureExtractor for PNASNet features.
Based on PNASNet ImageNet model: https://arxiv.org/abs/1712.00559
"""
import tensorflow as tf
from object_detection.meta_architectures import ssd_meta_arch
from object_detection.models import feature_map_generators
from object_detection.utils import context_manager
from object_detection.utils import ops
from nets.nasnet import pnasnet
slim = tf.contrib.slim
def pnasnet_large_arg_scope_for_detection(is_batch_norm_training=False):
"""Defines the default arg scope for the PNASNet Large for object detection.
This provides a small edit to switch batch norm training on and off.
Args:
is_batch_norm_training: Boolean indicating whether to train with batch norm.
Default is False.
Returns:
An `arg_scope` to use for the PNASNet Large Model.
"""
imagenet_scope = pnasnet.pnasnet_large_arg_scope()
with slim.arg_scope(imagenet_scope):
with slim.arg_scope([slim.batch_norm],
is_training=is_batch_norm_training) as sc:
return sc
class SSDPNASNetFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
"""SSD Feature Extractor using PNASNet features."""
def __init__(self,
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams_fn,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=False,
override_base_feature_extractor_hyperparams=False):
"""PNASNet Feature Extractor for SSD Models.
Args:
is_training: whether the network is in training mode.
depth_multiplier: float depth multiplier for feature extractor.
min_depth: minimum feature extractor depth.
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
and separable_conv2d ops in the layers that are added on top of the
base feature extractor.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
inputs so that the output dimensions are the same as if 'SAME' padding
were used.
use_depthwise: Whether to use depthwise convolutions.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
"""
super(SSDPNASNetFeatureExtractor, self).__init__(
is_training=is_training,
depth_multiplier=depth_multiplier,
min_depth=min_depth,
pad_to_multiple=pad_to_multiple,
conv_hyperparams_fn=conv_hyperparams_fn,
reuse_weights=reuse_weights,
use_explicit_padding=use_explicit_padding,
use_depthwise=use_depthwise,
override_base_feature_extractor_hyperparams=
override_base_feature_extractor_hyperparams)
def preprocess(self, resized_inputs):
"""SSD preprocessing.
Maps pixel values to the range [-1, 1].
Args:
resized_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
"""
return (2.0 / 255.0) * resized_inputs - 1.0
def extract_features(self, preprocessed_inputs):
"""Extract features from preprocessed inputs.
Args:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
feature_maps: a list of tensors where the ith tensor has shape
[batch, height_i, width_i, depth_i]
"""
feature_map_layout = {
'from_layer': ['Cell_7', 'Cell_11', '', '', '', ''],
'layer_depth': [-1, -1, 512, 256, 256, 128],
'use_explicit_padding': self._use_explicit_padding,
'use_depthwise': self._use_depthwise,
}
with slim.arg_scope(
pnasnet_large_arg_scope_for_detection(
is_batch_norm_training=self._is_training)):
with slim.arg_scope([slim.conv2d, slim.batch_norm, slim.separable_conv2d],
reuse=self._reuse_weights):
with (slim.arg_scope(self._conv_hyperparams_fn())
if self._override_base_feature_extractor_hyperparams else
context_manager.IdentityContextManager()):
_, image_features = pnasnet.build_pnasnet_large(
ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
num_classes=None,
is_training=self._is_training,
final_endpoint='Cell_11')
with tf.variable_scope('SSD_feature_maps', reuse=self._reuse_weights):
with slim.arg_scope(self._conv_hyperparams_fn()):
feature_maps = feature_map_generators.multi_resolution_feature_maps(
feature_map_layout=feature_map_layout,
depth_multiplier=self._depth_multiplier,
min_depth=self._min_depth,
insert_1x1_conv=True,
image_features=image_features)
return feature_maps.values()
def restore_from_classification_checkpoint_fn(self, feature_extractor_scope):
"""Returns a map of variables to load from a foreign checkpoint.
Note that this overrides the default implementation in
ssd_meta_arch.SSDFeatureExtractor which does not work for PNASNet
checkpoints.
Args:
feature_extractor_scope: A scope name for the first stage feature
extractor.
Returns:
A dict mapping variable names (to load from a checkpoint) to variables in
the model graph.
"""
variables_to_restore = {}
for variable in tf.global_variables():
if variable.op.name.startswith(feature_extractor_scope):
var_name = variable.op.name.replace(feature_extractor_scope + '/', '')
var_name += '/ExponentialMovingAverage'
variables_to_restore[var_name] = variable
return variables_to_restore
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for ssd_pnas_feature_extractor."""
import numpy as np
import tensorflow as tf
from object_detection.models import ssd_feature_extractor_test
from object_detection.models import ssd_pnasnet_feature_extractor
slim = tf.contrib.slim
class SsdPnasNetFeatureExtractorTest(
ssd_feature_extractor_test.SsdFeatureExtractorTestBase):
def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
is_training=True, use_explicit_padding=False):
"""Constructs a new feature extractor.
Args:
depth_multiplier: float depth multiplier for feature extractor
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
is_training: whether the network is in training mode.
use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
inputs so that the output dimensions are the same as if 'SAME' padding
were used.
Returns:
an ssd_meta_arch.SSDFeatureExtractor object.
"""
min_depth = 32
return ssd_pnasnet_feature_extractor.SSDPNASNetFeatureExtractor(
is_training, depth_multiplier, min_depth, pad_to_multiple,
self.conv_hyperparams_fn,
use_explicit_padding=use_explicit_padding)
def test_extract_features_returns_correct_shapes_128(self):
image_height = 128
image_width = 128
depth_multiplier = 1.0
pad_to_multiple = 1
expected_feature_map_shape = [(2, 8, 8, 2160), (2, 4, 4, 4320),
(2, 2, 2, 512), (2, 1, 1, 256),
(2, 1, 1, 256), (2, 1, 1, 128)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape)
def test_extract_features_returns_correct_shapes_299(self):
image_height = 299
image_width = 299
depth_multiplier = 1.0
pad_to_multiple = 1
expected_feature_map_shape = [(2, 19, 19, 2160), (2, 10, 10, 4320),
(2, 5, 5, 512), (2, 3, 3, 256),
(2, 2, 2, 256), (2, 1, 1, 128)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape)
def test_preprocess_returns_correct_value_range(self):
image_height = 128
image_width = 128
depth_multiplier = 1
pad_to_multiple = 1
test_image = np.random.rand(2, image_height, image_width, 3)
feature_extractor = self._create_feature_extractor(depth_multiplier,
pad_to_multiple)
preprocessed_image = feature_extractor.preprocess(test_image)
self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))
if __name__ == '__main__':
tf.test.main()
...@@ -113,6 +113,8 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -113,6 +113,8 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
VGG style channel mean subtraction as described here: VGG style channel mean subtraction as described here:
https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-mdnge. https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-mdnge.
Note that if the number of channels is not equal to 3, the mean subtraction
will be skipped and the original resized_inputs will be returned.
Args: Args:
resized_inputs: a [batch, height, width, channels] float tensor resized_inputs: a [batch, height, width, channels] float tensor
...@@ -122,8 +124,11 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -122,8 +124,11 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
preprocessed_inputs: a [batch, height, width, channels] float tensor preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images. representing a batch of images.
""" """
channel_means = [123.68, 116.779, 103.939] if resized_inputs.shape.as_list()[3] == 3:
return resized_inputs - [[channel_means]] channel_means = [123.68, 116.779, 103.939]
return resized_inputs - [[channel_means]]
else:
return resized_inputs
def _filter_features(self, image_features): def _filter_features(self, image_features):
# TODO(rathodv): Change resnet endpoint to strip scope prefixes instead # TODO(rathodv): Change resnet endpoint to strip scope prefixes instead
......
...@@ -82,12 +82,15 @@ class SSDResnetFPNFeatureExtractorTestBase( ...@@ -82,12 +82,15 @@ class SSDResnetFPNFeatureExtractorTestBase(
image_width = 128 image_width = 128
depth_multiplier = 1 depth_multiplier = 1
pad_to_multiple = 1 pad_to_multiple = 1
test_image = np.random.rand(4, image_height, image_width, 3) test_image = tf.constant(np.random.rand(4, image_height, image_width, 3))
feature_extractor = self._create_feature_extractor(depth_multiplier, feature_extractor = self._create_feature_extractor(depth_multiplier,
pad_to_multiple) pad_to_multiple)
preprocessed_image = feature_extractor.preprocess(test_image) preprocessed_image = feature_extractor.preprocess(test_image)
self.assertAllClose(preprocessed_image, with self.test_session() as sess:
test_image - [[123.68, 116.779, 103.939]]) test_image_out, preprocessed_image_out = sess.run(
[test_image, preprocessed_image])
self.assertAllClose(preprocessed_image_out,
test_image_out - [[123.68, 116.779, 103.939]])
def test_variables_only_created_in_scope(self): def test_variables_only_created_in_scope(self):
depth_multiplier = 1 depth_multiplier = 1
...@@ -103,5 +106,3 @@ class SSDResnetFPNFeatureExtractorTestBase( ...@@ -103,5 +106,3 @@ class SSDResnetFPNFeatureExtractorTestBase(
self.assertTrue( self.assertTrue(
variable.name.startswith(self._resnet_scope_name()) variable.name.startswith(self._resnet_scope_name())
or variable.name.startswith(self._fpn_scope_name())) or variable.name.startswith(self._fpn_scope_name()))
...@@ -98,6 +98,8 @@ class _SSDResnetPpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -98,6 +98,8 @@ class _SSDResnetPpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
VGG style channel mean subtraction as described here: VGG style channel mean subtraction as described here:
https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-mdnge. https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-mdnge.
Note that if the number of channels is not equal to 3, the mean subtraction
will be skipped and the original resized_inputs will be returned.
Args: Args:
resized_inputs: a [batch, height, width, channels] float tensor resized_inputs: a [batch, height, width, channels] float tensor
...@@ -107,8 +109,11 @@ class _SSDResnetPpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -107,8 +109,11 @@ class _SSDResnetPpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
preprocessed_inputs: a [batch, height, width, channels] float tensor preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images. representing a batch of images.
""" """
channel_means = [123.68, 116.779, 103.939] if resized_inputs.shape.as_list()[3] == 3:
return resized_inputs - [[channel_means]] channel_means = [123.68, 116.779, 103.939]
return resized_inputs - [[channel_means]]
else:
return resized_inputs
def extract_features(self, preprocessed_inputs): def extract_features(self, preprocessed_inputs):
"""Extract features from preprocessed inputs. """Extract features from preprocessed inputs.
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
"""Tests for ssd resnet v1 feature extractors.""" """Tests for ssd resnet v1 feature extractors."""
import abc import abc
import numpy as np import numpy as np
import tensorflow as tf
from object_detection.models import ssd_feature_extractor_test from object_detection.models import ssd_feature_extractor_test
...@@ -64,12 +65,15 @@ class SSDResnetPpnFeatureExtractorTestBase( ...@@ -64,12 +65,15 @@ class SSDResnetPpnFeatureExtractorTestBase(
image_width = 128 image_width = 128
depth_multiplier = 1 depth_multiplier = 1
pad_to_multiple = 1 pad_to_multiple = 1
test_image = np.random.rand(4, image_height, image_width, 3) test_image = tf.constant(np.random.rand(4, image_height, image_width, 3))
feature_extractor = self._create_feature_extractor(depth_multiplier, feature_extractor = self._create_feature_extractor(depth_multiplier,
pad_to_multiple) pad_to_multiple)
preprocessed_image = feature_extractor.preprocess(test_image) preprocessed_image = feature_extractor.preprocess(test_image)
self.assertAllClose(preprocessed_image, with self.test_session() as sess:
test_image - [[123.68, 116.779, 103.939]]) test_image_out, preprocessed_image_out = sess.run(
[test_image, preprocessed_image])
self.assertAllClose(preprocessed_image_out,
test_image_out - [[123.68, 116.779, 103.939]])
def test_variables_only_created_in_scope(self): def test_variables_only_created_in_scope(self):
depth_multiplier = 1 depth_multiplier = 1
......
...@@ -134,26 +134,32 @@ class ConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor): ...@@ -134,26 +134,32 @@ class ConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor):
(len(self._prediction_heads[BOX_ENCODINGS]), (len(self._prediction_heads[BOX_ENCODINGS]),
len(input_shapes))) len(input_shapes)))
for stack_index, input_shape in enumerate(input_shapes): for stack_index, input_shape in enumerate(input_shapes):
net = tf.keras.Sequential(name='PreHeadConvolutions_%d' % stack_index) net = []
self._shared_nets.append(net)
# Add additional conv layers before the class predictor. # Add additional conv layers before the class predictor.
features_depth = static_shape.get_depth(input_shape) features_depth = static_shape.get_depth(input_shape)
depth = max(min(features_depth, self._max_depth), self._min_depth) depth = max(min(features_depth, self._max_depth), self._min_depth)
tf.logging.info( tf.logging.info(
'depth of additional conv before box predictor: {}'.format(depth)) 'depth of additional conv before box predictor: {}'.format(depth))
if depth > 0 and self._num_layers_before_predictor > 0: if depth > 0 and self._num_layers_before_predictor > 0:
for i in range(self._num_layers_before_predictor): for i in range(self._num_layers_before_predictor):
net.add(keras.Conv2D(depth, [1, 1], net.append(keras.Conv2D(depth, [1, 1],
name='Conv2d_%d_1x1_%d' % (i, depth), name='SharedConvolutions_%d/Conv2d_%d_1x1_%d'
padding='SAME', % (stack_index, i, depth),
**self._conv_hyperparams.params())) padding='SAME',
net.add(self._conv_hyperparams.build_batch_norm( **self._conv_hyperparams.params()))
net.append(self._conv_hyperparams.build_batch_norm(
training=(self._is_training and not self._freeze_batchnorm), training=(self._is_training and not self._freeze_batchnorm),
name='Conv2d_%d_1x1_%d_norm' % (i, depth))) name='SharedConvolutions_%d/Conv2d_%d_1x1_%d_norm'
net.add(self._conv_hyperparams.build_activation_layer( % (stack_index, i, depth)))
name='Conv2d_%d_1x1_%d_activation' % (i, depth), net.append(self._conv_hyperparams.build_activation_layer(
name='SharedConvolutions_%d/Conv2d_%d_1x1_%d_activation'
% (stack_index, i, depth),
)) ))
# Until certain bugs are fixed in checkpointable lists,
# this net must be appended only once it's been filled with layers
self._shared_nets.append(net)
self.built = True self.built = True
def _predict(self, image_features): def _predict(self, image_features):
...@@ -175,10 +181,11 @@ class ConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor): ...@@ -175,10 +181,11 @@ class ConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor):
""" """
predictions = collections.defaultdict(list) predictions = collections.defaultdict(list)
for (index, image_feature) in enumerate(image_features): for (index, net) in enumerate(image_features):
# Apply shared conv layers before the head predictors. # Apply shared conv layers before the head predictors.
net = self._shared_nets[index](image_feature) for layer in self._shared_nets[index]:
net = layer(net)
for head_name in self._prediction_heads: for head_name in self._prediction_heads:
head_obj = self._prediction_heads[head_name][index] head_obj = self._prediction_heads[head_name][index]
......
...@@ -181,8 +181,8 @@ class ConvolutionalKerasBoxPredictorTest(test_case.TestCase): ...@@ -181,8 +181,8 @@ class ConvolutionalKerasBoxPredictorTest(test_case.TestCase):
self.assertAllEqual(objectness_predictions_shape, self.assertAllEqual(objectness_predictions_shape,
[4, expected_num_anchors, 1]) [4, expected_num_anchors, 1])
expected_variable_set = set([ expected_variable_set = set([
'BoxPredictor/PreHeadConvolutions_0/Conv2d_0_1x1_32/bias', 'BoxPredictor/SharedConvolutions_0/Conv2d_0_1x1_32/bias',
'BoxPredictor/PreHeadConvolutions_0/Conv2d_0_1x1_32/kernel', 'BoxPredictor/SharedConvolutions_0/Conv2d_0_1x1_32/kernel',
'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/bias', 'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/bias',
'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/kernel', 'BoxPredictor/ConvolutionalBoxHead_0/BoxEncodingPredictor/kernel',
'BoxPredictor/ConvolutionalClassHead_0/ClassPredictor/bias', 'BoxPredictor/ConvolutionalClassHead_0/ClassPredictor/bias',
......
...@@ -34,16 +34,18 @@ class MaskRCNNClassHead(head.Head): ...@@ -34,16 +34,18 @@ class MaskRCNNClassHead(head.Head):
https://arxiv.org/abs/1703.06870 https://arxiv.org/abs/1703.06870
""" """
def __init__(self, is_training, num_classes, fc_hyperparams_fn, def __init__(self,
use_dropout, dropout_keep_prob): is_training,
num_class_slots,
fc_hyperparams_fn,
use_dropout,
dropout_keep_prob):
"""Constructor. """Constructor.
Args: Args:
is_training: Indicates whether the BoxPredictor is in training mode. is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: number of classes. Note that num_classes *does not* num_class_slots: number of class slots. Note that num_class_slots may or
include the background category, so if groundtruth labels take values may not include an implicit background category.
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
assigned classification targets can range from {0,... K}).
fc_hyperparams_fn: A function to generate tf-slim arg_scope with fc_hyperparams_fn: A function to generate tf-slim arg_scope with
hyperparameters for fully connected ops. hyperparameters for fully connected ops.
use_dropout: Option to use dropout or not. Note that a single dropout use_dropout: Option to use dropout or not. Note that a single dropout
...@@ -54,7 +56,7 @@ class MaskRCNNClassHead(head.Head): ...@@ -54,7 +56,7 @@ class MaskRCNNClassHead(head.Head):
""" """
super(MaskRCNNClassHead, self).__init__() super(MaskRCNNClassHead, self).__init__()
self._is_training = is_training self._is_training = is_training
self._num_classes = num_classes self._num_class_slots = num_class_slots
self._fc_hyperparams_fn = fc_hyperparams_fn self._fc_hyperparams_fn = fc_hyperparams_fn
self._use_dropout = use_dropout self._use_dropout = use_dropout
self._dropout_keep_prob = dropout_keep_prob self._dropout_keep_prob = dropout_keep_prob
...@@ -70,7 +72,7 @@ class MaskRCNNClassHead(head.Head): ...@@ -70,7 +72,7 @@ class MaskRCNNClassHead(head.Head):
Returns: Returns:
class_predictions_with_background: A float tensor of shape class_predictions_with_background: A float tensor of shape
[batch_size, 1, num_classes + 1] representing the class predictions for [batch_size, 1, num_class_slots] representing the class predictions for
the proposals. the proposals.
Raises: Raises:
...@@ -91,11 +93,12 @@ class MaskRCNNClassHead(head.Head): ...@@ -91,11 +93,12 @@ class MaskRCNNClassHead(head.Head):
with slim.arg_scope(self._fc_hyperparams_fn()): with slim.arg_scope(self._fc_hyperparams_fn()):
class_predictions_with_background = slim.fully_connected( class_predictions_with_background = slim.fully_connected(
flattened_roi_pooled_features, flattened_roi_pooled_features,
self._num_classes + 1, self._num_class_slots,
activation_fn=None, activation_fn=None,
scope='ClassPredictor') scope='ClassPredictor')
class_predictions_with_background = tf.reshape( class_predictions_with_background = tf.reshape(
class_predictions_with_background, [-1, 1, self._num_classes + 1]) class_predictions_with_background,
[-1, 1, self._num_class_slots])
return class_predictions_with_background return class_predictions_with_background
...@@ -104,7 +107,7 @@ class ConvolutionalClassHead(head.Head): ...@@ -104,7 +107,7 @@ class ConvolutionalClassHead(head.Head):
def __init__(self, def __init__(self,
is_training, is_training,
num_classes, num_class_slots,
use_dropout, use_dropout,
dropout_keep_prob, dropout_keep_prob,
kernel_size, kernel_size,
...@@ -115,7 +118,8 @@ class ConvolutionalClassHead(head.Head): ...@@ -115,7 +118,8 @@ class ConvolutionalClassHead(head.Head):
Args: Args:
is_training: Indicates whether the BoxPredictor is in training mode. is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: Number of classes. num_class_slots: number of class slots. Note that num_class_slots may or
may not include an implicit background category.
use_dropout: Option to use dropout or not. Note that a single dropout use_dropout: Option to use dropout or not. Note that a single dropout
op is applied here prior to both box and class predictions, which stands op is applied here prior to both box and class predictions, which stands
in contrast to the ConvolutionalBoxPredictor below. in contrast to the ConvolutionalBoxPredictor below.
...@@ -137,7 +141,7 @@ class ConvolutionalClassHead(head.Head): ...@@ -137,7 +141,7 @@ class ConvolutionalClassHead(head.Head):
""" """
super(ConvolutionalClassHead, self).__init__() super(ConvolutionalClassHead, self).__init__()
self._is_training = is_training self._is_training = is_training
self._num_classes = num_classes self._num_class_slots = num_class_slots
self._use_dropout = use_dropout self._use_dropout = use_dropout
self._dropout_keep_prob = dropout_keep_prob self._dropout_keep_prob = dropout_keep_prob
self._kernel_size = kernel_size self._kernel_size = kernel_size
...@@ -156,12 +160,10 @@ class ConvolutionalClassHead(head.Head): ...@@ -156,12 +160,10 @@ class ConvolutionalClassHead(head.Head):
Returns: Returns:
class_predictions_with_background: A float tensors of shape class_predictions_with_background: A float tensors of shape
[batch_size, num_anchors, num_classes + 1] representing the class [batch_size, num_anchors, num_class_slots] representing the class
predictions for the proposals. predictions for the proposals.
""" """
net = features net = features
# Add a slot for the background class.
num_class_slots = self._num_classes + 1
if self._use_dropout: if self._use_dropout:
net = slim.dropout(net, keep_prob=self._dropout_keep_prob) net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
if self._use_depthwise: if self._use_depthwise:
...@@ -171,7 +173,7 @@ class ConvolutionalClassHead(head.Head): ...@@ -171,7 +173,7 @@ class ConvolutionalClassHead(head.Head):
rate=1, scope='ClassPredictor_depthwise') rate=1, scope='ClassPredictor_depthwise')
class_predictions_with_background = slim.conv2d( class_predictions_with_background = slim.conv2d(
class_predictions_with_background, class_predictions_with_background,
num_predictions_per_location * num_class_slots, [1, 1], num_predictions_per_location * self._num_class_slots, [1, 1],
activation_fn=None, activation_fn=None,
normalizer_fn=None, normalizer_fn=None,
normalizer_params=None, normalizer_params=None,
...@@ -179,7 +181,7 @@ class ConvolutionalClassHead(head.Head): ...@@ -179,7 +181,7 @@ class ConvolutionalClassHead(head.Head):
else: else:
class_predictions_with_background = slim.conv2d( class_predictions_with_background = slim.conv2d(
net, net,
num_predictions_per_location * num_class_slots, num_predictions_per_location * self._num_class_slots,
[self._kernel_size, self._kernel_size], [self._kernel_size, self._kernel_size],
activation_fn=None, activation_fn=None,
normalizer_fn=None, normalizer_fn=None,
...@@ -194,7 +196,8 @@ class ConvolutionalClassHead(head.Head): ...@@ -194,7 +196,8 @@ class ConvolutionalClassHead(head.Head):
if batch_size is None: if batch_size is None:
batch_size = tf.shape(features)[0] batch_size = tf.shape(features)[0]
class_predictions_with_background = tf.reshape( class_predictions_with_background = tf.reshape(
class_predictions_with_background, [batch_size, -1, num_class_slots]) class_predictions_with_background,
[batch_size, -1, self._num_class_slots])
return class_predictions_with_background return class_predictions_with_background
...@@ -208,7 +211,7 @@ class WeightSharedConvolutionalClassHead(head.Head): ...@@ -208,7 +211,7 @@ class WeightSharedConvolutionalClassHead(head.Head):
""" """
def __init__(self, def __init__(self,
num_classes, num_class_slots,
kernel_size=3, kernel_size=3,
class_prediction_bias_init=0.0, class_prediction_bias_init=0.0,
use_dropout=False, use_dropout=False,
...@@ -218,10 +221,8 @@ class WeightSharedConvolutionalClassHead(head.Head): ...@@ -218,10 +221,8 @@ class WeightSharedConvolutionalClassHead(head.Head):
"""Constructor. """Constructor.
Args: Args:
num_classes: number of classes. Note that num_classes *does not* num_class_slots: number of class slots. Note that num_class_slots may or
include the background category, so if groundtruth labels take values may not include an implicit background category.
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
assigned classification targets can range from {0,... K}).
kernel_size: Size of final convolution kernel. kernel_size: Size of final convolution kernel.
class_prediction_bias_init: constant value to initialize bias of the last class_prediction_bias_init: constant value to initialize bias of the last
conv2d layer before class prediction. conv2d layer before class prediction.
...@@ -233,7 +234,7 @@ class WeightSharedConvolutionalClassHead(head.Head): ...@@ -233,7 +234,7 @@ class WeightSharedConvolutionalClassHead(head.Head):
as inputs and returns tensors). as inputs and returns tensors).
""" """
super(WeightSharedConvolutionalClassHead, self).__init__() super(WeightSharedConvolutionalClassHead, self).__init__()
self._num_classes = num_classes self._num_class_slots = num_class_slots
self._kernel_size = kernel_size self._kernel_size = kernel_size
self._class_prediction_bias_init = class_prediction_bias_init self._class_prediction_bias_init = class_prediction_bias_init
self._use_dropout = use_dropout self._use_dropout = use_dropout
...@@ -252,12 +253,10 @@ class WeightSharedConvolutionalClassHead(head.Head): ...@@ -252,12 +253,10 @@ class WeightSharedConvolutionalClassHead(head.Head):
Returns: Returns:
class_predictions_with_background: A tensor of shape class_predictions_with_background: A tensor of shape
[batch_size, num_anchors, num_classes + 1] representing the class [batch_size, num_anchors, num_class_slots] representing the class
predictions for the proposals. predictions for the proposals.
""" """
class_predictions_net = features class_predictions_net = features
num_class_slots = self._num_classes + 1
# Add a slot for the background class.
if self._use_dropout: if self._use_dropout:
class_predictions_net = slim.dropout( class_predictions_net = slim.dropout(
class_predictions_net, keep_prob=self._dropout_keep_prob) class_predictions_net, keep_prob=self._dropout_keep_prob)
...@@ -267,7 +266,7 @@ class WeightSharedConvolutionalClassHead(head.Head): ...@@ -267,7 +266,7 @@ class WeightSharedConvolutionalClassHead(head.Head):
conv_op = slim.conv2d conv_op = slim.conv2d
class_predictions_with_background = conv_op( class_predictions_with_background = conv_op(
class_predictions_net, class_predictions_net,
num_predictions_per_location * num_class_slots, num_predictions_per_location * self._num_class_slots,
[self._kernel_size, self._kernel_size], [self._kernel_size, self._kernel_size],
activation_fn=None, stride=1, padding='SAME', activation_fn=None, stride=1, padding='SAME',
normalizer_fn=None, normalizer_fn=None,
...@@ -280,5 +279,6 @@ class WeightSharedConvolutionalClassHead(head.Head): ...@@ -280,5 +279,6 @@ class WeightSharedConvolutionalClassHead(head.Head):
class_predictions_with_background = self._score_converter_fn( class_predictions_with_background = self._score_converter_fn(
class_predictions_with_background) class_predictions_with_background)
class_predictions_with_background = tf.reshape( class_predictions_with_background = tf.reshape(
class_predictions_with_background, [batch_size, -1, num_class_slots]) class_predictions_with_background,
[batch_size, -1, self._num_class_slots])
return class_predictions_with_background return class_predictions_with_background
...@@ -46,7 +46,7 @@ class MaskRCNNClassHeadTest(test_case.TestCase): ...@@ -46,7 +46,7 @@ class MaskRCNNClassHeadTest(test_case.TestCase):
def test_prediction_size(self): def test_prediction_size(self):
class_prediction_head = class_head.MaskRCNNClassHead( class_prediction_head = class_head.MaskRCNNClassHead(
is_training=False, is_training=False,
num_classes=20, num_class_slots=20,
fc_hyperparams_fn=self._build_arg_scope_with_hyperparams(), fc_hyperparams_fn=self._build_arg_scope_with_hyperparams(),
use_dropout=True, use_dropout=True,
dropout_keep_prob=0.5) dropout_keep_prob=0.5)
...@@ -54,7 +54,7 @@ class MaskRCNNClassHeadTest(test_case.TestCase): ...@@ -54,7 +54,7 @@ class MaskRCNNClassHeadTest(test_case.TestCase):
[64, 7, 7, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32) [64, 7, 7, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
prediction = class_prediction_head.predict( prediction = class_prediction_head.predict(
features=roi_pooled_features, num_predictions_per_location=1) features=roi_pooled_features, num_predictions_per_location=1)
self.assertAllEqual([64, 1, 21], prediction.get_shape().as_list()) self.assertAllEqual([64, 1, 20], prediction.get_shape().as_list())
class ConvolutionalClassPredictorTest(test_case.TestCase): class ConvolutionalClassPredictorTest(test_case.TestCase):
...@@ -80,7 +80,7 @@ class ConvolutionalClassPredictorTest(test_case.TestCase): ...@@ -80,7 +80,7 @@ class ConvolutionalClassPredictorTest(test_case.TestCase):
def test_prediction_size(self): def test_prediction_size(self):
class_prediction_head = class_head.ConvolutionalClassHead( class_prediction_head = class_head.ConvolutionalClassHead(
is_training=True, is_training=True,
num_classes=20, num_class_slots=20,
use_dropout=True, use_dropout=True,
dropout_keep_prob=0.5, dropout_keep_prob=0.5,
kernel_size=3) kernel_size=3)
...@@ -89,7 +89,7 @@ class ConvolutionalClassPredictorTest(test_case.TestCase): ...@@ -89,7 +89,7 @@ class ConvolutionalClassPredictorTest(test_case.TestCase):
class_predictions = class_prediction_head.predict( class_predictions = class_prediction_head.predict(
features=image_feature, features=image_feature,
num_predictions_per_location=1) num_predictions_per_location=1)
self.assertAllEqual([64, 323, 21], self.assertAllEqual([64, 323, 20],
class_predictions.get_shape().as_list()) class_predictions.get_shape().as_list())
...@@ -115,13 +115,13 @@ class WeightSharedConvolutionalClassPredictorTest(test_case.TestCase): ...@@ -115,13 +115,13 @@ class WeightSharedConvolutionalClassPredictorTest(test_case.TestCase):
def test_prediction_size(self): def test_prediction_size(self):
class_prediction_head = ( class_prediction_head = (
class_head.WeightSharedConvolutionalClassHead(num_classes=20)) class_head.WeightSharedConvolutionalClassHead(num_class_slots=20))
image_feature = tf.random_uniform( image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32) [64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
class_predictions = class_prediction_head.predict( class_predictions = class_prediction_head.predict(
features=image_feature, features=image_feature,
num_predictions_per_location=1) num_predictions_per_location=1)
self.assertAllEqual([64, 323, 21], class_predictions.get_shape().as_list()) self.assertAllEqual([64, 323, 20], class_predictions.get_shape().as_list())
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -91,7 +91,7 @@ class ConvolutionalBoxHead(head.KerasHead): ...@@ -91,7 +91,7 @@ class ConvolutionalBoxHead(head.KerasHead):
tf.keras.layers.Conv2D( tf.keras.layers.Conv2D(
num_predictions_per_location * self._box_code_size, [1, 1], num_predictions_per_location * self._box_code_size, [1, 1],
name='BoxEncodingPredictor', name='BoxEncodingPredictor',
**conv_hyperparams.params(activation=None))) **conv_hyperparams.params(use_bias=True)))
else: else:
self._box_encoder_layers.append( self._box_encoder_layers.append(
tf.keras.layers.Conv2D( tf.keras.layers.Conv2D(
...@@ -99,7 +99,7 @@ class ConvolutionalBoxHead(head.KerasHead): ...@@ -99,7 +99,7 @@ class ConvolutionalBoxHead(head.KerasHead):
[self._kernel_size, self._kernel_size], [self._kernel_size, self._kernel_size],
padding='SAME', padding='SAME',
name='BoxEncodingPredictor', name='BoxEncodingPredictor',
**conv_hyperparams.params(activation=None))) **conv_hyperparams.params(use_bias=True)))
def _predict(self, features): def _predict(self, features):
"""Predicts boxes. """Predicts boxes.
......
...@@ -29,7 +29,7 @@ class ConvolutionalClassHead(head.KerasHead): ...@@ -29,7 +29,7 @@ class ConvolutionalClassHead(head.KerasHead):
def __init__(self, def __init__(self,
is_training, is_training,
num_classes, num_class_slots,
use_dropout, use_dropout,
dropout_keep_prob, dropout_keep_prob,
kernel_size, kernel_size,
...@@ -43,7 +43,8 @@ class ConvolutionalClassHead(head.KerasHead): ...@@ -43,7 +43,8 @@ class ConvolutionalClassHead(head.KerasHead):
Args: Args:
is_training: Indicates whether the BoxPredictor is in training mode. is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: Number of classes. num_class_slots: number of class slots. Note that num_class_slots may or
may not include an implicit background category.
use_dropout: Option to use dropout or not. Note that a single dropout use_dropout: Option to use dropout or not. Note that a single dropout
op is applied here prior to both box and class predictions, which stands op is applied here prior to both box and class predictions, which stands
in contrast to the ConvolutionalBoxPredictor below. in contrast to the ConvolutionalBoxPredictor below.
...@@ -73,13 +74,12 @@ class ConvolutionalClassHead(head.KerasHead): ...@@ -73,13 +74,12 @@ class ConvolutionalClassHead(head.KerasHead):
""" """
super(ConvolutionalClassHead, self).__init__(name=name) super(ConvolutionalClassHead, self).__init__(name=name)
self._is_training = is_training self._is_training = is_training
self._num_classes = num_classes
self._use_dropout = use_dropout self._use_dropout = use_dropout
self._dropout_keep_prob = dropout_keep_prob self._dropout_keep_prob = dropout_keep_prob
self._kernel_size = kernel_size self._kernel_size = kernel_size
self._class_prediction_bias_init = class_prediction_bias_init self._class_prediction_bias_init = class_prediction_bias_init
self._use_depthwise = use_depthwise self._use_depthwise = use_depthwise
self._num_class_slots = self._num_classes + 1 self._num_class_slots = num_class_slots
self._class_predictor_layers = [] self._class_predictor_layers = []
...@@ -110,7 +110,7 @@ class ConvolutionalClassHead(head.KerasHead): ...@@ -110,7 +110,7 @@ class ConvolutionalClassHead(head.KerasHead):
tf.keras.layers.Conv2D( tf.keras.layers.Conv2D(
num_predictions_per_location * self._num_class_slots, [1, 1], num_predictions_per_location * self._num_class_slots, [1, 1],
name='ClassPredictor', name='ClassPredictor',
**conv_hyperparams.params(activation=None))) **conv_hyperparams.params(use_bias=True)))
else: else:
self._class_predictor_layers.append( self._class_predictor_layers.append(
tf.keras.layers.Conv2D( tf.keras.layers.Conv2D(
...@@ -120,7 +120,7 @@ class ConvolutionalClassHead(head.KerasHead): ...@@ -120,7 +120,7 @@ class ConvolutionalClassHead(head.KerasHead):
name='ClassPredictor', name='ClassPredictor',
bias_initializer=tf.constant_initializer( bias_initializer=tf.constant_initializer(
self._class_prediction_bias_init), self._class_prediction_bias_init),
**conv_hyperparams.params(activation=None))) **conv_hyperparams.params(use_bias=True)))
def _predict(self, features): def _predict(self, features):
"""Predicts boxes. """Predicts boxes.
...@@ -131,7 +131,7 @@ class ConvolutionalClassHead(head.KerasHead): ...@@ -131,7 +131,7 @@ class ConvolutionalClassHead(head.KerasHead):
Returns: Returns:
class_predictions_with_background: A float tensor of shape class_predictions_with_background: A float tensor of shape
[batch_size, num_anchors, num_classes + 1] representing the class [batch_size, num_anchors, num_class_slots] representing the class
predictions for the proposals. predictions for the proposals.
""" """
# Add a slot for the background class. # Add a slot for the background class.
......
...@@ -45,7 +45,7 @@ class ConvolutionalKerasClassPredictorTest(test_case.TestCase): ...@@ -45,7 +45,7 @@ class ConvolutionalKerasClassPredictorTest(test_case.TestCase):
conv_hyperparams = self._build_conv_hyperparams() conv_hyperparams = self._build_conv_hyperparams()
class_prediction_head = keras_class_head.ConvolutionalClassHead( class_prediction_head = keras_class_head.ConvolutionalClassHead(
is_training=True, is_training=True,
num_classes=20, num_class_slots=20,
use_dropout=True, use_dropout=True,
dropout_keep_prob=0.5, dropout_keep_prob=0.5,
kernel_size=3, kernel_size=3,
...@@ -56,7 +56,7 @@ class ConvolutionalKerasClassPredictorTest(test_case.TestCase): ...@@ -56,7 +56,7 @@ class ConvolutionalKerasClassPredictorTest(test_case.TestCase):
image_feature = tf.random_uniform( image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32) [64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
class_predictions = class_prediction_head(image_feature,) class_predictions = class_prediction_head(image_feature,)
self.assertAllEqual([64, 323, 21], self.assertAllEqual([64, 323, 20],
class_predictions.get_shape().as_list()) class_predictions.get_shape().as_list())
# TODO(kaftan): Remove conditional after CMLE moves to TF 1.10 # TODO(kaftan): Remove conditional after CMLE moves to TF 1.10
......
...@@ -124,7 +124,7 @@ class ConvolutionalMaskHead(head.KerasHead): ...@@ -124,7 +124,7 @@ class ConvolutionalMaskHead(head.KerasHead):
tf.keras.layers.Conv2D( tf.keras.layers.Conv2D(
num_predictions_per_location * num_mask_channels, [1, 1], num_predictions_per_location * num_mask_channels, [1, 1],
name='MaskPredictor', name='MaskPredictor',
**conv_hyperparams.params(activation=None))) **conv_hyperparams.params(use_bias=True)))
else: else:
self._mask_predictor_layers.append( self._mask_predictor_layers.append(
tf.keras.layers.Conv2D( tf.keras.layers.Conv2D(
...@@ -132,7 +132,7 @@ class ConvolutionalMaskHead(head.KerasHead): ...@@ -132,7 +132,7 @@ class ConvolutionalMaskHead(head.KerasHead):
[self._kernel_size, self._kernel_size], [self._kernel_size, self._kernel_size],
padding='SAME', padding='SAME',
name='MaskPredictor', name='MaskPredictor',
**conv_hyperparams.params(activation=None))) **conv_hyperparams.params(use_bias=True)))
def _predict(self, features): def _predict(self, features):
"""Predicts boxes. """Predicts boxes.
......
...@@ -23,6 +23,7 @@ import math ...@@ -23,6 +23,7 @@ import math
import tensorflow as tf import tensorflow as tf
from object_detection.predictors.heads import head from object_detection.predictors.heads import head
from object_detection.utils import ops
slim = tf.contrib.slim slim = tf.contrib.slim
...@@ -41,7 +42,8 @@ class MaskRCNNMaskHead(head.Head): ...@@ -41,7 +42,8 @@ class MaskRCNNMaskHead(head.Head):
mask_width=14, mask_width=14,
mask_prediction_num_conv_layers=2, mask_prediction_num_conv_layers=2,
mask_prediction_conv_depth=256, mask_prediction_conv_depth=256,
masks_are_class_agnostic=False): masks_are_class_agnostic=False,
convolve_then_upsample=False):
"""Constructor. """Constructor.
Args: Args:
...@@ -62,6 +64,10 @@ class MaskRCNNMaskHead(head.Head): ...@@ -62,6 +64,10 @@ class MaskRCNNMaskHead(head.Head):
image features. image features.
masks_are_class_agnostic: Boolean determining if the mask-head is masks_are_class_agnostic: Boolean determining if the mask-head is
class-agnostic or not. class-agnostic or not.
convolve_then_upsample: Whether to apply convolutions on mask features
before upsampling using nearest neighbor resizing. Otherwise, mask
features are resized to [`mask_height`, `mask_width`] using bilinear
resizing before applying convolutions.
Raises: Raises:
ValueError: conv_hyperparams_fn is None. ValueError: conv_hyperparams_fn is None.
...@@ -74,6 +80,7 @@ class MaskRCNNMaskHead(head.Head): ...@@ -74,6 +80,7 @@ class MaskRCNNMaskHead(head.Head):
self._mask_prediction_num_conv_layers = mask_prediction_num_conv_layers self._mask_prediction_num_conv_layers = mask_prediction_num_conv_layers
self._mask_prediction_conv_depth = mask_prediction_conv_depth self._mask_prediction_conv_depth = mask_prediction_conv_depth
self._masks_are_class_agnostic = masks_are_class_agnostic self._masks_are_class_agnostic = masks_are_class_agnostic
self._convolve_then_upsample = convolve_then_upsample
if conv_hyperparams_fn is None: if conv_hyperparams_fn is None:
raise ValueError('conv_hyperparams_fn is None.') raise ValueError('conv_hyperparams_fn is None.')
...@@ -135,17 +142,30 @@ class MaskRCNNMaskHead(head.Head): ...@@ -135,17 +142,30 @@ class MaskRCNNMaskHead(head.Head):
num_conv_channels = self._get_mask_predictor_conv_depth( num_conv_channels = self._get_mask_predictor_conv_depth(
num_feature_channels, self._num_classes) num_feature_channels, self._num_classes)
with slim.arg_scope(self._conv_hyperparams_fn()): with slim.arg_scope(self._conv_hyperparams_fn()):
upsampled_features = tf.image.resize_bilinear( if not self._convolve_then_upsample:
features, [self._mask_height, self._mask_width], features = tf.image.resize_bilinear(
align_corners=True) features, [self._mask_height, self._mask_width],
align_corners=True)
for _ in range(self._mask_prediction_num_conv_layers - 1): for _ in range(self._mask_prediction_num_conv_layers - 1):
upsampled_features = slim.conv2d( features = slim.conv2d(
upsampled_features, features,
num_outputs=num_conv_channels,
kernel_size=[3, 3])
if self._convolve_then_upsample:
# Replace Transposed Convolution with a Nearest Neighbor upsampling step
# followed by 3x3 convolution.
height_scale = self._mask_height / features.shape[1].value
width_scale = self._mask_width / features.shape[2].value
features = ops.nearest_neighbor_upsampling(
features, height_scale=height_scale, width_scale=width_scale)
features = slim.conv2d(
features,
num_outputs=num_conv_channels, num_outputs=num_conv_channels,
kernel_size=[3, 3]) kernel_size=[3, 3])
num_masks = 1 if self._masks_are_class_agnostic else self._num_classes num_masks = 1 if self._masks_are_class_agnostic else self._num_classes
mask_predictions = slim.conv2d( mask_predictions = slim.conv2d(
upsampled_features, features,
num_outputs=num_masks, num_outputs=num_masks,
activation_fn=None, activation_fn=None,
normalizer_fn=None, normalizer_fn=None,
......
...@@ -58,6 +58,22 @@ class MaskRCNNMaskHeadTest(test_case.TestCase): ...@@ -58,6 +58,22 @@ class MaskRCNNMaskHeadTest(test_case.TestCase):
features=roi_pooled_features, num_predictions_per_location=1) features=roi_pooled_features, num_predictions_per_location=1)
self.assertAllEqual([64, 1, 20, 14, 14], prediction.get_shape().as_list()) self.assertAllEqual([64, 1, 20, 14, 14], prediction.get_shape().as_list())
def test_prediction_size_with_convolve_then_upsample(self):
mask_prediction_head = mask_head.MaskRCNNMaskHead(
num_classes=20,
conv_hyperparams_fn=self._build_arg_scope_with_hyperparams(),
mask_height=28,
mask_width=28,
mask_prediction_num_conv_layers=2,
mask_prediction_conv_depth=256,
masks_are_class_agnostic=True,
convolve_then_upsample=True)
roi_pooled_features = tf.random_uniform(
[64, 14, 14, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
prediction = mask_prediction_head.predict(
features=roi_pooled_features, num_predictions_per_location=1)
self.assertAllEqual([64, 1, 1, 28, 28], prediction.get_shape().as_list())
class ConvolutionalMaskPredictorTest(test_case.TestCase): class ConvolutionalMaskPredictorTest(test_case.TestCase):
......
...@@ -138,6 +138,7 @@ message WeightSharedConvolutionalBoxPredictor { ...@@ -138,6 +138,7 @@ message WeightSharedConvolutionalBoxPredictor {
// TODO(alirezafathi): Refactor the proto file to be able to configure mask rcnn // TODO(alirezafathi): Refactor the proto file to be able to configure mask rcnn
// head easily. // head easily.
// Next id: 15
message MaskRCNNBoxPredictor { message MaskRCNNBoxPredictor {
// Hyperparameters for fully connected ops used in the box predictor. // Hyperparameters for fully connected ops used in the box predictor.
optional Hyperparams fc_hyperparams = 1; optional Hyperparams fc_hyperparams = 1;
...@@ -178,6 +179,12 @@ message MaskRCNNBoxPredictor { ...@@ -178,6 +179,12 @@ message MaskRCNNBoxPredictor {
// Whether to use one box for all classes rather than a different box for each // Whether to use one box for all classes rather than a different box for each
// class. // class.
optional bool share_box_across_classes = 13 [default = false]; optional bool share_box_across_classes = 13 [default = false];
// Whether to apply convolutions on mask features before upsampling using
// nearest neighbor resizing.
// By default, mask features are resized to [`mask_height`, `mask_width`]
// before applying convolutions and predicting masks.
optional bool convolve_then_upsample_masks = 14 [default = false];
} }
message RfcnBoxPredictor { message RfcnBoxPredictor {
......
...@@ -164,6 +164,10 @@ message FasterRcnn { ...@@ -164,6 +164,10 @@ message FasterRcnn {
// Whether the masks present in groundtruth should be resized in the model to // Whether the masks present in groundtruth should be resized in the model to
// match the image size. // match the image size.
optional bool resize_masks = 36 [default = true]; optional bool resize_masks = 36 [default = true];
// If True, uses implementation of ops with static shape guarantees when
// running evaluation (specifically not is_training if False).
optional bool use_static_shapes_for_eval = 37 [default = false];
} }
......
...@@ -155,6 +155,9 @@ message RandomCropImage { ...@@ -155,6 +155,9 @@ message RandomCropImage {
// value, it is removed from the new image. // value, it is removed from the new image.
optional float overlap_thresh = 6 [default=0.3]; optional float overlap_thresh = 6 [default=0.3];
// Whether to clip the boxes to the cropped image.
optional bool clip_boxes = 8 [default=true];
// Probability of keeping the original image. // Probability of keeping the original image.
optional float random_coef = 7 [default=0.0]; optional float random_coef = 7 [default=0.0];
} }
...@@ -194,6 +197,9 @@ message RandomCropPadImage { ...@@ -194,6 +197,9 @@ message RandomCropPadImage {
// value, it is removed from the new image. // value, it is removed from the new image.
optional float overlap_thresh = 6 [default=0.3]; optional float overlap_thresh = 6 [default=0.3];
// Whether to clip the boxes to the cropped image.
optional bool clip_boxes = 11 [default=true];
// Probability of keeping the original image during the crop operation. // Probability of keeping the original image during the crop operation.
optional float random_coef = 7 [default=0.0]; optional float random_coef = 7 [default=0.0];
...@@ -217,6 +223,9 @@ message RandomCropToAspectRatio { ...@@ -217,6 +223,9 @@ message RandomCropToAspectRatio {
// ratio between a cropped bounding box and the original is less than this // ratio between a cropped bounding box and the original is less than this
// value, it is removed from the new image. // value, it is removed from the new image.
optional float overlap_thresh = 2 [default=0.3]; optional float overlap_thresh = 2 [default=0.3];
// Whether to clip the boxes to the cropped image.
optional bool clip_boxes = 3 [default=true];
} }
// Randomly adds black square patches to an image. // Randomly adds black square patches to an image.
...@@ -285,6 +294,9 @@ message SSDRandomCropOperation { ...@@ -285,6 +294,9 @@ message SSDRandomCropOperation {
// Cropped box area ratio must be above this threhold to be kept. // Cropped box area ratio must be above this threhold to be kept.
optional float overlap_thresh = 6; optional float overlap_thresh = 6;
// Whether to clip the boxes to the cropped image.
optional bool clip_boxes = 8 [default=true];
// Probability a crop operation is skipped. // Probability a crop operation is skipped.
optional float random_coef = 7; optional float random_coef = 7;
} }
...@@ -315,6 +327,9 @@ message SSDRandomCropPadOperation { ...@@ -315,6 +327,9 @@ message SSDRandomCropPadOperation {
// Cropped box area ratio must be above this threhold to be kept. // Cropped box area ratio must be above this threhold to be kept.
optional float overlap_thresh = 6; optional float overlap_thresh = 6;
// Whether to clip the boxes to the cropped image.
optional bool clip_boxes = 13 [default=true];
// Probability a crop operation is skipped. // Probability a crop operation is skipped.
optional float random_coef = 7; optional float random_coef = 7;
...@@ -353,6 +368,9 @@ message SSDRandomCropFixedAspectRatioOperation { ...@@ -353,6 +368,9 @@ message SSDRandomCropFixedAspectRatioOperation {
// Cropped box area ratio must be above this threhold to be kept. // Cropped box area ratio must be above this threhold to be kept.
optional float overlap_thresh = 6; optional float overlap_thresh = 6;
// Whether to clip the boxes to the cropped image.
optional bool clip_boxes = 8 [default=true];
// Probability a crop operation is skipped. // Probability a crop operation is skipped.
optional float random_coef = 7; optional float random_coef = 7;
} }
...@@ -387,6 +405,9 @@ message SSDRandomCropPadFixedAspectRatioOperation { ...@@ -387,6 +405,9 @@ message SSDRandomCropPadFixedAspectRatioOperation {
// Cropped box area ratio must be above this threhold to be kept. // Cropped box area ratio must be above this threhold to be kept.
optional float overlap_thresh = 6; optional float overlap_thresh = 6;
// Whether to clip the boxes to the cropped image.
optional bool clip_boxes = 8 [default=true];
// Probability a crop operation is skipped. // Probability a crop operation is skipped.
optional float random_coef = 7; optional float random_coef = 7;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment