Commit d305396d authored by A. Unique TensorFlower's avatar A. Unique TensorFlower
Browse files

ShapeMask Heads and Losses.

PiperOrigin-RevId: 312624281
parent eb49ae73
...@@ -29,6 +29,11 @@ def evaluator_generator(params): ...@@ -29,6 +29,11 @@ def evaluator_generator(params):
elif params.type == 'box_and_mask': elif params.type == 'box_and_mask':
evaluator = coco_evaluator.COCOEvaluator( evaluator = coco_evaluator.COCOEvaluator(
annotation_file=params.val_json_file, include_mask=True) annotation_file=params.val_json_file, include_mask=True)
elif params.type == 'shapemask_box_and_mask':
evaluator = coco_evaluator.ShapeMaskCOCOEvaluator(
mask_eval_class=params.mask_eval_class,
annotation_file=params.val_json_file, include_mask=True)
else: else:
raise ValueError('Evaluator %s is not supported.' % params.type) raise ValueError('Evaluator %s is not supported.' % params.type)
......
...@@ -85,8 +85,8 @@ def retinanet_head_generator(params): ...@@ -85,8 +85,8 @@ def retinanet_head_generator(params):
def rpn_head_generator(params): def rpn_head_generator(params):
head_params = params.rpn_head
"""Generator function for RPN head architecture.""" """Generator function for RPN head architecture."""
head_params = params.rpn_head
return heads.RpnHead( return heads.RpnHead(
params.architecture.min_level, params.architecture.min_level,
params.architecture.max_level, params.architecture.max_level,
...@@ -126,3 +126,38 @@ def mask_rcnn_head_generator(params): ...@@ -126,3 +126,38 @@ def mask_rcnn_head_generator(params):
params.norm_activation.activation, params.norm_activation.activation,
head_params.use_batch_norm, head_params.use_batch_norm,
norm_activation=norm_activation_generator(params.norm_activation)) norm_activation=norm_activation_generator(params.norm_activation))
def shapeprior_head_generator(params):
"""Generator function for shape prior head architecture."""
head_params = params.shapemask_head
return heads.ShapemaskPriorHead(
params.architecture.num_classes,
head_params.num_downsample_channels,
head_params.mask_crop_size,
head_params.use_category_for_mask,
head_params.shape_prior_path)
def coarsemask_head_generator(params):
"""Generator function for ShapeMask coarse mask head architecture."""
head_params = params.shapemask_head
return heads.ShapemaskCoarsemaskHead(
params.architecture.num_classes,
head_params.num_downsample_channels,
head_params.mask_crop_size,
head_params.use_category_for_mask,
head_params.num_convs,
norm_activation=norm_activation_generator(params.norm_activation))
def finemask_head_generator(params):
"""Generator function for Shapemask fine mask head architecture."""
head_params = params.shapemask_head
return heads.ShapemaskFinemaskHead(
params.architecture.num_classes,
head_params.num_downsample_channels,
head_params.mask_crop_size,
head_params.use_category_for_mask,
head_params.num_convs,
head_params.upsample_factor)
...@@ -19,9 +19,7 @@ from __future__ import division ...@@ -19,9 +19,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import functools import functools
import pickle
from absl import logging
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from tensorflow.python.keras import backend from tensorflow.python.keras import backend
...@@ -56,6 +54,7 @@ class RpnHead(tf.keras.layers.Layer): ...@@ -56,6 +54,7 @@ class RpnHead(tf.keras.layers.Layer):
intermediate conv layers. intermediate conv layers.
use_separable_conv: `bool`, indicating whether the separable conv layers use_separable_conv: `bool`, indicating whether the separable conv layers
is used. is used.
activation: activation function. Support 'relu' and 'swish'.
use_batch_norm: 'bool', indicating whether batchnorm layers are added. use_batch_norm: 'bool', indicating whether batchnorm layers are added.
norm_activation: an operation that includes a normalization layer norm_activation: an operation that includes a normalization layer
followed by an optional activation layer. followed by an optional activation layer.
...@@ -165,6 +164,7 @@ class FastrcnnHead(tf.keras.layers.Layer): ...@@ -165,6 +164,7 @@ class FastrcnnHead(tf.keras.layers.Layer):
predictions. predictions.
fc_dims: `int` number that represents the number of dimension of the FC fc_dims: `int` number that represents the number of dimension of the FC
layers. layers.
activation: activation function. Support 'relu' and 'swish'.
use_batch_norm: 'bool', indicating whether batchnorm layers are added. use_batch_norm: 'bool', indicating whether batchnorm layers are added.
norm_activation: an operation that includes a normalization layer norm_activation: an operation that includes a normalization layer
followed by an optional activation layer. followed by an optional activation layer.
...@@ -296,6 +296,7 @@ class MaskrcnnHead(tf.keras.layers.Layer): ...@@ -296,6 +296,7 @@ class MaskrcnnHead(tf.keras.layers.Layer):
intermediate conv layers. intermediate conv layers.
use_separable_conv: `bool`, indicating whether the separable conv layers use_separable_conv: `bool`, indicating whether the separable conv layers
is used. is used.
activation: activation function. Support 'relu' and 'swish'.
use_batch_norm: 'bool', indicating whether batchnorm layers are added. use_batch_norm: 'bool', indicating whether batchnorm layers are added.
norm_activation: an operation that includes a normalization layer norm_activation: an operation that includes a normalization layer
followed by an optional activation layer. followed by an optional activation layer.
...@@ -566,8 +567,8 @@ class RetinanetHead(object): ...@@ -566,8 +567,8 @@ class RetinanetHead(object):
with self._class_name_scope: with self._class_name_scope:
for i in range(self._num_convs): for i in range(self._num_convs):
features = self._class_conv[i](features) features = self._class_conv[i](features)
# The convolution layers in the class net are shared among all levels, but # The convolution layers in the class net are shared among all levels,
# each level has its batch normlization to capture the statistical # but each level has its batch normlization to capture the statistical
# difference among different levels. # difference among different levels.
name = self._class_net_batch_norm_name(i, level) name = self._class_net_batch_norm_name(i, level)
features = self._class_norm_activation[name]( features = self._class_norm_activation[name](
...@@ -601,12 +602,7 @@ class ShapemaskPriorHead(object): ...@@ -601,12 +602,7 @@ class ShapemaskPriorHead(object):
num_downsample_channels, num_downsample_channels,
mask_crop_size, mask_crop_size,
use_category_for_mask, use_category_for_mask,
num_of_instances, shape_prior_path):
min_mask_level,
max_mask_level,
num_clusters,
temperature,
shape_prior_path=None):
"""Initialize params to build RetinaNet head. """Initialize params to build RetinaNet head.
Args: Args:
...@@ -614,30 +610,18 @@ class ShapemaskPriorHead(object): ...@@ -614,30 +610,18 @@ class ShapemaskPriorHead(object):
num_downsample_channels: number of channels in mask branch. num_downsample_channels: number of channels in mask branch.
mask_crop_size: feature crop size. mask_crop_size: feature crop size.
use_category_for_mask: use class information in mask branch. use_category_for_mask: use class information in mask branch.
num_of_instances: number of instances to sample in training time.
min_mask_level: minimum FPN level to crop mask feature from.
max_mask_level: maximum FPN level to crop mask feature from.
num_clusters: number of clusters to use in K-Means.
temperature: the temperature for shape prior learning.
shape_prior_path: the path to load shape priors. shape_prior_path: the path to load shape priors.
""" """
self._mask_num_classes = num_classes self._mask_num_classes = num_classes if use_category_for_mask else 1
self._num_downsample_channels = num_downsample_channels self._num_downsample_channels = num_downsample_channels
self._mask_crop_size = mask_crop_size self._mask_crop_size = mask_crop_size
self._use_category_for_mask = use_category_for_mask
self._num_of_instances = num_of_instances
self._min_mask_level = min_mask_level
self._max_mask_level = max_mask_level
self._num_clusters = num_clusters
self._temperature = temperature
self._shape_prior_path = shape_prior_path self._shape_prior_path = shape_prior_path
self._use_category_for_mask = use_category_for_mask
self._shape_prior_fc = tf.keras.layers.Dense(
self._num_downsample_channels, name='shape-prior-fc')
def __call__(self, def __call__(self, fpn_features, boxes, outer_boxes, classes, is_training):
fpn_features,
boxes,
outer_boxes,
classes,
is_training=None):
"""Generate the detection priors from the box detections and FPN features. """Generate the detection priors from the box detections and FPN features.
This corresponds to the Fig. 4 of the ShapeMask paper at This corresponds to the Fig. 4 of the ShapeMask paper at
...@@ -654,221 +638,96 @@ class ShapemaskPriorHead(object): ...@@ -654,221 +638,96 @@ class ShapemaskPriorHead(object):
is_training: training mode or not. is_training: training mode or not.
Returns: Returns:
crop_features: a float Tensor of shape [batch_size * num_instances, instance_features: a float Tensor of shape [batch_size * num_instances,
mask_crop_size, mask_crop_size, num_downsample_channels]. This is the mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
instance feature crop. instance feature crop.
detection_priors: A float Tensor of shape [batch_size * num_instances, detection_priors: A float Tensor of shape [batch_size * num_instances,
mask_size, mask_size, 1]. mask_size, mask_size, 1].
""" """
with backend.get_graph().as_default(): with backend.get_graph().as_default(), tf.name_scope('prior_mask'):
# loads class specific or agnostic shape priors batch_size, num_instances, _ = boxes.get_shape().as_list()
if self._shape_prior_path: outer_boxes = tf.cast(outer_boxes, tf.float32)
if self._use_category_for_mask: boxes = tf.cast(boxes, tf.float32)
fid = tf.io.gfile.GFile(self._shape_prior_path, 'rb') instance_features = spatial_transform_ops.multilevel_crop_and_resize(
# The encoding='bytes' options is for incompatibility between python2 fpn_features, outer_boxes, output_size=self._mask_crop_size)
# and python3 pickle. instance_features = self._shape_prior_fc(instance_features)
class_tups = pickle.load(fid, encoding='bytes')
max_class_id = class_tups[-1][0] + 1 shape_priors = self._get_priors()
class_masks = np.zeros((max_class_id, self._num_clusters,
self._mask_crop_size, self._mask_crop_size), # Get uniform priors for each outer box.
dtype=np.float32) uniform_priors = tf.ones([batch_size, num_instances, self._mask_crop_size,
for cls_id, _, cls_mask in class_tups: self._mask_crop_size])
assert cls_mask.shape == (self._num_clusters,
self._mask_crop_size**2)
class_masks[cls_id] = cls_mask.reshape(self._num_clusters,
self._mask_crop_size,
self._mask_crop_size)
self.class_priors = tf.convert_to_tensor(
value=class_masks, dtype=tf.float32)
else:
npy_path = tf.io.gfile.GFile(self._shape_prior_path)
class_np_masks = np.load(npy_path)
assert class_np_masks.shape == (
self._num_clusters, self._mask_crop_size,
self._mask_crop_size), 'Invalid priors!!!'
self.class_priors = tf.convert_to_tensor(
value=class_np_masks, dtype=tf.float32)
else:
self.class_priors = tf.zeros(
[self._num_clusters, self._mask_crop_size, self._mask_crop_size],
tf.float32)
batch_size = boxes.get_shape()[0]
min_level_shape = fpn_features[self._min_mask_level].get_shape().as_list()
self._max_feature_size = min_level_shape[1]
detection_prior_levels = self._compute_box_levels(boxes)
level_outer_boxes = outer_boxes / tf.pow(
2., tf.expand_dims(detection_prior_levels, -1))
detection_prior_levels = tf.cast(detection_prior_levels, tf.int32)
uniform_priors = spatial_transform_ops.crop_mask_in_target_box( uniform_priors = spatial_transform_ops.crop_mask_in_target_box(
tf.ones([ uniform_priors, boxes, outer_boxes, self._mask_crop_size)
batch_size, self._num_of_instances, self._mask_crop_size,
self._mask_crop_size # Classify shape priors using uniform priors + instance features.
], tf.float32), boxes, outer_boxes, self._mask_crop_size) prior_distribution = self._classify_shape_priors(
tf.cast(instance_features, tf.float32), uniform_priors, classes)
# Prepare crop features.
multi_level_features = self._get_multilevel_features(fpn_features) instance_priors = tf.gather(shape_priors, classes)
crop_features = spatial_transform_ops.single_level_feature_crop( instance_priors *= tf.expand_dims(tf.expand_dims(
multi_level_features, level_outer_boxes, detection_prior_levels, tf.cast(prior_distribution, tf.float32), axis=-1), axis=-1)
self._min_mask_level, self._mask_crop_size) instance_priors = tf.reduce_sum(instance_priors, axis=2)
detection_priors = spatial_transform_ops.crop_mask_in_target_box(
# Predict and fuse shape priors. instance_priors, boxes, outer_boxes, self._mask_crop_size)
shape_weights = self._classify_and_fuse_detection_priors(
uniform_priors, classes, crop_features) return instance_features, detection_priors
fused_shape_priors = self._fuse_priors(shape_weights, classes)
fused_shape_priors = tf.reshape(fused_shape_priors, [ def _get_priors(self):
batch_size, self._num_of_instances, self._mask_crop_size, """Load shape priors from file."""
self._mask_crop_size # loads class specific or agnostic shape priors
]) if self._shape_prior_path:
predicted_detection_priors = spatial_transform_ops.crop_mask_in_target_box( # Priors are loaded into shape [mask_num_classes, num_clusters, 32, 32].
fused_shape_priors, boxes, outer_boxes, self._mask_crop_size) priors = np.load(tf.io.gfile.GFile(self._shape_prior_path, 'rb'))
predicted_detection_priors = tf.reshape( priors = tf.convert_to_tensor(priors, dtype=tf.float32)
predicted_detection_priors, self._num_clusters = priors.get_shape().as_list()[1]
[-1, self._mask_crop_size, self._mask_crop_size, 1]) else:
# If prior path does not exist, do not use priors, i.e., pirors equal to
return crop_features, predicted_detection_priors # uniform empty 32x32 patch.
self._num_clusters = 1
def _get_multilevel_features(self, fpn_features): priors = tf.zeros([self._mask_num_classes, self._num_clusters,
"""Get multilevel features from FPN feature dictionary into one tensor. self._mask_crop_size, self._mask_crop_size])
return priors
Args:
fpn_features: a dictionary of FPN features. def _classify_shape_priors(self, features, uniform_priors, classes):
Returns:
features: a float tensor of shape [batch_size, num_levels,
max_feature_size, max_feature_size, num_downsample_channels].
"""
# TODO(yeqing): Recover reuse=tf.AUTO_REUSE logic.
with tf.name_scope('masknet'):
mask_feats = {}
# Reduce the feature dimension at each FPN level by convolution.
for feat_level in range(self._min_mask_level, self._max_mask_level + 1):
mask_feats[feat_level] = tf.keras.layers.Conv2D(
self._num_downsample_channels,
kernel_size=(1, 1),
bias_initializer=tf.zeros_initializer(),
kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
padding='same',
name='mask-downsample')(
fpn_features[feat_level])
# Concat features through padding to the max size.
features = [mask_feats[self._min_mask_level]]
for feat_level in range(self._min_mask_level + 1,
self._max_mask_level + 1):
features.append(tf.image.pad_to_bounding_box(
mask_feats[feat_level], 0, 0,
self._max_feature_size, self._max_feature_size))
features = tf.stack(features, axis=1)
return features
def _compute_box_levels(self, boxes):
"""Compute the box FPN levels.
Args:
boxes: a float tensor of shape [batch_size, num_instances, 4].
Returns:
levels: a int tensor of shape [batch_size, num_instances].
"""
object_sizes = tf.stack([
boxes[:, :, 2] - boxes[:, :, 0],
boxes[:, :, 3] - boxes[:, :, 1],
], axis=2)
object_sizes = tf.reduce_max(input_tensor=object_sizes, axis=2)
ratios = object_sizes / self._mask_crop_size
levels = tf.math.ceil(tf.math.log(ratios) / tf.math.log(2.))
levels = tf.maximum(tf.minimum(levels, self._max_mask_level),
self._min_mask_level)
return levels
def _classify_and_fuse_detection_priors(self, uniform_priors,
detection_prior_classes,
crop_features):
"""Classify the uniform prior by predicting the shape modes. """Classify the uniform prior by predicting the shape modes.
Classify the object crop features into K modes of the clusters for each Classify the object crop features into K modes of the clusters for each
category. category.
Args: Args:
features: A float Tensor of shape [batch_size, num_instances,
mask_size, mask_size, num_channels].
uniform_priors: A float Tensor of shape [batch_size, num_instances, uniform_priors: A float Tensor of shape [batch_size, num_instances,
mask_size, mask_size] representing the uniform detection priors. mask_size, mask_size] representing the uniform detection priors.
detection_prior_classes: A int Tensor of shape [batch_size, num_instances] classes: A int Tensor of shape [batch_size, num_instances]
of detection class ids. of detection class ids.
crop_features: A float Tensor of shape [batch_size * num_instances,
mask_size, mask_size, num_channels].
Returns: Returns:
shape_weights: A float Tensor of shape prior_distribution: A float Tensor of shape
[batch_size * num_instances, num_clusters] representing the classifier [batch_size, num_instances, num_clusters] representing the classifier
output probability over all possible shapes. output probability over all possible shapes.
""" """
location_detection_priors = tf.reshape(
uniform_priors, [-1, self._mask_crop_size, self._mask_crop_size, 1])
# Generate image embedding to shape.
fused_shape_features = crop_features * location_detection_priors
shape_embedding = tf.reduce_mean(
input_tensor=fused_shape_features, axis=(1, 2))
if not self._use_category_for_mask:
# TODO(weicheng) use custom op for performance
shape_logits = tf.keras.layers.Dense(
self._num_clusters,
kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01))(
shape_embedding)
shape_logits = tf.reshape(shape_logits,
[-1, self._num_clusters]) / self._temperature
shape_weights = tf.nn.softmax(shape_logits, name='shape_prior_weights')
else:
shape_logits = tf.keras.layers.Dense(
self._mask_num_classes * self._num_clusters,
kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01))(
shape_embedding)
shape_logits = tf.reshape(
shape_logits, [-1, self._mask_num_classes, self._num_clusters])
training_classes = tf.reshape(detection_prior_classes, [-1])
class_idx = tf.stack(
[tf.range(tf.size(input=training_classes)), training_classes - 1],
axis=1)
shape_logits = tf.gather_nd(shape_logits, class_idx) / self._temperature
shape_weights = tf.nn.softmax(shape_logits, name='shape_prior_weights')
return shape_weights
def _fuse_priors(self, shape_weights, detection_prior_classes):
"""Fuse shape priors by the predicted shape probability.
Args:
shape_weights: A float Tensor of shape [batch_size * num_instances,
num_clusters] of predicted shape probability distribution.
detection_prior_classes: A int Tensor of shape [batch_size, num_instances]
of detection class ids.
Returns: batch_size, num_instances, _, _, _ = features.get_shape().as_list()
detection_priors: A float Tensor of shape [batch_size * num_instances, features *= tf.expand_dims(uniform_priors, axis=-1)
mask_size, mask_size, 1]. # Reduce spatial dimension of features. The features have shape
""" # [batch_size, num_instances, num_channels].
features = tf.reduce_mean(features, axis=(2, 3))
logits = tf.keras.layers.Dense(
self._mask_num_classes * self._num_clusters,
kernel_initializer=tf.random_normal_initializer(stddev=0.01))(features)
logits = tf.reshape(logits,
[batch_size, num_instances,
self._mask_num_classes, self._num_clusters])
if self._use_category_for_mask: if self._use_category_for_mask:
object_class_priors = tf.gather( logits = tf.gather(logits, tf.expand_dims(classes, axis=-1), batch_dims=2)
self.class_priors, detection_prior_classes) logits = tf.squeeze(logits, axis=2)
else: else:
num_batch_instances = shape_weights.get_shape()[0] logits = logits[:, :, 0, :]
object_class_priors = tf.tile(
tf.expand_dims(self.class_priors, 0), distribution = tf.nn.softmax(logits, name='shape_prior_weights')
[num_batch_instances, 1, 1, 1]) return distribution
vector_class_priors = tf.reshape(
object_class_priors,
[-1, self._num_clusters,
self._mask_crop_size * self._mask_crop_size])
detection_priors = tf.matmul(
tf.expand_dims(shape_weights, 1), vector_class_priors)[:, 0, :]
detection_priors = tf.reshape(
detection_priors, [-1, self._mask_crop_size, self._mask_crop_size, 1])
return detection_priors
class ShapemaskCoarsemaskHead(object): class ShapemaskCoarsemaskHead(object):
...@@ -879,7 +738,8 @@ class ShapemaskCoarsemaskHead(object): ...@@ -879,7 +738,8 @@ class ShapemaskCoarsemaskHead(object):
num_downsample_channels, num_downsample_channels,
mask_crop_size, mask_crop_size,
use_category_for_mask, use_category_for_mask,
num_convs): num_convs,
norm_activation=nn_ops.norm_activation_builder()):
"""Initialize params to build ShapeMask coarse and fine prediction head. """Initialize params to build ShapeMask coarse and fine prediction head.
Args: Args:
...@@ -889,118 +749,106 @@ class ShapemaskCoarsemaskHead(object): ...@@ -889,118 +749,106 @@ class ShapemaskCoarsemaskHead(object):
use_category_for_mask: use class information in mask branch. use_category_for_mask: use class information in mask branch.
num_convs: `int` number of stacked convolution before the last prediction num_convs: `int` number of stacked convolution before the last prediction
layer. layer.
norm_activation: an operation that includes a normalization layer
followed by an optional activation layer.
""" """
self._mask_num_classes = num_classes self._mask_num_classes = num_classes if use_category_for_mask else 1
self._use_category_for_mask = use_category_for_mask
self._num_downsample_channels = num_downsample_channels self._num_downsample_channels = num_downsample_channels
self._mask_crop_size = mask_crop_size self._mask_crop_size = mask_crop_size
self._use_category_for_mask = use_category_for_mask
self._num_convs = num_convs self._num_convs = num_convs
if not use_category_for_mask: self._norm_activation = norm_activation
assert num_classes == 1
self._coarse_mask_fc = tf.keras.layers.Dense(
def __call__(self, self._num_downsample_channels, name='coarse-mask-fc')
crop_features,
detection_priors, self._class_conv = []
inst_classes, self._class_norm_activation = []
is_training=None):
for i in range(self._num_convs):
self._class_conv.append(tf.keras.layers.Conv2D(
self._num_downsample_channels,
kernel_size=(3, 3),
bias_initializer=tf.zeros_initializer(),
kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
padding='same',
name='coarse-mask-class-%d' % i))
self._class_norm_activation.append(
norm_activation(name='coarse-mask-class-%d-bn' % i))
self._class_predict = tf.keras.layers.Conv2D(
self._mask_num_classes,
kernel_size=(1, 1),
# Focal loss bias initialization to have foreground 0.01 probability.
bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
padding='same',
name='coarse-mask-class-predict')
def __call__(self, features, detection_priors, classes, is_training):
"""Generate instance masks from FPN features and detection priors. """Generate instance masks from FPN features and detection priors.
This corresponds to the Fig. 5-6 of the ShapeMask paper at This corresponds to the Fig. 5-6 of the ShapeMask paper at
https://arxiv.org/pdf/1904.03239.pdf https://arxiv.org/pdf/1904.03239.pdf
Args: Args:
crop_features: a float Tensor of shape [batch_size * num_instances, features: a float Tensor of shape [batch_size, num_instances,
mask_crop_size, mask_crop_size, num_downsample_channels]. This is the mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
instance feature crop. instance feature crop.
detection_priors: a float Tensor of shape [batch_size * num_instances, detection_priors: a float Tensor of shape [batch_size, num_instances,
mask_crop_size, mask_crop_size, 1]. This is the detection prior for mask_crop_size, mask_crop_size, 1]. This is the detection prior for
the instance. the instance.
inst_classes: a int Tensor of shape [batch_size, num_instances] classes: a int Tensor of shape [batch_size, num_instances]
of instance classes. of instance classes.
is_training: a bool indicating whether in training mode. is_training: a bool indicating whether in training mode.
Returns: Returns:
mask_outputs: instance mask prediction as a float Tensor of shape mask_outputs: instance mask prediction as a float Tensor of shape
[batch_size * num_instances, mask_size, mask_size, num_classes]. [batch_size, num_instances, mask_size, mask_size].
""" """
# Embed the anchor map into some feature space for anchor conditioning. with backend.get_graph().as_default(), tf.name_scope('coarse_mask'):
detection_prior_features = tf.keras.layers.Conv2D( # Transform detection priors to have the same dimension as features.
self._num_downsample_channels, detection_priors = tf.expand_dims(detection_priors, axis=-1)
kernel_size=(1, 1), detection_priors = self._coarse_mask_fc(detection_priors)
bias_initializer=tf.zeros_initializer(),
kernel_initializer=tf.keras.initializers.RandomNormal( features += detection_priors
mean=0., stddev=0.01), mask_logits = self.decoder_net(features, is_training)
padding='same', # Gather the logits with right input class.
name='anchor-conv')( if self._use_category_for_mask:
detection_priors) mask_logits = tf.transpose(mask_logits, [0, 1, 4, 2, 3])
mask_logits = tf.gather(mask_logits, tf.expand_dims(classes, -1),
prior_conditioned_features = crop_features + detection_prior_features batch_dims=2)
coarse_output_features = self.coarsemask_decoder_net( mask_logits = tf.squeeze(mask_logits, axis=2)
prior_conditioned_features, is_training) else:
mask_logits = mask_logits[..., 0]
coarse_mask_classes = tf.keras.layers.Conv2D(
self._mask_num_classes,
kernel_size=(1, 1),
# Focal loss bias initialization to have foreground 0.01 probability.
bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
kernel_initializer=tf.keras.initializers.RandomNormal(
mean=0, stddev=0.01),
padding='same',
name='class-predict')(
coarse_output_features)
if self._use_category_for_mask:
inst_classes = tf.cast(tf.reshape(inst_classes, [-1]), tf.int32)
coarse_mask_classes_t = tf.transpose(
a=coarse_mask_classes, perm=(0, 3, 1, 2))
# pylint: disable=g-long-lambda
coarse_mask_logits = tf.cond(
pred=tf.size(input=inst_classes) > 0,
true_fn=lambda: tf.gather_nd(
coarse_mask_classes_t,
tf.stack(
[tf.range(tf.size(input=inst_classes)), inst_classes - 1],
axis=1)),
false_fn=lambda: coarse_mask_classes_t[:, 0, :, :])
# pylint: enable=g-long-lambda
coarse_mask_logits = tf.expand_dims(coarse_mask_logits, -1)
else:
coarse_mask_logits = coarse_mask_classes
coarse_class_probs = tf.nn.sigmoid(coarse_mask_logits)
class_probs = tf.cast(coarse_class_probs, prior_conditioned_features.dtype)
return coarse_mask_classes, class_probs, prior_conditioned_features return mask_logits
def coarsemask_decoder_net(self, def decoder_net(self, features, is_training=False):
images,
is_training=None,
norm_activation=nn_ops.norm_activation_builder()):
"""Coarse mask decoder network architecture. """Coarse mask decoder network architecture.
Args: Args:
images: A tensor of size [batch, height_in, width_in, channels_in]. features: A tensor of size [batch, height_in, width_in, channels_in].
is_training: Whether batch_norm layers are in training mode. is_training: Whether batch_norm layers are in training mode.
norm_activation: an operation that includes a batch normalization layer
followed by a relu layer(optional).
Returns: Returns:
images: A feature tensor of size [batch, output_size, output_size, images: A feature tensor of size [batch, output_size, output_size,
num_channels] num_channels]
""" """
(batch_size, num_instances, height, width,
num_channels) = features.get_shape().as_list()
features = tf.reshape(features, [batch_size * num_instances, height, width,
num_channels])
for i in range(self._num_convs): for i in range(self._num_convs):
images = tf.keras.layers.Conv2D( features = self._class_conv[i](features)
self._num_downsample_channels, features = self._class_norm_activation[i](features,
kernel_size=(3, 3), is_training=is_training)
bias_initializer=tf.zeros_initializer(),
kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
activation=None,
padding='same',
name='coarse-class-%d' % i)(
images)
images = norm_activation(name='coarse-class-%d-bn' % i)(
images, is_training=is_training)
return images mask_logits = self._class_predict(features)
mask_logits = tf.reshape(mask_logits, [batch_size, num_instances, height,
width, self._mask_num_classes])
return mask_logits
class ShapemaskFinemaskHead(object): class ShapemaskFinemaskHead(object):
...@@ -1010,9 +858,9 @@ class ShapemaskFinemaskHead(object): ...@@ -1010,9 +858,9 @@ class ShapemaskFinemaskHead(object):
num_classes, num_classes,
num_downsample_channels, num_downsample_channels,
mask_crop_size, mask_crop_size,
use_category_for_mask,
num_convs, num_convs,
coarse_mask_thr, upsample_factor,
gt_upsample_scale,
norm_activation=nn_ops.norm_activation_builder()): norm_activation=nn_ops.norm_activation_builder()):
"""Initialize params to build ShapeMask coarse and fine prediction head. """Initialize params to build ShapeMask coarse and fine prediction head.
...@@ -1020,33 +868,29 @@ class ShapemaskFinemaskHead(object): ...@@ -1020,33 +868,29 @@ class ShapemaskFinemaskHead(object):
num_classes: `int` number of mask classification categories. num_classes: `int` number of mask classification categories.
num_downsample_channels: `int` number of filters at mask head. num_downsample_channels: `int` number of filters at mask head.
mask_crop_size: feature crop size. mask_crop_size: feature crop size.
use_category_for_mask: use class information in mask branch.
num_convs: `int` number of stacked convolution before the last prediction num_convs: `int` number of stacked convolution before the last prediction
layer. layer.
coarse_mask_thr: the threshold for suppressing noisy coarse prediction. upsample_factor: `int` number of fine mask upsampling factor.
gt_upsample_scale: scale for upsampling groundtruths.
norm_activation: an operation that includes a batch normalization layer norm_activation: an operation that includes a batch normalization layer
followed by a relu layer(optional). followed by a relu layer(optional).
""" """
self._mask_num_classes = num_classes self._use_category_for_mask = use_category_for_mask
self._mask_num_classes = num_classes if use_category_for_mask else 1
self._num_downsample_channels = num_downsample_channels self._num_downsample_channels = num_downsample_channels
self._mask_crop_size = mask_crop_size self._mask_crop_size = mask_crop_size
self._num_convs = num_convs self._num_convs = num_convs
self._coarse_mask_thr = coarse_mask_thr self.up_sample_factor = upsample_factor
self._gt_upsample_scale = gt_upsample_scale
self._fine_mask_fc = tf.keras.layers.Dense(
self._num_downsample_channels, name='fine-mask-fc')
self._class_predict_conv = tf.keras.layers.Conv2D(
self._mask_num_classes,
kernel_size=(1, 1),
# Focal loss bias initialization to have foreground 0.01 probability.
bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
kernel_initializer=tf.keras.initializers.RandomNormal(
mean=0, stddev=0.01),
padding='same',
name='affinity-class-predict')
self._upsample_conv = tf.keras.layers.Conv2DTranspose( self._upsample_conv = tf.keras.layers.Conv2DTranspose(
self._num_downsample_channels // 2, self._num_downsample_channels,
(self._gt_upsample_scale, self._gt_upsample_scale), (self.up_sample_factor, self.up_sample_factor),
(self._gt_upsample_scale, self._gt_upsample_scale)) (self.up_sample_factor, self.up_sample_factor),
name='fine-mask-conv2d-tran')
self._fine_class_conv = [] self._fine_class_conv = []
self._fine_class_bn = [] self._fine_class_bn = []
for i in range(self._num_convs): for i in range(self._num_convs):
...@@ -1059,60 +903,73 @@ class ShapemaskFinemaskHead(object): ...@@ -1059,60 +903,73 @@ class ShapemaskFinemaskHead(object):
stddev=0.01), stddev=0.01),
activation=None, activation=None,
padding='same', padding='same',
name='fine-class-%d' % i)) name='fine-mask-class-%d' % i))
self._fine_class_bn.append(norm_activation(name='fine-class-%d-bn' % i)) self._fine_class_bn.append(norm_activation(
name='fine-mask-class-%d-bn' % i))
self._class_predict_conv = tf.keras.layers.Conv2D(
self._mask_num_classes,
kernel_size=(1, 1),
# Focal loss bias initialization to have foreground 0.01 probability.
bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
padding='same',
name='fine-mask-class-predict')
def __call__(self, prior_conditioned_features, class_probs, is_training=None): def __call__(self, features, mask_logits, classes, is_training):
"""Generate instance masks from FPN features and detection priors. """Generate instance masks from FPN features and detection priors.
This corresponds to the Fig. 5-6 of the ShapeMask paper at This corresponds to the Fig. 5-6 of the ShapeMask paper at
https://arxiv.org/pdf/1904.03239.pdf https://arxiv.org/pdf/1904.03239.pdf
Args: Args:
prior_conditioned_features: a float Tensor of shape [batch_size * features: a float Tensor of shape
num_instances, mask_crop_size, mask_crop_size, num_downsample_channels]. [batch_size, num_instances, mask_crop_size, mask_crop_size,
This is the instance feature crop. num_downsample_channels]. This is the instance feature crop.
class_probs: a float Tensor of shape [batch_size * num_instances, mask_logits: a float Tensor of shape
mask_crop_size, mask_crop_size, 1]. This is the class probability of [batch_size, num_instances, mask_crop_size, mask_crop_size] indicating
instance segmentation. predicted mask logits.
classes: a int Tensor of shape [batch_size, num_instances]
of instance classes.
is_training: a bool indicating whether in training mode. is_training: a bool indicating whether in training mode.
Returns: Returns:
mask_outputs: instance mask prediction as a float Tensor of shape mask_outputs: instance mask prediction as a float Tensor of shape
[batch_size * num_instances, mask_size, mask_size, num_classes]. [batch_size, num_instances, mask_size, mask_size].
""" """
with backend.get_graph().as_default(), tf.name_scope('affinity-masknet'): # Extract the foreground mean features
# Extract the foreground mean features # with tf.variable_scope('fine_mask', reuse=tf.AUTO_REUSE):
point_samp_prob_thr = 1. / (1. + tf.exp(-self._coarse_mask_thr)) with backend.get_graph().as_default(), tf.name_scope('fine_mask'):
point_samp_prob_thr = tf.cast(point_samp_prob_thr, class_probs.dtype) mask_probs = tf.nn.sigmoid(mask_logits)
class_probs = tf.where( # Compute instance embedding for hard average.
tf.greater(class_probs, point_samp_prob_thr), class_probs, binary_mask = tf.cast(tf.greater(mask_probs, 0.5), features.dtype)
tf.zeros_like(class_probs))
weighted_features = class_probs * prior_conditioned_features
sum_class_vector = tf.reduce_sum(
input_tensor=class_probs, axis=(1, 2)) + tf.constant(
1e-20, class_probs.dtype)
instance_embedding = tf.reduce_sum( instance_embedding = tf.reduce_sum(
input_tensor=weighted_features, axis=(1, 2)) / sum_class_vector features * tf.expand_dims(binary_mask, axis=-1), axis=(2, 3))
instance_embedding /= tf.expand_dims(
tf.reduce_sum(binary_mask, axis=(2, 3)) + 1e-20, axis=-1)
# Take the difference between crop features and mean instance features. # Take the difference between crop features and mean instance features.
instance_features = prior_conditioned_features - tf.reshape( features -= tf.expand_dims(
instance_embedding, (-1, 1, 1, self._num_downsample_channels)) tf.expand_dims(instance_embedding, axis=2), axis=2)
# Decoder to generate upsampled segmentation mask. features += self._fine_mask_fc(tf.expand_dims(mask_probs, axis=-1))
affinity_output_features = self.finemask_decoder_net(
instance_features, is_training)
# Predict per-class instance masks. # Decoder to generate upsampled segmentation mask.
affinity_mask_classes = self._class_predict_conv(affinity_output_features) mask_logits = self.decoder_net(features, is_training)
if self._use_category_for_mask:
mask_logits = tf.transpose(mask_logits, [0, 1, 4, 2, 3])
mask_logits = tf.gather(mask_logits,
tf.expand_dims(classes, -1), batch_dims=2)
mask_logits = tf.squeeze(mask_logits, axis=2)
else:
mask_logits = mask_logits[..., 0]
return affinity_mask_classes return mask_logits
def finemask_decoder_net(self, images, is_training=None): def decoder_net(self, features, is_training=False):
"""Fine mask decoder network architecture. """Fine mask decoder network architecture.
Args: Args:
images: A tensor of size [batch, height_in, width_in, channels_in]. features: A tensor of size [batch, height_in, width_in, channels_in].
is_training: Whether batch_norm layers are in training mode. is_training: Whether batch_norm layers are in training mode.
Returns: Returns:
...@@ -1120,11 +977,23 @@ class ShapemaskFinemaskHead(object): ...@@ -1120,11 +977,23 @@ class ShapemaskFinemaskHead(object):
num_channels], where output size is self._gt_upsample_scale times num_channels], where output size is self._gt_upsample_scale times
that of input. that of input.
""" """
(batch_size, num_instances, height, width,
num_channels) = features.get_shape().as_list()
features = tf.reshape(features, [batch_size * num_instances, height, width,
num_channels])
for i in range(self._num_convs): for i in range(self._num_convs):
images = self._fine_class_conv[i](images) features = self._fine_class_conv[i](features)
images = self._fine_class_bn[i](images, is_training=is_training) features = self._fine_class_bn[i](features, is_training=is_training)
if self.up_sample_factor > 1:
features = self._upsample_conv(features)
if self._gt_upsample_scale > 1: # Predict per-class instance masks.
images = self._upsample_conv(images) mask_logits = self._class_predict_conv(features)
return images mask_logits = tf.reshape(mask_logits,
[batch_size, num_instances,
height * self.up_sample_factor,
width * self.up_sample_factor,
self._mask_num_classes])
return mask_logits
...@@ -19,7 +19,6 @@ from __future__ import division ...@@ -19,7 +19,6 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import functools import functools
from absl import logging
import tensorflow as tf import tensorflow as tf
...@@ -105,5 +104,5 @@ def norm_activation_builder(momentum=0.997, ...@@ -105,5 +104,5 @@ def norm_activation_builder(momentum=0.997,
momentum=momentum, momentum=momentum,
epsilon=epsilon, epsilon=epsilon,
trainable=trainable, trainable=trainable,
activation='relu', activation=activation,
**kwargs) **kwargs)
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Losses used for Mask-RCNN.""" """Losses used for detection models."""
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
...@@ -479,12 +479,62 @@ class RetinanetBoxLoss(object): ...@@ -479,12 +479,62 @@ class RetinanetBoxLoss(object):
class ShapemaskMseLoss(object): class ShapemaskMseLoss(object):
"""ShapeMask mask Mean Squared Error loss function wrapper.""" """ShapeMask mask Mean Squared Error loss function wrapper."""
def __init__(self): def __call__(self, probs, labels, valid_mask):
raise NotImplementedError('Not Implemented.') """Compute instance segmentation loss.
Args:
probs: A Tensor of shape [batch_size * num_points, height, width,
num_classes]. The logits are not necessarily between 0 and 1.
labels: A float32/float16 Tensor of shape [batch_size, num_instances,
mask_size, mask_size], where mask_size =
mask_crop_size * gt_upsample_scale for fine mask, or mask_crop_size
for coarse masks and shape priors.
valid_mask: a binary mask indicating valid training masks.
Returns:
loss: an float tensor representing total mask classification loss.
"""
with tf.name_scope('shapemask_prior_loss'):
batch_size, num_instances = valid_mask.get_shape().as_list()[:2]
diff = (tf.cast(labels, dtype=tf.float32) -
tf.cast(probs, dtype=tf.float32))
diff *= tf.cast(
tf.reshape(valid_mask, [batch_size, num_instances, 1, 1]),
tf.float32)
# Adding 0.001 in the denominator to avoid division by zero.
loss = tf.nn.l2_loss(diff) / (tf.reduce_sum(labels) + 0.001)
return loss
class ShapemaskLoss(object): class ShapemaskLoss(object):
"""ShapeMask mask loss function wrapper.""" """ShapeMask mask loss function wrapper."""
def __init__(self): def __init__(self):
raise NotImplementedError('Not Implemented.') self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
def __call__(self, logits, labels, valid_mask):
"""ShapeMask mask cross entropy loss function wrapper.
Args:
logits: A Tensor of shape [batch_size * num_instances, height, width,
num_classes]. The logits are not necessarily between 0 and 1.
labels: A float16/float32 Tensor of shape [batch_size, num_instances,
mask_size, mask_size], where mask_size =
mask_crop_size * gt_upsample_scale for fine mask, or mask_crop_size
for coarse masks and shape priors.
valid_mask: a binary mask of shape [batch_size, num_instances]
indicating valid training masks.
Returns:
loss: an float tensor representing total mask classification loss.
"""
with tf.name_scope('shapemask_loss'):
batch_size, num_instances = valid_mask.get_shape().as_list()[:2]
labels = tf.cast(labels, tf.float32)
logits = tf.cast(logits, tf.float32)
loss = self._binary_crossentropy(labels, logits)
loss *= tf.cast(tf.reshape(
valid_mask, [batch_size, num_instances, 1, 1]), loss.dtype)
# Adding 0.001 in the denominator to avoid division by zero.
loss = tf.reduce_sum(loss) / (tf.reduce_sum(labels) + 0.001)
return loss
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment