Commit 9a264c9f authored by Yeqing Li's avatar Yeqing Li Committed by A. Unique TensorFlower
Browse files

Deprecating official/vision/detection folder.

The folder is archived in the official/legacy/detection

PiperOrigin-RevId: 419643226
parent 8fb4e6a6
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Learning rate schedule."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import numpy as np
import tensorflow as tf
from official.modeling.hyperparams import params_dict
class StepLearningRateWithLinearWarmup(
tf.keras.optimizers.schedules.LearningRateSchedule):
"""Class to generate learning rate tensor."""
def __init__(self, total_steps, params):
"""Creates the step learning rate tensor with linear warmup."""
super(StepLearningRateWithLinearWarmup, self).__init__()
self._total_steps = total_steps
assert isinstance(params, (dict, params_dict.ParamsDict))
if isinstance(params, dict):
params = params_dict.ParamsDict(params)
self._params = params
def __call__(self, global_step):
warmup_lr = self._params.warmup_learning_rate
warmup_steps = self._params.warmup_steps
init_lr = self._params.init_learning_rate
lr_levels = self._params.learning_rate_levels
lr_steps = self._params.learning_rate_steps
linear_warmup = (
warmup_lr + tf.cast(global_step, dtype=tf.float32) / warmup_steps *
(init_lr - warmup_lr))
learning_rate = tf.where(global_step < warmup_steps, linear_warmup, init_lr)
for next_learning_rate, start_step in zip(lr_levels, lr_steps):
learning_rate = tf.where(global_step >= start_step, next_learning_rate,
learning_rate)
return learning_rate
def get_config(self):
return {'_params': self._params.as_dict()}
class CosineLearningRateWithLinearWarmup(
tf.keras.optimizers.schedules.LearningRateSchedule):
"""Class to generate learning rate tensor."""
def __init__(self, total_steps, params):
"""Creates the consine learning rate tensor with linear warmup."""
super(CosineLearningRateWithLinearWarmup, self).__init__()
self._total_steps = total_steps
assert isinstance(params, (dict, params_dict.ParamsDict))
if isinstance(params, dict):
params = params_dict.ParamsDict(params)
self._params = params
def __call__(self, global_step):
global_step = tf.cast(global_step, dtype=tf.float32)
warmup_lr = self._params.warmup_learning_rate
warmup_steps = self._params.warmup_steps
init_lr = self._params.init_learning_rate
total_steps = self._total_steps
linear_warmup = (
warmup_lr + global_step / warmup_steps * (init_lr - warmup_lr))
cosine_learning_rate = (
init_lr * (tf.cos(np.pi * (global_step - warmup_steps) /
(total_steps - warmup_steps)) + 1.0) / 2.0)
learning_rate = tf.where(global_step < warmup_steps, linear_warmup,
cosine_learning_rate)
return learning_rate
def get_config(self):
return {'_params': self._params.as_dict()}
def learning_rate_generator(total_steps, params):
"""The learning rate function generator."""
if params.type == 'step':
return StepLearningRateWithLinearWarmup(total_steps, params)
elif params.type == 'cosine':
return CosineLearningRateWithLinearWarmup(total_steps, params)
else:
raise ValueError('Unsupported learning rate type: {}.'.format(params.type))
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Losses used for detection models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import logging
import tensorflow as tf
def focal_loss(logits, targets, alpha, gamma, normalizer):
"""Compute the focal loss between `logits` and the golden `target` values.
Focal loss = -(1-pt)^gamma * log(pt)
where pt is the probability of being classified to the true class.
Args:
logits: A float32 tensor of size
[batch, height_in, width_in, num_predictions].
targets: A float32 tensor of size
[batch, height_in, width_in, num_predictions].
alpha: A float32 scalar multiplying alpha to the loss from positive examples
and (1-alpha) to the loss from negative examples.
gamma: A float32 scalar modulating loss from hard and easy examples.
normalizer: A float32 scalar normalizes the total loss from all examples.
Returns:
loss: A float32 Tensor of size [batch, height_in, width_in, num_predictions]
representing normalized loss on the prediction map.
"""
with tf.name_scope('focal_loss'):
positive_label_mask = tf.math.equal(targets, 1.0)
cross_entropy = (
tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=logits))
# Below are comments/derivations for computing modulator.
# For brevity, let x = logits, z = targets, r = gamma, and p_t = sigmod(x)
# for positive samples and 1 - sigmoid(x) for negative examples.
#
# The modulator, defined as (1 - P_t)^r, is a critical part in focal loss
# computation. For r > 0, it puts more weights on hard examples, and less
# weights on easier ones. However if it is directly computed as (1 - P_t)^r,
# its back-propagation is not stable when r < 1. The implementation here
# resolves the issue.
#
# For positive samples (labels being 1),
# (1 - p_t)^r
# = (1 - sigmoid(x))^r
# = (1 - (1 / (1 + exp(-x))))^r
# = (exp(-x) / (1 + exp(-x)))^r
# = exp(log((exp(-x) / (1 + exp(-x)))^r))
# = exp(r * log(exp(-x)) - r * log(1 + exp(-x)))
# = exp(- r * x - r * log(1 + exp(-x)))
#
# For negative samples (labels being 0),
# (1 - p_t)^r
# = (sigmoid(x))^r
# = (1 / (1 + exp(-x)))^r
# = exp(log((1 / (1 + exp(-x)))^r))
# = exp(-r * log(1 + exp(-x)))
#
# Therefore one unified form for positive (z = 1) and negative (z = 0)
# samples is:
# (1 - p_t)^r = exp(-r * z * x - r * log(1 + exp(-x))).
neg_logits = -1.0 * logits
modulator = tf.math.exp(gamma * targets * neg_logits -
gamma * tf.math.log1p(tf.math.exp(neg_logits)))
loss = modulator * cross_entropy
weighted_loss = tf.where(positive_label_mask, alpha * loss,
(1.0 - alpha) * loss)
weighted_loss /= normalizer
return weighted_loss
class RpnScoreLoss(object):
"""Region Proposal Network score loss function."""
def __init__(self, params):
self._rpn_batch_size_per_im = params.rpn_batch_size_per_im
self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
def __call__(self, score_outputs, labels):
"""Computes total RPN detection loss.
Computes total RPN detection loss including box and score from all levels.
Args:
score_outputs: an OrderDict with keys representing levels and values
representing scores in [batch_size, height, width, num_anchors].
labels: the dictionary that returned from dataloader that includes
groundturth targets.
Returns:
rpn_score_loss: a scalar tensor representing total score loss.
"""
with tf.name_scope('rpn_loss'):
levels = sorted(score_outputs.keys())
score_losses = []
for level in levels:
score_losses.append(
self._rpn_score_loss(
score_outputs[level],
labels[level],
normalizer=tf.cast(
tf.shape(score_outputs[level])[0] *
self._rpn_batch_size_per_im, dtype=tf.float32)))
# Sums per level losses to total loss.
return tf.math.add_n(score_losses)
def _rpn_score_loss(self, score_outputs, score_targets, normalizer=1.0):
"""Computes score loss."""
# score_targets has three values:
# (1) score_targets[i]=1, the anchor is a positive sample.
# (2) score_targets[i]=0, negative.
# (3) score_targets[i]=-1, the anchor is don't care (ignore).
with tf.name_scope('rpn_score_loss'):
mask = tf.math.logical_or(tf.math.equal(score_targets, 1),
tf.math.equal(score_targets, 0))
score_targets = tf.math.maximum(score_targets,
tf.zeros_like(score_targets))
score_targets = tf.expand_dims(score_targets, axis=-1)
score_outputs = tf.expand_dims(score_outputs, axis=-1)
score_loss = self._binary_crossentropy(
score_targets, score_outputs, sample_weight=mask)
score_loss /= normalizer
return score_loss
class RpnBoxLoss(object):
"""Region Proposal Network box regression loss function."""
def __init__(self, params):
logging.info('RpnBoxLoss huber_loss_delta %s', params.huber_loss_delta)
# The delta is typically around the mean value of regression target.
# for instances, the regression targets of 512x512 input with 6 anchors on
# P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
self._huber_loss = tf.keras.losses.Huber(
delta=params.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
def __call__(self, box_outputs, labels):
"""Computes total RPN detection loss.
Computes total RPN detection loss including box and score from all levels.
Args:
box_outputs: an OrderDict with keys representing levels and values
representing box regression targets in
[batch_size, height, width, num_anchors * 4].
labels: the dictionary that returned from dataloader that includes
groundturth targets.
Returns:
rpn_box_loss: a scalar tensor representing total box regression loss.
"""
with tf.name_scope('rpn_loss'):
levels = sorted(box_outputs.keys())
box_losses = []
for level in levels:
box_losses.append(self._rpn_box_loss(box_outputs[level], labels[level]))
# Sum per level losses to total loss.
return tf.add_n(box_losses)
def _rpn_box_loss(self, box_outputs, box_targets, normalizer=1.0):
"""Computes box regression loss."""
with tf.name_scope('rpn_box_loss'):
mask = tf.cast(tf.not_equal(box_targets, 0.0), dtype=tf.float32)
box_targets = tf.expand_dims(box_targets, axis=-1)
box_outputs = tf.expand_dims(box_outputs, axis=-1)
box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
# The loss is normalized by the sum of non-zero weights and additional
# normalizer provided by the function caller. Using + 0.01 here to avoid
# division by zero.
box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01)
return box_loss
class OlnRpnCenterLoss(object):
"""Object Localization Network RPN centerness regression loss function."""
def __init__(self):
self._l1_loss = tf.keras.losses.MeanAbsoluteError(
reduction=tf.keras.losses.Reduction.SUM)
def __call__(self, center_outputs, labels):
"""Computes total RPN centerness regression loss.
Computes total RPN centerness score regression loss from all levels.
Args:
center_outputs: an OrderDict with keys representing levels and values
representing anchor centerness regression targets in
[batch_size, height, width, num_anchors * 4].
labels: the dictionary that returned from dataloader that includes
groundturth targets.
Returns:
rpn_center_loss: a scalar tensor representing total centerness regression
loss.
"""
with tf.name_scope('rpn_loss'):
# Normalizer.
levels = sorted(center_outputs.keys())
num_valid = 0
# 0<pos<1, neg=0, ign=-1
for level in levels:
num_valid += tf.reduce_sum(tf.cast(
tf.greater(labels[level], -1.0), tf.float32)) # in and out of box
num_valid += 1e-12
# Centerness loss over multi levels.
center_losses = []
for level in levels:
center_losses.append(
self._rpn_center_l1_loss(
center_outputs[level], labels[level],
normalizer=num_valid))
# Sum per level losses to total loss.
return tf.add_n(center_losses)
def _rpn_center_l1_loss(self, center_outputs, center_targets,
normalizer=1.0):
"""Computes centerness regression loss."""
# for instances, the regression targets of 512x512 input with 6 anchors on
# P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
with tf.name_scope('rpn_center_loss'):
# mask = tf.greater(center_targets, 0.0) # inside box only.
mask = tf.greater(center_targets, -1.0) # in and out of box.
center_targets = tf.maximum(center_targets, tf.zeros_like(center_targets))
center_outputs = tf.sigmoid(center_outputs)
center_targets = tf.expand_dims(center_targets, -1)
center_outputs = tf.expand_dims(center_outputs, -1)
mask = tf.cast(mask, dtype=tf.float32)
center_loss = self._l1_loss(center_targets, center_outputs,
sample_weight=mask)
center_loss /= normalizer
return center_loss
class OlnRpnIoULoss(object):
"""Object Localization Network RPN box-lrtb regression iou loss function."""
def __call__(self, box_outputs, labels, center_targets):
"""Computes total RPN detection loss.
Computes total RPN box regression loss from all levels.
Args:
box_outputs: an OrderDict with keys representing levels and values
representing box regression targets in
[batch_size, height, width, num_anchors * 4].
last channel: (left, right, top, bottom).
labels: the dictionary that returned from dataloader that includes
groundturth targets (left, right, top, bottom).
center_targets: valid_target mask.
Returns:
rpn_iou_loss: a scalar tensor representing total box regression loss.
"""
with tf.name_scope('rpn_loss'):
# Normalizer.
levels = sorted(box_outputs.keys())
normalizer = 0.
for level in levels:
# center_targets pos>0, neg=0, ign=-1.
mask_ = tf.cast(tf.logical_and(
tf.greater(center_targets[level][..., 0], 0.0),
tf.greater(tf.reduce_min(labels[level], -1), 0.0)), tf.float32)
normalizer += tf.reduce_sum(mask_)
normalizer += 1e-8
# iou_loss over multi levels.
iou_losses = []
for level in levels:
iou_losses.append(
self._rpn_iou_loss(
box_outputs[level], labels[level],
center_weight=center_targets[level][..., 0],
normalizer=normalizer))
# Sum per level losses to total loss.
return tf.add_n(iou_losses)
def _rpn_iou_loss(self, box_outputs, box_targets,
center_weight=None, normalizer=1.0):
"""Computes box regression loss."""
# for instances, the regression targets of 512x512 input with 6 anchors on
# P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
with tf.name_scope('rpn_iou_loss'):
mask = tf.logical_and(
tf.greater(center_weight, 0.0),
tf.greater(tf.reduce_min(box_targets, -1), 0.0))
pred_left = box_outputs[..., 0]
pred_right = box_outputs[..., 1]
pred_top = box_outputs[..., 2]
pred_bottom = box_outputs[..., 3]
gt_left = box_targets[..., 0]
gt_right = box_targets[..., 1]
gt_top = box_targets[..., 2]
gt_bottom = box_targets[..., 3]
inter_width = (tf.minimum(pred_left, gt_left) +
tf.minimum(pred_right, gt_right))
inter_height = (tf.minimum(pred_top, gt_top) +
tf.minimum(pred_bottom, gt_bottom))
inter_area = inter_width * inter_height
union_area = ((pred_left + pred_right) * (pred_top + pred_bottom) +
(gt_left + gt_right) * (gt_top + gt_bottom) -
inter_area)
iou = inter_area / (union_area + 1e-8)
mask_ = tf.cast(mask, tf.float32)
iou = tf.clip_by_value(iou, clip_value_min=1e-8, clip_value_max=1.0)
neg_log_iou = -tf.math.log(iou)
iou_loss = tf.reduce_sum(neg_log_iou * mask_)
iou_loss /= normalizer
return iou_loss
class FastrcnnClassLoss(object):
"""Fast R-CNN classification loss function."""
def __init__(self):
self._categorical_crossentropy = tf.keras.losses.CategoricalCrossentropy(
reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
def __call__(self, class_outputs, class_targets):
"""Computes the class loss (Fast-RCNN branch) of Mask-RCNN.
This function implements the classification loss of the Fast-RCNN.
The classification loss is softmax on all RoIs.
Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py # pylint: disable=line-too-long
Args:
class_outputs: a float tensor representing the class prediction for each box
with a shape of [batch_size, num_boxes, num_classes].
class_targets: a float tensor representing the class label for each box
with a shape of [batch_size, num_boxes].
Returns:
a scalar tensor representing total class loss.
"""
with tf.name_scope('fast_rcnn_loss'):
batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
class_targets = tf.cast(class_targets, dtype=tf.int32)
class_targets_one_hot = tf.one_hot(class_targets, num_classes)
return self._fast_rcnn_class_loss(class_outputs, class_targets_one_hot,
normalizer=batch_size * num_boxes / 2.0)
def _fast_rcnn_class_loss(self, class_outputs, class_targets_one_hot,
normalizer):
"""Computes classification loss."""
with tf.name_scope('fast_rcnn_class_loss'):
class_loss = self._categorical_crossentropy(class_targets_one_hot,
class_outputs)
class_loss /= normalizer
return class_loss
class FastrcnnBoxLoss(object):
"""Fast R-CNN box regression loss function."""
def __init__(self, params):
logging.info('FastrcnnBoxLoss huber_loss_delta %s', params.huber_loss_delta)
# The delta is typically around the mean value of regression target.
# for instances, the regression targets of 512x512 input with 6 anchors on
# P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
self._huber_loss = tf.keras.losses.Huber(
delta=params.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
def __call__(self, box_outputs, class_targets, box_targets):
"""Computes the box loss (Fast-RCNN branch) of Mask-RCNN.
This function implements the box regression loss of the Fast-RCNN. As the
`box_outputs` produces `num_classes` boxes for each RoI, the reference model
expands `box_targets` to match the shape of `box_outputs` and selects only
the target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/fast_rcnn.py) # pylint: disable=line-too-long
Instead, this function selects the `box_outputs` by the `class_targets` so
that it doesn't expand `box_targets`.
The box loss is smooth L1-loss on only positive samples of RoIs.
Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py # pylint: disable=line-too-long
Args:
box_outputs: a float tensor representing the box prediction for each box
with a shape of [batch_size, num_boxes, num_classes * 4].
class_targets: a float tensor representing the class label for each box
with a shape of [batch_size, num_boxes].
box_targets: a float tensor representing the box label for each box
with a shape of [batch_size, num_boxes, 4].
Returns:
box_loss: a scalar tensor representing total box regression loss.
"""
with tf.name_scope('fast_rcnn_loss'):
class_targets = tf.cast(class_targets, dtype=tf.int32)
# Selects the box from `box_outputs` based on `class_targets`, with which
# the box has the maximum overlap.
(batch_size, num_rois,
num_class_specific_boxes) = box_outputs.get_shape().as_list()
num_classes = num_class_specific_boxes // 4
box_outputs = tf.reshape(box_outputs,
[batch_size, num_rois, num_classes, 4])
box_indices = tf.reshape(
class_targets + tf.tile(
tf.expand_dims(
tf.range(batch_size) * num_rois * num_classes, 1),
[1, num_rois]) + tf.tile(
tf.expand_dims(tf.range(num_rois) * num_classes, 0),
[batch_size, 1]), [-1])
box_outputs = tf.matmul(
tf.one_hot(
box_indices,
batch_size * num_rois * num_classes,
dtype=box_outputs.dtype), tf.reshape(box_outputs, [-1, 4]))
box_outputs = tf.reshape(box_outputs, [batch_size, -1, 4])
return self._fast_rcnn_box_loss(box_outputs, box_targets, class_targets)
def _fast_rcnn_box_loss(self, box_outputs, box_targets, class_targets,
normalizer=1.0):
"""Computes box regression loss."""
with tf.name_scope('fast_rcnn_box_loss'):
mask = tf.tile(tf.expand_dims(tf.greater(class_targets, 0), axis=2),
[1, 1, 4])
mask = tf.cast(mask, dtype=tf.float32)
box_targets = tf.expand_dims(box_targets, axis=-1)
box_outputs = tf.expand_dims(box_outputs, axis=-1)
box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
# The loss is normalized by the number of ones in mask,
# additianal normalizer provided by the user and using 0.01 here to avoid
# division by 0.
box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01)
return box_loss
class OlnBoxScoreLoss(object):
"""Object Localization Network Box-Iou scoring function."""
def __init__(self, params):
self._ignore_threshold = params.ignore_threshold
self._l1_loss = tf.keras.losses.MeanAbsoluteError(
reduction=tf.keras.losses.Reduction.SUM)
def __call__(self, score_outputs, score_targets):
"""Computes the class loss (Fast-RCNN branch) of Mask-RCNN.
This function implements the classification loss of the Fast-RCNN.
The classification loss is softmax on all RoIs.
Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py # pylint: disable=line-too-long
Args:
score_outputs: a float tensor representing the class prediction for each box
with a shape of [batch_size, num_boxes, num_classes].
score_targets: a float tensor representing the class label for each box
with a shape of [batch_size, num_boxes].
Returns:
a scalar tensor representing total score loss.
"""
with tf.name_scope('fast_rcnn_loss'):
score_outputs = tf.squeeze(score_outputs, -1)
mask = tf.greater(score_targets, self._ignore_threshold)
num_valid = tf.reduce_sum(tf.cast(mask, tf.float32))
score_targets = tf.maximum(score_targets, tf.zeros_like(score_targets))
score_outputs = tf.sigmoid(score_outputs)
score_targets = tf.expand_dims(score_targets, -1)
score_outputs = tf.expand_dims(score_outputs, -1)
mask = tf.cast(mask, dtype=tf.float32)
score_loss = self._l1_loss(score_targets, score_outputs,
sample_weight=mask)
score_loss /= (num_valid + 1e-10)
return score_loss
class MaskrcnnLoss(object):
"""Mask R-CNN instance segmentation mask loss function."""
def __init__(self):
self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
def __call__(self, mask_outputs, mask_targets, select_class_targets):
"""Computes the mask loss of Mask-RCNN.
This function implements the mask loss of Mask-RCNN. As the `mask_outputs`
produces `num_classes` masks for each RoI, the reference model expands
`mask_targets` to match the shape of `mask_outputs` and selects only the
target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/mask_rcnn.py) # pylint: disable=line-too-long
Instead, this implementation selects the `mask_outputs` by the `class_targets`
so that it doesn't expand `mask_targets`. Note that the selection logic is
done in the post-processing of mask_rcnn_fn in mask_rcnn_architecture.py.
Args:
mask_outputs: a float tensor representing the prediction for each mask,
with a shape of
[batch_size, num_masks, mask_height, mask_width].
mask_targets: a float tensor representing the binary mask of ground truth
labels for each mask with a shape of
[batch_size, num_masks, mask_height, mask_width].
select_class_targets: a tensor with a shape of [batch_size, num_masks],
representing the foreground mask targets.
Returns:
mask_loss: a float tensor representing total mask loss.
"""
with tf.name_scope('mask_rcnn_loss'):
(batch_size, num_masks, mask_height,
mask_width) = mask_outputs.get_shape().as_list()
weights = tf.tile(
tf.reshape(tf.greater(select_class_targets, 0),
[batch_size, num_masks, 1, 1]),
[1, 1, mask_height, mask_width])
weights = tf.cast(weights, dtype=tf.float32)
mask_targets = tf.expand_dims(mask_targets, axis=-1)
mask_outputs = tf.expand_dims(mask_outputs, axis=-1)
mask_loss = self._binary_crossentropy(mask_targets, mask_outputs,
sample_weight=weights)
# The loss is normalized by the number of 1's in weights and
# + 0.01 is used to avoid division by zero.
return mask_loss / (tf.reduce_sum(weights) + 0.01)
class RetinanetClassLoss(object):
"""RetinaNet class loss."""
def __init__(self, params, num_classes):
self._num_classes = num_classes
self._focal_loss_alpha = params.focal_loss_alpha
self._focal_loss_gamma = params.focal_loss_gamma
def __call__(self, cls_outputs, labels, num_positives):
"""Computes total detection loss.
Computes total detection loss including box and class loss from all levels.
Args:
cls_outputs: an OrderDict with keys representing levels and values
representing logits in [batch_size, height, width,
num_anchors * num_classes].
labels: the dictionary that returned from dataloader that includes
class groundturth targets.
num_positives: number of positive examples in the minibatch.
Returns:
an integar tensor representing total class loss.
"""
# Sums all positives in a batch for normalization and avoids zero
# num_positives_sum, which would lead to inf loss during training
num_positives_sum = tf.reduce_sum(input_tensor=num_positives) + 1.0
cls_losses = []
for level in cls_outputs.keys():
cls_losses.append(self.class_loss(
cls_outputs[level], labels[level], num_positives_sum))
# Sums per level losses to total loss.
return tf.add_n(cls_losses)
def class_loss(self, cls_outputs, cls_targets, num_positives,
ignore_label=-2):
"""Computes RetinaNet classification loss."""
# Onehot encoding for classification labels.
cls_targets_one_hot = tf.one_hot(cls_targets, self._num_classes)
bs, height, width, _, _ = cls_targets_one_hot.get_shape().as_list()
cls_targets_one_hot = tf.reshape(cls_targets_one_hot,
[bs, height, width, -1])
loss = focal_loss(tf.cast(cls_outputs, dtype=tf.float32),
tf.cast(cls_targets_one_hot, dtype=tf.float32),
self._focal_loss_alpha,
self._focal_loss_gamma,
num_positives)
ignore_loss = tf.where(
tf.equal(cls_targets, ignore_label),
tf.zeros_like(cls_targets, dtype=tf.float32),
tf.ones_like(cls_targets, dtype=tf.float32),
)
ignore_loss = tf.expand_dims(ignore_loss, -1)
ignore_loss = tf.tile(ignore_loss, [1, 1, 1, 1, self._num_classes])
ignore_loss = tf.reshape(ignore_loss, tf.shape(input=loss))
return tf.reduce_sum(input_tensor=ignore_loss * loss)
class RetinanetBoxLoss(object):
"""RetinaNet box loss."""
def __init__(self, params):
self._huber_loss = tf.keras.losses.Huber(
delta=params.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
def __call__(self, box_outputs, labels, num_positives):
"""Computes box detection loss.
Computes total detection loss including box and class loss from all levels.
Args:
box_outputs: an OrderDict with keys representing levels and values
representing box regression targets in [batch_size, height, width,
num_anchors * 4].
labels: the dictionary that returned from dataloader that includes
box groundturth targets.
num_positives: number of positive examples in the minibatch.
Returns:
an integer tensor representing total box regression loss.
"""
# Sums all positives in a batch for normalization and avoids zero
# num_positives_sum, which would lead to inf loss during training
num_positives_sum = tf.reduce_sum(input_tensor=num_positives) + 1.0
box_losses = []
for level in box_outputs.keys():
box_targets_l = labels[level]
box_losses.append(
self.box_loss(box_outputs[level], box_targets_l, num_positives_sum))
# Sums per level losses to total loss.
return tf.add_n(box_losses)
def box_loss(self, box_outputs, box_targets, num_positives):
"""Computes RetinaNet box regression loss."""
# The delta is typically around the mean value of regression target.
# for instances, the regression targets of 512x512 input with 6 anchors on
# P3-P7 pyramid is about [0.1, 0.1, 0.2, 0.2].
normalizer = num_positives * 4.0
mask = tf.cast(tf.not_equal(box_targets, 0.0), dtype=tf.float32)
box_targets = tf.expand_dims(box_targets, axis=-1)
box_outputs = tf.expand_dims(box_outputs, axis=-1)
box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
box_loss /= normalizer
return box_loss
class ShapemaskMseLoss(object):
"""ShapeMask mask Mean Squared Error loss function wrapper."""
def __call__(self, probs, labels, valid_mask):
"""Compute instance segmentation loss.
Args:
probs: A Tensor of shape [batch_size * num_points, height, width,
num_classes]. The logits are not necessarily between 0 and 1.
labels: A float32/float16 Tensor of shape [batch_size, num_instances,
mask_size, mask_size], where mask_size =
mask_crop_size * gt_upsample_scale for fine mask, or mask_crop_size
for coarse masks and shape priors.
valid_mask: a binary mask indicating valid training masks.
Returns:
loss: an float tensor representing total mask classification loss.
"""
with tf.name_scope('shapemask_prior_loss'):
batch_size, num_instances = valid_mask.get_shape().as_list()[:2]
diff = (tf.cast(labels, dtype=tf.float32) -
tf.cast(probs, dtype=tf.float32))
diff *= tf.cast(
tf.reshape(valid_mask, [batch_size, num_instances, 1, 1]),
tf.float32)
# Adding 0.001 in the denominator to avoid division by zero.
loss = tf.nn.l2_loss(diff) / (tf.reduce_sum(labels) + 0.001)
return loss
class ShapemaskLoss(object):
"""ShapeMask mask loss function wrapper."""
def __init__(self):
self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
def __call__(self, logits, labels, valid_mask):
"""ShapeMask mask cross entropy loss function wrapper.
Args:
logits: A Tensor of shape [batch_size * num_instances, height, width,
num_classes]. The logits are not necessarily between 0 and 1.
labels: A float16/float32 Tensor of shape [batch_size, num_instances,
mask_size, mask_size], where mask_size =
mask_crop_size * gt_upsample_scale for fine mask, or mask_crop_size
for coarse masks and shape priors.
valid_mask: a binary mask of shape [batch_size, num_instances]
indicating valid training masks.
Returns:
loss: an float tensor representing total mask classification loss.
"""
with tf.name_scope('shapemask_loss'):
batch_size, num_instances = valid_mask.get_shape().as_list()[:2]
labels = tf.cast(labels, tf.float32)
logits = tf.cast(logits, tf.float32)
loss = self._binary_crossentropy(labels, logits)
loss *= tf.cast(tf.reshape(
valid_mask, [batch_size, num_instances, 1, 1]), loss.dtype)
# Adding 0.001 in the denominator to avoid division by zero.
loss = tf.reduce_sum(loss) / (tf.reduce_sum(labels) + 0.001)
return loss
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model defination for the Mask R-CNN Model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from official.vision.detection.dataloader import anchor
from official.vision.detection.dataloader import mode_keys
from official.vision.detection.evaluation import factory as eval_factory
from official.vision.detection.modeling import base_model
from official.vision.detection.modeling import losses
from official.vision.detection.modeling.architecture import factory
from official.vision.detection.ops import postprocess_ops
from official.vision.detection.ops import roi_ops
from official.vision.detection.ops import spatial_transform_ops
from official.vision.detection.ops import target_ops
from official.vision.detection.utils import box_utils
class MaskrcnnModel(base_model.Model):
"""Mask R-CNN model function."""
def __init__(self, params):
super(MaskrcnnModel, self).__init__(params)
# For eval metrics.
self._params = params
self._keras_model = None
self._include_mask = params.architecture.include_mask
# Architecture generators.
self._backbone_fn = factory.backbone_generator(params)
self._fpn_fn = factory.multilevel_features_generator(params)
self._rpn_head_fn = factory.rpn_head_generator(params)
self._generate_rois_fn = roi_ops.ROIGenerator(params.roi_proposal)
self._sample_rois_fn = target_ops.ROISampler(params.roi_sampling)
self._sample_masks_fn = target_ops.MaskSampler(
params.architecture.mask_target_size,
params.mask_sampling.num_mask_samples_per_image)
self._frcnn_head_fn = factory.fast_rcnn_head_generator(params)
if self._include_mask:
self._mrcnn_head_fn = factory.mask_rcnn_head_generator(params)
# Loss function.
self._rpn_score_loss_fn = losses.RpnScoreLoss(params.rpn_score_loss)
self._rpn_box_loss_fn = losses.RpnBoxLoss(params.rpn_box_loss)
self._frcnn_class_loss_fn = losses.FastrcnnClassLoss()
self._frcnn_box_loss_fn = losses.FastrcnnBoxLoss(params.frcnn_box_loss)
if self._include_mask:
self._mask_loss_fn = losses.MaskrcnnLoss()
self._generate_detections_fn = postprocess_ops.GenericDetectionGenerator(
params.postprocess)
self._transpose_input = params.train.transpose_input
assert not self._transpose_input, 'Transpose input is not supportted.'
def build_outputs(self, inputs, mode):
is_training = mode == mode_keys.TRAIN
model_outputs = {}
image = inputs['image']
_, image_height, image_width, _ = image.get_shape().as_list()
backbone_features = self._backbone_fn(image, is_training)
fpn_features = self._fpn_fn(backbone_features, is_training)
rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn(
fpn_features, is_training)
model_outputs.update({
'rpn_score_outputs':
tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
rpn_score_outputs),
'rpn_box_outputs':
tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
rpn_box_outputs),
})
input_anchor = anchor.Anchor(self._params.architecture.min_level,
self._params.architecture.max_level,
self._params.anchor.num_scales,
self._params.anchor.aspect_ratios,
self._params.anchor.anchor_size,
(image_height, image_width))
rpn_rois, _ = self._generate_rois_fn(rpn_box_outputs, rpn_score_outputs,
input_anchor.multilevel_boxes,
inputs['image_info'][:, 1, :],
is_training)
if is_training:
rpn_rois = tf.stop_gradient(rpn_rois)
# Sample proposals.
rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
self._sample_rois_fn(rpn_rois, inputs['gt_boxes'],
inputs['gt_classes']))
# Create bounding box training targets.
box_targets = box_utils.encode_boxes(
matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0])
# If the target is background, the box target is set to all 0s.
box_targets = tf.where(
tf.tile(
tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
[1, 1, 4]), tf.zeros_like(box_targets), box_targets)
model_outputs.update({
'class_targets': matched_gt_classes,
'box_targets': box_targets,
})
roi_features = spatial_transform_ops.multilevel_crop_and_resize(
fpn_features, rpn_rois, output_size=7)
class_outputs, box_outputs = self._frcnn_head_fn(roi_features, is_training)
model_outputs.update({
'class_outputs':
tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
class_outputs),
'box_outputs':
tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
box_outputs),
})
# Add this output to train to make the checkpoint loadable in predict mode.
# If we skip it in train mode, the heads will be out-of-order and checkpoint
# loading will fail.
boxes, scores, classes, valid_detections = self._generate_detections_fn(
box_outputs, class_outputs, rpn_rois, inputs['image_info'][:, 1:2, :])
model_outputs.update({
'num_detections': valid_detections,
'detection_boxes': boxes,
'detection_classes': classes,
'detection_scores': scores,
})
if not self._include_mask:
return model_outputs
if is_training:
rpn_rois, classes, mask_targets = self._sample_masks_fn(
rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices,
inputs['gt_masks'])
mask_targets = tf.stop_gradient(mask_targets)
classes = tf.cast(classes, dtype=tf.int32)
model_outputs.update({
'mask_targets': mask_targets,
'sampled_class_targets': classes,
})
else:
rpn_rois = boxes
classes = tf.cast(classes, dtype=tf.int32)
mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
fpn_features, rpn_rois, output_size=14)
mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes, is_training)
if is_training:
model_outputs.update({
'mask_outputs':
tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
mask_outputs),
})
else:
model_outputs.update({'detection_masks': tf.nn.sigmoid(mask_outputs)})
return model_outputs
def build_loss_fn(self):
if self._keras_model is None:
raise ValueError('build_loss_fn() must be called after build_model().')
filter_fn = self.make_filter_trainable_variables_fn()
trainable_variables = filter_fn(self._keras_model.trainable_variables)
def _total_loss_fn(labels, outputs):
rpn_score_loss = self._rpn_score_loss_fn(outputs['rpn_score_outputs'],
labels['rpn_score_targets'])
rpn_box_loss = self._rpn_box_loss_fn(outputs['rpn_box_outputs'],
labels['rpn_box_targets'])
frcnn_class_loss = self._frcnn_class_loss_fn(outputs['class_outputs'],
outputs['class_targets'])
frcnn_box_loss = self._frcnn_box_loss_fn(outputs['box_outputs'],
outputs['class_targets'],
outputs['box_targets'])
if self._include_mask:
mask_loss = self._mask_loss_fn(outputs['mask_outputs'],
outputs['mask_targets'],
outputs['sampled_class_targets'])
else:
mask_loss = 0.0
model_loss = (
rpn_score_loss + rpn_box_loss + frcnn_class_loss + frcnn_box_loss +
mask_loss)
l2_regularization_loss = self.weight_decay_loss(trainable_variables)
total_loss = model_loss + l2_regularization_loss
return {
'total_loss': total_loss,
'loss': total_loss,
'fast_rcnn_class_loss': frcnn_class_loss,
'fast_rcnn_box_loss': frcnn_box_loss,
'mask_loss': mask_loss,
'model_loss': model_loss,
'l2_regularization_loss': l2_regularization_loss,
'rpn_score_loss': rpn_score_loss,
'rpn_box_loss': rpn_box_loss,
}
return _total_loss_fn
def build_input_layers(self, params, mode):
is_training = mode == mode_keys.TRAIN
input_shape = (
params.maskrcnn_parser.output_size +
[params.maskrcnn_parser.num_channels])
if is_training:
batch_size = params.train.batch_size
input_layer = {
'image':
tf.keras.layers.Input(
shape=input_shape,
batch_size=batch_size,
name='image',
dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
'image_info':
tf.keras.layers.Input(
shape=[4, 2],
batch_size=batch_size,
name='image_info',
),
'gt_boxes':
tf.keras.layers.Input(
shape=[params.maskrcnn_parser.max_num_instances, 4],
batch_size=batch_size,
name='gt_boxes'),
'gt_classes':
tf.keras.layers.Input(
shape=[params.maskrcnn_parser.max_num_instances],
batch_size=batch_size,
name='gt_classes',
dtype=tf.int64),
}
if self._include_mask:
input_layer['gt_masks'] = tf.keras.layers.Input(
shape=[
params.maskrcnn_parser.max_num_instances,
params.maskrcnn_parser.mask_crop_size,
params.maskrcnn_parser.mask_crop_size
],
batch_size=batch_size,
name='gt_masks')
else:
batch_size = params.eval.batch_size
input_layer = {
'image':
tf.keras.layers.Input(
shape=input_shape,
batch_size=batch_size,
name='image',
dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
'image_info':
tf.keras.layers.Input(
shape=[4, 2],
batch_size=batch_size,
name='image_info',
),
}
return input_layer
def build_model(self, params, mode):
if self._keras_model is None:
input_layers = self.build_input_layers(self._params, mode)
outputs = self.model_outputs(input_layers, mode)
model = tf.keras.models.Model(
inputs=input_layers, outputs=outputs, name='maskrcnn')
assert model is not None, 'Fail to build tf.keras.Model.'
model.optimizer = self.build_optimizer()
self._keras_model = model
return self._keras_model
def post_processing(self, labels, outputs):
required_output_fields = ['class_outputs', 'box_outputs']
for field in required_output_fields:
if field not in outputs:
raise ValueError('"%s" is missing in outputs, requried %s found %s' %
(field, required_output_fields, outputs.keys()))
predictions = {
'image_info': labels['image_info'],
'num_detections': outputs['num_detections'],
'detection_boxes': outputs['detection_boxes'],
'detection_classes': outputs['detection_classes'],
'detection_scores': outputs['detection_scores'],
}
if self._include_mask:
predictions.update({
'detection_masks': outputs['detection_masks'],
})
if 'groundtruths' in labels:
predictions['source_id'] = labels['groundtruths']['source_id']
predictions['gt_source_id'] = labels['groundtruths']['source_id']
predictions['gt_height'] = labels['groundtruths']['height']
predictions['gt_width'] = labels['groundtruths']['width']
predictions['gt_image_info'] = labels['image_info']
predictions['gt_num_detections'] = (
labels['groundtruths']['num_detections'])
predictions['gt_boxes'] = labels['groundtruths']['boxes']
predictions['gt_classes'] = labels['groundtruths']['classes']
predictions['gt_areas'] = labels['groundtruths']['areas']
predictions['gt_is_crowds'] = labels['groundtruths']['is_crowds']
return labels, predictions
def eval_metrics(self):
return eval_factory.evaluator_generator(self._params.eval)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model defination for the Object Localization Network (OLN) Model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from official.vision.detection.dataloader import anchor
from official.vision.detection.dataloader import mode_keys
from official.vision.detection.modeling import losses
from official.vision.detection.modeling.architecture import factory
from official.vision.detection.modeling.maskrcnn_model import MaskrcnnModel
from official.vision.detection.ops import postprocess_ops
from official.vision.detection.ops import roi_ops
from official.vision.detection.ops import spatial_transform_ops
from official.vision.detection.ops import target_ops
from official.vision.detection.utils import box_utils
class OlnMaskModel(MaskrcnnModel):
"""OLN-Mask model function."""
def __init__(self, params):
super(OlnMaskModel, self).__init__(params)
self._params = params
# Different heads and layers.
self._include_rpn_class = params.architecture.include_rpn_class
self._include_mask = params.architecture.include_mask
self._include_frcnn_class = params.architecture.include_frcnn_class
self._include_frcnn_box = params.architecture.include_frcnn_box
self._include_centerness = params.rpn_head.has_centerness
self._include_box_score = (params.frcnn_head.has_scoring and
params.architecture.include_frcnn_box)
self._include_mask_score = (params.mrcnn_head.has_scoring and
params.architecture.include_mask)
# Architecture generators.
self._backbone_fn = factory.backbone_generator(params)
self._fpn_fn = factory.multilevel_features_generator(params)
self._rpn_head_fn = factory.rpn_head_generator(params)
if self._include_centerness:
self._rpn_head_fn = factory.oln_rpn_head_generator(params)
else:
self._rpn_head_fn = factory.rpn_head_generator(params)
self._generate_rois_fn = roi_ops.OlnROIGenerator(params.roi_proposal)
self._sample_rois_fn = target_ops.ROIScoreSampler(params.roi_sampling)
self._sample_masks_fn = target_ops.MaskSampler(
params.architecture.mask_target_size,
params.mask_sampling.num_mask_samples_per_image)
if self._include_box_score:
self._frcnn_head_fn = factory.oln_box_score_head_generator(params)
else:
self._frcnn_head_fn = factory.fast_rcnn_head_generator(params)
if self._include_mask:
if self._include_mask_score:
self._mrcnn_head_fn = factory.oln_mask_score_head_generator(params)
else:
self._mrcnn_head_fn = factory.mask_rcnn_head_generator(params)
# Loss function.
self._rpn_score_loss_fn = losses.RpnScoreLoss(params.rpn_score_loss)
self._rpn_box_loss_fn = losses.RpnBoxLoss(params.rpn_box_loss)
if self._include_centerness:
self._rpn_iou_loss_fn = losses.OlnRpnIoULoss()
self._rpn_center_loss_fn = losses.OlnRpnCenterLoss()
self._frcnn_class_loss_fn = losses.FastrcnnClassLoss()
self._frcnn_box_loss_fn = losses.FastrcnnBoxLoss(params.frcnn_box_loss)
if self._include_box_score:
self._frcnn_box_score_loss_fn = losses.OlnBoxScoreLoss(
params.frcnn_box_score_loss)
if self._include_mask:
self._mask_loss_fn = losses.MaskrcnnLoss()
self._generate_detections_fn = postprocess_ops.OlnDetectionGenerator(
params.postprocess)
self._transpose_input = params.train.transpose_input
assert not self._transpose_input, 'Transpose input is not supportted.'
def build_outputs(self, inputs, mode):
is_training = mode == mode_keys.TRAIN
model_outputs = {}
image = inputs['image']
_, image_height, image_width, _ = image.get_shape().as_list()
backbone_features = self._backbone_fn(image, is_training)
fpn_features = self._fpn_fn(backbone_features, is_training)
# rpn_centerness.
if self._include_centerness:
rpn_score_outputs, rpn_box_outputs, rpn_center_outputs = (
self._rpn_head_fn(fpn_features, is_training))
model_outputs.update({
'rpn_center_outputs':
tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
rpn_center_outputs),
})
object_scores = rpn_center_outputs
else:
rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn(
fpn_features, is_training)
object_scores = None
model_outputs.update({
'rpn_score_outputs':
tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
rpn_score_outputs),
'rpn_box_outputs':
tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
rpn_box_outputs),
})
input_anchor = anchor.Anchor(self._params.architecture.min_level,
self._params.architecture.max_level,
self._params.anchor.num_scales,
self._params.anchor.aspect_ratios,
self._params.anchor.anchor_size,
(image_height, image_width))
rpn_rois, rpn_roi_scores = self._generate_rois_fn(
rpn_box_outputs,
rpn_score_outputs,
input_anchor.multilevel_boxes,
inputs['image_info'][:, 1, :],
is_training,
is_box_lrtb=self._include_centerness,
object_scores=object_scores,
)
if (not self._include_frcnn_class and
not self._include_frcnn_box and
not self._include_mask):
# if not is_training:
# For direct RPN detection,
# use dummy box_outputs = (dy,dx,dh,dw = 0,0,0,0)
box_outputs = tf.zeros_like(rpn_rois)
box_outputs = tf.concat([box_outputs, box_outputs], -1)
boxes, scores, classes, valid_detections = self._generate_detections_fn(
box_outputs, rpn_roi_scores, rpn_rois,
inputs['image_info'][:, 1:2, :],
is_single_fg_score=True, # if no_background, no softmax is applied.
keep_nms=True)
model_outputs.update({
'num_detections': valid_detections,
'detection_boxes': boxes,
'detection_classes': classes,
'detection_scores': scores,
})
return model_outputs
# ---- OLN-Proposal finishes here. ----
if is_training:
rpn_rois = tf.stop_gradient(rpn_rois)
rpn_roi_scores = tf.stop_gradient(rpn_roi_scores)
# Sample proposals.
(rpn_rois, rpn_roi_scores, matched_gt_boxes, matched_gt_classes,
matched_gt_indices) = (
self._sample_rois_fn(rpn_rois, rpn_roi_scores, inputs['gt_boxes'],
inputs['gt_classes']))
# Create bounding box training targets.
box_targets = box_utils.encode_boxes(
matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0])
# If the target is background, the box target is set to all 0s.
box_targets = tf.where(
tf.tile(
tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
[1, 1, 4]), tf.zeros_like(box_targets), box_targets)
model_outputs.update({
'class_targets': matched_gt_classes,
'box_targets': box_targets,
})
# Create Box-IoU targets. {
box_ious = box_utils.bbox_overlap(
rpn_rois, inputs['gt_boxes'])
matched_box_ious = tf.reduce_max(box_ious, 2)
model_outputs.update({
'box_iou_targets': matched_box_ious,}) # }
roi_features = spatial_transform_ops.multilevel_crop_and_resize(
fpn_features, rpn_rois, output_size=7)
if not self._include_box_score:
class_outputs, box_outputs = self._frcnn_head_fn(
roi_features, is_training)
else:
class_outputs, box_outputs, score_outputs = self._frcnn_head_fn(
roi_features, is_training)
model_outputs.update({
'box_score_outputs':
tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
score_outputs),})
model_outputs.update({
'class_outputs':
tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
class_outputs),
'box_outputs':
tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
box_outputs),
})
# Add this output to train to make the checkpoint loadable in predict mode.
# If we skip it in train mode, the heads will be out-of-order and checkpoint
# loading will fail.
if not self._include_frcnn_box:
box_outputs = tf.zeros_like(box_outputs) # dummy zeros.
if self._include_box_score:
score_outputs = tf.cast(tf.squeeze(score_outputs, -1),
rpn_roi_scores.dtype)
# box-score = (rpn-centerness * box-iou)^(1/2)
# TR: rpn_roi_scores: b,1000, score_outputs: b,512
# TS: rpn_roi_scores: b,1000, score_outputs: b,1000
box_scores = tf.pow(
rpn_roi_scores * tf.sigmoid(score_outputs), 1/2.)
if not self._include_frcnn_class:
boxes, scores, classes, valid_detections = self._generate_detections_fn(
box_outputs,
box_scores,
rpn_rois,
inputs['image_info'][:, 1:2, :],
is_single_fg_score=True,
keep_nms=True,)
else:
boxes, scores, classes, valid_detections = self._generate_detections_fn(
box_outputs, class_outputs, rpn_rois,
inputs['image_info'][:, 1:2, :],
keep_nms=True,)
model_outputs.update({
'num_detections': valid_detections,
'detection_boxes': boxes,
'detection_classes': classes,
'detection_scores': scores,
})
# ---- OLN-Box finishes here. ----
if not self._include_mask:
return model_outputs
if is_training:
rpn_rois, classes, mask_targets = self._sample_masks_fn(
rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices,
inputs['gt_masks'])
mask_targets = tf.stop_gradient(mask_targets)
classes = tf.cast(classes, dtype=tf.int32)
model_outputs.update({
'mask_targets': mask_targets,
'sampled_class_targets': classes,
})
else:
rpn_rois = boxes
classes = tf.cast(classes, dtype=tf.int32)
mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
fpn_features, rpn_rois, output_size=14)
mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes, is_training)
if is_training:
model_outputs.update({
'mask_outputs':
tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
mask_outputs),
})
else:
model_outputs.update({'detection_masks': tf.nn.sigmoid(mask_outputs)})
return model_outputs
def build_loss_fn(self):
if self._keras_model is None:
raise ValueError('build_loss_fn() must be called after build_model().')
filter_fn = self.make_filter_trainable_variables_fn()
trainable_variables = filter_fn(self._keras_model.trainable_variables)
def _total_loss_fn(labels, outputs):
if self._include_rpn_class:
rpn_score_loss = self._rpn_score_loss_fn(outputs['rpn_score_outputs'],
labels['rpn_score_targets'])
else:
rpn_score_loss = 0.0
if self._include_centerness:
rpn_center_loss = self._rpn_center_loss_fn(
outputs['rpn_center_outputs'], labels['rpn_center_targets'])
rpn_box_loss = self._rpn_iou_loss_fn(
outputs['rpn_box_outputs'], labels['rpn_box_targets'],
labels['rpn_center_targets'])
else:
rpn_center_loss = 0.0
rpn_box_loss = self._rpn_box_loss_fn(
outputs['rpn_box_outputs'], labels['rpn_box_targets'])
if self._include_frcnn_class:
frcnn_class_loss = self._frcnn_class_loss_fn(
outputs['class_outputs'], outputs['class_targets'])
else:
frcnn_class_loss = 0.0
if self._include_frcnn_box:
frcnn_box_loss = self._frcnn_box_loss_fn(
outputs['box_outputs'], outputs['class_targets'],
outputs['box_targets'])
else:
frcnn_box_loss = 0.0
if self._include_box_score:
box_score_loss = self._frcnn_box_score_loss_fn(
outputs['box_score_outputs'], outputs['box_iou_targets'])
else:
box_score_loss = 0.0
if self._include_mask:
mask_loss = self._mask_loss_fn(outputs['mask_outputs'],
outputs['mask_targets'],
outputs['sampled_class_targets'])
else:
mask_loss = 0.0
model_loss = (
rpn_score_loss + rpn_box_loss + rpn_center_loss +
frcnn_class_loss + frcnn_box_loss + box_score_loss +
mask_loss)
l2_regularization_loss = self.weight_decay_loss(trainable_variables)
total_loss = model_loss + l2_regularization_loss
return {
'total_loss': total_loss,
'loss': total_loss,
'fast_rcnn_class_loss': frcnn_class_loss,
'fast_rcnn_box_loss': frcnn_box_loss,
'fast_rcnn_box_score_loss': box_score_loss,
'mask_loss': mask_loss,
'model_loss': model_loss,
'l2_regularization_loss': l2_regularization_loss,
'rpn_score_loss': rpn_score_loss,
'rpn_box_loss': rpn_box_loss,
'rpn_center_loss': rpn_center_loss,
}
return _total_loss_fn
def build_input_layers(self, params, mode):
is_training = mode == mode_keys.TRAIN
input_shape = (
params.olnmask_parser.output_size +
[params.olnmask_parser.num_channels])
if is_training:
batch_size = params.train.batch_size
input_layer = {
'image':
tf.keras.layers.Input(
shape=input_shape,
batch_size=batch_size,
name='image',
dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
'image_info':
tf.keras.layers.Input(
shape=[4, 2],
batch_size=batch_size,
name='image_info',
),
'gt_boxes':
tf.keras.layers.Input(
shape=[params.olnmask_parser.max_num_instances, 4],
batch_size=batch_size,
name='gt_boxes'),
'gt_classes':
tf.keras.layers.Input(
shape=[params.olnmask_parser.max_num_instances],
batch_size=batch_size,
name='gt_classes',
dtype=tf.int64),
}
if self._include_mask:
input_layer['gt_masks'] = tf.keras.layers.Input(
shape=[
params.olnmask_parser.max_num_instances,
params.olnmask_parser.mask_crop_size,
params.olnmask_parser.mask_crop_size
],
batch_size=batch_size,
name='gt_masks')
else:
batch_size = params.eval.batch_size
input_layer = {
'image':
tf.keras.layers.Input(
shape=input_shape,
batch_size=batch_size,
name='image',
dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
'image_info':
tf.keras.layers.Input(
shape=[4, 2],
batch_size=batch_size,
name='image_info',
),
}
return input_layer
def build_model(self, params, mode):
if self._keras_model is None:
input_layers = self.build_input_layers(self._params, mode)
outputs = self.model_outputs(input_layers, mode)
model = tf.keras.models.Model(
inputs=input_layers, outputs=outputs, name='olnmask')
assert model is not None, 'Fail to build tf.keras.Model.'
model.optimizer = self.build_optimizer()
self._keras_model = model
return self._keras_model
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimizers."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import numpy as np
import tensorflow as tf
class OptimizerFactory(object):
"""Class to generate optimizer function."""
def __init__(self, params):
"""Creates optimized based on the specified flags."""
if params.type == 'momentum':
self._optimizer = functools.partial(
tf.keras.optimizers.SGD,
momentum=params.momentum,
nesterov=params.nesterov)
elif params.type == 'adam':
self._optimizer = tf.keras.optimizers.Adam
elif params.type == 'adadelta':
self._optimizer = tf.keras.optimizers.Adadelta
elif params.type == 'adagrad':
self._optimizer = tf.keras.optimizers.Adagrad
elif params.type == 'rmsprop':
self._optimizer = functools.partial(
tf.keras.optimizers.RMSprop, momentum=params.momentum)
else:
raise ValueError('Unsupported optimizer type `{}`.'.format(params.type))
def __call__(self, learning_rate):
return self._optimizer(learning_rate=learning_rate)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model defination for the RetinaNet Model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from official.vision.detection.dataloader import mode_keys
from official.vision.detection.evaluation import factory as eval_factory
from official.vision.detection.modeling import base_model
from official.vision.detection.modeling import losses
from official.vision.detection.modeling.architecture import factory
from official.vision.detection.ops import postprocess_ops
class RetinanetModel(base_model.Model):
"""RetinaNet model function."""
def __init__(self, params):
super(RetinanetModel, self).__init__(params)
# For eval metrics.
self._params = params
# Architecture generators.
self._backbone_fn = factory.backbone_generator(params)
self._fpn_fn = factory.multilevel_features_generator(params)
self._head_fn = factory.retinanet_head_generator(params)
# Loss function.
self._cls_loss_fn = losses.RetinanetClassLoss(
params.retinanet_loss, params.architecture.num_classes)
self._box_loss_fn = losses.RetinanetBoxLoss(params.retinanet_loss)
self._box_loss_weight = params.retinanet_loss.box_loss_weight
self._keras_model = None
# Predict function.
self._generate_detections_fn = postprocess_ops.MultilevelDetectionGenerator(
params.architecture.min_level, params.architecture.max_level,
params.postprocess)
self._transpose_input = params.train.transpose_input
assert not self._transpose_input, 'Transpose input is not supported.'
# Input layer.
self._input_layer = tf.keras.layers.Input(
shape=(None, None, params.retinanet_parser.num_channels),
name='',
dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32)
def build_outputs(self, inputs, mode):
# If the input image is transposed (from NHWC to HWCN), we need to revert it
# back to the original shape before it's used in the computation.
if self._transpose_input:
inputs = tf.transpose(inputs, [3, 0, 1, 2])
backbone_features = self._backbone_fn(
inputs, is_training=(mode == mode_keys.TRAIN))
fpn_features = self._fpn_fn(
backbone_features, is_training=(mode == mode_keys.TRAIN))
cls_outputs, box_outputs = self._head_fn(
fpn_features, is_training=(mode == mode_keys.TRAIN))
if self._use_bfloat16:
levels = cls_outputs.keys()
for level in levels:
cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
box_outputs[level] = tf.cast(box_outputs[level], tf.float32)
model_outputs = {
'cls_outputs': cls_outputs,
'box_outputs': box_outputs,
}
return model_outputs
def build_loss_fn(self):
if self._keras_model is None:
raise ValueError('build_loss_fn() must be called after build_model().')
filter_fn = self.make_filter_trainable_variables_fn()
trainable_variables = filter_fn(self._keras_model.trainable_variables)
def _total_loss_fn(labels, outputs):
cls_loss = self._cls_loss_fn(outputs['cls_outputs'],
labels['cls_targets'],
labels['num_positives'])
box_loss = self._box_loss_fn(outputs['box_outputs'],
labels['box_targets'],
labels['num_positives'])
model_loss = cls_loss + self._box_loss_weight * box_loss
l2_regularization_loss = self.weight_decay_loss(trainable_variables)
total_loss = model_loss + l2_regularization_loss
return {
'total_loss': total_loss,
'cls_loss': cls_loss,
'box_loss': box_loss,
'model_loss': model_loss,
'l2_regularization_loss': l2_regularization_loss,
}
return _total_loss_fn
def build_model(self, params, mode=None):
if self._keras_model is None:
outputs = self.model_outputs(self._input_layer, mode)
model = tf.keras.models.Model(
inputs=self._input_layer, outputs=outputs, name='retinanet')
assert model is not None, 'Fail to build tf.keras.Model.'
model.optimizer = self.build_optimizer()
self._keras_model = model
return self._keras_model
def post_processing(self, labels, outputs):
# TODO(yeqing): Moves the output related part into build_outputs.
required_output_fields = ['cls_outputs', 'box_outputs']
for field in required_output_fields:
if field not in outputs:
raise ValueError('"%s" is missing in outputs, requried %s found %s',
field, required_output_fields, outputs.keys())
required_label_fields = ['image_info', 'groundtruths']
for field in required_label_fields:
if field not in labels:
raise ValueError('"%s" is missing in outputs, requried %s found %s',
field, required_label_fields, labels.keys())
boxes, scores, classes, valid_detections = self._generate_detections_fn(
outputs['box_outputs'], outputs['cls_outputs'], labels['anchor_boxes'],
labels['image_info'][:, 1:2, :])
# Discards the old output tensors to save memory. The `cls_outputs` and
# `box_outputs` are pretty big and could potentiall lead to memory issue.
outputs = {
'source_id': labels['groundtruths']['source_id'],
'image_info': labels['image_info'],
'num_detections': valid_detections,
'detection_boxes': boxes,
'detection_classes': classes,
'detection_scores': scores,
}
if 'groundtruths' in labels:
labels['source_id'] = labels['groundtruths']['source_id']
labels['boxes'] = labels['groundtruths']['boxes']
labels['classes'] = labels['groundtruths']['classes']
labels['areas'] = labels['groundtruths']['areas']
labels['is_crowds'] = labels['groundtruths']['is_crowds']
return labels, outputs
def eval_metrics(self):
return eval_factory.evaluator_generator(self._params.eval)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model definition for the ShapeMask Model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from official.vision.detection.dataloader import anchor
from official.vision.detection.dataloader import mode_keys
from official.vision.detection.evaluation import factory as eval_factory
from official.vision.detection.modeling import base_model
from official.vision.detection.modeling import losses
from official.vision.detection.modeling.architecture import factory
from official.vision.detection.ops import postprocess_ops
from official.vision.detection.utils import box_utils
class ShapeMaskModel(base_model.Model):
"""ShapeMask model function."""
def __init__(self, params):
super(ShapeMaskModel, self).__init__(params)
self._params = params
self._keras_model = None
# Architecture generators.
self._backbone_fn = factory.backbone_generator(params)
self._fpn_fn = factory.multilevel_features_generator(params)
self._retinanet_head_fn = factory.retinanet_head_generator(params)
self._shape_prior_head_fn = factory.shapeprior_head_generator(params)
self._coarse_mask_fn = factory.coarsemask_head_generator(params)
self._fine_mask_fn = factory.finemask_head_generator(params)
# Loss functions.
self._cls_loss_fn = losses.RetinanetClassLoss(
params.retinanet_loss, params.architecture.num_classes)
self._box_loss_fn = losses.RetinanetBoxLoss(params.retinanet_loss)
self._box_loss_weight = params.retinanet_loss.box_loss_weight
# Mask loss function.
self._shapemask_prior_loss_fn = losses.ShapemaskMseLoss()
self._shapemask_loss_fn = losses.ShapemaskLoss()
self._shape_prior_loss_weight = (
params.shapemask_loss.shape_prior_loss_weight)
self._coarse_mask_loss_weight = (
params.shapemask_loss.coarse_mask_loss_weight)
self._fine_mask_loss_weight = (params.shapemask_loss.fine_mask_loss_weight)
# Predict function.
self._generate_detections_fn = postprocess_ops.MultilevelDetectionGenerator(
params.architecture.min_level, params.architecture.max_level,
params.postprocess)
def build_outputs(self, inputs, mode):
is_training = mode == mode_keys.TRAIN
images = inputs['image']
if 'anchor_boxes' in inputs:
anchor_boxes = inputs['anchor_boxes']
else:
anchor_boxes = anchor.Anchor(
self._params.architecture.min_level,
self._params.architecture.max_level, self._params.anchor.num_scales,
self._params.anchor.aspect_ratios, self._params.anchor.anchor_size,
images.get_shape().as_list()[1:3]).multilevel_boxes
batch_size = tf.shape(images)[0]
for level in anchor_boxes:
anchor_boxes[level] = tf.tile(
tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1, 1])
backbone_features = self._backbone_fn(images, is_training=is_training)
fpn_features = self._fpn_fn(backbone_features, is_training=is_training)
cls_outputs, box_outputs = self._retinanet_head_fn(
fpn_features, is_training=is_training)
valid_boxes, valid_scores, valid_classes, valid_detections = (
self._generate_detections_fn(box_outputs, cls_outputs, anchor_boxes,
inputs['image_info'][:, 1:2, :]))
image_size = images.get_shape().as_list()[1:3]
valid_outer_boxes = box_utils.compute_outer_boxes(
tf.reshape(valid_boxes, [-1, 4]),
image_size,
scale=self._params.shapemask_parser.outer_box_scale)
valid_outer_boxes = tf.reshape(valid_outer_boxes, tf.shape(valid_boxes))
# Wrapping if else code paths into a layer to make the checkpoint loadable
# in prediction mode.
class SampledBoxesLayer(tf.keras.layers.Layer):
"""ShapeMask model function."""
def call(self, inputs, val_boxes, val_classes, val_outer_boxes, training):
if training:
boxes = inputs['mask_boxes']
outer_boxes = inputs['mask_outer_boxes']
classes = inputs['mask_classes']
else:
boxes = val_boxes
classes = val_classes
outer_boxes = val_outer_boxes
return boxes, classes, outer_boxes
boxes, classes, outer_boxes = SampledBoxesLayer()(
inputs,
valid_boxes,
valid_classes,
valid_outer_boxes,
training=is_training)
instance_features, prior_masks = self._shape_prior_head_fn(
fpn_features, boxes, outer_boxes, classes, is_training)
coarse_mask_logits = self._coarse_mask_fn(instance_features, prior_masks,
classes, is_training)
fine_mask_logits = self._fine_mask_fn(instance_features, coarse_mask_logits,
classes, is_training)
model_outputs = {
'cls_outputs': cls_outputs,
'box_outputs': box_outputs,
'fine_mask_logits': fine_mask_logits,
'coarse_mask_logits': coarse_mask_logits,
'prior_masks': prior_masks,
}
if not is_training:
model_outputs.update({
'num_detections': valid_detections,
'detection_boxes': valid_boxes,
'detection_outer_boxes': valid_outer_boxes,
'detection_masks': fine_mask_logits,
'detection_classes': valid_classes,
'detection_scores': valid_scores,
})
return model_outputs
def build_loss_fn(self):
if self._keras_model is None:
raise ValueError('build_loss_fn() must be called after build_model().')
filter_fn = self.make_filter_trainable_variables_fn()
trainable_variables = filter_fn(self._keras_model.trainable_variables)
def _total_loss_fn(labels, outputs):
cls_loss = self._cls_loss_fn(outputs['cls_outputs'],
labels['cls_targets'],
labels['num_positives'])
box_loss = self._box_loss_fn(outputs['box_outputs'],
labels['box_targets'],
labels['num_positives'])
# Adds Shapemask model losses.
shape_prior_loss = self._shapemask_prior_loss_fn(outputs['prior_masks'],
labels['mask_targets'],
labels['mask_is_valid'])
coarse_mask_loss = self._shapemask_loss_fn(outputs['coarse_mask_logits'],
labels['mask_targets'],
labels['mask_is_valid'])
fine_mask_loss = self._shapemask_loss_fn(outputs['fine_mask_logits'],
labels['fine_mask_targets'],
labels['mask_is_valid'])
model_loss = (
cls_loss + self._box_loss_weight * box_loss +
shape_prior_loss * self._shape_prior_loss_weight +
coarse_mask_loss * self._coarse_mask_loss_weight +
fine_mask_loss * self._fine_mask_loss_weight)
l2_regularization_loss = self.weight_decay_loss(trainable_variables)
total_loss = model_loss + l2_regularization_loss
shapemask_losses = {
'total_loss': total_loss,
'loss': total_loss,
'retinanet_cls_loss': cls_loss,
'l2_regularization_loss': l2_regularization_loss,
'retinanet_box_loss': box_loss,
'shapemask_prior_loss': shape_prior_loss,
'shapemask_coarse_mask_loss': coarse_mask_loss,
'shapemask_fine_mask_loss': fine_mask_loss,
'model_loss': model_loss,
}
return shapemask_losses
return _total_loss_fn
def build_input_layers(self, params, mode):
is_training = mode == mode_keys.TRAIN
input_shape = (
params.shapemask_parser.output_size +
[params.shapemask_parser.num_channels])
if is_training:
batch_size = params.train.batch_size
input_layer = {
'image':
tf.keras.layers.Input(
shape=input_shape,
batch_size=batch_size,
name='image',
dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
'image_info':
tf.keras.layers.Input(
shape=[4, 2], batch_size=batch_size, name='image_info'),
'mask_classes':
tf.keras.layers.Input(
shape=[params.shapemask_parser.num_sampled_masks],
batch_size=batch_size,
name='mask_classes',
dtype=tf.int64),
'mask_outer_boxes':
tf.keras.layers.Input(
shape=[params.shapemask_parser.num_sampled_masks, 4],
batch_size=batch_size,
name='mask_outer_boxes',
dtype=tf.float32),
'mask_boxes':
tf.keras.layers.Input(
shape=[params.shapemask_parser.num_sampled_masks, 4],
batch_size=batch_size,
name='mask_boxes',
dtype=tf.float32),
}
else:
batch_size = params.eval.batch_size
input_layer = {
'image':
tf.keras.layers.Input(
shape=input_shape,
batch_size=batch_size,
name='image',
dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
'image_info':
tf.keras.layers.Input(
shape=[4, 2], batch_size=batch_size, name='image_info'),
}
return input_layer
def build_model(self, params, mode):
if self._keras_model is None:
input_layers = self.build_input_layers(self._params, mode)
outputs = self.model_outputs(input_layers, mode)
model = tf.keras.models.Model(
inputs=input_layers, outputs=outputs, name='shapemask')
assert model is not None, 'Fail to build tf.keras.Model.'
model.optimizer = self.build_optimizer()
self._keras_model = model
return self._keras_model
def post_processing(self, labels, outputs):
required_output_fields = [
'num_detections', 'detection_boxes', 'detection_classes',
'detection_masks', 'detection_scores'
]
for field in required_output_fields:
if field not in outputs:
raise ValueError(
'"{}" is missing in outputs, requried {} found {}'.format(
field, required_output_fields, outputs.keys()))
required_label_fields = ['image_info']
for field in required_label_fields:
if field not in labels:
raise ValueError(
'"{}" is missing in labels, requried {} found {}'.format(
field, required_label_fields, labels.keys()))
predictions = {
'image_info': labels['image_info'],
'num_detections': outputs['num_detections'],
'detection_boxes': outputs['detection_boxes'],
'detection_outer_boxes': outputs['detection_outer_boxes'],
'detection_classes': outputs['detection_classes'],
'detection_scores': outputs['detection_scores'],
'detection_masks': outputs['detection_masks'],
}
if 'groundtruths' in labels:
predictions['source_id'] = labels['groundtruths']['source_id']
labels = labels['groundtruths']
return labels, predictions
def eval_metrics(self):
return eval_factory.evaluator_generator(self._params.eval)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tensorflow implementation of non max suppression."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from official.vision.detection.utils import box_utils
NMS_TILE_SIZE = 512
def _self_suppression(iou, _, iou_sum):
batch_size = tf.shape(iou)[0]
can_suppress_others = tf.cast(
tf.reshape(tf.reduce_max(iou, 1) <= 0.5, [batch_size, -1, 1]), iou.dtype)
iou_suppressed = tf.reshape(
tf.cast(tf.reduce_max(can_suppress_others * iou, 1) <= 0.5, iou.dtype),
[batch_size, -1, 1]) * iou
iou_sum_new = tf.reduce_sum(iou_suppressed, [1, 2])
return [
iou_suppressed,
tf.reduce_any(iou_sum - iou_sum_new > 0.5), iou_sum_new
]
def _cross_suppression(boxes, box_slice, iou_threshold, inner_idx):
batch_size = tf.shape(boxes)[0]
new_slice = tf.slice(boxes, [0, inner_idx * NMS_TILE_SIZE, 0],
[batch_size, NMS_TILE_SIZE, 4])
iou = box_utils.bbox_overlap(new_slice, box_slice)
ret_slice = tf.expand_dims(
tf.cast(tf.reduce_all(iou < iou_threshold, [1]), box_slice.dtype),
2) * box_slice
return boxes, ret_slice, iou_threshold, inner_idx + 1
def _suppression_loop_body(boxes, iou_threshold, output_size, idx):
"""Process boxes in the range [idx*NMS_TILE_SIZE, (idx+1)*NMS_TILE_SIZE).
Args:
boxes: a tensor with a shape of [batch_size, anchors, 4].
iou_threshold: a float representing the threshold for deciding whether boxes
overlap too much with respect to IOU.
output_size: an int32 tensor of size [batch_size]. Representing the number
of selected boxes for each batch.
idx: an integer scalar representing induction variable.
Returns:
boxes: updated boxes.
iou_threshold: pass down iou_threshold to the next iteration.
output_size: the updated output_size.
idx: the updated induction variable.
"""
num_tiles = tf.shape(boxes)[1] // NMS_TILE_SIZE
batch_size = tf.shape(boxes)[0]
# Iterates over tiles that can possibly suppress the current tile.
box_slice = tf.slice(boxes, [0, idx * NMS_TILE_SIZE, 0],
[batch_size, NMS_TILE_SIZE, 4])
_, box_slice, _, _ = tf.while_loop(
lambda _boxes, _box_slice, _threshold, inner_idx: inner_idx < idx,
_cross_suppression, [boxes, box_slice, iou_threshold,
tf.constant(0)])
# Iterates over the current tile to compute self-suppression.
iou = box_utils.bbox_overlap(box_slice, box_slice)
mask = tf.expand_dims(
tf.reshape(tf.range(NMS_TILE_SIZE), [1, -1]) > tf.reshape(
tf.range(NMS_TILE_SIZE), [-1, 1]), 0)
iou *= tf.cast(tf.logical_and(mask, iou >= iou_threshold), iou.dtype)
suppressed_iou, _, _ = tf.while_loop(
lambda _iou, loop_condition, _iou_sum: loop_condition, _self_suppression,
[iou, tf.constant(True),
tf.reduce_sum(iou, [1, 2])])
suppressed_box = tf.reduce_sum(suppressed_iou, 1) > 0
box_slice *= tf.expand_dims(1.0 - tf.cast(suppressed_box, box_slice.dtype), 2)
# Uses box_slice to update the input boxes.
mask = tf.reshape(
tf.cast(tf.equal(tf.range(num_tiles), idx), boxes.dtype), [1, -1, 1, 1])
boxes = tf.tile(tf.expand_dims(
box_slice, [1]), [1, num_tiles, 1, 1]) * mask + tf.reshape(
boxes, [batch_size, num_tiles, NMS_TILE_SIZE, 4]) * (1 - mask)
boxes = tf.reshape(boxes, [batch_size, -1, 4])
# Updates output_size.
output_size += tf.reduce_sum(
tf.cast(tf.reduce_any(box_slice > 0, [2]), tf.int32), [1])
return boxes, iou_threshold, output_size, idx + 1
def sorted_non_max_suppression_padded(scores, boxes, max_output_size,
iou_threshold):
"""A wrapper that handles non-maximum suppression.
Assumption:
* The boxes are sorted by scores unless the box is a dot (all coordinates
are zero).
* Boxes with higher scores can be used to suppress boxes with lower scores.
The overal design of the algorithm is to handle boxes tile-by-tile:
boxes = boxes.pad_to_multiply_of(tile_size)
num_tiles = len(boxes) // tile_size
output_boxes = []
for i in range(num_tiles):
box_tile = boxes[i*tile_size : (i+1)*tile_size]
for j in range(i - 1):
suppressing_tile = boxes[j*tile_size : (j+1)*tile_size]
iou = bbox_overlap(box_tile, suppressing_tile)
# if the box is suppressed in iou, clear it to a dot
box_tile *= _update_boxes(iou)
# Iteratively handle the diagnal tile.
iou = _box_overlap(box_tile, box_tile)
iou_changed = True
while iou_changed:
# boxes that are not suppressed by anything else
suppressing_boxes = _get_suppressing_boxes(iou)
# boxes that are suppressed by suppressing_boxes
suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes)
# clear iou to 0 for boxes that are suppressed, as they cannot be used
# to suppress other boxes any more
new_iou = _clear_iou(iou, suppressed_boxes)
iou_changed = (new_iou != iou)
iou = new_iou
# remaining boxes that can still suppress others, are selected boxes.
output_boxes.append(_get_suppressing_boxes(iou))
if len(output_boxes) >= max_output_size:
break
Args:
scores: a tensor with a shape of [batch_size, anchors].
boxes: a tensor with a shape of [batch_size, anchors, 4].
max_output_size: a scalar integer `Tensor` representing the maximum number
of boxes to be selected by non max suppression.
iou_threshold: a float representing the threshold for deciding whether boxes
overlap too much with respect to IOU.
Returns:
nms_scores: a tensor with a shape of [batch_size, anchors]. It has same
dtype as input scores.
nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has
same dtype as input boxes.
"""
batch_size = tf.shape(boxes)[0]
num_boxes = tf.shape(boxes)[1]
pad = tf.cast(
tf.math.ceil(tf.cast(num_boxes, tf.float32) / NMS_TILE_SIZE),
tf.int32) * NMS_TILE_SIZE - num_boxes
boxes = tf.pad(tf.cast(boxes, tf.float32), [[0, 0], [0, pad], [0, 0]])
scores = tf.pad(
tf.cast(scores, tf.float32), [[0, 0], [0, pad]], constant_values=-1)
num_boxes += pad
def _loop_cond(unused_boxes, unused_threshold, output_size, idx):
return tf.logical_and(
tf.reduce_min(output_size) < max_output_size,
idx < num_boxes // NMS_TILE_SIZE)
selected_boxes, _, output_size, _ = tf.while_loop(
_loop_cond, _suppression_loop_body,
[boxes, iou_threshold,
tf.zeros([batch_size], tf.int32),
tf.constant(0)])
idx = num_boxes - tf.cast(
tf.nn.top_k(
tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) *
tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0],
tf.int32)
idx = tf.minimum(idx, num_boxes - 1)
idx = tf.reshape(idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]),
[-1])
boxes = tf.reshape(
tf.gather(tf.reshape(boxes, [-1, 4]), idx),
[batch_size, max_output_size, 4])
boxes = boxes * tf.cast(
tf.reshape(tf.range(max_output_size), [1, -1, 1]) < tf.reshape(
output_size, [-1, 1, 1]), boxes.dtype)
scores = tf.reshape(
tf.gather(tf.reshape(scores, [-1, 1]), idx),
[batch_size, max_output_size])
scores = scores * tf.cast(
tf.reshape(tf.range(max_output_size), [1, -1]) < tf.reshape(
output_size, [-1, 1]), scores.dtype)
return scores, boxes
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Post-processing model outputs to generate detection."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import tensorflow as tf
from official.vision.detection.ops import nms
from official.vision.detection.utils import box_utils
def generate_detections_factory(params):
"""Factory to select function to generate detection."""
if params.use_batched_nms:
func = functools.partial(
_generate_detections_batched,
max_total_size=params.max_total_size,
nms_iou_threshold=params.nms_iou_threshold,
score_threshold=params.score_threshold)
else:
func = functools.partial(
_generate_detections,
max_total_size=params.max_total_size,
nms_iou_threshold=params.nms_iou_threshold,
score_threshold=params.score_threshold,
pre_nms_num_boxes=params.pre_nms_num_boxes)
return func
def _select_top_k_scores(scores_in, pre_nms_num_detections):
"""Select top_k scores and indices for each class.
Args:
scores_in: a Tensor with shape [batch_size, N, num_classes], which stacks
class logit outputs on all feature levels. The N is the number of total
anchors on all levels. The num_classes is the number of classes predicted
by the model.
pre_nms_num_detections: Number of candidates before NMS.
Returns:
scores and indices: Tensors with shape [batch_size, pre_nms_num_detections,
num_classes].
"""
batch_size, num_anchors, num_class = scores_in.get_shape().as_list()
scores_trans = tf.transpose(scores_in, perm=[0, 2, 1])
scores_trans = tf.reshape(scores_trans, [-1, num_anchors])
top_k_scores, top_k_indices = tf.nn.top_k(
scores_trans, k=pre_nms_num_detections, sorted=True)
top_k_scores = tf.reshape(top_k_scores,
[batch_size, num_class, pre_nms_num_detections])
top_k_indices = tf.reshape(top_k_indices,
[batch_size, num_class, pre_nms_num_detections])
return tf.transpose(top_k_scores,
[0, 2, 1]), tf.transpose(top_k_indices, [0, 2, 1])
def _generate_detections(boxes,
scores,
max_total_size=100,
nms_iou_threshold=0.3,
score_threshold=0.05,
pre_nms_num_boxes=5000):
"""Generate the final detections given the model outputs.
This uses classes unrolling with while loop based NMS, could be parralled
at batch dimension.
Args:
boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size,
N, 1, 4], which box predictions on all feature levels. The N is the number
of total anchors on all levels.
scores: a tensor with shape [batch_size, N, num_classes], which stacks class
probability on all feature levels. The N is the number of total anchors on
all levels. The num_classes is the number of classes predicted by the
model. Note that the class_outputs here is the raw score.
max_total_size: a scalar representing maximum number of boxes retained over
all classes.
nms_iou_threshold: a float representing the threshold for deciding whether
boxes overlap too much with respect to IOU.
score_threshold: a float representing the threshold for deciding when to
remove boxes based on score.
pre_nms_num_boxes: an int number of top candidate detections per class
before NMS.
Returns:
nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
representing top detected boxes in [y1, x1, y2, x2].
nms_scores: `float` Tensor of shape [batch_size, max_total_size]
representing sorted confidence scores for detected boxes. The values are
between [0, 1].
nms_classes: `int` Tensor of shape [batch_size, max_total_size] representing
classes for detected boxes.
valid_detections: `int` Tensor of shape [batch_size] only the top
`valid_detections` boxes are valid detections.
"""
with tf.name_scope('generate_detections'):
nmsed_boxes = []
nmsed_classes = []
nmsed_scores = []
valid_detections = []
batch_size, _, num_classes_for_box, _ = boxes.get_shape().as_list()
_, total_anchors, num_classes = scores.get_shape().as_list()
# Selects top pre_nms_num scores and indices before NMS.
scores, indices = _select_top_k_scores(
scores, min(total_anchors, pre_nms_num_boxes))
for i in range(num_classes):
boxes_i = boxes[:, :, min(num_classes_for_box - 1, i), :]
scores_i = scores[:, :, i]
# Obtains pre_nms_num_boxes before running NMS.
boxes_i = tf.gather(boxes_i, indices[:, :, i], batch_dims=1, axis=1)
# Filter out scores.
boxes_i, scores_i = box_utils.filter_boxes_by_scores(
boxes_i, scores_i, min_score_threshold=score_threshold)
(nmsed_scores_i, nmsed_boxes_i) = nms.sorted_non_max_suppression_padded(
tf.cast(scores_i, tf.float32),
tf.cast(boxes_i, tf.float32),
max_total_size,
iou_threshold=nms_iou_threshold)
nmsed_classes_i = tf.fill([batch_size, max_total_size], i)
nmsed_boxes.append(nmsed_boxes_i)
nmsed_scores.append(nmsed_scores_i)
nmsed_classes.append(nmsed_classes_i)
nmsed_boxes = tf.concat(nmsed_boxes, axis=1)
nmsed_scores = tf.concat(nmsed_scores, axis=1)
nmsed_classes = tf.concat(nmsed_classes, axis=1)
nmsed_scores, indices = tf.nn.top_k(
nmsed_scores, k=max_total_size, sorted=True)
nmsed_boxes = tf.gather(nmsed_boxes, indices, batch_dims=1, axis=1)
nmsed_classes = tf.gather(nmsed_classes, indices, batch_dims=1)
valid_detections = tf.reduce_sum(
input_tensor=tf.cast(tf.greater(nmsed_scores, -1), tf.int32), axis=1)
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
def _generate_detections_per_image(boxes,
scores,
max_total_size=100,
nms_iou_threshold=0.3,
score_threshold=0.05,
pre_nms_num_boxes=5000):
"""Generate the final detections per image given the model outputs.
Args:
boxes: a tensor with shape [N, num_classes, 4] or [N, 1, 4], which box
predictions on all feature levels. The N is the number of total anchors on
all levels.
scores: a tensor with shape [N, num_classes], which stacks class probability
on all feature levels. The N is the number of total anchors on all levels.
The num_classes is the number of classes predicted by the model. Note that
the class_outputs here is the raw score.
max_total_size: a scalar representing maximum number of boxes retained over
all classes.
nms_iou_threshold: a float representing the threshold for deciding whether
boxes overlap too much with respect to IOU.
score_threshold: a float representing the threshold for deciding when to
remove boxes based on score.
pre_nms_num_boxes: an int number of top candidate detections per class
before NMS.
Returns:
nms_boxes: `float` Tensor of shape [max_total_size, 4] representing top
detected boxes in [y1, x1, y2, x2].
nms_scores: `float` Tensor of shape [max_total_size] representing sorted
confidence scores for detected boxes. The values are between [0, 1].
nms_classes: `int` Tensor of shape [max_total_size] representing classes for
detected boxes.
valid_detections: `int` Tensor of shape [1] only the top `valid_detections`
boxes are valid detections.
"""
nmsed_boxes = []
nmsed_scores = []
nmsed_classes = []
num_classes_for_box = boxes.get_shape().as_list()[1]
num_classes = scores.get_shape().as_list()[1]
for i in range(num_classes):
boxes_i = boxes[:, min(num_classes_for_box - 1, i)]
scores_i = scores[:, i]
# Obtains pre_nms_num_boxes before running NMS.
scores_i, indices = tf.nn.top_k(
scores_i, k=tf.minimum(tf.shape(input=scores_i)[-1], pre_nms_num_boxes))
boxes_i = tf.gather(boxes_i, indices)
(nmsed_indices_i, nmsed_num_valid_i) = tf.image.non_max_suppression_padded(
tf.cast(boxes_i, tf.float32),
tf.cast(scores_i, tf.float32),
max_total_size,
iou_threshold=nms_iou_threshold,
score_threshold=score_threshold,
pad_to_max_output_size=True,
name='nms_detections_' + str(i))
nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i)
# Sets scores of invalid boxes to -1.
nmsed_scores_i = tf.where(
tf.less(tf.range(max_total_size), [nmsed_num_valid_i]), nmsed_scores_i,
-tf.ones_like(nmsed_scores_i))
nmsed_classes_i = tf.fill([max_total_size], i)
nmsed_boxes.append(nmsed_boxes_i)
nmsed_scores.append(nmsed_scores_i)
nmsed_classes.append(nmsed_classes_i)
# Concats results from all classes and sort them.
nmsed_boxes = tf.concat(nmsed_boxes, axis=0)
nmsed_scores = tf.concat(nmsed_scores, axis=0)
nmsed_classes = tf.concat(nmsed_classes, axis=0)
nmsed_scores, indices = tf.nn.top_k(
nmsed_scores, k=max_total_size, sorted=True)
nmsed_boxes = tf.gather(nmsed_boxes, indices)
nmsed_classes = tf.gather(nmsed_classes, indices)
valid_detections = tf.reduce_sum(
input_tensor=tf.cast(tf.greater(nmsed_scores, -1), tf.int32))
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
def _generate_detections_batched(boxes, scores, max_total_size,
nms_iou_threshold, score_threshold):
"""Generates detected boxes with scores and classes for one-stage detector.
The function takes output of multi-level ConvNets and anchor boxes and
generates detected boxes. Note that this used batched nms, which is not
supported on TPU currently.
Args:
boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size,
N, 1, 4], which box predictions on all feature levels. The N is the number
of total anchors on all levels.
scores: a tensor with shape [batch_size, N, num_classes], which stacks class
probability on all feature levels. The N is the number of total anchors on
all levels. The num_classes is the number of classes predicted by the
model. Note that the class_outputs here is the raw score.
max_total_size: a scalar representing maximum number of boxes retained over
all classes.
nms_iou_threshold: a float representing the threshold for deciding whether
boxes overlap too much with respect to IOU.
score_threshold: a float representing the threshold for deciding when to
remove boxes based on score.
Returns:
nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
representing top detected boxes in [y1, x1, y2, x2].
nms_scores: `float` Tensor of shape [batch_size, max_total_size]
representing sorted confidence scores for detected boxes. The values are
between [0, 1].
nms_classes: `int` Tensor of shape [batch_size, max_total_size] representing
classes for detected boxes.
valid_detections: `int` Tensor of shape [batch_size] only the top
`valid_detections` boxes are valid detections.
"""
with tf.name_scope('generate_detections'):
# TODO(tsungyi): Removes normalization/denomalization once the
# tf.image.combined_non_max_suppression is coordinate system agnostic.
# Normalizes maximum box cooridinates to 1.
normalizer = tf.reduce_max(boxes)
boxes /= normalizer
(nmsed_boxes, nmsed_scores, nmsed_classes,
valid_detections) = tf.image.combined_non_max_suppression(
boxes,
scores,
max_output_size_per_class=max_total_size,
max_total_size=max_total_size,
iou_threshold=nms_iou_threshold,
score_threshold=score_threshold,
pad_per_class=False,
)
# De-normalizes box cooridinates.
nmsed_boxes *= normalizer
nmsed_classes = tf.cast(nmsed_classes, tf.int32)
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
class MultilevelDetectionGenerator(tf.keras.layers.Layer):
"""Generates detected boxes with scores and classes for one-stage detector."""
def __init__(self, min_level, max_level, params):
self._min_level = min_level
self._max_level = max_level
self._generate_detections = generate_detections_factory(params)
super(MultilevelDetectionGenerator, self).__init__(autocast=False)
def call(self, box_outputs, class_outputs, anchor_boxes, image_shape):
# Collects outputs from all levels into a list.
boxes = []
scores = []
for i in range(self._min_level, self._max_level + 1):
box_outputs_i_shape = tf.shape(box_outputs[i])
batch_size = box_outputs_i_shape[0]
num_anchors_per_locations = box_outputs_i_shape[-1] // 4
num_classes = tf.shape(class_outputs[i])[-1] // num_anchors_per_locations
# Applies score transformation and remove the implicit background class.
scores_i = tf.sigmoid(
tf.reshape(class_outputs[i], [batch_size, -1, num_classes]))
scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1])
# Box decoding.
# The anchor boxes are shared for all data in a batch.
# One stage detector only supports class agnostic box regression.
anchor_boxes_i = tf.reshape(anchor_boxes[i], [batch_size, -1, 4])
box_outputs_i = tf.reshape(box_outputs[i], [batch_size, -1, 4])
boxes_i = box_utils.decode_boxes(box_outputs_i, anchor_boxes_i)
# Box clipping.
boxes_i = box_utils.clip_boxes(boxes_i, image_shape)
boxes.append(boxes_i)
scores.append(scores_i)
boxes = tf.concat(boxes, axis=1)
scores = tf.concat(scores, axis=1)
nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
self._generate_detections(tf.expand_dims(boxes, axis=2), scores))
# Adds 1 to offset the background class which has index 0.
nmsed_classes += 1
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
class GenericDetectionGenerator(tf.keras.layers.Layer):
"""Generates the final detected boxes with scores and classes."""
def __init__(self, params):
super(GenericDetectionGenerator, self).__init__(autocast=False)
self._generate_detections = generate_detections_factory(params)
def call(self, box_outputs, class_outputs, anchor_boxes, image_shape):
"""Generate final detections.
Args:
box_outputs: a tensor of shape of [batch_size, K, num_classes * 4]
representing the class-specific box coordinates relative to anchors.
class_outputs: a tensor of shape of [batch_size, K, num_classes]
representing the class logits before applying score activiation.
anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
corresponding anchor boxes w.r.t `box_outputs`.
image_shape: a tensor of shape of [batch_size, 2] storing the image height
and width w.r.t. the scaled image, i.e. the same image space as
`box_outputs` and `anchor_boxes`.
Returns:
nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
representing top detected boxes in [y1, x1, y2, x2].
nms_scores: `float` Tensor of shape [batch_size, max_total_size]
representing sorted confidence scores for detected boxes. The values are
between [0, 1].
nms_classes: `int` Tensor of shape [batch_size, max_total_size]
representing classes for detected boxes.
valid_detections: `int` Tensor of shape [batch_size] only the top
`valid_detections` boxes are valid detections.
"""
class_outputs = tf.nn.softmax(class_outputs, axis=-1)
# Removes the background class.
class_outputs_shape = tf.shape(class_outputs)
batch_size = class_outputs_shape[0]
num_locations = class_outputs_shape[1]
num_classes = class_outputs_shape[-1]
num_detections = num_locations * (num_classes - 1)
class_outputs = tf.slice(class_outputs, [0, 0, 1], [-1, -1, -1])
box_outputs = tf.reshape(
box_outputs,
tf.stack([batch_size, num_locations, num_classes, 4], axis=-1))
box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1])
anchor_boxes = tf.tile(
tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
box_outputs = tf.reshape(box_outputs,
tf.stack([batch_size, num_detections, 4], axis=-1))
anchor_boxes = tf.reshape(
anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1))
# Box decoding.
decoded_boxes = box_utils.decode_boxes(
box_outputs, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0])
# Box clipping
decoded_boxes = box_utils.clip_boxes(decoded_boxes, image_shape)
decoded_boxes = tf.reshape(
decoded_boxes,
tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1))
nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
self._generate_detections(decoded_boxes, class_outputs))
# Adds 1 to offset the background class which has index 0.
nmsed_classes += 1
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
class OlnDetectionGenerator(GenericDetectionGenerator):
"""Generates the final detected boxes with scores and classes."""
def __call__(self, box_outputs, class_outputs, anchor_boxes, image_shape,
is_single_fg_score=False, keep_nms=True):
"""Generate final detections for Object Localization Network (OLN).
Args:
box_outputs: a tensor of shape of [batch_size, K, num_classes * 4]
representing the class-specific box coordinates relative to anchors.
class_outputs: a tensor of shape of [batch_size, K, num_classes]
representing the class logits before applying score activiation.
anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
corresponding anchor boxes w.r.t `box_outputs`.
image_shape: a tensor of shape of [batch_size, 2] storing the image height
and width w.r.t. the scaled image, i.e. the same image space as
`box_outputs` and `anchor_boxes`.
is_single_fg_score: a Bool indicator of whether class_outputs includes the
background scores concatenated or not. By default, class_outputs is a
concatenation of both scores for the foreground and background. That is,
scores_without_bg=False.
keep_nms: a Bool indicator of whether to perform NMS or not.
Returns:
nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
representing top detected boxes in [y1, x1, y2, x2].
nms_scores: `float` Tensor of shape [batch_size, max_total_size]
representing sorted confidence scores for detected boxes. The values are
between [0, 1].
nms_classes: `int` Tensor of shape [batch_size, max_total_size]
representing classes for detected boxes.
valid_detections: `int` Tensor of shape [batch_size] only the top
`valid_detections` boxes are valid detections.
"""
if is_single_fg_score:
# Concatenates dummy background scores.
dummy_bg_scores = tf.zeros_like(class_outputs)
class_outputs = tf.stack([dummy_bg_scores, class_outputs], -1)
else:
class_outputs = tf.nn.softmax(class_outputs, axis=-1)
# Removes the background class.
class_outputs_shape = tf.shape(class_outputs)
batch_size = class_outputs_shape[0]
num_locations = class_outputs_shape[1]
num_classes = class_outputs_shape[-1]
num_detections = num_locations * (num_classes - 1)
class_outputs = tf.slice(class_outputs, [0, 0, 1], [-1, -1, -1])
box_outputs = tf.reshape(
box_outputs,
tf.stack([batch_size, num_locations, num_classes, 4], axis=-1))
box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1])
anchor_boxes = tf.tile(
tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
box_outputs = tf.reshape(box_outputs,
tf.stack([batch_size, num_detections, 4], axis=-1))
anchor_boxes = tf.reshape(
anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1))
# Box decoding. For RPN outputs, box_outputs are all zeros.
decoded_boxes = box_utils.decode_boxes(
box_outputs, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0])
# Box clipping
decoded_boxes = box_utils.clip_boxes(decoded_boxes, image_shape)
decoded_boxes = tf.reshape(
decoded_boxes,
tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1))
if keep_nms:
nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
self._generate_detections(decoded_boxes, class_outputs))
# Adds 1 to offset the background class which has index 0.
nmsed_classes += 1
else:
nmsed_boxes = decoded_boxes[:, :, 0, :]
nmsed_scores = class_outputs[:, :, 0]
nmsed_classes = tf.cast(tf.ones_like(nmsed_scores), tf.int32)
valid_detections = tf.cast(
tf.reduce_sum(tf.ones_like(nmsed_scores), axis=-1), tf.int32)
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ROI-related ops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from official.vision.detection.ops import nms
from official.vision.detection.utils import box_utils
def multilevel_propose_rois(rpn_boxes,
rpn_scores,
anchor_boxes,
image_shape,
rpn_pre_nms_top_k=2000,
rpn_post_nms_top_k=1000,
rpn_nms_threshold=0.7,
rpn_score_threshold=0.0,
rpn_min_size_threshold=0.0,
decode_boxes=True,
clip_boxes=True,
use_batched_nms=False,
apply_sigmoid_to_score=True):
"""Proposes RoIs given a group of candidates from different FPN levels.
The following describes the steps:
1. For each individual level:
a. Apply sigmoid transform if specified.
b. Decode boxes if specified.
c. Clip boxes if specified.
d. Filter small boxes and those fall outside image if specified.
e. Apply pre-NMS filtering including pre-NMS top k and score thresholding.
f. Apply NMS.
2. Aggregate post-NMS boxes from each level.
3. Apply an overall top k to generate the final selected RoIs.
Args:
rpn_boxes: a dict with keys representing FPN levels and values representing
box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
rpn_scores: a dict with keys representing FPN levels and values representing
logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
anchor_boxes: a dict with keys representing FPN levels and values
representing anchor box tensors of shape [batch_size, feature_h,
feature_w, num_anchors * 4].
image_shape: a tensor of shape [batch_size, 2] where the last dimension are
[height, width] of the scaled image.
rpn_pre_nms_top_k: an integer of top scoring RPN proposals *per level* to
keep before applying NMS. Default: 2000.
rpn_post_nms_top_k: an integer of top scoring RPN proposals *in total* to
keep after applying NMS. Default: 1000.
rpn_nms_threshold: a float between 0 and 1 representing the IoU threshold
used for NMS. If 0.0, no NMS is applied. Default: 0.7.
rpn_score_threshold: a float between 0 and 1 representing the minimal box
score to keep before applying NMS. This is often used as a pre-filtering
step for better performance. If 0, no filtering is applied. Default: 0.
rpn_min_size_threshold: a float representing the minimal box size in each
side (w.r.t. the scaled image) to keep before applying NMS. This is often
used as a pre-filtering step for better performance. If 0, no filtering is
applied. Default: 0.
decode_boxes: a boolean indicating whether `rpn_boxes` needs to be decoded
using `anchor_boxes`. If False, use `rpn_boxes` directly and ignore
`anchor_boxes`. Default: True.
clip_boxes: a boolean indicating whether boxes are first clipped to the
scaled image size before appliying NMS. If False, no clipping is applied
and `image_shape` is ignored. Default: True.
use_batched_nms: a boolean indicating whether NMS is applied in batch using
`tf.image.combined_non_max_suppression`. Currently only available in
CPU/GPU. Default: False.
apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to
`rpn_scores` before applying NMS. Default: True.
Returns:
selected_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
representing the box coordinates of the selected proposals w.r.t. the
scaled image.
selected_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k, 1],
representing the scores of the selected proposals.
"""
with tf.name_scope('multilevel_propose_rois'):
rois = []
roi_scores = []
image_shape = tf.expand_dims(image_shape, axis=1)
for level in sorted(rpn_scores.keys()):
with tf.name_scope('level_%d' % level):
_, feature_h, feature_w, num_anchors_per_location = (
rpn_scores[level].get_shape().as_list())
num_boxes = feature_h * feature_w * num_anchors_per_location
this_level_scores = tf.reshape(rpn_scores[level], [-1, num_boxes])
this_level_boxes = tf.reshape(rpn_boxes[level], [-1, num_boxes, 4])
this_level_anchors = tf.cast(
tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
dtype=this_level_scores.dtype)
if apply_sigmoid_to_score:
this_level_scores = tf.sigmoid(this_level_scores)
if decode_boxes:
this_level_boxes = box_utils.decode_boxes(this_level_boxes,
this_level_anchors)
if clip_boxes:
this_level_boxes = box_utils.clip_boxes(this_level_boxes, image_shape)
if rpn_min_size_threshold > 0.0:
this_level_boxes, this_level_scores = box_utils.filter_boxes(
this_level_boxes, this_level_scores, image_shape,
rpn_min_size_threshold)
this_level_pre_nms_top_k = min(num_boxes, rpn_pre_nms_top_k)
this_level_post_nms_top_k = min(num_boxes, rpn_post_nms_top_k)
if rpn_nms_threshold > 0.0:
if use_batched_nms:
this_level_rois, this_level_roi_scores, _, _ = (
tf.image.combined_non_max_suppression(
tf.expand_dims(this_level_boxes, axis=2),
tf.expand_dims(this_level_scores, axis=-1),
max_output_size_per_class=this_level_pre_nms_top_k,
max_total_size=this_level_post_nms_top_k,
iou_threshold=rpn_nms_threshold,
score_threshold=rpn_score_threshold,
pad_per_class=False,
clip_boxes=False))
else:
if rpn_score_threshold > 0.0:
this_level_boxes, this_level_scores = (
box_utils.filter_boxes_by_scores(this_level_boxes,
this_level_scores,
rpn_score_threshold))
this_level_boxes, this_level_scores = box_utils.top_k_boxes(
this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k)
this_level_roi_scores, this_level_rois = (
nms.sorted_non_max_suppression_padded(
this_level_scores,
this_level_boxes,
max_output_size=this_level_post_nms_top_k,
iou_threshold=rpn_nms_threshold))
else:
this_level_rois, this_level_roi_scores = box_utils.top_k_boxes(
this_level_rois, this_level_scores, k=this_level_post_nms_top_k)
rois.append(this_level_rois)
roi_scores.append(this_level_roi_scores)
all_rois = tf.concat(rois, axis=1)
all_roi_scores = tf.concat(roi_scores, axis=1)
with tf.name_scope('top_k_rois'):
_, num_valid_rois = all_roi_scores.get_shape().as_list()
overall_top_k = min(num_valid_rois, rpn_post_nms_top_k)
selected_rois, selected_roi_scores = box_utils.top_k_boxes(
all_rois, all_roi_scores, k=overall_top_k)
return selected_rois, selected_roi_scores
class ROIGenerator(tf.keras.layers.Layer):
"""Proposes RoIs for the second stage processing."""
def __init__(self, params):
self._rpn_pre_nms_top_k = params.rpn_pre_nms_top_k
self._rpn_post_nms_top_k = params.rpn_post_nms_top_k
self._rpn_nms_threshold = params.rpn_nms_threshold
self._rpn_score_threshold = params.rpn_score_threshold
self._rpn_min_size_threshold = params.rpn_min_size_threshold
self._test_rpn_pre_nms_top_k = params.test_rpn_pre_nms_top_k
self._test_rpn_post_nms_top_k = params.test_rpn_post_nms_top_k
self._test_rpn_nms_threshold = params.test_rpn_nms_threshold
self._test_rpn_score_threshold = params.test_rpn_score_threshold
self._test_rpn_min_size_threshold = params.test_rpn_min_size_threshold
self._use_batched_nms = params.use_batched_nms
super(ROIGenerator, self).__init__(autocast=False)
def call(self, boxes, scores, anchor_boxes, image_shape, is_training):
"""Generates RoI proposals.
Args:
boxes: a dict with keys representing FPN levels and values representing
box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
scores: a dict with keys representing FPN levels and values representing
logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
anchor_boxes: a dict with keys representing FPN levels and values
representing anchor box tensors of shape [batch_size, feature_h,
feature_w, num_anchors * 4].
image_shape: a tensor of shape [batch_size, 2] where the last dimension
are [height, width] of the scaled image.
is_training: a bool indicating whether it is in training or inference
mode.
Returns:
proposed_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
representing the box coordinates of the proposed RoIs w.r.t. the
scaled image.
proposed_roi_scores: a tensor of shape
[batch_size, rpn_post_nms_top_k, 1], representing the scores of the
proposed RoIs.
"""
proposed_rois, proposed_roi_scores = multilevel_propose_rois(
boxes,
scores,
anchor_boxes,
image_shape,
rpn_pre_nms_top_k=(self._rpn_pre_nms_top_k
if is_training else self._test_rpn_pre_nms_top_k),
rpn_post_nms_top_k=(self._rpn_post_nms_top_k
if is_training else self._test_rpn_post_nms_top_k),
rpn_nms_threshold=(self._rpn_nms_threshold
if is_training else self._test_rpn_nms_threshold),
rpn_score_threshold=(self._rpn_score_threshold if is_training else
self._test_rpn_score_threshold),
rpn_min_size_threshold=(self._rpn_min_size_threshold if is_training else
self._test_rpn_min_size_threshold),
decode_boxes=True,
clip_boxes=True,
use_batched_nms=self._use_batched_nms,
apply_sigmoid_to_score=True)
return proposed_rois, proposed_roi_scores
class OlnROIGenerator(ROIGenerator):
"""Proposes RoIs for the second stage processing."""
def __call__(self, boxes, scores, anchor_boxes, image_shape, is_training,
is_box_lrtb=False, object_scores=None):
"""Generates RoI proposals.
Args:
boxes: a dict with keys representing FPN levels and values representing
box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
scores: a dict with keys representing FPN levels and values representing
logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
anchor_boxes: a dict with keys representing FPN levels and values
representing anchor box tensors of shape [batch_size, feature_h,
feature_w, num_anchors * 4].
image_shape: a tensor of shape [batch_size, 2] where the last dimension
are [height, width] of the scaled image.
is_training: a bool indicating whether it is in training or inference
mode.
is_box_lrtb: a bool indicating whether boxes are in lrtb (=left,right,top,
bottom) format.
object_scores: another objectness score (e.g., centerness). In OLN, we use
object_scores=centerness as a replacement of the scores at each level.
A dict with keys representing FPN levels and values representing logit
tensors of shape [batch_size, feature_h, feature_w, num_anchors].
Returns:
proposed_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
representing the box coordinates of the proposed RoIs w.r.t. the
scaled image.
proposed_roi_scores: a tensor of shape
[batch_size, rpn_post_nms_top_k, 1], representing the scores of the
proposed RoIs.
"""
proposed_rois, proposed_roi_scores = self.oln_multilevel_propose_rois(
boxes,
scores,
anchor_boxes,
image_shape,
rpn_pre_nms_top_k=(self._rpn_pre_nms_top_k
if is_training else self._test_rpn_pre_nms_top_k),
rpn_post_nms_top_k=(self._rpn_post_nms_top_k
if is_training else self._test_rpn_post_nms_top_k),
rpn_nms_threshold=(self._rpn_nms_threshold
if is_training else self._test_rpn_nms_threshold),
rpn_score_threshold=(self._rpn_score_threshold if is_training else
self._test_rpn_score_threshold),
rpn_min_size_threshold=(self._rpn_min_size_threshold if is_training else
self._test_rpn_min_size_threshold),
decode_boxes=True,
clip_boxes=True,
use_batched_nms=self._use_batched_nms,
apply_sigmoid_to_score=True,
is_box_lrtb=is_box_lrtb,
rpn_object_scores=object_scores,)
return proposed_rois, proposed_roi_scores
def oln_multilevel_propose_rois(self,
rpn_boxes,
rpn_scores,
anchor_boxes,
image_shape,
rpn_pre_nms_top_k=2000,
rpn_post_nms_top_k=1000,
rpn_nms_threshold=0.7,
rpn_score_threshold=0.0,
rpn_min_size_threshold=0.0,
decode_boxes=True,
clip_boxes=True,
use_batched_nms=False,
apply_sigmoid_to_score=True,
is_box_lrtb=False,
rpn_object_scores=None,):
"""Proposes RoIs given a group of candidates from different FPN levels.
The following describes the steps:
1. For each individual level:
a. Adjust scores for each level if specified by rpn_object_scores.
b. Apply sigmoid transform if specified.
c. Decode boxes (either of xyhw or left-right-top-bottom format) if
specified.
d. Clip boxes if specified.
e. Filter small boxes and those fall outside image if specified.
f. Apply pre-NMS filtering including pre-NMS top k and score
thresholding.
g. Apply NMS.
2. Aggregate post-NMS boxes from each level.
3. Apply an overall top k to generate the final selected RoIs.
Args:
rpn_boxes: a dict with keys representing FPN levels and values
representing box tenors of shape [batch_size, feature_h, feature_w,
num_anchors * 4].
rpn_scores: a dict with keys representing FPN levels and values
representing logit tensors of shape [batch_size, feature_h, feature_w,
num_anchors].
anchor_boxes: a dict with keys representing FPN levels and values
representing anchor box tensors of shape [batch_size, feature_h,
feature_w, num_anchors * 4].
image_shape: a tensor of shape [batch_size, 2] where the last dimension
are [height, width] of the scaled image.
rpn_pre_nms_top_k: an integer of top scoring RPN proposals *per level* to
keep before applying NMS. Default: 2000.
rpn_post_nms_top_k: an integer of top scoring RPN proposals *in total* to
keep after applying NMS. Default: 1000.
rpn_nms_threshold: a float between 0 and 1 representing the IoU threshold
used for NMS. If 0.0, no NMS is applied. Default: 0.7.
rpn_score_threshold: a float between 0 and 1 representing the minimal box
score to keep before applying NMS. This is often used as a pre-filtering
step for better performance. If 0, no filtering is applied. Default: 0.
rpn_min_size_threshold: a float representing the minimal box size in each
side (w.r.t. the scaled image) to keep before applying NMS. This is
often used as a pre-filtering step for better performance. If 0, no
filtering is applied. Default: 0.
decode_boxes: a boolean indicating whether `rpn_boxes` needs to be decoded
using `anchor_boxes`. If False, use `rpn_boxes` directly and ignore
`anchor_boxes`. Default: True.
clip_boxes: a boolean indicating whether boxes are first clipped to the
scaled image size before appliying NMS. If False, no clipping is applied
and `image_shape` is ignored. Default: True.
use_batched_nms: a boolean indicating whether NMS is applied in batch
using `tf.image.combined_non_max_suppression`. Currently only available
in CPU/GPU. Default: False.
apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to
`rpn_scores` before applying NMS. Default: True.
is_box_lrtb: a bool indicating whether boxes are in lrtb (=left,right,top,
bottom) format.
rpn_object_scores: a predicted objectness score (e.g., centerness). In
OLN, we use object_scores=centerness as a replacement of the scores at
each level. A dict with keys representing FPN levels and values
representing logit tensors of shape [batch_size, feature_h, feature_w,
num_anchors].
Returns:
selected_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
representing the box coordinates of the selected proposals w.r.t. the
scaled image.
selected_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k,
1],representing the scores of the selected proposals.
"""
with tf.name_scope('multilevel_propose_rois'):
rois = []
roi_scores = []
image_shape = tf.expand_dims(image_shape, axis=1)
for level in sorted(rpn_scores.keys()):
with tf.name_scope('level_%d' % level):
_, feature_h, feature_w, num_anchors_per_location = (
rpn_scores[level].get_shape().as_list())
num_boxes = feature_h * feature_w * num_anchors_per_location
this_level_scores = tf.reshape(rpn_scores[level], [-1, num_boxes])
this_level_boxes = tf.reshape(rpn_boxes[level], [-1, num_boxes, 4])
this_level_anchors = tf.cast(
tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
dtype=this_level_scores.dtype)
if rpn_object_scores:
this_level_object_scores = rpn_object_scores[level]
this_level_object_scores = tf.reshape(this_level_object_scores,
[-1, num_boxes])
this_level_object_scores = tf.cast(this_level_object_scores,
this_level_scores.dtype)
this_level_scores = this_level_object_scores
if apply_sigmoid_to_score:
this_level_scores = tf.sigmoid(this_level_scores)
if decode_boxes:
if is_box_lrtb: # Box in left-right-top-bottom format.
this_level_boxes = box_utils.decode_boxes_lrtb(
this_level_boxes, this_level_anchors)
else: # Box in standard x-y-h-w format.
this_level_boxes = box_utils.decode_boxes(
this_level_boxes, this_level_anchors)
if clip_boxes:
this_level_boxes = box_utils.clip_boxes(
this_level_boxes, image_shape)
if rpn_min_size_threshold > 0.0:
this_level_boxes, this_level_scores = box_utils.filter_boxes(
this_level_boxes, this_level_scores, image_shape,
rpn_min_size_threshold)
this_level_pre_nms_top_k = min(num_boxes, rpn_pre_nms_top_k)
this_level_post_nms_top_k = min(num_boxes, rpn_post_nms_top_k)
if rpn_nms_threshold > 0.0:
if use_batched_nms:
this_level_rois, this_level_roi_scores, _, _ = (
tf.image.combined_non_max_suppression(
tf.expand_dims(this_level_boxes, axis=2),
tf.expand_dims(this_level_scores, axis=-1),
max_output_size_per_class=this_level_pre_nms_top_k,
max_total_size=this_level_post_nms_top_k,
iou_threshold=rpn_nms_threshold,
score_threshold=rpn_score_threshold,
pad_per_class=False,
clip_boxes=False))
else:
if rpn_score_threshold > 0.0:
this_level_boxes, this_level_scores = (
box_utils.filter_boxes_by_scores(this_level_boxes,
this_level_scores,
rpn_score_threshold))
this_level_boxes, this_level_scores = box_utils.top_k_boxes(
this_level_boxes, this_level_scores,
k=this_level_pre_nms_top_k)
this_level_roi_scores, this_level_rois = (
nms.sorted_non_max_suppression_padded(
this_level_scores,
this_level_boxes,
max_output_size=this_level_post_nms_top_k,
iou_threshold=rpn_nms_threshold))
else:
this_level_rois, this_level_roi_scores = box_utils.top_k_boxes(
this_level_rois, this_level_scores, k=this_level_post_nms_top_k)
rois.append(this_level_rois)
roi_scores.append(this_level_roi_scores)
all_rois = tf.concat(rois, axis=1)
all_roi_scores = tf.concat(roi_scores, axis=1)
with tf.name_scope('top_k_rois'):
_, num_valid_rois = all_roi_scores.get_shape().as_list()
overall_top_k = min(num_valid_rois, rpn_post_nms_top_k)
selected_rois, selected_roi_scores = box_utils.top_k_boxes(
all_rois, all_roi_scores, k=overall_top_k)
return selected_rois, selected_roi_scores
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions to performa spatial transformation for Tensor."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
_EPSILON = 1e-8
def nearest_upsampling(data, scale):
"""Nearest neighbor upsampling implementation.
Args:
data: A tensor with a shape of [batch, height_in, width_in, channels].
scale: An integer multiple to scale resolution of input data.
Returns:
data_up: A tensor with a shape of
[batch, height_in*scale, width_in*scale, channels]. Same dtype as input
data.
"""
with tf.name_scope('nearest_upsampling'):
bs, _, _, c = data.get_shape().as_list()
shape = tf.shape(input=data)
h = shape[1]
w = shape[2]
bs = -1 if bs is None else bs
# Uses reshape to quickly upsample the input. The nearest pixel is selected
# implicitly via broadcasting.
data = tf.reshape(data, [bs, h, 1, w, 1, c]) * tf.ones(
[1, 1, scale, 1, scale, 1], dtype=data.dtype)
return tf.reshape(data, [bs, h * scale, w * scale, c])
def feature_bilinear_interpolation(features, kernel_y, kernel_x):
"""Feature bilinear interpolation.
The RoIAlign feature f can be computed by bilinear interpolation
of four neighboring feature points f0, f1, f2, and f3.
f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
[f10, f11]]
f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
kernel_y = [hy, ly]
kernel_x = [hx, lx]
Args:
features: The features are in shape of [batch_size, num_boxes, output_size *
2, output_size * 2, num_filters].
kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
Returns:
A 5-D tensor representing feature crop of shape
[batch_size, num_boxes, output_size, output_size, num_filters].
"""
(batch_size, num_boxes, output_size, _,
num_filters) = features.get_shape().as_list()
output_size = output_size // 2
kernel_y = tf.reshape(kernel_y, [batch_size, num_boxes, output_size * 2, 1])
kernel_x = tf.reshape(kernel_x, [batch_size, num_boxes, 1, output_size * 2])
# Use implicit broadcast to generate the interpolation kernel. The
# multiplier `4` is for avg pooling.
interpolation_kernel = kernel_y * kernel_x * 4
# Interpolate the gathered features with computed interpolation kernels.
features *= tf.cast(
tf.expand_dims(interpolation_kernel, axis=-1), dtype=features.dtype)
features = tf.reshape(
features,
[batch_size * num_boxes, output_size * 2, output_size * 2, num_filters])
features = tf.nn.avg_pool(features, [1, 2, 2, 1], [1, 2, 2, 1], 'VALID')
features = tf.reshape(
features, [batch_size, num_boxes, output_size, output_size, num_filters])
return features
def compute_grid_positions(boxes, boundaries, output_size, sample_offset):
"""Compute the grid position w.r.t.
the corresponding feature map.
Args:
boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
information of each box w.r.t. the corresponding feature map.
boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
in terms of the number of pixels of the corresponding feature map size.
boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
the boundary (in (y, x)) of the corresponding feature map for each box.
Any resampled grid points that go beyond the bounary will be clipped.
output_size: a scalar indicating the output crop size.
sample_offset: a float number in [0, 1] indicates the subpixel sample offset
from grid point.
Returns:
kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
box_grid_y0y1: Tensor of size [batch_size, boxes, output_size, 2]
box_grid_x0x1: Tensor of size [batch_size, boxes, output_size, 2]
"""
batch_size, num_boxes, _ = boxes.get_shape().as_list()
box_grid_x = []
box_grid_y = []
for i in range(output_size):
box_grid_x.append(boxes[:, :, 1] +
(i + sample_offset) * boxes[:, :, 3] / output_size)
box_grid_y.append(boxes[:, :, 0] +
(i + sample_offset) * boxes[:, :, 2] / output_size)
box_grid_x = tf.stack(box_grid_x, axis=2)
box_grid_y = tf.stack(box_grid_y, axis=2)
box_grid_y0 = tf.floor(box_grid_y)
box_grid_x0 = tf.floor(box_grid_x)
box_grid_x0 = tf.maximum(0., box_grid_x0)
box_grid_y0 = tf.maximum(0., box_grid_y0)
box_grid_x0 = tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1))
box_grid_x1 = tf.minimum(box_grid_x0 + 1,
tf.expand_dims(boundaries[:, :, 1], -1))
box_grid_y0 = tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1))
box_grid_y1 = tf.minimum(box_grid_y0 + 1,
tf.expand_dims(boundaries[:, :, 0], -1))
box_gridx0x1 = tf.stack([box_grid_x0, box_grid_x1], axis=-1)
box_gridy0y1 = tf.stack([box_grid_y0, box_grid_y1], axis=-1)
# The RoIAlign feature f can be computed by bilinear interpolation of four
# neighboring feature points f0, f1, f2, and f3.
# f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
# [f10, f11]]
# f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
# f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
ly = box_grid_y - box_grid_y0
lx = box_grid_x - box_grid_x0
hy = 1.0 - ly
hx = 1.0 - lx
kernel_y = tf.reshape(
tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size, 2, 1])
kernel_x = tf.reshape(
tf.stack([hx, lx], axis=3), [batch_size, num_boxes, output_size, 2, 1])
return kernel_y, kernel_x, box_gridy0y1, box_gridx0x1
def get_grid_one_hot(box_gridy0y1, box_gridx0x1, feature_height, feature_width):
"""Get grid_one_hot from indices and feature_size."""
(batch_size, num_boxes, output_size, _) = box_gridx0x1.get_shape().as_list()
y_indices = tf.cast(
tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size, 2]),
dtype=tf.int32)
x_indices = tf.cast(
tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size, 2]),
dtype=tf.int32)
# shape is [batch_size, num_boxes, output_size, 2, height]
grid_y_one_hot = tf.one_hot(tf.cast(y_indices, tf.int32), feature_height)
# shape is [batch_size, num_boxes, output_size, 2, width]
grid_x_one_hot = tf.one_hot(tf.cast(x_indices, tf.int32), feature_width)
return grid_y_one_hot, grid_x_one_hot
def selective_crop_and_resize(features,
boxes,
box_levels,
boundaries,
output_size=7,
sample_offset=0.5,
use_einsum_gather=False):
"""Crop and resize boxes on a set of feature maps.
Given multiple features maps indexed by different levels, and a set of boxes
where each box is mapped to a certain level, it selectively crops and resizes
boxes from the corresponding feature maps to generate the box features.
We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf,
figure 3 for reference). Specifically, for each feature map, we select an
(output_size, output_size) set of pixels corresponding to the box location,
and then use bilinear interpolation to select the feature value for each
pixel.
For performance, we perform the gather and interpolation on all layers as a
single operation. In this op the multi-level features are first stacked and
gathered into [2*output_size, 2*output_size] feature points. Then bilinear
interpolation is performed on the gathered feature points to generate
[output_size, output_size] RoIAlign feature map.
Here is the step-by-step algorithm:
1. The multi-level features are gathered into a
[batch_size, num_boxes, output_size*2, output_size*2, num_filters]
Tensor. The Tensor contains four neighboring feature points for each
vertice in the output grid.
2. Compute the interpolation kernel of shape
[batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis
can be seen as stacking 2x2 interpolation kernels for all vertices in the
output grid.
3. Element-wise multiply the gathered features and interpolation kernel.
Then apply 2x2 average pooling to reduce spatial dimension to
output_size.
Args:
features: a 5-D tensor of shape [batch_size, num_levels, max_height,
max_width, num_filters] where cropping and resizing are based.
boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
information of each box w.r.t. the corresponding feature map.
boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
in terms of the number of pixels of the corresponding feature map size.
box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing
the 0-based corresponding feature level index of each box.
boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
the boundary (in (y, x)) of the corresponding feature map for each box.
Any resampled grid points that go beyond the bounary will be clipped.
output_size: a scalar indicating the output crop size.
sample_offset: a float number in [0, 1] indicates the subpixel sample offset
from grid point.
use_einsum_gather: use einsum to replace gather or not. Replacing einsum
with gather can improve performance when feature size is not large, einsum
is friendly with model partition as well. Gather's performance is better
when feature size is very large and there are multiple box levels.
Returns:
features_per_box: a 5-D tensor of shape
[batch_size, num_boxes, output_size, output_size, num_filters]
representing the cropped features.
"""
(batch_size, num_levels, max_feature_height, max_feature_width,
num_filters) = features.get_shape().as_list()
_, num_boxes, _ = boxes.get_shape().as_list()
kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
boxes, boundaries, output_size, sample_offset)
x_indices = tf.cast(
tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
dtype=tf.int32)
y_indices = tf.cast(
tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
dtype=tf.int32)
if use_einsum_gather:
# Blinear interpolation is done during the last two gathers:
# f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
# [f10, f11]]
# [[f00, f01],
# [f10, f11]] = tf.einsum(tf.einsum(features, y_one_hot), x_one_hot)
# where [hy, ly] and [hx, lx] are the bilinear interpolation kernel.
# shape is [batch_size, boxes, output_size, 2, 1]
grid_y_one_hot, grid_x_one_hot = get_grid_one_hot(box_gridy0y1,
box_gridx0x1,
max_feature_height,
max_feature_width)
# shape is [batch_size, num_boxes, output_size, height]
grid_y_weight = tf.reduce_sum(
tf.multiply(grid_y_one_hot, kernel_y), axis=-2)
# shape is [batch_size, num_boxes, output_size, width]
grid_x_weight = tf.reduce_sum(
tf.multiply(grid_x_one_hot, kernel_x), axis=-2)
# Gather for y_axis.
# shape is [batch_size, num_boxes, output_size, width, features]
features_per_box = tf.einsum('bmhwf,bmoh->bmowf', features,
tf.cast(grid_y_weight, features.dtype))
# Gather for x_axis.
# shape is [batch_size, num_boxes, output_size, output_size, features]
features_per_box = tf.einsum('bmhwf,bmow->bmhof', features_per_box,
tf.cast(grid_x_weight, features.dtype))
else:
height_dim_offset = max_feature_width
level_dim_offset = max_feature_height * height_dim_offset
batch_dim_offset = num_levels * level_dim_offset
batch_size_offset = tf.tile(
tf.reshape(
tf.range(batch_size) * batch_dim_offset, [batch_size, 1, 1, 1]),
[1, num_boxes, output_size * 2, output_size * 2])
box_levels_offset = tf.tile(
tf.reshape(box_levels * level_dim_offset,
[batch_size, num_boxes, 1, 1]),
[1, 1, output_size * 2, output_size * 2])
y_indices_offset = tf.tile(
tf.reshape(y_indices * height_dim_offset,
[batch_size, num_boxes, output_size * 2, 1]),
[1, 1, 1, output_size * 2])
x_indices_offset = tf.tile(
tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
[1, 1, output_size * 2, 1])
indices = tf.reshape(
batch_size_offset + box_levels_offset + y_indices_offset +
x_indices_offset, [-1])
features = tf.reshape(features, [-1, num_filters])
# TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
# performance.
features_per_box = tf.reshape(
tf.gather(features, indices),
[batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
features_per_box = feature_bilinear_interpolation(features_per_box,
kernel_y, kernel_x)
return features_per_box
def multilevel_crop_and_resize(features, boxes, output_size=7):
"""Crop and resize on multilevel feature pyramid.
Generate the (output_size, output_size) set of pixels for each input box
by first locating the box into the correct feature level, and then cropping
and resizing it using the correspoding feature map of that level.
Args:
features: A dictionary with key as pyramid level and value as features. The
features are in shape of [batch_size, height_l, width_l, num_filters].
boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents
a box with [y1, x1, y2, x2] in un-normalized coordinates.
output_size: A scalar to indicate the output crop size.
Returns:
A 5-D tensor representing feature crop of shape
[batch_size, num_boxes, output_size, output_size, num_filters].
"""
with tf.name_scope('multilevel_crop_and_resize'):
levels = list(features.keys())
min_level = min(levels)
max_level = max(levels)
batch_size, max_feature_height, max_feature_width, num_filters = (
features[min_level].get_shape().as_list())
_, num_boxes, _ = boxes.get_shape().as_list()
# Stack feature pyramid into a features_all of shape
# [batch_size, levels, height, width, num_filters].
features_all = []
feature_heights = []
feature_widths = []
for level in range(min_level, max_level + 1):
shape = features[level].get_shape().as_list()
feature_heights.append(shape[1])
feature_widths.append(shape[2])
# Concat tensor of [batch_size, height_l * width_l, num_filters] for each
# levels.
features_all.append(
tf.reshape(features[level], [batch_size, -1, num_filters]))
features_r2 = tf.reshape(tf.concat(features_all, 1), [-1, num_filters])
# Calculate height_l * width_l for each level.
level_dim_sizes = [
feature_widths[i] * feature_heights[i]
for i in range(len(feature_widths))
]
# level_dim_offsets is accumulated sum of level_dim_size.
level_dim_offsets = [0]
for i in range(len(feature_widths) - 1):
level_dim_offsets.append(level_dim_offsets[i] + level_dim_sizes[i])
batch_dim_size = level_dim_offsets[-1] + level_dim_sizes[-1]
level_dim_offsets = tf.constant(level_dim_offsets, tf.int32)
height_dim_sizes = tf.constant(feature_widths, tf.int32)
# Assigns boxes to the right level.
box_width = boxes[:, :, 3] - boxes[:, :, 1]
box_height = boxes[:, :, 2] - boxes[:, :, 0]
areas_sqrt = tf.sqrt(box_height * box_width)
levels = tf.cast(
tf.math.floordiv(
tf.math.log(tf.divide(areas_sqrt, 224.0)), tf.math.log(2.0)) + 4.0,
dtype=tf.int32)
# Maps levels between [min_level, max_level].
levels = tf.minimum(max_level, tf.maximum(levels, min_level))
# Projects box location and sizes to corresponding feature levels.
scale_to_level = tf.cast(
tf.pow(tf.constant(2.0), tf.cast(levels, tf.float32)),
dtype=boxes.dtype)
boxes /= tf.expand_dims(scale_to_level, axis=2)
box_width /= scale_to_level
box_height /= scale_to_level
boxes = tf.concat([
boxes[:, :, 0:2],
tf.expand_dims(box_height, -1),
tf.expand_dims(box_width, -1)
],
axis=-1)
# Maps levels to [0, max_level-min_level].
levels -= min_level
level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32))
boundary = tf.cast(
tf.concat([
tf.expand_dims(
[[tf.cast(max_feature_height, tf.float32)]] / level_strides - 1,
axis=-1),
tf.expand_dims(
[[tf.cast(max_feature_width, tf.float32)]] / level_strides - 1,
axis=-1),
],
axis=-1), boxes.dtype)
# Compute grid positions.
kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
boxes, boundary, output_size, sample_offset=0.5)
x_indices = tf.cast(
tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
dtype=tf.int32)
y_indices = tf.cast(
tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
dtype=tf.int32)
batch_size_offset = tf.tile(
tf.reshape(
tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
[1, num_boxes, output_size * 2, output_size * 2])
# Get level offset for each box. Each box belongs to one level.
levels_offset = tf.tile(
tf.reshape(
tf.gather(level_dim_offsets, levels),
[batch_size, num_boxes, 1, 1]),
[1, 1, output_size * 2, output_size * 2])
y_indices_offset = tf.tile(
tf.reshape(
y_indices * tf.expand_dims(tf.gather(height_dim_sizes, levels), -1),
[batch_size, num_boxes, output_size * 2, 1]),
[1, 1, 1, output_size * 2])
x_indices_offset = tf.tile(
tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
[1, 1, output_size * 2, 1])
indices = tf.reshape(
batch_size_offset + levels_offset + y_indices_offset + x_indices_offset,
[-1])
# TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
# performance.
features_per_box = tf.reshape(
tf.gather(features_r2, indices),
[batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
# Bilinear interpolation.
features_per_box = feature_bilinear_interpolation(features_per_box,
kernel_y, kernel_x)
return features_per_box
def single_level_feature_crop(features, level_boxes, detection_prior_levels,
min_mask_level, mask_crop_size):
"""Crop the FPN features at the appropriate levels for each detection.
Args:
features: a float tensor of shape [batch_size, num_levels, max_feature_size,
max_feature_size, num_downsample_channels].
level_boxes: a float Tensor of the level boxes to crop from. [batch_size,
num_instances, 4].
detection_prior_levels: an int Tensor of instance assigned level of shape
[batch_size, num_instances].
min_mask_level: minimum FPN level to crop mask feature from.
mask_crop_size: an int of mask crop size.
Returns:
crop_features: a float Tensor of shape [batch_size * num_instances,
mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
instance feature crop.
"""
(batch_size, num_levels, max_feature_size, _,
num_downsample_channels) = features.get_shape().as_list()
_, num_of_instances, _ = level_boxes.get_shape().as_list()
level_boxes = tf.cast(level_boxes, tf.int32)
assert num_of_instances == detection_prior_levels.get_shape().as_list()[1]
x_start_indices = level_boxes[:, :, 1]
y_start_indices = level_boxes[:, :, 0]
# generate the full indices (not just the starting index)
x_idx_list = []
y_idx_list = []
for i in range(mask_crop_size):
x_idx_list.append(x_start_indices + i)
y_idx_list.append(y_start_indices + i)
x_indices = tf.stack(x_idx_list, axis=2)
y_indices = tf.stack(y_idx_list, axis=2)
levels = detection_prior_levels - min_mask_level
height_dim_size = max_feature_size
level_dim_size = max_feature_size * height_dim_size
batch_dim_size = num_levels * level_dim_size
# TODO(weicheng) change this to gather_nd for better readability.
indices = tf.reshape(
tf.tile(
tf.reshape(
tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
[1, num_of_instances, mask_crop_size, mask_crop_size]) + tf.tile(
tf.reshape(levels * level_dim_size,
[batch_size, num_of_instances, 1, 1]),
[1, 1, mask_crop_size, mask_crop_size]) + tf.tile(
tf.reshape(y_indices * height_dim_size,
[batch_size, num_of_instances, mask_crop_size, 1]),
[1, 1, 1, mask_crop_size]) +
tf.tile(
tf.reshape(x_indices,
[batch_size, num_of_instances, 1, mask_crop_size]),
[1, 1, mask_crop_size, 1]), [-1])
features_r2 = tf.reshape(features, [-1, num_downsample_channels])
crop_features = tf.reshape(
tf.gather(features_r2, indices), [
batch_size * num_of_instances, mask_crop_size, mask_crop_size,
num_downsample_channels
])
return crop_features
def crop_mask_in_target_box(masks,
boxes,
target_boxes,
output_size,
sample_offset=0,
use_einsum=True):
"""Crop masks in target boxes.
Args:
masks: A tensor with a shape of [batch_size, num_masks, height, width].
boxes: a float tensor representing box cooridnates that tightly enclose
masks with a shape of [batch_size, num_masks, 4] in un-normalized
coordinates. A box is represented by [ymin, xmin, ymax, xmax].
target_boxes: a float tensor representing target box cooridnates for masks
with a shape of [batch_size, num_masks, 4] in un-normalized coordinates. A
box is represented by [ymin, xmin, ymax, xmax].
output_size: A scalar to indicate the output crop size. It currently only
supports to output a square shape outputs.
sample_offset: a float number in [0, 1] indicates the subpixel sample offset
from grid point.
use_einsum: Use einsum to replace gather in selective_crop_and_resize.
Returns:
A 4-D tensor representing feature crop of shape
[batch_size, num_boxes, output_size, output_size].
"""
with tf.name_scope('crop_mask_in_target_box'):
batch_size, num_masks, height, width = masks.get_shape().as_list()
masks = tf.reshape(masks, [batch_size * num_masks, height, width, 1])
# Pad zeros on the boundary of masks.
masks = tf.image.pad_to_bounding_box(masks, 2, 2, height + 4, width + 4)
masks = tf.reshape(masks, [batch_size, num_masks, height + 4, width + 4, 1])
# Projects target box locations and sizes to corresponding cropped
# mask coordinates.
gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
value=boxes, num_or_size_splits=4, axis=2)
bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
value=target_boxes, num_or_size_splits=4, axis=2)
y_transform = (bb_y_min - gt_y_min) * height / (gt_y_max - gt_y_min +
_EPSILON) + 2
x_transform = (bb_x_min - gt_x_min) * height / (gt_x_max - gt_x_min +
_EPSILON) + 2
h_transform = (bb_y_max - bb_y_min) * width / (
gt_y_max - gt_y_min + _EPSILON)
w_transform = (bb_x_max - bb_x_min) * width / (
gt_x_max - gt_x_min + _EPSILON)
boundaries = tf.concat([
tf.cast(
tf.ones_like(y_transform) * ((height + 4) - 1), dtype=tf.float32),
tf.cast(
tf.ones_like(x_transform) * ((width + 4) - 1), dtype=tf.float32)
],
axis=-1)
# Reshape tensors to have the right shape for selective_crop_and_resize.
trasnformed_boxes = tf.concat(
[y_transform, x_transform, h_transform, w_transform], -1)
levels = tf.tile(
tf.reshape(tf.range(num_masks), [1, num_masks]), [batch_size, 1])
cropped_masks = selective_crop_and_resize(
masks,
trasnformed_boxes,
levels,
boundaries,
output_size,
sample_offset=sample_offset,
use_einsum_gather=use_einsum)
cropped_masks = tf.squeeze(cropped_masks, axis=-1)
return cropped_masks
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Target and sampling related ops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from official.vision.detection.ops import spatial_transform_ops
from official.vision.detection.utils import box_utils
from official.vision.utils.object_detection import balanced_positive_negative_sampler
def box_matching(boxes, gt_boxes, gt_classes):
"""Match boxes to groundtruth boxes.
Given the proposal boxes and the groundtruth boxes and classes, perform the
groundtruth matching by taking the argmax of the IoU between boxes and
groundtruth boxes.
Args:
boxes: a tensor of shape of [batch_size, N, 4] representing the box
coordiantes to be matched to groundtruth boxes.
gt_boxes: a tensor of shape of [batch_size, MAX_INSTANCES, 4] representing
the groundtruth box coordinates. It is padded with -1s to indicate the
invalid boxes.
gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
classes. It is padded with -1s to indicate the invalid classes.
Returns:
matched_gt_boxes: a tensor of shape of [batch_size, N, 4], representing
the matched groundtruth box coordinates for each input box. If the box
does not overlap with any groundtruth boxes, the matched boxes of it
will be set to all 0s.
matched_gt_classes: a tensor of shape of [batch_size, N], representing
the matched groundtruth classes for each input box. If the box does not
overlap with any groundtruth boxes, the matched box classes of it will
be set to 0, which corresponds to the background class.
matched_gt_indices: a tensor of shape of [batch_size, N], representing
the indices of the matched groundtruth boxes in the original gt_boxes
tensor. If the box does not overlap with any groundtruth boxes, the
index of the matched groundtruth will be set to -1.
matched_iou: a tensor of shape of [batch_size, N], representing the IoU
between the box and its matched groundtruth box. The matched IoU is the
maximum IoU of the box and all the groundtruth boxes.
iou: a tensor of shape of [batch_size, N, K], representing the IoU matrix
between boxes and the groundtruth boxes. The IoU between a box and the
invalid groundtruth boxes whose coordinates are [-1, -1, -1, -1] is -1.
"""
# Compute IoU between boxes and gt_boxes.
# iou <- [batch_size, N, K]
iou = box_utils.bbox_overlap(boxes, gt_boxes)
# max_iou <- [batch_size, N]
# 0.0 -> no match to gt, or -1.0 match to no gt
matched_iou = tf.reduce_max(iou, axis=-1)
# background_box_mask <- bool, [batch_size, N]
background_box_mask = tf.less_equal(matched_iou, 0.0)
argmax_iou_indices = tf.argmax(iou, axis=-1, output_type=tf.int32)
argmax_iou_indices_shape = tf.shape(argmax_iou_indices)
batch_indices = (
tf.expand_dims(tf.range(argmax_iou_indices_shape[0]), axis=-1) *
tf.ones([1, argmax_iou_indices_shape[-1]], dtype=tf.int32))
gather_nd_indices = tf.stack([batch_indices, argmax_iou_indices], axis=-1)
matched_gt_boxes = tf.gather_nd(gt_boxes, gather_nd_indices)
matched_gt_boxes = tf.where(
tf.tile(tf.expand_dims(background_box_mask, axis=-1), [1, 1, 4]),
tf.zeros_like(matched_gt_boxes, dtype=matched_gt_boxes.dtype),
matched_gt_boxes)
matched_gt_classes = tf.gather_nd(gt_classes, gather_nd_indices)
matched_gt_classes = tf.where(background_box_mask,
tf.zeros_like(matched_gt_classes),
matched_gt_classes)
matched_gt_indices = tf.where(background_box_mask,
-tf.ones_like(argmax_iou_indices),
argmax_iou_indices)
return (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
iou)
def assign_and_sample_proposals(proposed_boxes,
gt_boxes,
gt_classes,
num_samples_per_image=512,
mix_gt_boxes=True,
fg_fraction=0.25,
fg_iou_thresh=0.5,
bg_iou_thresh_hi=0.5,
bg_iou_thresh_lo=0.0):
"""Assigns the proposals with groundtruth classes and performs subsmpling.
Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
following algorithm to generate the final `num_samples_per_image` RoIs.
1. Calculates the IoU between each proposal box and each gt_boxes.
2. Assigns each proposed box with a groundtruth class and box by choosing
the largest IoU overlap.
3. Samples `num_samples_per_image` boxes from all proposed boxes, and
returns box_targets, class_targets, and RoIs.
Args:
proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment. The last dimension is the box
coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
coordinates of gt_boxes are in the pixel coordinates of the scaled image.
This tensor might have padding of values -1 indicating the invalid box
coordinates.
gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
tensor might have paddings with values of -1 indicating the invalid
classes.
num_samples_per_image: a integer represents RoI minibatch size per image.
mix_gt_boxes: a bool indicating whether to mix the groundtruth boxes before
sampling proposals.
fg_fraction: a float represents the target fraction of RoI minibatch that is
labeled foreground (i.e., class > 0).
fg_iou_thresh: a float represents the IoU overlap threshold for an RoI to be
considered foreground (if >= fg_iou_thresh).
bg_iou_thresh_hi: a float represents the IoU overlap threshold for an RoI to
be considered background (class = 0 if overlap in [LO, HI)).
bg_iou_thresh_lo: a float represents the IoU overlap threshold for an RoI to
be considered background (class = 0 if overlap in [LO, HI)).
Returns:
sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
coordinates of the sampled RoIs, where K is the number of the sampled
RoIs, i.e. K = num_samples_per_image.
sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
box coordinates of the matched groundtruth boxes of the samples RoIs.
sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
classes of the matched groundtruth boxes of the sampled RoIs.
sampled_gt_indices: a tensor of shape of [batch_size, K], storing the
indices of the sampled groudntruth boxes in the original `gt_boxes`
tensor, i.e. gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
"""
with tf.name_scope('sample_proposals'):
if mix_gt_boxes:
boxes = tf.concat([proposed_boxes, gt_boxes], axis=1)
else:
boxes = proposed_boxes
(matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
_) = box_matching(boxes, gt_boxes, gt_classes)
positive_match = tf.greater(matched_iou, fg_iou_thresh)
negative_match = tf.logical_and(
tf.greater_equal(matched_iou, bg_iou_thresh_lo),
tf.less(matched_iou, bg_iou_thresh_hi))
ignored_match = tf.less(matched_iou, 0.0)
# re-assign negatively matched boxes to the background class.
matched_gt_classes = tf.where(negative_match,
tf.zeros_like(matched_gt_classes),
matched_gt_classes)
matched_gt_indices = tf.where(negative_match,
tf.zeros_like(matched_gt_indices),
matched_gt_indices)
sample_candidates = tf.logical_and(
tf.logical_or(positive_match, negative_match),
tf.logical_not(ignored_match))
sampler = (
balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
positive_fraction=fg_fraction, is_static=True))
batch_size, _ = sample_candidates.get_shape().as_list()
sampled_indicators = []
for i in range(batch_size):
sampled_indicator = sampler.subsample(sample_candidates[i],
num_samples_per_image,
positive_match[i])
sampled_indicators.append(sampled_indicator)
sampled_indicators = tf.stack(sampled_indicators)
_, sampled_indices = tf.nn.top_k(
tf.cast(sampled_indicators, dtype=tf.int32),
k=num_samples_per_image,
sorted=True)
sampled_indices_shape = tf.shape(sampled_indices)
batch_indices = (
tf.expand_dims(tf.range(sampled_indices_shape[0]), axis=-1) *
tf.ones([1, sampled_indices_shape[-1]], dtype=tf.int32))
gather_nd_indices = tf.stack([batch_indices, sampled_indices], axis=-1)
sampled_rois = tf.gather_nd(boxes, gather_nd_indices)
sampled_gt_boxes = tf.gather_nd(matched_gt_boxes, gather_nd_indices)
sampled_gt_classes = tf.gather_nd(matched_gt_classes, gather_nd_indices)
sampled_gt_indices = tf.gather_nd(matched_gt_indices, gather_nd_indices)
return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
sampled_gt_indices)
def sample_and_crop_foreground_masks(candidate_rois,
candidate_gt_boxes,
candidate_gt_classes,
candidate_gt_indices,
gt_masks,
num_mask_samples_per_image=128,
mask_target_size=28):
"""Samples and creates cropped foreground masks for training.
Args:
candidate_rois: a tensor of shape of [batch_size, N, 4], where N is the
number of candidate RoIs to be considered for mask sampling. It includes
both positive and negative RoIs. The `num_mask_samples_per_image` positive
RoIs will be sampled to create mask training targets.
candidate_gt_boxes: a tensor of shape of [batch_size, N, 4], storing the
corresponding groundtruth boxes to the `candidate_rois`.
candidate_gt_classes: a tensor of shape of [batch_size, N], storing the
corresponding groundtruth classes to the `candidate_rois`. 0 in the tensor
corresponds to the background class, i.e. negative RoIs.
candidate_gt_indices: a tensor of shape [batch_size, N], storing the
corresponding groundtruth instance indices to the `candidate_gt_boxes`,
i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i] and
gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is
the superset of candidate_gt_boxes.
gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
containing all the groundtruth masks which sample masks are drawn from.
num_mask_samples_per_image: an integer which specifies the number of masks
to sample.
mask_target_size: an integer which specifies the final cropped mask size
after sampling. The output masks are resized w.r.t the sampled RoIs.
Returns:
foreground_rois: a tensor of shape of [batch_size, K, 4] storing the RoI
that corresponds to the sampled foreground masks, where
K = num_mask_samples_per_image.
foreground_classes: a tensor of shape of [batch_size, K] storing the classes
corresponding to the sampled foreground masks.
cropoped_foreground_masks: a tensor of shape of
[batch_size, K, mask_target_size, mask_target_size] storing the cropped
foreground masks used for training.
"""
with tf.name_scope('sample_and_crop_foreground_masks'):
_, fg_instance_indices = tf.nn.top_k(
tf.cast(tf.greater(candidate_gt_classes, 0), dtype=tf.int32),
k=num_mask_samples_per_image)
fg_instance_indices_shape = tf.shape(fg_instance_indices)
batch_indices = (
tf.expand_dims(tf.range(fg_instance_indices_shape[0]), axis=-1) *
tf.ones([1, fg_instance_indices_shape[-1]], dtype=tf.int32))
gather_nd_instance_indices = tf.stack([batch_indices, fg_instance_indices],
axis=-1)
foreground_rois = tf.gather_nd(candidate_rois, gather_nd_instance_indices)
foreground_boxes = tf.gather_nd(candidate_gt_boxes,
gather_nd_instance_indices)
foreground_classes = tf.gather_nd(candidate_gt_classes,
gather_nd_instance_indices)
foreground_gt_indices = tf.gather_nd(candidate_gt_indices,
gather_nd_instance_indices)
foreground_gt_indices_shape = tf.shape(foreground_gt_indices)
batch_indices = (
tf.expand_dims(tf.range(foreground_gt_indices_shape[0]), axis=-1) *
tf.ones([1, foreground_gt_indices_shape[-1]], dtype=tf.int32))
gather_nd_gt_indices = tf.stack([batch_indices, foreground_gt_indices],
axis=-1)
foreground_masks = tf.gather_nd(gt_masks, gather_nd_gt_indices)
cropped_foreground_masks = spatial_transform_ops.crop_mask_in_target_box(
foreground_masks,
foreground_boxes,
foreground_rois,
mask_target_size,
sample_offset=0.5)
return foreground_rois, foreground_classes, cropped_foreground_masks
class ROISampler(tf.keras.layers.Layer):
"""Samples RoIs and creates training targets."""
def __init__(self, params):
self._num_samples_per_image = params.num_samples_per_image
self._fg_fraction = params.fg_fraction
self._fg_iou_thresh = params.fg_iou_thresh
self._bg_iou_thresh_hi = params.bg_iou_thresh_hi
self._bg_iou_thresh_lo = params.bg_iou_thresh_lo
self._mix_gt_boxes = params.mix_gt_boxes
super(ROISampler, self).__init__(autocast=False)
def call(self, rois, gt_boxes, gt_classes):
"""Sample and assign RoIs for training.
Args:
rois: a tensor of shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment. The last dimension is the box
coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
coordinates of gt_boxes are in the pixel coordinates of the scaled
image. This tensor might have padding of values -1 indicating the
invalid box coordinates.
gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
tensor might have paddings with values of -1 indicating the invalid
classes.
Returns:
sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
coordinates of the sampled RoIs, where K is the number of the sampled
RoIs, i.e. K = num_samples_per_image.
sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
box coordinates of the matched groundtruth boxes of the samples RoIs.
sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
classes of the matched groundtruth boxes of the sampled RoIs.
"""
sampled_rois, sampled_gt_boxes, sampled_gt_classes, sampled_gt_indices = (
assign_and_sample_proposals(
rois,
gt_boxes,
gt_classes,
num_samples_per_image=self._num_samples_per_image,
mix_gt_boxes=self._mix_gt_boxes,
fg_fraction=self._fg_fraction,
fg_iou_thresh=self._fg_iou_thresh,
bg_iou_thresh_hi=self._bg_iou_thresh_hi,
bg_iou_thresh_lo=self._bg_iou_thresh_lo))
return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
sampled_gt_indices)
class ROIScoreSampler(ROISampler):
"""Samples RoIs, RoI-scores and creates training targets."""
def __call__(self, rois, roi_scores, gt_boxes, gt_classes):
"""Sample and assign RoIs for training.
Args:
rois: a tensor of shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment. The last dimension is the box
coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
roi_scores:
gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
coordinates of gt_boxes are in the pixel coordinates of the scaled
image. This tensor might have padding of values -1 indicating the
invalid box coordinates.
gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
tensor might have paddings with values of -1 indicating the invalid
classes.
Returns:
sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
coordinates of the sampled RoIs, where K is the number of the sampled
RoIs, i.e. K = num_samples_per_image.
sampled_roi_scores:
sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
box coordinates of the matched groundtruth boxes of the samples RoIs.
sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
classes of the matched groundtruth boxes of the sampled RoIs.
"""
(sampled_rois, sampled_roi_scores, sampled_gt_boxes, sampled_gt_classes,
sampled_gt_indices) = (
self.assign_and_sample_proposals_and_scores(
rois,
roi_scores,
gt_boxes,
gt_classes,
num_samples_per_image=self._num_samples_per_image,
mix_gt_boxes=self._mix_gt_boxes,
fg_fraction=self._fg_fraction,
fg_iou_thresh=self._fg_iou_thresh,
bg_iou_thresh_hi=self._bg_iou_thresh_hi,
bg_iou_thresh_lo=self._bg_iou_thresh_lo))
return (sampled_rois, sampled_roi_scores, sampled_gt_boxes,
sampled_gt_classes, sampled_gt_indices)
def assign_and_sample_proposals_and_scores(self,
proposed_boxes,
proposed_scores,
gt_boxes,
gt_classes,
num_samples_per_image=512,
mix_gt_boxes=True,
fg_fraction=0.25,
fg_iou_thresh=0.5,
bg_iou_thresh_hi=0.5,
bg_iou_thresh_lo=0.0):
"""Assigns the proposals with groundtruth classes and performs subsmpling.
Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
following algorithm to generate the final `num_samples_per_image` RoIs.
1. Calculates the IoU between each proposal box and each gt_boxes.
2. Assigns each proposed box with a groundtruth class and box by choosing
the largest IoU overlap.
3. Samples `num_samples_per_image` boxes from all proposed boxes, and
returns box_targets, class_targets, and RoIs.
Args:
proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number
of proposals before groundtruth assignment. The last dimension is the
box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
format.
proposed_scores: a tensor of shape of [batch_size, N]. N is the number of
proposals before groundtruth assignment. It is the rpn scores for all
proposed boxes which can be either their classification or centerness
scores.
gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
coordinates of gt_boxes are in the pixel coordinates of the scaled
image. This tensor might have padding of values -1 indicating the
invalid box coordinates.
gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
tensor might have paddings with values of -1 indicating the invalid
classes.
num_samples_per_image: a integer represents RoI minibatch size per image.
mix_gt_boxes: a bool indicating whether to mix the groundtruth boxes
before sampling proposals.
fg_fraction: a float represents the target fraction of RoI minibatch that
is labeled foreground (i.e., class > 0).
fg_iou_thresh: a float represents the IoU overlap threshold for an RoI to
be considered foreground (if >= fg_iou_thresh).
bg_iou_thresh_hi: a float represents the IoU overlap threshold for an RoI
to be considered background (class = 0 if overlap in [LO, HI)).
bg_iou_thresh_lo: a float represents the IoU overlap threshold for an RoI
to be considered background (class = 0 if overlap in [LO, HI)).
Returns:
sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
coordinates of the sampled RoIs, where K is the number of the sampled
RoIs, i.e. K = num_samples_per_image.
sampled_scores: a tensor of shape of [batch_size, K], representing the
confidence score of the sampled RoIs, where K is the number of the
sampled RoIs, i.e. K = num_samples_per_image.
sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
box coordinates of the matched groundtruth boxes of the samples RoIs.
sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
classes of the matched groundtruth boxes of the sampled RoIs.
sampled_gt_indices: a tensor of shape of [batch_size, K], storing the
indices of the sampled groudntruth boxes in the original `gt_boxes`
tensor, i.e. gt_boxes[sampled_gt_indices[:, i]] =
sampled_gt_boxes[:, i].
"""
with tf.name_scope('sample_proposals_and_scores'):
if mix_gt_boxes:
boxes = tf.concat([proposed_boxes, gt_boxes], axis=1)
gt_scores = tf.ones_like(gt_boxes[:, :, 0])
scores = tf.concat([proposed_scores, gt_scores], axis=1)
else:
boxes = proposed_boxes
scores = proposed_scores
(matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
_) = box_matching(boxes, gt_boxes, gt_classes)
positive_match = tf.greater(matched_iou, fg_iou_thresh)
negative_match = tf.logical_and(
tf.greater_equal(matched_iou, bg_iou_thresh_lo),
tf.less(matched_iou, bg_iou_thresh_hi))
ignored_match = tf.less(matched_iou, 0.0)
# re-assign negatively matched boxes to the background class.
matched_gt_classes = tf.where(negative_match,
tf.zeros_like(matched_gt_classes),
matched_gt_classes)
matched_gt_indices = tf.where(negative_match,
tf.zeros_like(matched_gt_indices),
matched_gt_indices)
sample_candidates = tf.logical_and(
tf.logical_or(positive_match, negative_match),
tf.logical_not(ignored_match))
sampler = (
balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
positive_fraction=fg_fraction, is_static=True))
batch_size, _ = sample_candidates.get_shape().as_list()
sampled_indicators = []
for i in range(batch_size):
sampled_indicator = sampler.subsample(sample_candidates[i],
num_samples_per_image,
positive_match[i])
sampled_indicators.append(sampled_indicator)
sampled_indicators = tf.stack(sampled_indicators)
_, sampled_indices = tf.nn.top_k(
tf.cast(sampled_indicators, dtype=tf.int32),
k=num_samples_per_image,
sorted=True)
sampled_indices_shape = tf.shape(sampled_indices)
batch_indices = (
tf.expand_dims(tf.range(sampled_indices_shape[0]), axis=-1) *
tf.ones([1, sampled_indices_shape[-1]], dtype=tf.int32))
gather_nd_indices = tf.stack([batch_indices, sampled_indices], axis=-1)
sampled_rois = tf.gather_nd(boxes, gather_nd_indices)
sampled_roi_scores = tf.gather_nd(scores, gather_nd_indices)
sampled_gt_boxes = tf.gather_nd(matched_gt_boxes, gather_nd_indices)
sampled_gt_classes = tf.gather_nd(matched_gt_classes, gather_nd_indices)
sampled_gt_indices = tf.gather_nd(matched_gt_indices, gather_nd_indices)
return (sampled_rois, sampled_roi_scores, sampled_gt_boxes,
sampled_gt_classes, sampled_gt_indices)
class MaskSampler(tf.keras.layers.Layer):
"""Samples and creates mask training targets."""
def __init__(self, mask_target_size, num_mask_samples_per_image):
self._mask_target_size = mask_target_size
self._num_mask_samples_per_image = num_mask_samples_per_image
super(MaskSampler, self).__init__(autocast=False)
def call(self,
candidate_rois,
candidate_gt_boxes,
candidate_gt_classes,
candidate_gt_indices,
gt_masks):
"""Sample and create mask targets for training.
Args:
candidate_rois: a tensor of shape of [batch_size, N, 4], where N is the
number of candidate RoIs to be considered for mask sampling. It includes
both positive and negative RoIs. The `num_mask_samples_per_image`
positive RoIs will be sampled to create mask training targets.
candidate_gt_boxes: a tensor of shape of [batch_size, N, 4], storing the
corresponding groundtruth boxes to the `candidate_rois`.
candidate_gt_classes: a tensor of shape of [batch_size, N], storing the
corresponding groundtruth classes to the `candidate_rois`. 0 in the
tensor corresponds to the background class, i.e. negative RoIs.
candidate_gt_indices: a tensor of shape [batch_size, N], storing the
corresponding groundtruth instance indices to the `candidate_gt_boxes`,
i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i],
where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >=
N, is the superset of candidate_gt_boxes.
gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
containing all the groundtruth masks which sample masks are drawn from.
after sampling. The output masks are resized w.r.t the sampled RoIs.
Returns:
foreground_rois: a tensor of shape of [batch_size, K, 4] storing the RoI
that corresponds to the sampled foreground masks, where
K = num_mask_samples_per_image.
foreground_classes: a tensor of shape of [batch_size, K] storing the
classes corresponding to the sampled foreground masks.
cropoped_foreground_masks: a tensor of shape of
[batch_size, K, mask_target_size, mask_target_size] storing the
cropped foreground masks used for training.
"""
foreground_rois, foreground_classes, cropped_foreground_masks = (
sample_and_crop_foreground_masks(candidate_rois, candidate_gt_boxes,
candidate_gt_classes,
candidate_gt_indices, gt_masks,
self._num_mask_samples_per_image,
self._mask_target_size))
return foreground_rois, foreground_classes, cropped_foreground_masks
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for bounding box processing."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
EPSILON = 1e-8
BBOX_XFORM_CLIP = np.log(1000. / 16.)
def visualize_images_with_bounding_boxes(images, box_outputs, step,
summary_writer):
"""Records subset of evaluation images with bounding boxes."""
image_shape = tf.shape(images[0])
image_height = tf.cast(image_shape[0], tf.float32)
image_width = tf.cast(image_shape[1], tf.float32)
normalized_boxes = normalize_boxes(box_outputs, [image_height, image_width])
bounding_box_color = tf.constant([[1.0, 1.0, 0.0, 1.0]])
image_summary = tf.image.draw_bounding_boxes(images, normalized_boxes,
bounding_box_color)
with summary_writer.as_default():
tf.summary.image('bounding_box_summary', image_summary, step=step)
summary_writer.flush()
def yxyx_to_xywh(boxes):
"""Converts boxes from ymin, xmin, ymax, xmax to xmin, ymin, width, height.
Args:
boxes: a numpy array whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
Returns:
boxes: a numpy array whose shape is the same as `boxes` in new format.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
boxes_ymin = boxes[..., 0]
boxes_xmin = boxes[..., 1]
boxes_width = boxes[..., 3] - boxes[..., 1]
boxes_height = boxes[..., 2] - boxes[..., 0]
new_boxes = np.stack([boxes_xmin, boxes_ymin, boxes_width, boxes_height],
axis=-1)
return new_boxes
def jitter_boxes(boxes, noise_scale=0.025):
"""Jitter the box coordinates by some noise distribution.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
noise_scale: a python float which specifies the magnitude of noise. The rule
of thumb is to set this between (0, 0.1]. The default value is found to
mimic the noisy detections best empirically.
Returns:
jittered_boxes: a tensor whose shape is the same as `boxes` representing
the jittered boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('jitter_boxes'):
bbox_jitters = tf.random.normal(boxes.get_shape(), stddev=noise_scale)
ymin = boxes[..., 0:1]
xmin = boxes[..., 1:2]
ymax = boxes[..., 2:3]
xmax = boxes[..., 3:4]
width = xmax - xmin
height = ymax - ymin
new_center_x = (xmin + xmax) / 2.0 + bbox_jitters[..., 0:1] * width
new_center_y = (ymin + ymax) / 2.0 + bbox_jitters[..., 1:2] * height
new_width = width * tf.math.exp(bbox_jitters[..., 2:3])
new_height = height * tf.math.exp(bbox_jitters[..., 3:4])
jittered_boxes = tf.concat([
new_center_y - new_height * 0.5, new_center_x - new_width * 0.5,
new_center_y + new_height * 0.5, new_center_x + new_width * 0.5
],
axis=-1)
return jittered_boxes
def normalize_boxes(boxes, image_shape):
"""Converts boxes to the normalized coordinates.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
normalized_boxes: a tensor whose shape is the same as `boxes` representing
the normalized boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('normalize_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height = image_shape[..., 0:1]
width = image_shape[..., 1:2]
ymin = boxes[..., 0:1] / height
xmin = boxes[..., 1:2] / width
ymax = boxes[..., 2:3] / height
xmax = boxes[..., 3:4] / width
normalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
return normalized_boxes
def denormalize_boxes(boxes, image_shape):
"""Converts boxes normalized by [height, width] to pixel coordinates.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
denormalized_boxes: a tensor whose shape is the same as `boxes` representing
the denormalized boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
with tf.name_scope('denormalize_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height, width = tf.split(image_shape, 2, axis=-1)
ymin, xmin, ymax, xmax = tf.split(boxes, 4, axis=-1)
ymin = ymin * height
xmin = xmin * width
ymax = ymax * height
xmax = xmax * width
denormalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
return denormalized_boxes
def clip_boxes(boxes, image_shape):
"""Clips boxes to image boundaries.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
clipped_boxes: a tensor whose shape is the same as `boxes` representing the
clipped boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('clip_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
max_length = [height - 1.0, width - 1.0, height - 1.0, width - 1.0]
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height, width = tf.unstack(image_shape, axis=-1)
max_length = tf.stack(
[height - 1.0, width - 1.0, height - 1.0, width - 1.0], axis=-1)
clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0)
return clipped_boxes
def compute_outer_boxes(boxes, image_shape, scale=1.0):
"""Compute outer box encloses an object with a margin.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
scale: a float number specifying the scale of output outer boxes to input
`boxes`.
Returns:
outer_boxes: a tensor whose shape is the same as `boxes` representing the
outer boxes.
"""
if scale < 1.0:
raise ValueError(
'scale is {}, but outer box scale must be greater than 1.0.'.format(
scale))
centers_y = (boxes[..., 0] + boxes[..., 2]) / 2.0
centers_x = (boxes[..., 1] + boxes[..., 3]) / 2.0
box_height = (boxes[..., 2] - boxes[..., 0]) * scale
box_width = (boxes[..., 3] - boxes[..., 1]) * scale
outer_boxes = tf.stack([
centers_y - box_height / 2.0, centers_x - box_width / 2.0,
centers_y + box_height / 2.0, centers_x + box_width / 2.0
],
axis=1)
outer_boxes = clip_boxes(outer_boxes, image_shape)
return outer_boxes
def encode_boxes(boxes, anchors, weights=None):
"""Encode boxes to targets.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
encoded_boxes: a tensor whose shape is the same as `boxes` representing the
encoded box targets.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('encode_boxes'):
boxes = tf.cast(boxes, dtype=anchors.dtype)
ymin = boxes[..., 0:1]
xmin = boxes[..., 1:2]
ymax = boxes[..., 2:3]
xmax = boxes[..., 3:4]
box_h = ymax - ymin + 1.0
box_w = xmax - xmin + 1.0
box_yc = ymin + 0.5 * box_h
box_xc = xmin + 0.5 * box_w
anchor_ymin = anchors[..., 0:1]
anchor_xmin = anchors[..., 1:2]
anchor_ymax = anchors[..., 2:3]
anchor_xmax = anchors[..., 3:4]
anchor_h = anchor_ymax - anchor_ymin + 1.0
anchor_w = anchor_xmax - anchor_xmin + 1.0
anchor_yc = anchor_ymin + 0.5 * anchor_h
anchor_xc = anchor_xmin + 0.5 * anchor_w
encoded_dy = (box_yc - anchor_yc) / anchor_h
encoded_dx = (box_xc - anchor_xc) / anchor_w
encoded_dh = tf.math.log(box_h / anchor_h)
encoded_dw = tf.math.log(box_w / anchor_w)
if weights:
encoded_dy *= weights[0]
encoded_dx *= weights[1]
encoded_dh *= weights[2]
encoded_dw *= weights[3]
encoded_boxes = tf.concat([encoded_dy, encoded_dx, encoded_dh, encoded_dw],
axis=-1)
return encoded_boxes
def decode_boxes(encoded_boxes, anchors, weights=None):
"""Decode boxes.
Args:
encoded_boxes: a tensor whose last dimension is 4 representing the
coordinates of encoded boxes in ymin, xmin, ymax, xmax order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
encoded_boxes: a tensor whose shape is the same as `boxes` representing the
decoded box targets.
"""
if encoded_boxes.shape[-1] != 4:
raise ValueError('encoded_boxes.shape[-1] is {:d}, but must be 4.'.format(
encoded_boxes.shape[-1]))
with tf.name_scope('decode_boxes'):
encoded_boxes = tf.cast(encoded_boxes, dtype=anchors.dtype)
dy = encoded_boxes[..., 0:1]
dx = encoded_boxes[..., 1:2]
dh = encoded_boxes[..., 2:3]
dw = encoded_boxes[..., 3:4]
if weights:
dy /= weights[0]
dx /= weights[1]
dh /= weights[2]
dw /= weights[3]
dh = tf.math.minimum(dh, BBOX_XFORM_CLIP)
dw = tf.math.minimum(dw, BBOX_XFORM_CLIP)
anchor_ymin = anchors[..., 0:1]
anchor_xmin = anchors[..., 1:2]
anchor_ymax = anchors[..., 2:3]
anchor_xmax = anchors[..., 3:4]
anchor_h = anchor_ymax - anchor_ymin + 1.0
anchor_w = anchor_xmax - anchor_xmin + 1.0
anchor_yc = anchor_ymin + 0.5 * anchor_h
anchor_xc = anchor_xmin + 0.5 * anchor_w
decoded_boxes_yc = dy * anchor_h + anchor_yc
decoded_boxes_xc = dx * anchor_w + anchor_xc
decoded_boxes_h = tf.math.exp(dh) * anchor_h
decoded_boxes_w = tf.math.exp(dw) * anchor_w
decoded_boxes_ymin = decoded_boxes_yc - 0.5 * decoded_boxes_h
decoded_boxes_xmin = decoded_boxes_xc - 0.5 * decoded_boxes_w
decoded_boxes_ymax = decoded_boxes_ymin + decoded_boxes_h - 1.0
decoded_boxes_xmax = decoded_boxes_xmin + decoded_boxes_w - 1.0
decoded_boxes = tf.concat([
decoded_boxes_ymin, decoded_boxes_xmin, decoded_boxes_ymax,
decoded_boxes_xmax
],
axis=-1)
return decoded_boxes
def encode_boxes_lrtb(boxes, anchors, weights=None):
"""Encode boxes to targets on lrtb (=left,right,top,bottom) format.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
encoded_boxes_lrtb: a tensor whose shape is the same as `boxes` representing
the encoded box targets. The box targets encode the left, right, top,
bottom distances from an anchor location to the four borders of the
matched groundtruth bounding box.
center_targets: centerness targets defined by the left, right, top, and
bottom distance targets. The centerness is defined as the deviation of the
anchor location from the groundtruth object center. Formally, centerness =
sqrt(min(left, right)/max(left, right)*min(top, bottom)/max(top, bottom)).
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
with tf.name_scope('encode_boxes_lrtb'):
boxes = tf.cast(boxes, dtype=anchors.dtype)
ymin = boxes[..., 0:1]
xmin = boxes[..., 1:2]
ymax = boxes[..., 2:3]
xmax = boxes[..., 3:4]
# box_h = ymax - ymin + 1.0
# box_w = xmax - xmin + 1.0
box_h = ymax - ymin
box_w = xmax - xmin
anchor_ymin = anchors[..., 0:1]
anchor_xmin = anchors[..., 1:2]
anchor_ymax = anchors[..., 2:3]
anchor_xmax = anchors[..., 3:4]
# anchor_h = anchor_ymax - anchor_ymin + 1.0
# anchor_w = anchor_xmax - anchor_xmin + 1.0
anchor_h = anchor_ymax - anchor_ymin
anchor_w = anchor_xmax - anchor_xmin
anchor_yc = anchor_ymin + 0.5 * anchor_h
anchor_xc = anchor_xmin + 0.5 * anchor_w
box_h += EPSILON
box_w += EPSILON
anchor_h += EPSILON
anchor_w += EPSILON
left = (anchor_xc - xmin) / anchor_w
right = (xmax - anchor_xc) / anchor_w
top = (anchor_yc - ymin) / anchor_h
bottom = (ymax - anchor_yc) / anchor_h
# Create centerness target. {
lrtb_targets = tf.concat([left, right, top, bottom], axis=-1)
valid_match = tf.greater(tf.reduce_min(lrtb_targets, -1), 0.0)
# Centerness score.
left_right = tf.concat([left, right], axis=-1)
left_right = tf.where(tf.stack([valid_match, valid_match], -1),
left_right, tf.zeros_like(left_right))
top_bottom = tf.concat([top, bottom], axis=-1)
top_bottom = tf.where(tf.stack([valid_match, valid_match], -1),
top_bottom, tf.zeros_like(top_bottom))
center_targets = tf.sqrt(
(tf.reduce_min(left_right, -1) /
(tf.reduce_max(left_right, -1) + EPSILON)) *
(tf.reduce_min(top_bottom, -1) /
(tf.reduce_max(top_bottom, -1) + EPSILON)))
center_targets = tf.where(valid_match,
center_targets,
tf.zeros_like(center_targets))
if weights:
left *= weights[0]
right *= weights[1]
top *= weights[2]
bottom *= weights[3]
encoded_boxes_lrtb = tf.concat(
[left, right, top, bottom],
axis=-1)
return encoded_boxes_lrtb, center_targets
def decode_boxes_lrtb(encoded_boxes_lrtb, anchors, weights=None):
"""Decode boxes.
Args:
encoded_boxes_lrtb: a tensor whose last dimension is 4 representing the
coordinates of encoded boxes in left, right, top, bottom order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
decoded_boxes_lrtb: a tensor whose shape is the same as `boxes` representing
the decoded box targets in lrtb (=left,right,top,bottom) format. The box
decoded box coordinates represent the left, right, top, and bottom
distances from an anchor location to the four borders of the matched
groundtruth bounding box.
"""
if encoded_boxes_lrtb.shape[-1] != 4:
raise ValueError(
'encoded_boxes_lrtb.shape[-1] is {:d}, but must be 4.'
.format(encoded_boxes_lrtb.shape[-1]))
with tf.name_scope('decode_boxes_lrtb'):
encoded_boxes_lrtb = tf.cast(encoded_boxes_lrtb, dtype=anchors.dtype)
left = encoded_boxes_lrtb[..., 0:1]
right = encoded_boxes_lrtb[..., 1:2]
top = encoded_boxes_lrtb[..., 2:3]
bottom = encoded_boxes_lrtb[..., 3:4]
if weights:
left /= weights[0]
right /= weights[1]
top /= weights[2]
bottom /= weights[3]
anchor_ymin = anchors[..., 0:1]
anchor_xmin = anchors[..., 1:2]
anchor_ymax = anchors[..., 2:3]
anchor_xmax = anchors[..., 3:4]
anchor_h = anchor_ymax - anchor_ymin
anchor_w = anchor_xmax - anchor_xmin
anchor_yc = anchor_ymin + 0.5 * anchor_h
anchor_xc = anchor_xmin + 0.5 * anchor_w
anchor_h += EPSILON
anchor_w += EPSILON
decoded_boxes_ymin = anchor_yc - top * anchor_h
decoded_boxes_xmin = anchor_xc - left * anchor_w
decoded_boxes_ymax = anchor_yc + bottom * anchor_h
decoded_boxes_xmax = anchor_xc + right * anchor_w
decoded_boxes_lrtb = tf.concat(
[decoded_boxes_ymin, decoded_boxes_xmin,
decoded_boxes_ymax, decoded_boxes_xmax],
axis=-1)
return decoded_boxes_lrtb
def filter_boxes(boxes, scores, image_shape, min_size_threshold):
"""Filter and remove boxes that are too small or fall outside the image.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
representing the original scores of the boxes.
image_shape: a tensor whose shape is the same as, or `broadcastable` to
`boxes` except the last dimension, which is 2, representing [height,
width] of the scaled image.
min_size_threshold: a float representing the minimal box size in each side
(w.r.t. the scaled image). Boxes whose sides are smaller than it will be
filtered out.
Returns:
filtered_boxes: a tensor whose shape is the same as `boxes` but with
the position of the filtered boxes are filled with 0.
filtered_scores: a tensor whose shape is the same as 'scores' but with
the positinon of the filtered boxes filled with 0.
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('filter_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height = image_shape[..., 0]
width = image_shape[..., 1]
ymin = boxes[..., 0]
xmin = boxes[..., 1]
ymax = boxes[..., 2]
xmax = boxes[..., 3]
h = ymax - ymin + 1.0
w = xmax - xmin + 1.0
yc = ymin + 0.5 * h
xc = xmin + 0.5 * w
min_size = tf.cast(
tf.math.maximum(min_size_threshold, 1.0), dtype=boxes.dtype)
filtered_size_mask = tf.math.logical_and(
tf.math.greater(h, min_size), tf.math.greater(w, min_size))
filtered_center_mask = tf.logical_and(
tf.math.logical_and(tf.math.greater(yc, 0.0), tf.math.less(yc, height)),
tf.math.logical_and(tf.math.greater(xc, 0.0), tf.math.less(xc, width)))
filtered_mask = tf.math.logical_and(filtered_size_mask,
filtered_center_mask)
filtered_scores = tf.where(filtered_mask, scores, tf.zeros_like(scores))
filtered_boxes = tf.cast(
tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
return filtered_boxes, filtered_scores
def filter_boxes_by_scores(boxes, scores, min_score_threshold):
"""Filter and remove boxes whose scores are smaller than the threshold.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
representing the original scores of the boxes.
min_score_threshold: a float representing the minimal box score threshold.
Boxes whose score are smaller than it will be filtered out.
Returns:
filtered_boxes: a tensor whose shape is the same as `boxes` but with
the position of the filtered boxes are filled with -1.
filtered_scores: a tensor whose shape is the same as 'scores' but with
the
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('filter_boxes_by_scores'):
filtered_mask = tf.math.greater(scores, min_score_threshold)
filtered_scores = tf.where(filtered_mask, scores, -tf.ones_like(scores))
filtered_boxes = tf.cast(
tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
return filtered_boxes, filtered_scores
def top_k_boxes(boxes, scores, k):
"""Sort and select top k boxes according to the scores.
Args:
boxes: a tensor of shape [batch_size, N, 4] representing the coordiante of
the boxes. N is the number of boxes per image.
scores: a tensor of shsape [batch_size, N] representing the socre of the
boxes.
k: an integer or a tensor indicating the top k number.
Returns:
selected_boxes: a tensor of shape [batch_size, k, 4] representing the
selected top k box coordinates.
selected_scores: a tensor of shape [batch_size, k] representing the selected
top k box scores.
"""
with tf.name_scope('top_k_boxes'):
selected_scores, top_k_indices = tf.nn.top_k(scores, k=k, sorted=True)
batch_size, _ = scores.get_shape().as_list()
if batch_size == 1:
selected_boxes = tf.squeeze(
tf.gather(boxes, top_k_indices, axis=1), axis=1)
else:
top_k_indices_shape = tf.shape(top_k_indices)
batch_indices = (
tf.expand_dims(tf.range(top_k_indices_shape[0]), axis=-1) *
tf.ones([1, top_k_indices_shape[-1]], dtype=tf.int32))
gather_nd_indices = tf.stack([batch_indices, top_k_indices], axis=-1)
selected_boxes = tf.gather_nd(boxes, gather_nd_indices)
return selected_boxes, selected_scores
def bbox_overlap(boxes, gt_boxes):
"""Calculates the overlap between proposal and ground truth boxes.
Some `gt_boxes` may have been padded. The returned `iou` tensor for these
boxes will be -1.
Args:
boxes: a tensor with a shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. This
tensor might have paddings with a negative value.
Returns:
iou: a tensor with as a shape of [batch_size, N, MAX_NUM_INSTANCES].
"""
with tf.name_scope('bbox_overlap'):
bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
value=boxes, num_or_size_splits=4, axis=2)
gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
value=gt_boxes, num_or_size_splits=4, axis=2)
# Calculates the intersection area.
i_xmin = tf.math.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
i_xmax = tf.math.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
i_ymin = tf.math.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
i_ymax = tf.math.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
i_area = tf.math.maximum((i_xmax - i_xmin), 0) * tf.math.maximum(
(i_ymax - i_ymin), 0)
# Calculates the union area.
bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min)
gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min)
# Adds a small epsilon to avoid divide-by-zero.
u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8
# Calculates IoU.
iou = i_area / u_area
# Fills -1 for IoU entries between the padded ground truth boxes.
gt_invalid_mask = tf.less(
tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
padding_mask = tf.logical_or(
tf.zeros_like(bb_x_min, dtype=tf.bool),
tf.transpose(gt_invalid_mask, [0, 2, 1]))
iou = tf.where(padding_mask, -tf.ones_like(iou), iou)
return iou
def get_non_empty_box_indices(boxes):
"""Get indices for non-empty boxes."""
# Selects indices if box height or width is 0.
height = boxes[:, 2] - boxes[:, 0]
width = boxes[:, 3] - boxes[:, 1]
indices = tf.where(
tf.logical_and(tf.greater(height, 0), tf.greater(width, 0)))
return indices[:, 0]
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for handling dataset object categories."""
def coco_split_class_ids(split_name):
"""Return the COCO class split ids based on split name and training mode.
Args:
split_name: The name of dataset split.
Returns:
class_ids: a python list of integer.
"""
if split_name == 'all':
return []
elif split_name == 'voc':
return [
1, 2, 3, 4, 5, 6, 7, 9, 16, 17, 18, 19, 20, 21, 44, 62, 63, 64, 67, 72
]
elif split_name == 'nonvoc':
return [
8, 10, 11, 13, 14, 15, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36,
37, 38, 39, 40, 41, 42, 43, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
57, 58, 59, 60, 61, 65, 70, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84,
85, 86, 87, 88, 89, 90
]
else:
raise ValueError('Invalid split name {}!!!'.format(split_name))
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for dataloader."""
import tensorflow as tf
from official.vision.detection.utils import input_utils
def process_source_id(source_id):
"""Processes source_id to the right format."""
if source_id.dtype == tf.string:
source_id = tf.cast(tf.strings.to_number(source_id), tf.int64)
with tf.control_dependencies([source_id]):
source_id = tf.cond(
pred=tf.equal(tf.size(input=source_id), 0),
true_fn=lambda: tf.cast(tf.constant(-1), tf.int64),
false_fn=lambda: tf.identity(source_id))
return source_id
def pad_groundtruths_to_fixed_size(gt, n):
"""Pads the first dimension of groundtruths labels to the fixed size."""
gt['boxes'] = input_utils.pad_to_fixed_size(gt['boxes'], n, -1)
gt['is_crowds'] = input_utils.pad_to_fixed_size(gt['is_crowds'], n, 0)
gt['areas'] = input_utils.pad_to_fixed_size(gt['areas'], n, -1)
gt['classes'] = input_utils.pad_to_fixed_size(gt['classes'], n, -1)
return gt
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for input processing."""
import math
import tensorflow as tf
from official.vision.detection.utils import box_utils
from official.vision.utils.object_detection import preprocessor
def pad_to_fixed_size(input_tensor, size, constant_values=0):
"""Pads data to a fixed length at the first dimension.
Args:
input_tensor: `Tensor` with any dimension.
size: `int` number for the first dimension of output Tensor.
constant_values: `int` value assigned to the paddings.
Returns:
`Tensor` with the first dimension padded to `size`.
"""
input_shape = input_tensor.get_shape().as_list()
padding_shape = []
# Computes the padding length on the first dimension.
padding_length = tf.maximum(0, size - tf.shape(input_tensor)[0])
assert_length = tf.Assert(
tf.greater_equal(padding_length, 0), [padding_length])
with tf.control_dependencies([assert_length]):
padding_shape.append(padding_length)
# Copies shapes of the rest of input shape dimensions.
for i in range(1, len(input_shape)):
padding_shape.append(tf.shape(input=input_tensor)[i])
# Pads input tensor to the fixed first dimension.
paddings = tf.cast(constant_values * tf.ones(padding_shape),
input_tensor.dtype)
padded_tensor = tf.concat([input_tensor, paddings], axis=0)
output_shape = input_shape
output_shape[0] = size
padded_tensor.set_shape(output_shape)
return padded_tensor
def normalize_image(image,
offset=(0.485, 0.456, 0.406),
scale=(0.229, 0.224, 0.225)):
"""Normalizes the image to zero mean and unit variance."""
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
offset = tf.constant(offset)
offset = tf.expand_dims(offset, axis=0)
offset = tf.expand_dims(offset, axis=0)
image -= offset
scale = tf.constant(scale)
scale = tf.expand_dims(scale, axis=0)
scale = tf.expand_dims(scale, axis=0)
image /= scale
return image
def compute_padded_size(desired_size, stride):
"""Compute the padded size given the desired size and the stride.
The padded size will be the smallest rectangle, such that each dimension is
the smallest multiple of the stride which is larger than the desired
dimension. For example, if desired_size = (100, 200) and stride = 32,
the output padded_size = (128, 224).
Args:
desired_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the target output image size.
stride: an integer, the stride of the backbone network.
Returns:
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size.
"""
if isinstance(desired_size, list) or isinstance(desired_size, tuple):
padded_size = [
int(math.ceil(d * 1.0 / stride) * stride) for d in desired_size
]
else:
padded_size = tf.cast(
tf.math.ceil(tf.cast(desired_size, dtype=tf.float32) / stride) * stride,
tf.int32)
return padded_size
def resize_and_crop_image(image,
desired_size,
padded_size,
aug_scale_min=1.0,
aug_scale_max=1.0,
seed=1,
method=tf.image.ResizeMethod.BILINEAR):
"""Resizes the input image to output size.
Resize and pad images given the desired output size of the image and
stride size.
Here are the preprocessing steps.
1. For a given image, keep its aspect ratio and rescale the image to make it
the largest rectangle to be bounded by the rectangle specified by the
`desired_size`.
2. Pad the rescaled image to the padded_size.
Args:
image: a `Tensor` of shape [height, width, 3] representing an image.
desired_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the desired actual output image size.
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size. Padding will be applied
after scaling the image to the desired_size.
aug_scale_min: a `float` with range between [0, 1.0] representing minimum
random scale applied to desired_size for training scale jittering.
aug_scale_max: a `float` with range between [1.0, inf] representing maximum
random scale applied to desired_size for training scale jittering.
seed: seed for random scale jittering.
method: function to resize input image to scaled image.
Returns:
output_image: `Tensor` of shape [height, width, 3] where [height, width]
equals to `output_size`.
image_info: a 2D `Tensor` that encodes the information of the image and the
applied preprocessing. It is in the format of
[[original_height, original_width], [desired_height, desired_width],
[y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
desireed_width] is the actual scaled image size, and [y_scale, x_scale] is
the scaling factory, which is the ratio of
scaled dimension / original dimension.
"""
with tf.name_scope('resize_and_crop_image'):
image_size = tf.cast(tf.shape(input=image)[0:2], tf.float32)
random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
if random_jittering:
random_scale = tf.random.uniform([],
aug_scale_min,
aug_scale_max,
seed=seed)
scaled_size = tf.round(random_scale * desired_size)
else:
scaled_size = desired_size
scale = tf.minimum(scaled_size[0] / image_size[0],
scaled_size[1] / image_size[1])
scaled_size = tf.round(image_size * scale)
# Computes 2D image_scale.
image_scale = scaled_size / image_size
# Selects non-zero random offset (x, y) if scaled image is larger than
# desired_size.
if random_jittering:
max_offset = scaled_size - desired_size
max_offset = tf.where(
tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
offset = max_offset * tf.random.uniform([
2,
], 0, 1, seed=seed)
offset = tf.cast(offset, tf.int32)
else:
offset = tf.zeros((2,), tf.int32)
scaled_image = tf.image.resize(
image, tf.cast(scaled_size, tf.int32), method=method)
if random_jittering:
scaled_image = scaled_image[offset[0]:offset[0] + desired_size[0],
offset[1]:offset[1] + desired_size[1], :]
output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0,
padded_size[0], padded_size[1])
image_info = tf.stack([
image_size,
tf.cast(desired_size, dtype=tf.float32), image_scale,
tf.cast(offset, tf.float32)
])
return output_image, image_info
def resize_and_crop_image_v2(image,
short_side,
long_side,
padded_size,
aug_scale_min=1.0,
aug_scale_max=1.0,
seed=1,
method=tf.image.ResizeMethod.BILINEAR):
"""Resizes the input image to output size (Faster R-CNN style).
Resize and pad images given the specified short / long side length and the
stride size.
Here are the preprocessing steps.
1. For a given image, keep its aspect ratio and first try to rescale the short
side of the original image to `short_side`.
2. If the scaled image after 1 has a long side that exceeds `long_side`, keep
the aspect ratio and rescal the long side of the image to `long_side`.
2. Pad the rescaled image to the padded_size.
Args:
image: a `Tensor` of shape [height, width, 3] representing an image.
short_side: a scalar `Tensor` or `int` representing the desired short side
to be rescaled to.
long_side: a scalar `Tensor` or `int` representing the desired long side to
be rescaled to.
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size. Padding will be applied
after scaling the image to the desired_size.
aug_scale_min: a `float` with range between [0, 1.0] representing minimum
random scale applied to desired_size for training scale jittering.
aug_scale_max: a `float` with range between [1.0, inf] representing maximum
random scale applied to desired_size for training scale jittering.
seed: seed for random scale jittering.
method: function to resize input image to scaled image.
Returns:
output_image: `Tensor` of shape [height, width, 3] where [height, width]
equals to `output_size`.
image_info: a 2D `Tensor` that encodes the information of the image and the
applied preprocessing. It is in the format of
[[original_height, original_width], [desired_height, desired_width],
[y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
desired_width] is the actual scaled image size, and [y_scale, x_scale] is
the scaling factor, which is the ratio of
scaled dimension / original dimension.
"""
with tf.name_scope('resize_and_crop_image_v2'):
image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
scale_using_short_side = (
short_side / tf.math.minimum(image_size[0], image_size[1]))
scale_using_long_side = (
long_side / tf.math.maximum(image_size[0], image_size[1]))
scaled_size = tf.math.round(image_size * scale_using_short_side)
scaled_size = tf.where(
tf.math.greater(
tf.math.maximum(scaled_size[0], scaled_size[1]), long_side),
tf.math.round(image_size * scale_using_long_side), scaled_size)
desired_size = scaled_size
random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
if random_jittering:
random_scale = tf.random.uniform([],
aug_scale_min,
aug_scale_max,
seed=seed)
scaled_size = tf.math.round(random_scale * scaled_size)
# Computes 2D image_scale.
image_scale = scaled_size / image_size
# Selects non-zero random offset (x, y) if scaled image is larger than
# desired_size.
if random_jittering:
max_offset = scaled_size - desired_size
max_offset = tf.where(
tf.math.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
offset = max_offset * tf.random.uniform([
2,
], 0, 1, seed=seed)
offset = tf.cast(offset, tf.int32)
else:
offset = tf.zeros((2,), tf.int32)
scaled_image = tf.image.resize(
image, tf.cast(scaled_size, tf.int32), method=method)
if random_jittering:
scaled_image = scaled_image[offset[0]:offset[0] + desired_size[0],
offset[1]:offset[1] + desired_size[1], :]
output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0,
padded_size[0], padded_size[1])
image_info = tf.stack([
image_size,
tf.cast(desired_size, dtype=tf.float32), image_scale,
tf.cast(offset, tf.float32)
])
return output_image, image_info
def resize_and_crop_boxes(boxes, image_scale, output_size, offset):
"""Resizes boxes to output size with scale and offset.
Args:
boxes: `Tensor` of shape [N, 4] representing ground truth boxes.
image_scale: 2D float `Tensor` representing scale factors that apply to
[height, width] of input image.
output_size: 2D `Tensor` or `int` representing [height, width] of target
output image size.
offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
boxes.
Returns:
boxes: `Tensor` of shape [N, 4] representing the scaled boxes.
"""
# Adjusts box coordinates based on image_scale and offset.
boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
# Clips the boxes.
boxes = box_utils.clip_boxes(boxes, output_size)
return boxes
def resize_and_crop_masks(masks, image_scale, output_size, offset):
"""Resizes boxes to output size with scale and offset.
Args:
masks: `Tensor` of shape [N, H, W, 1] representing ground truth masks.
image_scale: 2D float `Tensor` representing scale factors that apply to
[height, width] of input image.
output_size: 2D `Tensor` or `int` representing [height, width] of target
output image size.
offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
boxes.
Returns:
masks: `Tensor` of shape [N, H, W, 1] representing the scaled masks.
"""
mask_size = tf.shape(input=masks)[1:3]
scaled_size = tf.cast(image_scale * tf.cast(mask_size, image_scale.dtype),
tf.int32)
scaled_masks = tf.image.resize(
masks, scaled_size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
offset = tf.cast(offset, tf.int32)
scaled_masks = scaled_masks[:, offset[0]:offset[0] + output_size[0],
offset[1]:offset[1] + output_size[1], :]
output_masks = tf.image.pad_to_bounding_box(scaled_masks, 0, 0,
output_size[0], output_size[1])
return output_masks
def random_horizontal_flip(image, boxes=None, masks=None):
"""Randomly flips input image and bounding boxes."""
return preprocessor.random_horizontal_flip(image, boxes, masks)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for segmentations."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy as np
import cv2
def paste_instance_masks(masks, detected_boxes, image_height, image_width):
"""Paste instance masks to generate the image segmentation results.
Args:
masks: a numpy array of shape [N, mask_height, mask_width] representing the
instance masks w.r.t. the `detected_boxes`.
detected_boxes: a numpy array of shape [N, 4] representing the reference
bounding boxes.
image_height: an integer representing the height of the image.
image_width: an integer representing the width of the image.
Returns:
segms: a numpy array of shape [N, image_height, image_width] representing
the instance masks *pasted* on the image canvas.
"""
def expand_boxes(boxes, scale):
"""Expands an array of boxes by a given scale."""
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227 # pylint: disable=line-too-long
# The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
# whereas `boxes` here is in [x1, y1, w, h] form
w_half = boxes[:, 2] * .5
h_half = boxes[:, 3] * .5
x_c = boxes[:, 0] + w_half
y_c = boxes[:, 1] + h_half
w_half *= scale
h_half *= scale
boxes_exp = np.zeros(boxes.shape)
boxes_exp[:, 0] = x_c - w_half
boxes_exp[:, 2] = x_c + w_half
boxes_exp[:, 1] = y_c - h_half
boxes_exp[:, 3] = y_c + h_half
return boxes_exp
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812 # pylint: disable=line-too-long
# To work around an issue with cv2.resize (it seems to automatically pad
# with repeated border values), we manually zero-pad the masks by 1 pixel
# prior to resizing back to the original image resolution. This prevents
# "top hat" artifacts. We therefore need to expand the reference boxes by an
# appropriate factor.
_, mask_height, mask_width = masks.shape
scale = max((mask_width + 2.0) / mask_width,
(mask_height + 2.0) / mask_height)
ref_boxes = expand_boxes(detected_boxes, scale)
ref_boxes = ref_boxes.astype(np.int32)
padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
segms = []
for mask_ind, mask in enumerate(masks):
im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
# Process mask inside bounding boxes.
padded_mask[1:-1, 1:-1] = mask[:, :]
ref_box = ref_boxes[mask_ind, :]
w = ref_box[2] - ref_box[0] + 1
h = ref_box[3] - ref_box[1] + 1
w = np.maximum(w, 1)
h = np.maximum(h, 1)
mask = cv2.resize(padded_mask, (w, h))
mask = np.array(mask > 0.5, dtype=np.uint8)
x_0 = min(max(ref_box[0], 0), image_width)
x_1 = min(max(ref_box[2] + 1, 0), image_width)
y_0 = min(max(ref_box[1], 0), image_height)
y_1 = min(max(ref_box[3] + 1, 0), image_height)
im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[1]),
(x_0 - ref_box[0]):(x_1 - ref_box[0])]
segms.append(im_mask)
segms = np.array(segms)
assert masks.shape[0] == segms.shape[0]
return segms
def paste_instance_masks_v2(masks, detected_boxes, image_height, image_width):
"""Paste instance masks to generate the image segmentation (v2).
Args:
masks: a numpy array of shape [N, mask_height, mask_width] representing the
instance masks w.r.t. the `detected_boxes`.
detected_boxes: a numpy array of shape [N, 4] representing the reference
bounding boxes.
image_height: an integer representing the height of the image.
image_width: an integer representing the width of the image.
Returns:
segms: a numpy array of shape [N, image_height, image_width] representing
the instance masks *pasted* on the image canvas.
"""
_, mask_height, mask_width = masks.shape
segms = []
for i, mask in enumerate(masks):
box = detected_boxes[i, :]
xmin = box[0]
ymin = box[1]
xmax = xmin + box[2]
ymax = ymin + box[3]
# Sample points of the cropped mask w.r.t. the image grid.
# Note that these coordinates may fall beyond the image.
# Pixel clipping will happen after warping.
xmin_int = int(math.floor(xmin))
xmax_int = int(math.ceil(xmax))
ymin_int = int(math.floor(ymin))
ymax_int = int(math.ceil(ymax))
alpha = box[2] / (1.0 * mask_width)
beta = box[3] / (1.0 * mask_height)
# pylint: disable=invalid-name
# Transformation from mask pixel indices to image coordinate.
M_mask_to_image = np.array([[alpha, 0, xmin], [0, beta, ymin], [0, 0, 1]],
dtype=np.float32)
# Transformation from image to cropped mask coordinate.
M_image_to_crop = np.array(
[[1, 0, -xmin_int], [0, 1, -ymin_int], [0, 0, 1]], dtype=np.float32)
M = np.dot(M_image_to_crop, M_mask_to_image)
# Compensate the half pixel offset that OpenCV has in the
# warpPerspective implementation: the top-left pixel is sampled
# at (0,0), but we want it to be at (0.5, 0.5).
M = np.dot(
np.dot(
np.array([[1, 0, -0.5], [0, 1, -0.5], [0, 0, 1]], np.float32), M),
np.array([[1, 0, 0.5], [0, 1, 0.5], [0, 0, 1]], np.float32))
# pylint: enable=invalid-name
cropped_mask = cv2.warpPerspective(
mask.astype(np.float32), M, (xmax_int - xmin_int, ymax_int - ymin_int))
cropped_mask = np.array(cropped_mask > 0.5, dtype=np.uint8)
img_mask = np.zeros((image_height, image_width))
x0 = max(min(xmin_int, image_width), 0)
x1 = max(min(xmax_int, image_width), 0)
y0 = max(min(ymin_int, image_height), 0)
y1 = max(min(ymax_int, image_height), 0)
img_mask[y0:y1, x0:x1] = cropped_mask[(y0 - ymin_int):(y1 - ymin_int),
(x0 - xmin_int):(x1 - xmin_int)]
segms.append(img_mask)
segms = np.array(segms)
return segms
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment