Commit bcb231f0 authored by Yeqing Li's avatar Yeqing Li Committed by A. Unique TensorFlower
Browse files

Move retinanet keras model to tensorflow_models/official

PiperOrigin-RevId: 274010788
parent 04ce9636
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Class to subsample minibatches by balancing positives and negatives.
Subsamples minibatches based on a pre-specified positive fraction in range
[0,1]. The class presumes there are many more negatives than positive examples:
if the desired batch_size cannot be achieved with the pre-specified positive
fraction, it fills the rest with negative examples. If this is not sufficient
for obtaining the desired batch_size, it returns fewer examples.
The main function to call is Subsample(self, indicator, labels). For convenience
one can also call SubsampleWeights(self, weights, labels) which is defined in
the minibatch_sampler base class.
When is_static is True, it implements a method that guarantees static shapes.
It also ensures the length of output of the subsample is always batch_size, even
when number of examples set to True in indicator is less than batch_size.
This is originally implemented in TensorFlow Object Detection API.
"""
import tensorflow.compat.v2 as tf
from official.vision.detection.utils.object_detection import minibatch_sampler
from official.vision.detection.utils.object_detection import ops
class BalancedPositiveNegativeSampler(minibatch_sampler.MinibatchSampler):
"""Subsamples minibatches to a desired balance of positives and negatives."""
def __init__(self, positive_fraction=0.5, is_static=False):
"""Constructs a minibatch sampler.
Args:
positive_fraction: desired fraction of positive examples (scalar in [0,1])
in the batch.
is_static: If True, uses an implementation with static shape guarantees.
Raises:
ValueError: if positive_fraction < 0, or positive_fraction > 1
"""
if positive_fraction < 0 or positive_fraction > 1:
raise ValueError('positive_fraction should be in range [0,1]. '
'Received: %s.' % positive_fraction)
self._positive_fraction = positive_fraction
self._is_static = is_static
def _get_num_pos_neg_samples(self, sorted_indices_tensor, sample_size):
"""Counts the number of positives and negatives numbers to be sampled.
Args:
sorted_indices_tensor: A sorted int32 tensor of shape [N] which contains
the signed indices of the examples where the sign is based on the label
value. The examples that cannot be sampled are set to 0. It samples
atmost sample_size*positive_fraction positive examples and remaining
from negative examples.
sample_size: Size of subsamples.
Returns:
A tuple containing the number of positive and negative labels in the
subsample.
"""
input_length = tf.shape(input=sorted_indices_tensor)[0]
valid_positive_index = tf.greater(sorted_indices_tensor,
tf.zeros(input_length, tf.int32))
num_sampled_pos = tf.reduce_sum(
input_tensor=tf.cast(valid_positive_index, tf.int32))
max_num_positive_samples = tf.constant(
int(sample_size * self._positive_fraction), tf.int32)
num_positive_samples = tf.minimum(max_num_positive_samples, num_sampled_pos)
num_negative_samples = tf.constant(sample_size,
tf.int32) - num_positive_samples
return num_positive_samples, num_negative_samples
def _get_values_from_start_and_end(self, input_tensor, num_start_samples,
num_end_samples, total_num_samples):
"""slices num_start_samples and last num_end_samples from input_tensor.
Args:
input_tensor: An int32 tensor of shape [N] to be sliced.
num_start_samples: Number of examples to be sliced from the beginning
of the input tensor.
num_end_samples: Number of examples to be sliced from the end of the
input tensor.
total_num_samples: Sum of is num_start_samples and num_end_samples. This
should be a scalar.
Returns:
A tensor containing the first num_start_samples and last num_end_samples
from input_tensor.
"""
input_length = tf.shape(input=input_tensor)[0]
start_positions = tf.less(tf.range(input_length), num_start_samples)
end_positions = tf.greater_equal(
tf.range(input_length), input_length - num_end_samples)
selected_positions = tf.logical_or(start_positions, end_positions)
selected_positions = tf.cast(selected_positions, tf.float32)
indexed_positions = tf.multiply(tf.cumsum(selected_positions),
selected_positions)
one_hot_selector = tf.one_hot(tf.cast(indexed_positions, tf.int32) - 1,
total_num_samples,
dtype=tf.float32)
return tf.cast(tf.tensordot(tf.cast(input_tensor, tf.float32),
one_hot_selector, axes=[0, 0]), tf.int32)
def _static_subsample(self, indicator, batch_size, labels):
"""Returns subsampled minibatch.
Args:
indicator: boolean tensor of shape [N] whose True entries can be sampled.
N should be a complie time constant.
batch_size: desired batch size. This scalar cannot be None.
labels: boolean tensor of shape [N] denoting positive(=True) and negative
(=False) examples. N should be a complie time constant.
Returns:
sampled_idx_indicator: boolean tensor of shape [N], True for entries which
are sampled. It ensures the length of output of the subsample is always
batch_size, even when number of examples set to True in indicator is
less than batch_size.
Raises:
ValueError: if labels and indicator are not 1D boolean tensors.
"""
# Check if indicator and labels have a static size.
if not indicator.shape.is_fully_defined():
raise ValueError('indicator must be static in shape when is_static is'
'True')
if not labels.shape.is_fully_defined():
raise ValueError('labels must be static in shape when is_static is'
'True')
if not isinstance(batch_size, int):
raise ValueError('batch_size has to be an integer when is_static is'
'True.')
input_length = tf.shape(input=indicator)[0]
# Set the number of examples set True in indicator to be at least
# batch_size.
num_true_sampled = tf.reduce_sum(
input_tensor=tf.cast(indicator, tf.float32))
additional_false_sample = tf.less_equal(
tf.cumsum(tf.cast(tf.logical_not(indicator), tf.float32)),
batch_size - num_true_sampled)
indicator = tf.logical_or(indicator, additional_false_sample)
# Shuffle indicator and label. Need to store the permutation to restore the
# order post sampling.
permutation = tf.random.shuffle(tf.range(input_length))
indicator = ops.matmul_gather_on_zeroth_axis(
tf.cast(indicator, tf.float32), permutation)
labels = ops.matmul_gather_on_zeroth_axis(
tf.cast(labels, tf.float32), permutation)
# index (starting from 1) when indicator is True, 0 when False
indicator_idx = tf.where(
tf.cast(indicator, tf.bool), tf.range(1, input_length + 1),
tf.zeros(input_length, tf.int32))
# Replace -1 for negative, +1 for positive labels
signed_label = tf.where(
tf.cast(labels, tf.bool), tf.ones(input_length, tf.int32),
tf.scalar_mul(-1, tf.ones(input_length, tf.int32)))
# negative of index for negative label, positive index for positive label,
# 0 when indicator is False.
signed_indicator_idx = tf.multiply(indicator_idx, signed_label)
sorted_signed_indicator_idx = tf.nn.top_k(
signed_indicator_idx, input_length, sorted=True).values
[num_positive_samples,
num_negative_samples] = self._get_num_pos_neg_samples(
sorted_signed_indicator_idx, batch_size)
sampled_idx = self._get_values_from_start_and_end(
sorted_signed_indicator_idx, num_positive_samples,
num_negative_samples, batch_size)
# Shift the indices to start from 0 and remove any samples that are set as
# False.
sampled_idx = tf.abs(sampled_idx) - tf.ones(batch_size, tf.int32)
sampled_idx = tf.multiply(
tf.cast(tf.greater_equal(sampled_idx, tf.constant(0)), tf.int32),
sampled_idx)
sampled_idx_indicator = tf.cast(
tf.reduce_sum(
input_tensor=tf.one_hot(sampled_idx, depth=input_length), axis=0),
tf.bool)
# project back the order based on stored permutations
reprojections = tf.one_hot(permutation, depth=input_length,
dtype=tf.float32)
return tf.cast(tf.tensordot(
tf.cast(sampled_idx_indicator, tf.float32),
reprojections, axes=[0, 0]), tf.bool)
def subsample(self, indicator, batch_size, labels, scope=None):
"""Returns subsampled minibatch.
Args:
indicator: boolean tensor of shape [N] whose True entries can be sampled.
batch_size: desired batch size. If None, keeps all positive samples and
randomly selects negative samples so that the positive sample fraction
matches self._positive_fraction. It cannot be None is is_static is True.
labels: boolean tensor of shape [N] denoting positive(=True) and negative
(=False) examples.
scope: name scope.
Returns:
sampled_idx_indicator: boolean tensor of shape [N], True for entries which
are sampled.
Raises:
ValueError: if labels and indicator are not 1D boolean tensors.
"""
if len(indicator.get_shape().as_list()) != 1:
raise ValueError('indicator must be 1 dimensional, got a tensor of '
'shape %s' % indicator.get_shape())
if len(labels.get_shape().as_list()) != 1:
raise ValueError('labels must be 1 dimensional, got a tensor of '
'shape %s' % labels.get_shape())
if labels.dtype != tf.bool:
raise ValueError('labels should be of type bool. Received: %s' %
labels.dtype)
if indicator.dtype != tf.bool:
raise ValueError('indicator should be of type bool. Received: %s' %
indicator.dtype)
scope = scope or 'BalancedPositiveNegativeSampler'
with tf.name_scope(scope):
if self._is_static:
return self._static_subsample(indicator, batch_size, labels)
else:
# Only sample from indicated samples
negative_idx = tf.logical_not(labels)
positive_idx = tf.logical_and(labels, indicator)
negative_idx = tf.logical_and(negative_idx, indicator)
# Sample positive and negative samples separately
if batch_size is None:
max_num_pos = tf.reduce_sum(
input_tensor=tf.cast(positive_idx, dtype=tf.int32))
else:
max_num_pos = int(self._positive_fraction * batch_size)
sampled_pos_idx = self.subsample_indicator(positive_idx, max_num_pos)
num_sampled_pos = tf.reduce_sum(
input_tensor=tf.cast(sampled_pos_idx, tf.int32))
if batch_size is None:
negative_positive_ratio = (
1 - self._positive_fraction) / self._positive_fraction
max_num_neg = tf.cast(
negative_positive_ratio *
tf.cast(num_sampled_pos, dtype=tf.float32),
dtype=tf.int32)
else:
max_num_neg = batch_size - num_sampled_pos
sampled_neg_idx = self.subsample_indicator(negative_idx, max_num_neg)
return tf.logical_or(sampled_pos_idx, sampled_neg_idx)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Base box coder.
Box coders convert between coordinate frames, namely image-centric
(with (0,0) on the top left of image) and anchor-centric (with (0,0) being
defined by a specific anchor).
Users of a BoxCoder can call two methods:
encode: which encodes a box with respect to a given anchor
(or rather, a tensor of boxes wrt a corresponding tensor of anchors) and
decode: which inverts this encoding with a decode operation.
In both cases, the arguments are assumed to be in 1-1 correspondence already;
it is not the job of a BoxCoder to perform matching.
"""
from abc import ABCMeta
from abc import abstractmethod
from abc import abstractproperty
import tensorflow.compat.v2 as tf
# Box coder types.
FASTER_RCNN = 'faster_rcnn'
KEYPOINT = 'keypoint'
MEAN_STDDEV = 'mean_stddev'
SQUARE = 'square'
class BoxCoder(object):
"""Abstract base class for box coder."""
__metaclass__ = ABCMeta
@abstractproperty
def code_size(self):
"""Return the size of each code.
This number is a constant and should agree with the output of the `encode`
op (e.g. if rel_codes is the output of self.encode(...), then it should have
shape [N, code_size()]). This abstractproperty should be overridden by
implementations.
Returns:
an integer constant
"""
pass
def encode(self, boxes, anchors):
"""Encode a box list relative to an anchor collection.
Args:
boxes: BoxList holding N boxes to be encoded
anchors: BoxList of N anchors
Returns:
a tensor representing N relative-encoded boxes
"""
with tf.name_scope('Encode'):
return self._encode(boxes, anchors)
def decode(self, rel_codes, anchors):
"""Decode boxes that are encoded relative to an anchor collection.
Args:
rel_codes: a tensor representing N relative-encoded boxes
anchors: BoxList of anchors
Returns:
boxlist: BoxList holding N boxes encoded in the ordinary way (i.e.,
with corners y_min, x_min, y_max, x_max)
"""
with tf.name_scope('Decode'):
return self._decode(rel_codes, anchors)
@abstractmethod
def _encode(self, boxes, anchors):
"""Method to be overriden by implementations.
Args:
boxes: BoxList holding N boxes to be encoded
anchors: BoxList of N anchors
Returns:
a tensor representing N relative-encoded boxes
"""
pass
@abstractmethod
def _decode(self, rel_codes, anchors):
"""Method to be overriden by implementations.
Args:
rel_codes: a tensor representing N relative-encoded boxes
anchors: BoxList of anchors
Returns:
boxlist: BoxList holding N boxes encoded in the ordinary way (i.e.,
with corners y_min, x_min, y_max, x_max)
"""
pass
def batch_decode(encoded_boxes, box_coder, anchors):
"""Decode a batch of encoded boxes.
This op takes a batch of encoded bounding boxes and transforms
them to a batch of bounding boxes specified by their corners in
the order of [y_min, x_min, y_max, x_max].
Args:
encoded_boxes: a float32 tensor of shape [batch_size, num_anchors,
code_size] representing the location of the objects.
box_coder: a BoxCoder object.
anchors: a BoxList of anchors used to encode `encoded_boxes`.
Returns:
decoded_boxes: a float32 tensor of shape [batch_size, num_anchors,
coder_size] representing the corners of the objects in the order
of [y_min, x_min, y_max, x_max].
Raises:
ValueError: if batch sizes of the inputs are inconsistent, or if
the number of anchors inferred from encoded_boxes and anchors are
inconsistent.
"""
encoded_boxes.get_shape().assert_has_rank(3)
if encoded_boxes.get_shape()[1].value != anchors.num_boxes_static():
raise ValueError('The number of anchors inferred from encoded_boxes'
' and anchors are inconsistent: shape[1] of encoded_boxes'
' %s should be equal to the number of anchors: %s.' %
(encoded_boxes.get_shape()[1].value,
anchors.num_boxes_static()))
decoded_boxes = tf.stack([
box_coder.decode(boxes, anchors).get()
for boxes in tf.unstack(encoded_boxes)
])
return decoded_boxes
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Bounding Box List definition.
BoxList represents a list of bounding boxes as tensorflow
tensors, where each bounding box is represented as a row of 4 numbers,
[y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes
within a given list correspond to a single image. See also
box_list_ops.py for common box related operations (such as area, iou, etc).
Optionally, users can add additional related fields (such as weights).
We assume the following things to be true about fields:
* they correspond to boxes in the box_list along the 0th dimension
* they have inferrable rank at graph construction time
* all dimensions except for possibly the 0th can be inferred
(i.e., not None) at graph construction time.
Some other notes:
* Following tensorflow conventions, we use height, width ordering,
and correspondingly, y,x (or ymin, xmin, ymax, xmax) ordering
* Tensors are always provided as (flat) [N, 4] tensors.
"""
import tensorflow.compat.v2 as tf
class BoxList(object):
"""Box collection."""
def __init__(self, boxes):
"""Constructs box collection.
Args:
boxes: a tensor of shape [N, 4] representing box corners
Raises:
ValueError: if invalid dimensions for bbox data or if bbox data is not in
float32 format.
"""
if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4:
raise ValueError('Invalid dimensions for box data.')
if boxes.dtype != tf.float32:
raise ValueError('Invalid tensor type: should be tf.float32')
self.data = {'boxes': boxes}
def num_boxes(self):
"""Returns number of boxes held in collection.
Returns:
a tensor representing the number of boxes held in the collection.
"""
return tf.shape(input=self.data['boxes'])[0]
def num_boxes_static(self):
"""Returns number of boxes held in collection.
This number is inferred at graph construction time rather than run-time.
Returns:
Number of boxes held in collection (integer) or None if this is not
inferrable at graph construction time.
"""
return self.data['boxes'].get_shape().dims[0].value
def get_all_fields(self):
"""Returns all fields."""
return self.data.keys()
def get_extra_fields(self):
"""Returns all non-box fields (i.e., everything not named 'boxes')."""
return [k for k in self.data.keys() if k != 'boxes']
def add_field(self, field, field_data):
"""Add field to box list.
This method can be used to add related box data such as
weights/labels, etc.
Args:
field: a string key to access the data via `get`
field_data: a tensor containing the data to store in the BoxList
"""
self.data[field] = field_data
def has_field(self, field):
return field in self.data
def get(self):
"""Convenience function for accessing box coordinates.
Returns:
a tensor with shape [N, 4] representing box coordinates.
"""
return self.get_field('boxes')
def set(self, boxes):
"""Convenience function for setting box coordinates.
Args:
boxes: a tensor of shape [N, 4] representing box corners
Raises:
ValueError: if invalid dimensions for bbox data
"""
if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4:
raise ValueError('Invalid dimensions for box data.')
self.data['boxes'] = boxes
def get_field(self, field):
"""Accesses a box collection and associated fields.
This function returns specified field with object; if no field is specified,
it returns the box coordinates.
Args:
field: this optional string parameter can be used to specify
a related field to be accessed.
Returns:
a tensor representing the box collection or an associated field.
Raises:
ValueError: if invalid field
"""
if not self.has_field(field):
raise ValueError('field ' + str(field) + ' does not exist')
return self.data[field]
def set_field(self, field, value):
"""Sets the value of a field.
Updates the field of a box_list with a given value.
Args:
field: (string) name of the field to set value.
value: the value to assign to the field.
Raises:
ValueError: if the box_list does not have specified field.
"""
if not self.has_field(field):
raise ValueError('field %s does not exist' % field)
self.data[field] = value
def get_center_coordinates_and_sizes(self, scope=None):
"""Computes the center coordinates, height and width of the boxes.
Args:
scope: name scope of the function.
Returns:
a list of 4 1-D tensors [ycenter, xcenter, height, width].
"""
if not scope:
scope = 'get_center_coordinates_and_sizes'
with tf.name_scope(scope):
box_corners = self.get()
ymin, xmin, ymax, xmax = tf.unstack(tf.transpose(a=box_corners))
width = xmax - xmin
height = ymax - ymin
ycenter = ymin + height / 2.
xcenter = xmin + width / 2.
return [ycenter, xcenter, height, width]
def transpose_coordinates(self, scope=None):
"""Transpose the coordinate representation in a boxlist.
Args:
scope: name scope of the function.
"""
if not scope:
scope = 'transpose_coordinates'
with tf.name_scope(scope):
y_min, x_min, y_max, x_max = tf.split(
value=self.get(), num_or_size_splits=4, axis=1)
self.set(tf.concat([x_min, y_min, x_max, y_max], 1))
def as_tensor_dict(self, fields=None):
"""Retrieves specified fields as a dictionary of tensors.
Args:
fields: (optional) list of fields to return in the dictionary.
If None (default), all fields are returned.
Returns:
tensor_dict: A dictionary of tensors specified by fields.
Raises:
ValueError: if specified field is not contained in boxlist.
"""
tensor_dict = {}
if fields is None:
fields = self.get_all_fields()
for field in fields:
if not self.has_field(field):
raise ValueError('boxlist must contain all specified fields')
tensor_dict[field] = self.get_field(field)
return tensor_dict
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Bounding Box List operations.
Example box operations that are supported:
* areas: compute bounding box areas
* iou: pairwise intersection-over-union scores
* sq_dist: pairwise distances between bounding boxes
Whenever box_list_ops functions output a BoxList, the fields of the incoming
BoxList are retained unless documented otherwise.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six.moves import range
import tensorflow.compat.v2 as tf
from official.vision.detection.utils.object_detection import box_list
from official.vision.detection.utils.object_detection import ops
class SortOrder(object):
"""Enum class for sort order.
Attributes:
ascend: ascend order.
descend: descend order.
"""
ascend = 1
descend = 2
def area(boxlist, scope=None):
"""Computes area of boxes.
Args:
boxlist: BoxList holding N boxes
scope: name scope.
Returns:
a tensor with shape [N] representing box areas.
"""
with tf.name_scope(scope, 'Area'):
y_min, x_min, y_max, x_max = tf.split(
value=boxlist.get(), num_or_size_splits=4, axis=1)
return tf.squeeze((y_max - y_min) * (x_max - x_min), [1])
def height_width(boxlist, scope=None):
"""Computes height and width of boxes in boxlist.
Args:
boxlist: BoxList holding N boxes
scope: name scope.
Returns:
Height: A tensor with shape [N] representing box heights.
Width: A tensor with shape [N] representing box widths.
"""
with tf.name_scope(scope, 'HeightWidth'):
y_min, x_min, y_max, x_max = tf.split(
value=boxlist.get(), num_or_size_splits=4, axis=1)
return tf.squeeze(y_max - y_min, [1]), tf.squeeze(x_max - x_min, [1])
def scale(boxlist, y_scale, x_scale, scope=None):
"""scale box coordinates in x and y dimensions.
Args:
boxlist: BoxList holding N boxes
y_scale: (float) scalar tensor
x_scale: (float) scalar tensor
scope: name scope.
Returns:
boxlist: BoxList holding N boxes
"""
with tf.name_scope(scope, 'Scale'):
y_scale = tf.cast(y_scale, tf.float32)
x_scale = tf.cast(x_scale, tf.float32)
y_min, x_min, y_max, x_max = tf.split(
value=boxlist.get(), num_or_size_splits=4, axis=1)
y_min = y_scale * y_min
y_max = y_scale * y_max
x_min = x_scale * x_min
x_max = x_scale * x_max
scaled_boxlist = box_list.BoxList(
tf.concat([y_min, x_min, y_max, x_max], 1))
return _copy_extra_fields(scaled_boxlist, boxlist)
def clip_to_window(boxlist, window, filter_nonoverlapping=True, scope=None):
"""Clip bounding boxes to a window.
This op clips any input bounding boxes (represented by bounding box
corners) to a window, optionally filtering out boxes that do not
overlap at all with the window.
Args:
boxlist: BoxList holding M_in boxes
window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max]
window to which the op should clip boxes.
filter_nonoverlapping: whether to filter out boxes that do not overlap at
all with the window.
scope: name scope.
Returns:
a BoxList holding M_out boxes where M_out <= M_in
"""
with tf.name_scope(scope, 'ClipToWindow'):
y_min, x_min, y_max, x_max = tf.split(
value=boxlist.get(), num_or_size_splits=4, axis=1)
win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window)
y_min_clipped = tf.maximum(tf.minimum(y_min, win_y_max), win_y_min)
y_max_clipped = tf.maximum(tf.minimum(y_max, win_y_max), win_y_min)
x_min_clipped = tf.maximum(tf.minimum(x_min, win_x_max), win_x_min)
x_max_clipped = tf.maximum(tf.minimum(x_max, win_x_max), win_x_min)
clipped = box_list.BoxList(
tf.concat([y_min_clipped, x_min_clipped, y_max_clipped, x_max_clipped],
1))
clipped = _copy_extra_fields(clipped, boxlist)
if filter_nonoverlapping:
areas = area(clipped)
nonzero_area_indices = tf.cast(
tf.reshape(tf.where(tf.greater(areas, 0.0)), [-1]), tf.int32)
clipped = gather(clipped, nonzero_area_indices)
return clipped
def prune_outside_window(boxlist, window, scope=None):
"""Prunes bounding boxes that fall outside a given window.
This function prunes bounding boxes that even partially fall outside the given
window. See also clip_to_window which only prunes bounding boxes that fall
completely outside the window, and clips any bounding boxes that partially
overflow.
Args:
boxlist: a BoxList holding M_in boxes.
window: a float tensor of shape [4] representing [ymin, xmin, ymax, xmax]
of the window
scope: name scope.
Returns:
pruned_corners: a tensor with shape [M_out, 4] where M_out <= M_in
valid_indices: a tensor with shape [M_out] indexing the valid bounding boxes
in the input tensor.
"""
with tf.name_scope(scope, 'PruneOutsideWindow'):
y_min, x_min, y_max, x_max = tf.split(
value=boxlist.get(), num_or_size_splits=4, axis=1)
win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window)
coordinate_violations = tf.concat([
tf.less(y_min, win_y_min), tf.less(x_min, win_x_min),
tf.greater(y_max, win_y_max), tf.greater(x_max, win_x_max)
], 1)
valid_indices = tf.reshape(
tf.where(tf.logical_not(tf.reduce_any(coordinate_violations, 1))), [-1])
return gather(boxlist, valid_indices), valid_indices
def prune_completely_outside_window(boxlist, window, scope=None):
"""Prunes bounding boxes that fall completely outside of the given window.
The function clip_to_window prunes bounding boxes that fall
completely outside the window, but also clips any bounding boxes that
partially overflow. This function does not clip partially overflowing boxes.
Args:
boxlist: a BoxList holding M_in boxes.
window: a float tensor of shape [4] representing [ymin, xmin, ymax, xmax]
of the window
scope: name scope.
Returns:
pruned_boxlist: a new BoxList with all bounding boxes partially or fully in
the window.
valid_indices: a tensor with shape [M_out] indexing the valid bounding boxes
in the input tensor.
"""
with tf.name_scope(scope, 'PruneCompleteleyOutsideWindow'):
y_min, x_min, y_max, x_max = tf.split(
value=boxlist.get(), num_or_size_splits=4, axis=1)
win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window)
coordinate_violations = tf.concat([
tf.greater_equal(y_min, win_y_max), tf.greater_equal(x_min, win_x_max),
tf.less_equal(y_max, win_y_min), tf.less_equal(x_max, win_x_min)
], 1)
valid_indices = tf.reshape(
tf.where(tf.logical_not(tf.reduce_any(coordinate_violations, 1))), [-1])
return gather(boxlist, valid_indices), valid_indices
def intersection(boxlist1, boxlist2, scope=None):
"""Compute pairwise intersection areas between boxes.
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding M boxes
scope: name scope.
Returns:
a tensor with shape [N, M] representing pairwise intersections
"""
with tf.name_scope(scope, 'Intersection'):
y_min1, x_min1, y_max1, x_max1 = tf.split(
value=boxlist1.get(), num_or_size_splits=4, axis=1)
y_min2, x_min2, y_max2, x_max2 = tf.split(
value=boxlist2.get(), num_or_size_splits=4, axis=1)
all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2))
all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2))
intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin)
all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2))
all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2))
intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin)
return intersect_heights * intersect_widths
def matched_intersection(boxlist1, boxlist2, scope=None):
"""Compute intersection areas between corresponding boxes in two boxlists.
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding N boxes
scope: name scope.
Returns:
a tensor with shape [N] representing pairwise intersections
"""
with tf.name_scope(scope, 'MatchedIntersection'):
y_min1, x_min1, y_max1, x_max1 = tf.split(
value=boxlist1.get(), num_or_size_splits=4, axis=1)
y_min2, x_min2, y_max2, x_max2 = tf.split(
value=boxlist2.get(), num_or_size_splits=4, axis=1)
min_ymax = tf.minimum(y_max1, y_max2)
max_ymin = tf.maximum(y_min1, y_min2)
intersect_heights = tf.maximum(0.0, min_ymax - max_ymin)
min_xmax = tf.minimum(x_max1, x_max2)
max_xmin = tf.maximum(x_min1, x_min2)
intersect_widths = tf.maximum(0.0, min_xmax - max_xmin)
return tf.reshape(intersect_heights * intersect_widths, [-1])
def iou(boxlist1, boxlist2, scope=None):
"""Computes pairwise intersection-over-union between box collections.
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding M boxes
scope: name scope.
Returns:
a tensor with shape [N, M] representing pairwise iou scores.
"""
with tf.name_scope(scope, 'IOU'):
intersections = intersection(boxlist1, boxlist2)
areas1 = area(boxlist1)
areas2 = area(boxlist2)
unions = (
tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections)
return tf.where(
tf.equal(intersections, 0.0),
tf.zeros_like(intersections), tf.truediv(intersections, unions))
def matched_iou(boxlist1, boxlist2, scope=None):
"""Compute intersection-over-union between corresponding boxes in boxlists.
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding N boxes
scope: name scope.
Returns:
a tensor with shape [N] representing pairwise iou scores.
"""
with tf.name_scope(scope, 'MatchedIOU'):
intersections = matched_intersection(boxlist1, boxlist2)
areas1 = area(boxlist1)
areas2 = area(boxlist2)
unions = areas1 + areas2 - intersections
return tf.where(
tf.equal(intersections, 0.0),
tf.zeros_like(intersections), tf.truediv(intersections, unions))
def ioa(boxlist1, boxlist2, scope=None):
"""Computes pairwise intersection-over-area between box collections.
intersection-over-area (IOA) between two boxes box1 and box2 is defined as
their intersection area over box2's area. Note that ioa is not symmetric,
that is, ioa(box1, box2) != ioa(box2, box1).
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding M boxes
scope: name scope.
Returns:
a tensor with shape [N, M] representing pairwise ioa scores.
"""
with tf.name_scope(scope, 'IOA'):
intersections = intersection(boxlist1, boxlist2)
areas = tf.expand_dims(area(boxlist2), 0)
return tf.truediv(intersections, areas)
def prune_non_overlapping_boxes(
boxlist1, boxlist2, min_overlap=0.0, scope=None):
"""Prunes the boxes in boxlist1 that overlap less than thresh with boxlist2.
For each box in boxlist1, we want its IOA to be more than minoverlap with
at least one of the boxes in boxlist2. If it does not, we remove it.
Args:
boxlist1: BoxList holding N boxes.
boxlist2: BoxList holding M boxes.
min_overlap: Minimum required overlap between boxes, to count them as
overlapping.
scope: name scope.
Returns:
new_boxlist1: A pruned boxlist with size [N', 4].
keep_inds: A tensor with shape [N'] indexing kept bounding boxes in the
first input BoxList `boxlist1`.
"""
with tf.name_scope(scope, 'PruneNonOverlappingBoxes'):
ioa_ = ioa(boxlist2, boxlist1) # [M, N] tensor
ioa_ = tf.reduce_max(ioa_, reduction_indices=[0]) # [N] tensor
keep_bool = tf.greater_equal(ioa_, tf.constant(min_overlap))
keep_inds = tf.squeeze(tf.where(keep_bool), axis=[1])
new_boxlist1 = gather(boxlist1, keep_inds)
return new_boxlist1, keep_inds
def prune_small_boxes(boxlist, min_side, scope=None):
"""Prunes small boxes in the boxlist which have a side smaller than min_side.
Args:
boxlist: BoxList holding N boxes.
min_side: Minimum width AND height of box to survive pruning.
scope: name scope.
Returns:
A pruned boxlist.
"""
with tf.name_scope(scope, 'PruneSmallBoxes'):
height, width = height_width(boxlist)
is_valid = tf.logical_and(tf.greater_equal(width, min_side),
tf.greater_equal(height, min_side))
return gather(boxlist, tf.reshape(tf.where(is_valid), [-1]))
def change_coordinate_frame(boxlist, window, scope=None):
"""Change coordinate frame of the boxlist to be relative to window's frame.
Given a window of the form [ymin, xmin, ymax, xmax],
changes bounding box coordinates from boxlist to be relative to this window
(e.g., the min corner maps to (0,0) and the max corner maps to (1,1)).
An example use case is data augmentation: where we are given groundtruth
boxes (boxlist) and would like to randomly crop the image to some
window (window). In this case we need to change the coordinate frame of
each groundtruth box to be relative to this new window.
Args:
boxlist: A BoxList object holding N boxes.
window: A rank 1 tensor [4].
scope: name scope.
Returns:
Returns a BoxList object with N boxes.
"""
with tf.name_scope(scope, 'ChangeCoordinateFrame'):
win_height = window[2] - window[0]
win_width = window[3] - window[1]
boxlist_new = scale(box_list.BoxList(
boxlist.get() - [window[0], window[1], window[0], window[1]]),
1.0 / win_height, 1.0 / win_width)
boxlist_new = _copy_extra_fields(boxlist_new, boxlist)
return boxlist_new
def sq_dist(boxlist1, boxlist2, scope=None):
"""Computes the pairwise squared distances between box corners.
This op treats each box as if it were a point in a 4d Euclidean space and
computes pairwise squared distances.
Mathematically, we are given two matrices of box coordinates X and Y,
where X(i,:) is the i'th row of X, containing the 4 numbers defining the
corners of the i'th box in boxlist1. Similarly Y(j,:) corresponds to
boxlist2. We compute
Z(i,j) = ||X(i,:) - Y(j,:)||^2
= ||X(i,:)||^2 + ||Y(j,:)||^2 - 2 X(i,:)' * Y(j,:),
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding M boxes
scope: name scope.
Returns:
a tensor with shape [N, M] representing pairwise distances
"""
with tf.name_scope(scope, 'SqDist'):
sqnorm1 = tf.reduce_sum(tf.square(boxlist1.get()), 1, keep_dims=True)
sqnorm2 = tf.reduce_sum(tf.square(boxlist2.get()), 1, keep_dims=True)
innerprod = tf.matmul(boxlist1.get(), boxlist2.get(),
transpose_a=False, transpose_b=True)
return sqnorm1 + tf.transpose(sqnorm2) - 2.0 * innerprod
def boolean_mask(boxlist, indicator, fields=None, scope=None,
use_static_shapes=False, indicator_sum=None):
"""Select boxes from BoxList according to indicator and return new BoxList.
`boolean_mask` returns the subset of boxes that are marked as "True" by the
indicator tensor. By default, `boolean_mask` returns boxes corresponding to
the input index list, as well as all additional fields stored in the boxlist
(indexing into the first dimension). However one can optionally only draw
from a subset of fields.
Args:
boxlist: BoxList holding N boxes
indicator: a rank-1 boolean tensor
fields: (optional) list of fields to also gather from. If None (default),
all fields are gathered from. Pass an empty fields list to only gather
the box coordinates.
scope: name scope.
use_static_shapes: Whether to use an implementation with static shape
gurantees.
indicator_sum: An integer containing the sum of `indicator` vector. Only
required if `use_static_shape` is True.
Returns:
subboxlist: a BoxList corresponding to the subset of the input BoxList
specified by indicator
Raises:
ValueError: if `indicator` is not a rank-1 boolean tensor.
"""
with tf.name_scope(scope, 'BooleanMask'):
if indicator.shape.ndims != 1:
raise ValueError('indicator should have rank 1')
if indicator.dtype != tf.bool:
raise ValueError('indicator should be a boolean tensor')
if use_static_shapes:
if not (indicator_sum and isinstance(indicator_sum, int)):
raise ValueError('`indicator_sum` must be a of type int')
selected_positions = tf.cast(indicator, dtype=tf.float32)
indexed_positions = tf.cast(
tf.multiply(
tf.cumsum(selected_positions), selected_positions),
dtype=tf.int32)
one_hot_selector = tf.one_hot(
indexed_positions - 1, indicator_sum, dtype=tf.float32)
sampled_indices = tf.cast(
tf.tensordot(
tf.cast(tf.range(tf.shape(indicator)[0]), dtype=tf.float32),
one_hot_selector,
axes=[0, 0]),
dtype=tf.int32)
return gather(boxlist, sampled_indices, use_static_shapes=True)
else:
subboxlist = box_list.BoxList(tf.boolean_mask(boxlist.get(), indicator))
if fields is None:
fields = boxlist.get_extra_fields()
for field in fields:
if not boxlist.has_field(field):
raise ValueError('boxlist must contain all specified fields')
subfieldlist = tf.boolean_mask(boxlist.get_field(field), indicator)
subboxlist.add_field(field, subfieldlist)
return subboxlist
def gather(boxlist, indices, fields=None, scope=None, use_static_shapes=False):
"""Gather boxes from BoxList according to indices and return new BoxList.
By default, `gather` returns boxes corresponding to the input index list, as
well as all additional fields stored in the boxlist (indexing into the
first dimension). However one can optionally only gather from a
subset of fields.
Args:
boxlist: BoxList holding N boxes
indices: a rank-1 tensor of type int32 / int64
fields: (optional) list of fields to also gather from. If None (default),
all fields are gathered from. Pass an empty fields list to only gather
the box coordinates.
scope: name scope.
use_static_shapes: Whether to use an implementation with static shape
gurantees.
Returns:
subboxlist: a BoxList corresponding to the subset of the input BoxList
specified by indices
Raises:
ValueError: if specified field is not contained in boxlist or if the
indices are not of type int32
"""
with tf.name_scope(scope, 'Gather'):
if len(indices.shape.as_list()) != 1:
raise ValueError('indices should have rank 1')
if indices.dtype != tf.int32 and indices.dtype != tf.int64:
raise ValueError('indices should be an int32 / int64 tensor')
gather_op = tf.gather
if use_static_shapes:
gather_op = ops.matmul_gather_on_zeroth_axis
subboxlist = box_list.BoxList(gather_op(boxlist.get(), indices))
if fields is None:
fields = boxlist.get_extra_fields()
fields += ['boxes']
for field in fields:
if not boxlist.has_field(field):
raise ValueError('boxlist must contain all specified fields')
subfieldlist = gather_op(boxlist.get_field(field), indices)
subboxlist.add_field(field, subfieldlist)
return subboxlist
def concatenate(boxlists, fields=None, scope=None):
"""Concatenate list of BoxLists.
This op concatenates a list of input BoxLists into a larger BoxList. It also
handles concatenation of BoxList fields as long as the field tensor shapes
are equal except for the first dimension.
Args:
boxlists: list of BoxList objects
fields: optional list of fields to also concatenate. By default, all
fields from the first BoxList in the list are included in the
concatenation.
scope: name scope.
Returns:
a BoxList with number of boxes equal to
sum([boxlist.num_boxes() for boxlist in BoxList])
Raises:
ValueError: if boxlists is invalid (i.e., is not a list, is empty, or
contains non BoxList objects), or if requested fields are not contained in
all boxlists
"""
with tf.name_scope(scope, 'Concatenate'):
if not isinstance(boxlists, list):
raise ValueError('boxlists should be a list')
if not boxlists:
raise ValueError('boxlists should have nonzero length')
for boxlist in boxlists:
if not isinstance(boxlist, box_list.BoxList):
raise ValueError('all elements of boxlists should be BoxList objects')
concatenated = box_list.BoxList(
tf.concat([boxlist.get() for boxlist in boxlists], 0))
if fields is None:
fields = boxlists[0].get_extra_fields()
for field in fields:
first_field_shape = boxlists[0].get_field(field).get_shape().as_list()
first_field_shape[0] = -1
if None in first_field_shape:
raise ValueError('field %s must have fully defined shape except for the'
' 0th dimension.' % field)
for boxlist in boxlists:
if not boxlist.has_field(field):
raise ValueError('boxlist must contain all requested fields')
field_shape = boxlist.get_field(field).get_shape().as_list()
field_shape[0] = -1
if field_shape != first_field_shape:
raise ValueError('field %s must have same shape for all boxlists '
'except for the 0th dimension.' % field)
concatenated_field = tf.concat(
[boxlist.get_field(field) for boxlist in boxlists], 0)
concatenated.add_field(field, concatenated_field)
return concatenated
def sort_by_field(boxlist, field, order=SortOrder.descend, scope=None):
"""Sort boxes and associated fields according to a scalar field.
A common use case is reordering the boxes according to descending scores.
Args:
boxlist: BoxList holding N boxes.
field: A BoxList field for sorting and reordering the BoxList.
order: (Optional) descend or ascend. Default is descend.
scope: name scope.
Returns:
sorted_boxlist: A sorted BoxList with the field in the specified order.
Raises:
ValueError: if specified field does not exist
ValueError: if the order is not either descend or ascend
"""
with tf.name_scope(scope, 'SortByField'):
if order != SortOrder.descend and order != SortOrder.ascend:
raise ValueError('Invalid sort order')
field_to_sort = boxlist.get_field(field)
if len(field_to_sort.shape.as_list()) != 1:
raise ValueError('Field should have rank 1')
num_boxes = boxlist.num_boxes()
num_entries = tf.size(field_to_sort)
length_assert = tf.Assert(
tf.equal(num_boxes, num_entries),
['Incorrect field size: actual vs expected.', num_entries, num_boxes])
with tf.control_dependencies([length_assert]):
_, sorted_indices = tf.nn.top_k(field_to_sort, num_boxes, sorted=True)
if order == SortOrder.ascend:
sorted_indices = tf.reverse_v2(sorted_indices, [0])
return gather(boxlist, sorted_indices)
def visualize_boxes_in_image(image, boxlist, normalized=False, scope=None):
"""Overlay bounding box list on image.
Currently this visualization plots a 1 pixel thick red bounding box on top
of the image. Note that tf.image.draw_bounding_boxes essentially is
1 indexed.
Args:
image: an image tensor with shape [height, width, 3]
boxlist: a BoxList
normalized: (boolean) specify whether corners are to be interpreted
as absolute coordinates in image space or normalized with respect to the
image size.
scope: name scope.
Returns:
image_and_boxes: an image tensor with shape [height, width, 3]
"""
with tf.name_scope(scope, 'VisualizeBoxesInImage'):
if not normalized:
height, width, _ = tf.unstack(tf.shape(image))
boxlist = scale(boxlist,
1.0 / tf.cast(height, tf.float32),
1.0 / tf.cast(width, tf.float32))
corners = tf.expand_dims(boxlist.get(), 0)
image = tf.expand_dims(image, 0)
return tf.squeeze(tf.image.draw_bounding_boxes(image, corners), [0])
def filter_field_value_equals(boxlist, field, value, scope=None):
"""Filter to keep only boxes with field entries equal to the given value.
Args:
boxlist: BoxList holding N boxes.
field: field name for filtering.
value: scalar value.
scope: name scope.
Returns:
a BoxList holding M boxes where M <= N
Raises:
ValueError: if boxlist not a BoxList object or if it does not have
the specified field.
"""
with tf.name_scope(scope, 'FilterFieldValueEquals'):
if not isinstance(boxlist, box_list.BoxList):
raise ValueError('boxlist must be a BoxList')
if not boxlist.has_field(field):
raise ValueError('boxlist must contain the specified field')
filter_field = boxlist.get_field(field)
gather_index = tf.reshape(tf.where(tf.equal(filter_field, value)), [-1])
return gather(boxlist, gather_index)
def filter_greater_than(boxlist, thresh, scope=None):
"""Filter to keep only boxes with score exceeding a given threshold.
This op keeps the collection of boxes whose corresponding scores are
greater than the input threshold.
TODO(jonathanhuang): Change function name to filter_scores_greater_than
Args:
boxlist: BoxList holding N boxes. Must contain a 'scores' field
representing detection scores.
thresh: scalar threshold
scope: name scope.
Returns:
a BoxList holding M boxes where M <= N
Raises:
ValueError: if boxlist not a BoxList object or if it does not
have a scores field
"""
with tf.name_scope(scope, 'FilterGreaterThan'):
if not isinstance(boxlist, box_list.BoxList):
raise ValueError('boxlist must be a BoxList')
if not boxlist.has_field('scores'):
raise ValueError('input boxlist must have \'scores\' field')
scores = boxlist.get_field('scores')
if len(scores.shape.as_list()) > 2:
raise ValueError('Scores should have rank 1 or 2')
if len(scores.shape.as_list()) == 2 and scores.shape.as_list()[1] != 1:
raise ValueError('Scores should have rank 1 or have shape '
'consistent with [None, 1]')
high_score_indices = tf.cast(tf.reshape(
tf.where(tf.greater(scores, thresh)),
[-1]), tf.int32)
return gather(boxlist, high_score_indices)
def non_max_suppression(boxlist, thresh, max_output_size, scope=None):
"""Non maximum suppression.
This op greedily selects a subset of detection bounding boxes, pruning
away boxes that have high IOU (intersection over union) overlap (> thresh)
with already selected boxes. Note that this only works for a single class ---
to apply NMS to multi-class predictions, use MultiClassNonMaxSuppression.
Args:
boxlist: BoxList holding N boxes. Must contain a 'scores' field
representing detection scores.
thresh: scalar threshold
max_output_size: maximum number of retained boxes
scope: name scope.
Returns:
a BoxList holding M boxes where M <= max_output_size
Raises:
ValueError: if thresh is not in [0, 1]
"""
with tf.name_scope(scope, 'NonMaxSuppression'):
if not 0 <= thresh <= 1.0:
raise ValueError('thresh must be between 0 and 1')
if not isinstance(boxlist, box_list.BoxList):
raise ValueError('boxlist must be a BoxList')
if not boxlist.has_field('scores'):
raise ValueError('input boxlist must have \'scores\' field')
selected_indices = tf.image.non_max_suppression(
boxlist.get(), boxlist.get_field('scores'),
max_output_size, iou_threshold=thresh)
return gather(boxlist, selected_indices)
def _copy_extra_fields(boxlist_to_copy_to, boxlist_to_copy_from):
"""Copies the extra fields of boxlist_to_copy_from to boxlist_to_copy_to.
Args:
boxlist_to_copy_to: BoxList to which extra fields are copied.
boxlist_to_copy_from: BoxList from which fields are copied.
Returns:
boxlist_to_copy_to with extra fields.
"""
for field in boxlist_to_copy_from.get_extra_fields():
boxlist_to_copy_to.add_field(field, boxlist_to_copy_from.get_field(field))
return boxlist_to_copy_to
def to_normalized_coordinates(boxlist, height, width,
check_range=True, scope=None):
"""Converts absolute box coordinates to normalized coordinates in [0, 1].
Usually one uses the dynamic shape of the image or conv-layer tensor:
boxlist = box_list_ops.to_normalized_coordinates(boxlist,
tf.shape(images)[1],
tf.shape(images)[2]),
This function raises an assertion failed error at graph execution time when
the maximum coordinate is smaller than 1.01 (which means that coordinates are
already normalized). The value 1.01 is to deal with small rounding errors.
Args:
boxlist: BoxList with coordinates in terms of pixel-locations.
height: Maximum value for height of absolute box coordinates.
width: Maximum value for width of absolute box coordinates.
check_range: If True, checks if the coordinates are normalized or not.
scope: name scope.
Returns:
boxlist with normalized coordinates in [0, 1].
"""
with tf.name_scope(scope, 'ToNormalizedCoordinates'):
height = tf.cast(height, tf.float32)
width = tf.cast(width, tf.float32)
if check_range:
max_val = tf.reduce_max(boxlist.get())
max_assert = tf.Assert(tf.greater(max_val, 1.01),
['max value is lower than 1.01: ', max_val])
with tf.control_dependencies([max_assert]):
width = tf.identity(width)
return scale(boxlist, 1 / height, 1 / width)
def to_absolute_coordinates(boxlist,
height,
width,
check_range=True,
maximum_normalized_coordinate=1.1,
scope=None):
"""Converts normalized box coordinates to absolute pixel coordinates.
This function raises an assertion failed error when the maximum box coordinate
value is larger than maximum_normalized_coordinate (in which case coordinates
are already absolute).
Args:
boxlist: BoxList with coordinates in range [0, 1].
height: Maximum value for height of absolute box coordinates.
width: Maximum value for width of absolute box coordinates.
check_range: If True, checks if the coordinates are normalized or not.
maximum_normalized_coordinate: Maximum coordinate value to be considered
as normalized, default to 1.1.
scope: name scope.
Returns:
boxlist with absolute coordinates in terms of the image size.
"""
with tf.name_scope(scope, 'ToAbsoluteCoordinates'):
height = tf.cast(height, tf.float32)
width = tf.cast(width, tf.float32)
# Ensure range of input boxes is correct.
if check_range:
box_maximum = tf.reduce_max(boxlist.get())
max_assert = tf.Assert(
tf.greater_equal(maximum_normalized_coordinate, box_maximum),
['maximum box coordinate value is larger '
'than %f: ' % maximum_normalized_coordinate, box_maximum])
with tf.control_dependencies([max_assert]):
width = tf.identity(width)
return scale(boxlist, height, width)
def refine_boxes_multi_class(pool_boxes,
num_classes,
nms_iou_thresh,
nms_max_detections,
voting_iou_thresh=0.5):
"""Refines a pool of boxes using non max suppression and box voting.
Box refinement is done independently for each class.
Args:
pool_boxes: (BoxList) A collection of boxes to be refined. pool_boxes must
have a rank 1 'scores' field and a rank 1 'classes' field.
num_classes: (int scalar) Number of classes.
nms_iou_thresh: (float scalar) iou threshold for non max suppression (NMS).
nms_max_detections: (int scalar) maximum output size for NMS.
voting_iou_thresh: (float scalar) iou threshold for box voting.
Returns:
BoxList of refined boxes.
Raises:
ValueError: if
a) nms_iou_thresh or voting_iou_thresh is not in [0, 1].
b) pool_boxes is not a BoxList.
c) pool_boxes does not have a scores and classes field.
"""
if not 0.0 <= nms_iou_thresh <= 1.0:
raise ValueError('nms_iou_thresh must be between 0 and 1')
if not 0.0 <= voting_iou_thresh <= 1.0:
raise ValueError('voting_iou_thresh must be between 0 and 1')
if not isinstance(pool_boxes, box_list.BoxList):
raise ValueError('pool_boxes must be a BoxList')
if not pool_boxes.has_field('scores'):
raise ValueError('pool_boxes must have a \'scores\' field')
if not pool_boxes.has_field('classes'):
raise ValueError('pool_boxes must have a \'classes\' field')
refined_boxes = []
for i in range(num_classes):
boxes_class = filter_field_value_equals(pool_boxes, 'classes', i)
refined_boxes_class = refine_boxes(boxes_class, nms_iou_thresh,
nms_max_detections, voting_iou_thresh)
refined_boxes.append(refined_boxes_class)
return sort_by_field(concatenate(refined_boxes), 'scores')
def refine_boxes(pool_boxes,
nms_iou_thresh,
nms_max_detections,
voting_iou_thresh=0.5):
"""Refines a pool of boxes using non max suppression and box voting.
Args:
pool_boxes: (BoxList) A collection of boxes to be refined. pool_boxes must
have a rank 1 'scores' field.
nms_iou_thresh: (float scalar) iou threshold for non max suppression (NMS).
nms_max_detections: (int scalar) maximum output size for NMS.
voting_iou_thresh: (float scalar) iou threshold for box voting.
Returns:
BoxList of refined boxes.
Raises:
ValueError: if
a) nms_iou_thresh or voting_iou_thresh is not in [0, 1].
b) pool_boxes is not a BoxList.
c) pool_boxes does not have a scores field.
"""
if not 0.0 <= nms_iou_thresh <= 1.0:
raise ValueError('nms_iou_thresh must be between 0 and 1')
if not 0.0 <= voting_iou_thresh <= 1.0:
raise ValueError('voting_iou_thresh must be between 0 and 1')
if not isinstance(pool_boxes, box_list.BoxList):
raise ValueError('pool_boxes must be a BoxList')
if not pool_boxes.has_field('scores'):
raise ValueError('pool_boxes must have a \'scores\' field')
nms_boxes = non_max_suppression(
pool_boxes, nms_iou_thresh, nms_max_detections)
return box_voting(nms_boxes, pool_boxes, voting_iou_thresh)
def box_voting(selected_boxes, pool_boxes, iou_thresh=0.5):
"""Performs box voting as described in S. Gidaris and N. Komodakis, ICCV 2015.
Performs box voting as described in 'Object detection via a multi-region &
semantic segmentation-aware CNN model', Gidaris and Komodakis, ICCV 2015. For
each box 'B' in selected_boxes, we find the set 'S' of boxes in pool_boxes
with iou overlap >= iou_thresh. The location of B is set to the weighted
average location of boxes in S (scores are used for weighting). And the score
of B is set to the average score of boxes in S.
Args:
selected_boxes: BoxList containing a subset of boxes in pool_boxes. These
boxes are usually selected from pool_boxes using non max suppression.
pool_boxes: BoxList containing a set of (possibly redundant) boxes.
iou_thresh: (float scalar) iou threshold for matching boxes in
selected_boxes and pool_boxes.
Returns:
BoxList containing averaged locations and scores for each box in
selected_boxes.
Raises:
ValueError: if
a) selected_boxes or pool_boxes is not a BoxList.
b) if iou_thresh is not in [0, 1].
c) pool_boxes does not have a scores field.
"""
if not 0.0 <= iou_thresh <= 1.0:
raise ValueError('iou_thresh must be between 0 and 1')
if not isinstance(selected_boxes, box_list.BoxList):
raise ValueError('selected_boxes must be a BoxList')
if not isinstance(pool_boxes, box_list.BoxList):
raise ValueError('pool_boxes must be a BoxList')
if not pool_boxes.has_field('scores'):
raise ValueError('pool_boxes must have a \'scores\' field')
iou_ = iou(selected_boxes, pool_boxes)
match_indicator = tf.cast(tf.greater(iou_, iou_thresh), dtype=tf.float32)
num_matches = tf.reduce_sum(match_indicator, 1)
# TODO(kbanoop): Handle the case where some boxes in selected_boxes do not
# match to any boxes in pool_boxes. For such boxes without any matches, we
# should return the original boxes without voting.
match_assert = tf.Assert(
tf.reduce_all(tf.greater(num_matches, 0)),
['Each box in selected_boxes must match with at least one box '
'in pool_boxes.'])
scores = tf.expand_dims(pool_boxes.get_field('scores'), 1)
scores_assert = tf.Assert(
tf.reduce_all(tf.greater_equal(scores, 0)),
['Scores must be non negative.'])
with tf.control_dependencies([scores_assert, match_assert]):
sum_scores = tf.matmul(match_indicator, scores)
averaged_scores = tf.reshape(sum_scores, [-1]) / num_matches
box_locations = tf.matmul(match_indicator,
pool_boxes.get() * scores) / sum_scores
averaged_boxes = box_list.BoxList(box_locations)
_copy_extra_fields(averaged_boxes, selected_boxes)
averaged_boxes.add_field('scores', averaged_scores)
return averaged_boxes
def get_minimal_coverage_box(boxlist,
default_box=None,
scope=None):
"""Creates a single bounding box which covers all boxes in the boxlist.
Args:
boxlist: A Boxlist.
default_box: A [1, 4] float32 tensor. If no boxes are present in `boxlist`,
this default box will be returned. If None, will use a default box of
[[0., 0., 1., 1.]].
scope: Name scope.
Returns:
A [1, 4] float32 tensor with a bounding box that tightly covers all the
boxes in the box list. If the boxlist does not contain any boxes, the
default box is returned.
"""
with tf.name_scope(scope, 'CreateCoverageBox'):
num_boxes = boxlist.num_boxes()
def coverage_box(bboxes):
y_min, x_min, y_max, x_max = tf.split(
value=bboxes, num_or_size_splits=4, axis=1)
y_min_coverage = tf.reduce_min(y_min, axis=0)
x_min_coverage = tf.reduce_min(x_min, axis=0)
y_max_coverage = tf.reduce_max(y_max, axis=0)
x_max_coverage = tf.reduce_max(x_max, axis=0)
return tf.stack(
[y_min_coverage, x_min_coverage, y_max_coverage, x_max_coverage],
axis=1)
default_box = default_box or tf.constant([[0., 0., 1., 1.]])
return tf.cond(
tf.greater_equal(num_boxes, 1),
true_fn=lambda: coverage_box(boxlist.get()),
false_fn=lambda: default_box)
def sample_boxes_by_jittering(boxlist,
num_boxes_to_sample,
stddev=0.1,
scope=None):
"""Samples num_boxes_to_sample boxes by jittering around boxlist boxes.
It is possible that this function might generate boxes with size 0. The larger
the stddev, this is more probable. For a small stddev of 0.1 this probability
is very small.
Args:
boxlist: A boxlist containing N boxes in normalized coordinates.
num_boxes_to_sample: A positive integer containing the number of boxes to
sample.
stddev: Standard deviation. This is used to draw random offsets for the
box corners from a normal distribution. The offset is multiplied by the
box size so will be larger in terms of pixels for larger boxes.
scope: Name scope.
Returns:
sampled_boxlist: A boxlist containing num_boxes_to_sample boxes in
normalized coordinates.
"""
with tf.name_scope(scope, 'SampleBoxesByJittering'):
num_boxes = boxlist.num_boxes()
box_indices = tf.random_uniform(
[num_boxes_to_sample],
minval=0,
maxval=num_boxes,
dtype=tf.int32)
sampled_boxes = tf.gather(boxlist.get(), box_indices)
sampled_boxes_height = sampled_boxes[:, 2] - sampled_boxes[:, 0]
sampled_boxes_width = sampled_boxes[:, 3] - sampled_boxes[:, 1]
rand_miny_gaussian = tf.random_normal([num_boxes_to_sample], stddev=stddev)
rand_minx_gaussian = tf.random_normal([num_boxes_to_sample], stddev=stddev)
rand_maxy_gaussian = tf.random_normal([num_boxes_to_sample], stddev=stddev)
rand_maxx_gaussian = tf.random_normal([num_boxes_to_sample], stddev=stddev)
miny = rand_miny_gaussian * sampled_boxes_height + sampled_boxes[:, 0]
minx = rand_minx_gaussian * sampled_boxes_width + sampled_boxes[:, 1]
maxy = rand_maxy_gaussian * sampled_boxes_height + sampled_boxes[:, 2]
maxx = rand_maxx_gaussian * sampled_boxes_width + sampled_boxes[:, 3]
maxy = tf.maximum(miny, maxy)
maxx = tf.maximum(minx, maxx)
sampled_boxes = tf.stack([miny, minx, maxy, maxx], axis=1)
sampled_boxes = tf.maximum(tf.minimum(sampled_boxes, 1.0), 0.0)
return box_list.BoxList(sampled_boxes)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Faster RCNN box coder.
Faster RCNN box coder follows the coding schema described below:
ty = (y - ya) / ha
tx = (x - xa) / wa
th = log(h / ha)
tw = log(w / wa)
where x, y, w, h denote the box's center coordinates, width and height
respectively. Similarly, xa, ya, wa, ha denote the anchor's center
coordinates, width and height. tx, ty, tw and th denote the anchor-encoded
center, width and height respectively.
See http://arxiv.org/abs/1506.01497 for details.
"""
import tensorflow.compat.v2 as tf
from official.vision.detection.utils.object_detection import box_coder
from official.vision.detection.utils.object_detection import box_list
EPSILON = 1e-8
class FasterRcnnBoxCoder(box_coder.BoxCoder):
"""Faster RCNN box coder."""
def __init__(self, scale_factors=None):
"""Constructor for FasterRcnnBoxCoder.
Args:
scale_factors: List of 4 positive scalars to scale ty, tx, th and tw.
If set to None, does not perform scaling. For Faster RCNN,
the open-source implementation recommends using [10.0, 10.0, 5.0, 5.0].
"""
if scale_factors:
assert len(scale_factors) == 4
for scalar in scale_factors:
assert scalar > 0
self._scale_factors = scale_factors
@property
def code_size(self):
return 4
def _encode(self, boxes, anchors):
"""Encode a box collection with respect to anchor collection.
Args:
boxes: BoxList holding N boxes to be encoded.
anchors: BoxList of anchors.
Returns:
a tensor representing N anchor-encoded boxes of the format
[ty, tx, th, tw].
"""
# Convert anchors to the center coordinate representation.
ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes()
ycenter, xcenter, h, w = boxes.get_center_coordinates_and_sizes()
# Avoid NaN in division and log below.
ha += EPSILON
wa += EPSILON
h += EPSILON
w += EPSILON
tx = (xcenter - xcenter_a) / wa
ty = (ycenter - ycenter_a) / ha
tw = tf.math.log(w / wa)
th = tf.math.log(h / ha)
# Scales location targets as used in paper for joint training.
if self._scale_factors:
ty *= self._scale_factors[0]
tx *= self._scale_factors[1]
th *= self._scale_factors[2]
tw *= self._scale_factors[3]
return tf.transpose(a=tf.stack([ty, tx, th, tw]))
def _decode(self, rel_codes, anchors):
"""Decode relative codes to boxes.
Args:
rel_codes: a tensor representing N anchor-encoded boxes.
anchors: BoxList of anchors.
Returns:
boxes: BoxList holding N bounding boxes.
"""
ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes()
ty, tx, th, tw = tf.unstack(tf.transpose(a=rel_codes))
if self._scale_factors:
ty /= self._scale_factors[0]
tx /= self._scale_factors[1]
th /= self._scale_factors[2]
tw /= self._scale_factors[3]
w = tf.exp(tw) * wa
h = tf.exp(th) * ha
ycenter = ty * ha + ycenter_a
xcenter = tx * wa + xcenter_a
ymin = ycenter - h / 2.
xmin = xcenter - w / 2.
ymax = ycenter + h / 2.
xmax = xcenter + w / 2.
return box_list.BoxList(tf.transpose(a=tf.stack([ymin, xmin, ymax, xmax])))
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Matcher interface and Match class.
This module defines the Matcher interface and the Match object. The job of the
matcher is to match row and column indices based on the similarity matrix and
other optional parameters. Each column is matched to at most one row. There
are three possibilities for the matching:
1) match: A column matches a row.
2) no_match: A column does not match any row.
3) ignore: A column that is neither 'match' nor no_match.
The ignore case is regularly encountered in object detection: when an anchor has
a relatively small overlap with a ground-truth box, one neither wants to
consider this box a positive example (match) nor a negative example (no match).
The Match class is used to store the match results and it provides simple apis
to query the results.
"""
from abc import ABCMeta
from abc import abstractmethod
import tensorflow.compat.v2 as tf
class Match(object):
"""Class to store results from the matcher.
This class is used to store the results from the matcher. It provides
convenient methods to query the matching results.
"""
def __init__(self, match_results):
"""Constructs a Match object.
Args:
match_results: Integer tensor of shape [N] with (1) match_results[i]>=0,
meaning that column i is matched with row match_results[i].
(2) match_results[i]=-1, meaning that column i is not matched.
(3) match_results[i]=-2, meaning that column i is ignored.
Raises:
ValueError: if match_results does not have rank 1 or is not an
integer int32 scalar tensor
"""
if match_results.shape.ndims != 1:
raise ValueError('match_results should have rank 1')
if match_results.dtype != tf.int32:
raise ValueError('match_results should be an int32 or int64 scalar '
'tensor')
self._match_results = match_results
@property
def match_results(self):
"""The accessor for match results.
Returns:
the tensor which encodes the match results.
"""
return self._match_results
def matched_column_indices(self):
"""Returns column indices that match to some row.
The indices returned by this op are always sorted in increasing order.
Returns:
column_indices: int32 tensor of shape [K] with column indices.
"""
return self._reshape_and_cast(tf.where(tf.greater(self._match_results, -1)))
def matched_column_indicator(self):
"""Returns column indices that are matched.
Returns:
column_indices: int32 tensor of shape [K] with column indices.
"""
return tf.greater_equal(self._match_results, 0)
def num_matched_columns(self):
"""Returns number (int32 scalar tensor) of matched columns."""
return tf.size(input=self.matched_column_indices())
def unmatched_column_indices(self):
"""Returns column indices that do not match any row.
The indices returned by this op are always sorted in increasing order.
Returns:
column_indices: int32 tensor of shape [K] with column indices.
"""
return self._reshape_and_cast(tf.where(tf.equal(self._match_results, -1)))
def unmatched_column_indicator(self):
"""Returns column indices that are unmatched.
Returns:
column_indices: int32 tensor of shape [K] with column indices.
"""
return tf.equal(self._match_results, -1)
def num_unmatched_columns(self):
"""Returns number (int32 scalar tensor) of unmatched columns."""
return tf.size(input=self.unmatched_column_indices())
def ignored_column_indices(self):
"""Returns column indices that are ignored (neither Matched nor Unmatched).
The indices returned by this op are always sorted in increasing order.
Returns:
column_indices: int32 tensor of shape [K] with column indices.
"""
return self._reshape_and_cast(tf.where(self.ignored_column_indicator()))
def ignored_column_indicator(self):
"""Returns boolean column indicator where True means the colum is ignored.
Returns:
column_indicator: boolean vector which is True for all ignored column
indices.
"""
return tf.equal(self._match_results, -2)
def num_ignored_columns(self):
"""Returns number (int32 scalar tensor) of matched columns."""
return tf.size(input=self.ignored_column_indices())
def unmatched_or_ignored_column_indices(self):
"""Returns column indices that are unmatched or ignored.
The indices returned by this op are always sorted in increasing order.
Returns:
column_indices: int32 tensor of shape [K] with column indices.
"""
return self._reshape_and_cast(tf.where(tf.greater(0, self._match_results)))
def matched_row_indices(self):
"""Returns row indices that match some column.
The indices returned by this op are ordered so as to be in correspondence
with the output of matched_column_indicator(). For example if
self.matched_column_indicator() is [0,2], and self.matched_row_indices() is
[7, 3], then we know that column 0 was matched to row 7 and column 2 was
matched to row 3.
Returns:
row_indices: int32 tensor of shape [K] with row indices.
"""
return self._reshape_and_cast(
tf.gather(self._match_results, self.matched_column_indices()))
def _reshape_and_cast(self, t):
return tf.cast(tf.reshape(t, [-1]), tf.int32)
def gather_based_on_match(self, input_tensor, unmatched_value,
ignored_value):
"""Gathers elements from `input_tensor` based on match results.
For columns that are matched to a row, gathered_tensor[col] is set to
input_tensor[match_results[col]]. For columns that are unmatched,
gathered_tensor[col] is set to unmatched_value. Finally, for columns that
are ignored gathered_tensor[col] is set to ignored_value.
Note that the input_tensor.shape[1:] must match with unmatched_value.shape
and ignored_value.shape
Args:
input_tensor: Tensor to gather values from.
unmatched_value: Constant tensor value for unmatched columns.
ignored_value: Constant tensor value for ignored columns.
Returns:
gathered_tensor: A tensor containing values gathered from input_tensor.
The shape of the gathered tensor is [match_results.shape[0]] +
input_tensor.shape[1:].
"""
input_tensor = tf.concat([tf.stack([ignored_value, unmatched_value]),
input_tensor], axis=0)
gather_indices = tf.maximum(self.match_results + 2, 0)
gathered_tensor = tf.gather(input_tensor, gather_indices)
return gathered_tensor
class Matcher(object):
"""Abstract base class for matcher.
"""
__metaclass__ = ABCMeta
def match(self, similarity_matrix, scope=None, **params):
"""Computes matches among row and column indices and returns the result.
Computes matches among the row and column indices based on the similarity
matrix and optional arguments.
Args:
similarity_matrix: Float tensor of shape [N, M] with pairwise similarity
where higher value means more similar.
scope: Op scope name. Defaults to 'Match' if None.
**params: Additional keyword arguments for specific implementations of
the Matcher.
Returns:
A Match object with the results of matching.
"""
if not scope:
scope = 'Match'
with tf.name_scope(scope) as scope:
return Match(self._match(similarity_matrix, **params))
@abstractmethod
def _match(self, similarity_matrix, **params):
"""Method to be overridden by implementations.
Args:
similarity_matrix: Float tensor of shape [N, M] with pairwise similarity
where higher value means more similar.
**params: Additional keyword arguments for specific implementations of
the Matcher.
Returns:
match_results: Integer tensor of shape [M]: match_results[i]>=0 means
that column i is matched to row match_results[i], match_results[i]=-1
means that the column is not matched. match_results[i]=-2 means that
the column is ignored (usually this happens when there is a very weak
match which one neither wants as positive nor negative example).
"""
pass
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Base minibatch sampler module.
The job of the minibatch_sampler is to subsample a minibatch based on some
criterion.
The main function call is:
subsample(indicator, batch_size, **params).
Indicator is a 1d boolean tensor where True denotes which examples can be
sampled. It returns a boolean indicator where True denotes an example has been
sampled..
Subclasses should implement the Subsample function and can make use of the
@staticmethod SubsampleIndicator.
This is originally implemented in TensorFlow Object Detection API.
"""
from abc import ABCMeta
from abc import abstractmethod
import tensorflow.compat.v2 as tf
from official.vision.detection.utils.object_detection import ops
class MinibatchSampler(object):
"""Abstract base class for subsampling minibatches."""
__metaclass__ = ABCMeta
def __init__(self):
"""Constructs a minibatch sampler."""
pass
@abstractmethod
def subsample(self, indicator, batch_size, **params):
"""Returns subsample of entries in indicator.
Args:
indicator: boolean tensor of shape [N] whose True entries can be sampled.
batch_size: desired batch size.
**params: additional keyword arguments for specific implementations of
the MinibatchSampler.
Returns:
sample_indicator: boolean tensor of shape [N] whose True entries have been
sampled. If sum(indicator) >= batch_size, sum(is_sampled) = batch_size
"""
pass
@staticmethod
def subsample_indicator(indicator, num_samples):
"""Subsample indicator vector.
Given a boolean indicator vector with M elements set to `True`, the function
assigns all but `num_samples` of these previously `True` elements to
`False`. If `num_samples` is greater than M, the original indicator vector
is returned.
Args:
indicator: a 1-dimensional boolean tensor indicating which elements
are allowed to be sampled and which are not.
num_samples: int32 scalar tensor
Returns:
a boolean tensor with the same shape as input (indicator) tensor
"""
indices = tf.where(indicator)
indices = tf.random.shuffle(indices)
indices = tf.reshape(indices, [-1])
num_samples = tf.minimum(tf.size(input=indices), num_samples)
selected_indices = tf.slice(indices, [0], tf.reshape(num_samples, [1]))
selected_indicator = ops.indices_to_dense_vector(
selected_indices,
tf.shape(input=indicator)[0])
return tf.equal(selected_indicator, 1)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A module for helper tensorflow ops.
This is originally implemented in TensorFlow Object Detection API.
"""
import tensorflow.compat.v2 as tf
from official.vision.detection.utils.object_detection import shape_utils
def indices_to_dense_vector(indices,
size,
indices_value=1.,
default_value=0,
dtype=tf.float32):
"""Creates dense vector with indices set to specific value and rest to zeros.
This function exists because it is unclear if it is safe to use
tf.sparse_to_dense(indices, [size], 1, validate_indices=False)
with indices which are not ordered.
This function accepts a dynamic size (e.g. tf.shape(tensor)[0])
Args:
indices: 1d Tensor with integer indices which are to be set to
indices_values.
size: scalar with size (integer) of output Tensor.
indices_value: values of elements specified by indices in the output vector
default_value: values of other elements in the output vector.
dtype: data type.
Returns:
dense 1D Tensor of shape [size] with indices set to indices_values and the
rest set to default_value.
"""
size = tf.cast(size, dtype=tf.int32)
zeros = tf.ones([size], dtype=dtype) * default_value
values = tf.ones_like(indices, dtype=dtype) * indices_value
return tf.dynamic_stitch(
[tf.range(size), tf.cast(indices, dtype=tf.int32)], [zeros, values])
def matmul_gather_on_zeroth_axis(params, indices, scope=None):
"""Matrix multiplication based implementation of tf.gather on zeroth axis.
TODO(rathodv, jonathanhuang): enable sparse matmul option.
Args:
params: A float32 Tensor. The tensor from which to gather values.
Must be at least rank 1.
indices: A Tensor. Must be one of the following types: int32, int64.
Must be in range [0, params.shape[0])
scope: A name for the operation (optional).
Returns:
A Tensor. Has the same type as params. Values from params gathered
from indices given by indices, with shape indices.shape + params.shape[1:].
"""
with tf.name_scope(scope, 'MatMulGather'):
params_shape = shape_utils.combined_static_and_dynamic_shape(params)
indices_shape = shape_utils.combined_static_and_dynamic_shape(indices)
params2d = tf.reshape(params, [params_shape[0], -1])
indicator_matrix = tf.one_hot(indices, params_shape[0])
gathered_result_flattened = tf.matmul(indicator_matrix, params2d)
return tf.reshape(gathered_result_flattened,
tf.stack(indices_shape + params_shape[1:]))
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Preprocess images and bounding boxes for detection.
We perform two sets of operations in preprocessing stage:
(a) operations that are applied to both training and testing data,
(b) operations that are applied only to training data for the purpose of
data augmentation.
A preprocessing function receives a set of inputs,
e.g. an image and bounding boxes,
performs an operation on them, and returns them.
Some examples are: randomly cropping the image, randomly mirroring the image,
randomly changing the brightness, contrast, hue and
randomly jittering the bounding boxes.
The image is a rank 4 tensor: [1, height, width, channels] with
dtype=tf.float32. The groundtruth_boxes is a rank 2 tensor: [N, 4] where
in each row there is a box with [ymin xmin ymax xmax].
Boxes are in normalized coordinates meaning
their coordinate values range in [0, 1]
Important Note: In tensor_dict, images is a rank 4 tensor, but preprocessing
functions receive a rank 3 tensor for processing the image. Thus, inside the
preprocess function we squeeze the image to become a rank 3 tensor and then
we pass it to the functions. At the end of the preprocess we expand the image
back to rank 4.
"""
import tensorflow.compat.v2 as tf
from official.vision.detection.utils.object_detection import box_list
def _flip_boxes_left_right(boxes):
"""Left-right flip the boxes.
Args:
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4].
Boxes are in normalized form meaning their coordinates vary
between [0, 1].
Each row is in the form of [ymin, xmin, ymax, xmax].
Returns:
Flipped boxes.
"""
ymin, xmin, ymax, xmax = tf.split(value=boxes, num_or_size_splits=4, axis=1)
flipped_xmin = tf.subtract(1.0, xmax)
flipped_xmax = tf.subtract(1.0, xmin)
flipped_boxes = tf.concat([ymin, flipped_xmin, ymax, flipped_xmax], 1)
return flipped_boxes
def _flip_masks_left_right(masks):
"""Left-right flip masks.
Args:
masks: rank 3 float32 tensor with shape
[num_instances, height, width] representing instance masks.
Returns:
flipped masks: rank 3 float32 tensor with shape
[num_instances, height, width] representing instance masks.
"""
return masks[:, :, ::-1]
def keypoint_flip_horizontal(keypoints, flip_point, flip_permutation,
scope=None):
"""Flips the keypoints horizontally around the flip_point.
This operation flips the x coordinate for each keypoint around the flip_point
and also permutes the keypoints in a manner specified by flip_permutation.
Args:
keypoints: a tensor of shape [num_instances, num_keypoints, 2]
flip_point: (float) scalar tensor representing the x coordinate to flip the
keypoints around.
flip_permutation: rank 1 int32 tensor containing the keypoint flip
permutation. This specifies the mapping from original keypoint indices
to the flipped keypoint indices. This is used primarily for keypoints
that are not reflection invariant. E.g. Suppose there are 3 keypoints
representing ['head', 'right_eye', 'left_eye'], then a logical choice for
flip_permutation might be [0, 2, 1] since we want to swap the 'left_eye'
and 'right_eye' after a horizontal flip.
scope: name scope.
Returns:
new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
"""
with tf.name_scope(scope, 'FlipHorizontal'):
keypoints = tf.transpose(a=keypoints, perm=[1, 0, 2])
keypoints = tf.gather(keypoints, flip_permutation)
v, u = tf.split(value=keypoints, num_or_size_splits=2, axis=2)
u = flip_point * 2.0 - u
new_keypoints = tf.concat([v, u], 2)
new_keypoints = tf.transpose(a=new_keypoints, perm=[1, 0, 2])
return new_keypoints
def random_horizontal_flip(image,
boxes=None,
masks=None,
keypoints=None,
keypoint_flip_permutation=None,
seed=None):
"""Randomly flips the image and detections horizontally.
The probability of flipping the image is 50%.
Args:
image: rank 3 float32 tensor with shape [height, width, channels].
boxes: (optional) rank 2 float32 tensor with shape [N, 4]
containing the bounding boxes.
Boxes are in normalized form meaning their coordinates vary
between [0, 1].
Each row is in the form of [ymin, xmin, ymax, xmax].
masks: (optional) rank 3 float32 tensor with shape
[num_instances, height, width] containing instance masks. The masks
are of the same height, width as the input `image`.
keypoints: (optional) rank 3 float32 tensor with shape
[num_instances, num_keypoints, 2]. The keypoints are in y-x
normalized coordinates.
keypoint_flip_permutation: rank 1 int32 tensor containing the keypoint flip
permutation.
seed: random seed
Returns:
image: image which is the same shape as input image.
If boxes, masks, keypoints, and keypoint_flip_permutation are not None,
the function also returns the following tensors.
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4].
Boxes are in normalized form meaning their coordinates vary
between [0, 1].
masks: rank 3 float32 tensor with shape [num_instances, height, width]
containing instance masks.
keypoints: rank 3 float32 tensor with shape
[num_instances, num_keypoints, 2]
Raises:
ValueError: if keypoints are provided but keypoint_flip_permutation is not.
"""
def _flip_image(image):
# flip image
image_flipped = tf.image.flip_left_right(image)
return image_flipped
if keypoints is not None and keypoint_flip_permutation is None:
raise ValueError(
'keypoints are provided but keypoints_flip_permutation is not provided')
with tf.name_scope('RandomHorizontalFlip'):
result = []
# random variable defining whether to do flip or not
do_a_flip_random = tf.greater(tf.random.uniform([], seed=seed), 0.5)
# flip image
image = tf.cond(
pred=do_a_flip_random,
true_fn=lambda: _flip_image(image),
false_fn=lambda: image)
result.append(image)
# flip boxes
if boxes is not None:
boxes = tf.cond(
pred=do_a_flip_random,
true_fn=lambda: _flip_boxes_left_right(boxes),
false_fn=lambda: boxes)
result.append(boxes)
# flip masks
if masks is not None:
masks = tf.cond(
pred=do_a_flip_random,
true_fn=lambda: _flip_masks_left_right(masks),
false_fn=lambda: masks)
result.append(masks)
# flip keypoints
if keypoints is not None and keypoint_flip_permutation is not None:
permutation = keypoint_flip_permutation
keypoints = tf.cond(
pred=do_a_flip_random,
true_fn=lambda: keypoint_flip_horizontal(keypoints, 0.5, permutation),
false_fn=lambda: keypoints)
result.append(keypoints)
return tuple(result)
def _compute_new_static_size(image, min_dimension, max_dimension):
"""Compute new static shape for resize_to_range method."""
image_shape = image.get_shape().as_list()
orig_height = image_shape[0]
orig_width = image_shape[1]
num_channels = image_shape[2]
orig_min_dim = min(orig_height, orig_width)
# Calculates the larger of the possible sizes
large_scale_factor = min_dimension / float(orig_min_dim)
# Scaling orig_(height|width) by large_scale_factor will make the smaller
# dimension equal to min_dimension, save for floating point rounding errors.
# For reasonably-sized images, taking the nearest integer will reliably
# eliminate this error.
large_height = int(round(orig_height * large_scale_factor))
large_width = int(round(orig_width * large_scale_factor))
large_size = [large_height, large_width]
if max_dimension:
# Calculates the smaller of the possible sizes, use that if the larger
# is too big.
orig_max_dim = max(orig_height, orig_width)
small_scale_factor = max_dimension / float(orig_max_dim)
# Scaling orig_(height|width) by small_scale_factor will make the larger
# dimension equal to max_dimension, save for floating point rounding
# errors. For reasonably-sized images, taking the nearest integer will
# reliably eliminate this error.
small_height = int(round(orig_height * small_scale_factor))
small_width = int(round(orig_width * small_scale_factor))
small_size = [small_height, small_width]
new_size = large_size
if max(large_size) > max_dimension:
new_size = small_size
else:
new_size = large_size
return tf.constant(new_size + [num_channels])
def _compute_new_dynamic_size(image, min_dimension, max_dimension):
"""Compute new dynamic shape for resize_to_range method."""
image_shape = tf.shape(input=image)
orig_height = tf.cast(image_shape[0], dtype=tf.float32)
orig_width = tf.cast(image_shape[1], dtype=tf.float32)
num_channels = image_shape[2]
orig_min_dim = tf.minimum(orig_height, orig_width)
# Calculates the larger of the possible sizes
min_dimension = tf.constant(min_dimension, dtype=tf.float32)
large_scale_factor = min_dimension / orig_min_dim
# Scaling orig_(height|width) by large_scale_factor will make the smaller
# dimension equal to min_dimension, save for floating point rounding errors.
# For reasonably-sized images, taking the nearest integer will reliably
# eliminate this error.
large_height = tf.cast(
tf.round(orig_height * large_scale_factor), dtype=tf.int32)
large_width = tf.cast(
tf.round(orig_width * large_scale_factor), dtype=tf.int32)
large_size = tf.stack([large_height, large_width])
if max_dimension:
# Calculates the smaller of the possible sizes, use that if the larger
# is too big.
orig_max_dim = tf.maximum(orig_height, orig_width)
max_dimension = tf.constant(max_dimension, dtype=tf.float32)
small_scale_factor = max_dimension / orig_max_dim
# Scaling orig_(height|width) by small_scale_factor will make the larger
# dimension equal to max_dimension, save for floating point rounding
# errors. For reasonably-sized images, taking the nearest integer will
# reliably eliminate this error.
small_height = tf.cast(
tf.round(orig_height * small_scale_factor), dtype=tf.int32)
small_width = tf.cast(
tf.round(orig_width * small_scale_factor), dtype=tf.int32)
small_size = tf.stack([small_height, small_width])
new_size = tf.cond(
pred=tf.cast(tf.reduce_max(input_tensor=large_size), dtype=tf.float32) >
max_dimension,
true_fn=lambda: small_size,
false_fn=lambda: large_size)
else:
new_size = large_size
return tf.stack(tf.unstack(new_size) + [num_channels])
def resize_to_range(image,
masks=None,
min_dimension=None,
max_dimension=None,
method=tf.image.ResizeMethod.BILINEAR,
align_corners=False,
pad_to_max_dimension=False):
"""Resizes an image so its dimensions are within the provided value.
The output size can be described by two cases:
1. If the image can be rescaled so its minimum dimension is equal to the
provided value without the other dimension exceeding max_dimension,
then do so.
2. Otherwise, resize so the largest dimension is equal to max_dimension.
Args:
image: A 3D tensor of shape [height, width, channels]
masks: (optional) rank 3 float32 tensor with shape
[num_instances, height, width] containing instance masks.
min_dimension: (optional) (scalar) desired size of the smaller image
dimension.
max_dimension: (optional) (scalar) maximum allowed size
of the larger image dimension.
method: (optional) interpolation method used in resizing. Defaults to
BILINEAR.
align_corners: bool. If true, exactly align all 4 corners of the input
and output. Defaults to False.
pad_to_max_dimension: Whether to resize the image and pad it with zeros
so the resulting image is of the spatial size
[max_dimension, max_dimension]. If masks are included they are padded
similarly.
Returns:
Note that the position of the resized_image_shape changes based on whether
masks are present.
resized_image: A 3D tensor of shape [new_height, new_width, channels],
where the image has been resized (with bilinear interpolation) so that
min(new_height, new_width) == min_dimension or
max(new_height, new_width) == max_dimension.
resized_masks: If masks is not None, also outputs masks. A 3D tensor of
shape [num_instances, new_height, new_width].
resized_image_shape: A 1D tensor of shape [3] containing shape of the
resized image.
Raises:
ValueError: if the image is not a 3D tensor.
"""
if len(image.get_shape()) != 3:
raise ValueError('Image should be 3D tensor')
with tf.name_scope('ResizeToRange', values=[image, min_dimension]):
if image.get_shape().is_fully_defined():
new_size = _compute_new_static_size(image, min_dimension, max_dimension)
else:
new_size = _compute_new_dynamic_size(image, min_dimension, max_dimension)
new_image = tf.image.resize(image, new_size[:-1], method=method)
if pad_to_max_dimension:
new_image = tf.image.pad_to_bounding_box(
new_image, 0, 0, max_dimension, max_dimension)
result = [new_image]
if masks is not None:
new_masks = tf.expand_dims(masks, 3)
new_masks = tf.image.resize(
new_masks,
new_size[:-1],
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
new_masks = tf.squeeze(new_masks, 3)
if pad_to_max_dimension:
new_masks = tf.image.pad_to_bounding_box(
new_masks, 0, 0, max_dimension, max_dimension)
result.append(new_masks)
result.append(new_size)
return result
def _copy_extra_fields(boxlist_to_copy_to, boxlist_to_copy_from):
"""Copies the extra fields of boxlist_to_copy_from to boxlist_to_copy_to.
Args:
boxlist_to_copy_to: BoxList to which extra fields are copied.
boxlist_to_copy_from: BoxList from which fields are copied.
Returns:
boxlist_to_copy_to with extra fields.
"""
for field in boxlist_to_copy_from.get_extra_fields():
boxlist_to_copy_to.add_field(field, boxlist_to_copy_from.get_field(field))
return boxlist_to_copy_to
def box_list_scale(boxlist, y_scale, x_scale, scope=None):
"""scale box coordinates in x and y dimensions.
Args:
boxlist: BoxList holding N boxes
y_scale: (float) scalar tensor
x_scale: (float) scalar tensor
scope: name scope.
Returns:
boxlist: BoxList holding N boxes
"""
with tf.name_scope(scope, 'Scale'):
y_scale = tf.cast(y_scale, tf.float32)
x_scale = tf.cast(x_scale, tf.float32)
y_min, x_min, y_max, x_max = tf.split(
value=boxlist.get(), num_or_size_splits=4, axis=1)
y_min = y_scale * y_min
y_max = y_scale * y_max
x_min = x_scale * x_min
x_max = x_scale * x_max
scaled_boxlist = box_list.BoxList(
tf.concat([y_min, x_min, y_max, x_max], 1))
return _copy_extra_fields(scaled_boxlist, boxlist)
def keypoint_scale(keypoints, y_scale, x_scale, scope=None):
"""Scales keypoint coordinates in x and y dimensions.
Args:
keypoints: a tensor of shape [num_instances, num_keypoints, 2]
y_scale: (float) scalar tensor
x_scale: (float) scalar tensor
scope: name scope.
Returns:
new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
"""
with tf.name_scope(scope, 'Scale'):
y_scale = tf.cast(y_scale, tf.float32)
x_scale = tf.cast(x_scale, tf.float32)
new_keypoints = keypoints * [[[y_scale, x_scale]]]
return new_keypoints
def scale_boxes_to_pixel_coordinates(image, boxes, keypoints=None):
"""Scales boxes from normalized to pixel coordinates.
Args:
image: A 3D float32 tensor of shape [height, width, channels].
boxes: A 2D float32 tensor of shape [num_boxes, 4] containing the bounding
boxes in normalized coordinates. Each row is of the form
[ymin, xmin, ymax, xmax].
keypoints: (optional) rank 3 float32 tensor with shape
[num_instances, num_keypoints, 2]. The keypoints are in y-x normalized
coordinates.
Returns:
image: unchanged input image.
scaled_boxes: a 2D float32 tensor of shape [num_boxes, 4] containing the
bounding boxes in pixel coordinates.
scaled_keypoints: a 3D float32 tensor with shape
[num_instances, num_keypoints, 2] containing the keypoints in pixel
coordinates.
"""
boxlist = box_list.BoxList(boxes)
image_height = tf.shape(input=image)[0]
image_width = tf.shape(input=image)[1]
scaled_boxes = box_list_scale(boxlist, image_height, image_width).get()
result = [image, scaled_boxes]
if keypoints is not None:
scaled_keypoints = keypoint_scale(keypoints, image_height, image_width)
result.append(scaled_keypoints)
return tuple(result)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Region Similarity Calculators for BoxLists.
Region Similarity Calculators compare a pairwise measure of similarity
between the boxes in two BoxLists.
"""
from abc import ABCMeta
from abc import abstractmethod
import tensorflow.compat.v2 as tf
def area(boxlist, scope=None):
"""Computes area of boxes.
Args:
boxlist: BoxList holding N boxes
scope: name scope.
Returns:
a tensor with shape [N] representing box areas.
"""
if not scope:
scope = 'Area'
with tf.name_scope(scope):
y_min, x_min, y_max, x_max = tf.split(
value=boxlist.get(), num_or_size_splits=4, axis=1)
return tf.squeeze((y_max - y_min) * (x_max - x_min), [1])
def intersection(boxlist1, boxlist2, scope=None):
"""Compute pairwise intersection areas between boxes.
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding M boxes
scope: name scope.
Returns:
a tensor with shape [N, M] representing pairwise intersections
"""
if not scope:
scope = 'Intersection'
with tf.name_scope(scope):
y_min1, x_min1, y_max1, x_max1 = tf.split(
value=boxlist1.get(), num_or_size_splits=4, axis=1)
y_min2, x_min2, y_max2, x_max2 = tf.split(
value=boxlist2.get(), num_or_size_splits=4, axis=1)
all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(a=y_max2))
all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(a=y_min2))
intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin)
all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(a=x_max2))
all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(a=x_min2))
intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin)
return intersect_heights * intersect_widths
def iou(boxlist1, boxlist2, scope=None):
"""Computes pairwise intersection-over-union between box collections.
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding M boxes
scope: name scope.
Returns:
a tensor with shape [N, M] representing pairwise iou scores.
"""
if not scope:
scope = 'IOU'
with tf.name_scope(scope):
intersections = intersection(boxlist1, boxlist2)
areas1 = area(boxlist1)
areas2 = area(boxlist2)
unions = (
tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections)
return tf.where(
tf.equal(intersections, 0.0), tf.zeros_like(intersections),
tf.truediv(intersections, unions))
class RegionSimilarityCalculator(object):
"""Abstract base class for region similarity calculator."""
__metaclass__ = ABCMeta
def compare(self, boxlist1, boxlist2, scope=None):
"""Computes matrix of pairwise similarity between BoxLists.
This op (to be overriden) computes a measure of pairwise similarity between
the boxes in the given BoxLists. Higher values indicate more similarity.
Note that this method simply measures similarity and does not explicitly
perform a matching.
Args:
boxlist1: BoxList holding N boxes.
boxlist2: BoxList holding M boxes.
scope: Op scope name. Defaults to 'Compare' if None.
Returns:
a (float32) tensor of shape [N, M] with pairwise similarity score.
"""
if not scope:
scope = 'Compare'
with tf.name_scope(scope) as scope:
return self._compare(boxlist1, boxlist2)
@abstractmethod
def _compare(self, boxlist1, boxlist2):
pass
class IouSimilarity(RegionSimilarityCalculator):
"""Class to compute similarity based on Intersection over Union (IOU) metric.
This class computes pairwise similarity between two BoxLists based on IOU.
"""
def _compare(self, boxlist1, boxlist2):
"""Compute pairwise IOU similarity between the two BoxLists.
Args:
boxlist1: BoxList holding N boxes.
boxlist2: BoxList holding M boxes.
Returns:
A tensor with shape [N, M] representing pairwise iou scores.
"""
return iou(boxlist1, boxlist2)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utils used to manipulate tensor shapes."""
import tensorflow.compat.v2 as tf
def assert_shape_equal(shape_a, shape_b):
"""Asserts that shape_a and shape_b are equal.
If the shapes are static, raises a ValueError when the shapes
mismatch.
If the shapes are dynamic, raises a tf InvalidArgumentError when the shapes
mismatch.
Args:
shape_a: a list containing shape of the first tensor.
shape_b: a list containing shape of the second tensor.
Returns:
Either a tf.no_op() when shapes are all static and a tf.assert_equal() op
when the shapes are dynamic.
Raises:
ValueError: When shapes are both static and unequal.
"""
if (all(isinstance(dim, int) for dim in shape_a) and
all(isinstance(dim, int) for dim in shape_b)):
if shape_a != shape_b:
raise ValueError('Unequal shapes {}, {}'.format(shape_a, shape_b))
else: return tf.no_op()
else:
return tf.assert_equal(shape_a, shape_b)
def combined_static_and_dynamic_shape(tensor):
"""Returns a list containing static and dynamic values for the dimensions.
Returns a list of static and dynamic values for shape dimensions. This is
useful to preserve static shapes when available in reshape operation.
Args:
tensor: A tensor of any type.
Returns:
A list of size tensor.shape.ndims containing integers or a scalar tensor.
"""
static_tensor_shape = tensor.shape.as_list()
dynamic_tensor_shape = tf.shape(input=tensor)
combined_shape = []
for index, dim in enumerate(static_tensor_shape):
if dim is not None:
combined_shape.append(dim)
else:
combined_shape.append(dynamic_tensor_shape[index])
return combined_shape
def pad_or_clip_nd(tensor, output_shape):
"""Pad or Clip given tensor to the output shape.
Args:
tensor: Input tensor to pad or clip.
output_shape: A list of integers / scalar tensors (or None for dynamic dim)
representing the size to pad or clip each dimension of the input tensor.
Returns:
Input tensor padded and clipped to the output shape.
"""
tensor_shape = tf.shape(input=tensor)
clip_size = [
tf.where(tensor_shape[i] - shape > 0, shape, -1)
if shape is not None else -1 for i, shape in enumerate(output_shape)
]
clipped_tensor = tf.slice(
tensor,
begin=tf.zeros(len(clip_size), dtype=tf.int32),
size=clip_size)
# Pad tensor if the shape of clipped tensor is smaller than the expected
# shape.
clipped_tensor_shape = tf.shape(input=clipped_tensor)
trailing_paddings = [
shape - clipped_tensor_shape[i] if shape is not None else 0
for i, shape in enumerate(output_shape)
]
paddings = tf.stack(
[
tf.zeros(len(trailing_paddings), dtype=tf.int32),
trailing_paddings
],
axis=1)
padded_tensor = tf.pad(tensor=clipped_tensor, paddings=paddings)
output_static_shape = [
dim if not isinstance(dim, tf.Tensor) else None for dim in output_shape
]
padded_tensor.set_shape(output_static_shape)
return padded_tensor
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Base target assigner module.
The job of a TargetAssigner is, for a given set of anchors (bounding boxes) and
groundtruth detections (bounding boxes), to assign classification and regression
targets to each anchor as well as weights to each anchor (specifying, e.g.,
which anchors should not contribute to training loss).
It assigns classification/regression targets by performing the following steps:
1) Computing pairwise similarity between anchors and groundtruth boxes using a
provided RegionSimilarity Calculator
2) Computing a matching based on the similarity matrix using a provided Matcher
3) Assigning regression targets based on the matching and a provided BoxCoder
4) Assigning classification targets based on the matching and groundtruth labels
Note that TargetAssigners only operate on detections from a single
image at a time, so any logic for applying a TargetAssigner to multiple
images must be handled externally.
"""
import tensorflow.compat.v2 as tf
from official.vision.detection.utils.object_detection import box_list
from official.vision.detection.utils.object_detection import shape_utils
KEYPOINTS_FIELD_NAME = 'keypoints'
class TargetAssigner(object):
"""Target assigner to compute classification and regression targets."""
def __init__(self, similarity_calc, matcher, box_coder,
negative_class_weight=1.0, unmatched_cls_target=None):
"""Construct Object Detection Target Assigner.
Args:
similarity_calc: a RegionSimilarityCalculator
matcher: Matcher used to match groundtruth to anchors.
box_coder: BoxCoder used to encode matching groundtruth boxes with
respect to anchors.
negative_class_weight: classification weight to be associated to negative
anchors (default: 1.0). The weight must be in [0., 1.].
unmatched_cls_target: a float32 tensor with shape [d_1, d_2, ..., d_k]
which is consistent with the classification target for each
anchor (and can be empty for scalar targets). This shape must thus be
compatible with the groundtruth labels that are passed to the "assign"
function (which have shape [num_gt_boxes, d_1, d_2, ..., d_k]).
If set to None, unmatched_cls_target is set to be [0] for each anchor.
Raises:
ValueError: if similarity_calc is not a RegionSimilarityCalculator or
if matcher is not a Matcher or if box_coder is not a BoxCoder
"""
self._similarity_calc = similarity_calc
self._matcher = matcher
self._box_coder = box_coder
self._negative_class_weight = negative_class_weight
if unmatched_cls_target is None:
self._unmatched_cls_target = tf.constant([0], tf.float32)
else:
self._unmatched_cls_target = unmatched_cls_target
@property
def box_coder(self):
return self._box_coder
def assign(self, anchors, groundtruth_boxes, groundtruth_labels=None,
groundtruth_weights=None, **params):
"""Assign classification and regression targets to each anchor.
For a given set of anchors and groundtruth detections, match anchors
to groundtruth_boxes and assign classification and regression targets to
each anchor as well as weights based on the resulting match (specifying,
e.g., which anchors should not contribute to training loss).
Anchors that are not matched to anything are given a classification target
of self._unmatched_cls_target which can be specified via the constructor.
Args:
anchors: a BoxList representing N anchors
groundtruth_boxes: a BoxList representing M groundtruth boxes
groundtruth_labels: a tensor of shape [M, d_1, ... d_k]
with labels for each of the ground_truth boxes. The subshape
[d_1, ... d_k] can be empty (corresponding to scalar inputs). When set
to None, groundtruth_labels assumes a binary problem where all
ground_truth boxes get a positive label (of 1).
groundtruth_weights: a float tensor of shape [M] indicating the weight to
assign to all anchors match to a particular groundtruth box. The weights
must be in [0., 1.]. If None, all weights are set to 1.
**params: Additional keyword arguments for specific implementations of
the Matcher.
Returns:
cls_targets: a float32 tensor with shape [num_anchors, d_1, d_2 ... d_k],
where the subshape [d_1, ..., d_k] is compatible with groundtruth_labels
which has shape [num_gt_boxes, d_1, d_2, ... d_k].
cls_weights: a float32 tensor with shape [num_anchors]
reg_targets: a float32 tensor with shape [num_anchors, box_code_dimension]
reg_weights: a float32 tensor with shape [num_anchors]
match: a matcher.Match object encoding the match between anchors and
groundtruth boxes, with rows corresponding to groundtruth boxes
and columns corresponding to anchors.
Raises:
ValueError: if anchors or groundtruth_boxes are not of type
box_list.BoxList
"""
if not isinstance(anchors, box_list.BoxList):
raise ValueError('anchors must be an BoxList')
if not isinstance(groundtruth_boxes, box_list.BoxList):
raise ValueError('groundtruth_boxes must be an BoxList')
if groundtruth_labels is None:
groundtruth_labels = tf.ones(tf.expand_dims(groundtruth_boxes.num_boxes(),
0))
groundtruth_labels = tf.expand_dims(groundtruth_labels, -1)
unmatched_shape_assert = shape_utils.assert_shape_equal(
shape_utils.combined_static_and_dynamic_shape(groundtruth_labels)[1:],
shape_utils.combined_static_and_dynamic_shape(
self._unmatched_cls_target))
labels_and_box_shapes_assert = shape_utils.assert_shape_equal(
shape_utils.combined_static_and_dynamic_shape(
groundtruth_labels)[:1],
shape_utils.combined_static_and_dynamic_shape(
groundtruth_boxes.get())[:1])
if groundtruth_weights is None:
num_gt_boxes = groundtruth_boxes.num_boxes_static()
if not num_gt_boxes:
num_gt_boxes = groundtruth_boxes.num_boxes()
groundtruth_weights = tf.ones([num_gt_boxes], dtype=tf.float32)
with tf.control_dependencies(
[unmatched_shape_assert, labels_and_box_shapes_assert]):
match_quality_matrix = self._similarity_calc.compare(groundtruth_boxes,
anchors)
match = self._matcher.match(match_quality_matrix, **params)
reg_targets = self._create_regression_targets(anchors,
groundtruth_boxes,
match)
cls_targets = self._create_classification_targets(groundtruth_labels,
match)
reg_weights = self._create_regression_weights(match, groundtruth_weights)
cls_weights = self._create_classification_weights(match,
groundtruth_weights)
num_anchors = anchors.num_boxes_static()
if num_anchors is not None:
reg_targets = self._reset_target_shape(reg_targets, num_anchors)
cls_targets = self._reset_target_shape(cls_targets, num_anchors)
reg_weights = self._reset_target_shape(reg_weights, num_anchors)
cls_weights = self._reset_target_shape(cls_weights, num_anchors)
return cls_targets, cls_weights, reg_targets, reg_weights, match
def _reset_target_shape(self, target, num_anchors):
"""Sets the static shape of the target.
Args:
target: the target tensor. Its first dimension will be overwritten.
num_anchors: the number of anchors, which is used to override the target's
first dimension.
Returns:
A tensor with the shape info filled in.
"""
target_shape = target.get_shape().as_list()
target_shape[0] = num_anchors
target.set_shape(target_shape)
return target
def _create_regression_targets(self, anchors, groundtruth_boxes, match):
"""Returns a regression target for each anchor.
Args:
anchors: a BoxList representing N anchors
groundtruth_boxes: a BoxList representing M groundtruth_boxes
match: a matcher.Match object
Returns:
reg_targets: a float32 tensor with shape [N, box_code_dimension]
"""
matched_gt_boxes = match.gather_based_on_match(
groundtruth_boxes.get(),
unmatched_value=tf.zeros(4),
ignored_value=tf.zeros(4))
matched_gt_boxlist = box_list.BoxList(matched_gt_boxes)
if groundtruth_boxes.has_field(KEYPOINTS_FIELD_NAME):
groundtruth_keypoints = groundtruth_boxes.get_field(KEYPOINTS_FIELD_NAME)
matched_keypoints = match.gather_based_on_match(
groundtruth_keypoints,
unmatched_value=tf.zeros(groundtruth_keypoints.get_shape()[1:]),
ignored_value=tf.zeros(groundtruth_keypoints.get_shape()[1:]))
matched_gt_boxlist.add_field(KEYPOINTS_FIELD_NAME, matched_keypoints)
matched_reg_targets = self._box_coder.encode(matched_gt_boxlist, anchors)
match_results_shape = shape_utils.combined_static_and_dynamic_shape(
match.match_results)
# Zero out the unmatched and ignored regression targets.
unmatched_ignored_reg_targets = tf.tile(
self._default_regression_target(), [match_results_shape[0], 1])
matched_anchors_mask = match.matched_column_indicator()
# To broadcast matched_anchors_mask to the same shape as
# matched_reg_targets.
matched_anchors_mask = tf.tile(
tf.expand_dims(matched_anchors_mask, 1),
[1, tf.shape(matched_reg_targets)[1]])
reg_targets = tf.where(matched_anchors_mask, matched_reg_targets,
unmatched_ignored_reg_targets)
return reg_targets
def _default_regression_target(self):
"""Returns the default target for anchors to regress to.
Default regression targets are set to zero (though in
this implementation what these targets are set to should
not matter as the regression weight of any box set to
regress to the default target is zero).
Returns:
default_target: a float32 tensor with shape [1, box_code_dimension]
"""
return tf.constant([self._box_coder.code_size*[0]], tf.float32)
def _create_classification_targets(self, groundtruth_labels, match):
"""Create classification targets for each anchor.
Assign a classification target of for each anchor to the matching
groundtruth label that is provided by match. Anchors that are not matched
to anything are given the target self._unmatched_cls_target
Args:
groundtruth_labels: a tensor of shape [num_gt_boxes, d_1, ... d_k]
with labels for each of the ground_truth boxes. The subshape
[d_1, ... d_k] can be empty (corresponding to scalar labels).
match: a matcher.Match object that provides a matching between anchors
and groundtruth boxes.
Returns:
a float32 tensor with shape [num_anchors, d_1, d_2 ... d_k], where the
subshape [d_1, ..., d_k] is compatible with groundtruth_labels which has
shape [num_gt_boxes, d_1, d_2, ... d_k].
"""
return match.gather_based_on_match(
groundtruth_labels,
unmatched_value=self._unmatched_cls_target,
ignored_value=self._unmatched_cls_target)
def _create_regression_weights(self, match, groundtruth_weights):
"""Set regression weight for each anchor.
Only positive anchors are set to contribute to the regression loss, so this
method returns a weight of 1 for every positive anchor and 0 for every
negative anchor.
Args:
match: a matcher.Match object that provides a matching between anchors
and groundtruth boxes.
groundtruth_weights: a float tensor of shape [M] indicating the weight to
assign to all anchors match to a particular groundtruth box.
Returns:
a float32 tensor with shape [num_anchors] representing regression weights.
"""
return match.gather_based_on_match(
groundtruth_weights, ignored_value=0., unmatched_value=0.)
def _create_classification_weights(self,
match,
groundtruth_weights):
"""Create classification weights for each anchor.
Positive (matched) anchors are associated with a weight of
positive_class_weight and negative (unmatched) anchors are associated with
a weight of negative_class_weight. When anchors are ignored, weights are set
to zero. By default, both positive/negative weights are set to 1.0,
but they can be adjusted to handle class imbalance (which is almost always
the case in object detection).
Args:
match: a matcher.Match object that provides a matching between anchors
and groundtruth boxes.
groundtruth_weights: a float tensor of shape [M] indicating the weight to
assign to all anchors match to a particular groundtruth box.
Returns:
a float32 tensor with shape [num_anchors] representing classification
weights.
"""
return match.gather_based_on_match(
groundtruth_weights,
ignored_value=0.,
unmatched_value=self._negative_class_weight)
def get_box_coder(self):
"""Get BoxCoder of this TargetAssigner.
Returns:
BoxCoder object.
"""
return self._box_coder
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A set of functions that are used for visualization.
These functions often receive an image, perform some visualization on the image.
The functions do not return a value, instead they modify the image itself.
"""
import collections
import functools
# Set headless-friendly backend.
import matplotlib; matplotlib.use('Agg') # pylint: disable=multiple-statements
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import PIL.Image as Image
import PIL.ImageColor as ImageColor
import PIL.ImageDraw as ImageDraw
import PIL.ImageFont as ImageFont
import six
import tensorflow.compat.v2 as tf
from official.vision.detection.utils.object_detection import shape_utils
_TITLE_LEFT_MARGIN = 10
_TITLE_TOP_MARGIN = 10
STANDARD_COLORS = [
'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
'WhiteSmoke', 'Yellow', 'YellowGreen'
]
def save_image_array_as_png(image, output_path):
"""Saves an image (represented as a numpy array) to PNG.
Args:
image: a numpy array with shape [height, width, 3].
output_path: path to which image should be written.
"""
image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
with tf.io.gfile.GFile(output_path, 'w') as fid:
image_pil.save(fid, 'PNG')
def encode_image_array_as_png_str(image):
"""Encodes a numpy array into a PNG string.
Args:
image: a numpy array with shape [height, width, 3].
Returns:
PNG encoded image string.
"""
image_pil = Image.fromarray(np.uint8(image))
output = six.BytesIO()
image_pil.save(output, format='PNG')
png_string = output.getvalue()
output.close()
return png_string
def draw_bounding_box_on_image_array(image,
ymin,
xmin,
ymax,
xmax,
color='red',
thickness=4,
display_str_list=(),
use_normalized_coordinates=True):
"""Adds a bounding box to an image (numpy array).
Bounding box coordinates can be specified in either absolute (pixel) or
normalized coordinates by setting the use_normalized_coordinates argument.
Args:
image: a numpy array with shape [height, width, 3].
ymin: ymin of bounding box.
xmin: xmin of bounding box.
ymax: ymax of bounding box.
xmax: xmax of bounding box.
color: color to draw bounding box. Default is red.
thickness: line thickness. Default value is 4.
display_str_list: list of strings to display in box
(each to be shown on its own line).
use_normalized_coordinates: If True (default), treat coordinates
ymin, xmin, ymax, xmax as relative to the image. Otherwise treat
coordinates as absolute.
"""
image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
draw_bounding_box_on_image(image_pil, ymin, xmin, ymax, xmax, color,
thickness, display_str_list,
use_normalized_coordinates)
np.copyto(image, np.array(image_pil))
def draw_bounding_box_on_image(image,
ymin,
xmin,
ymax,
xmax,
color='red',
thickness=4,
display_str_list=(),
use_normalized_coordinates=True):
"""Adds a bounding box to an image.
Bounding box coordinates can be specified in either absolute (pixel) or
normalized coordinates by setting the use_normalized_coordinates argument.
Each string in display_str_list is displayed on a separate line above the
bounding box in black text on a rectangle filled with the input 'color'.
If the top of the bounding box extends to the edge of the image, the strings
are displayed below the bounding box.
Args:
image: a PIL.Image object.
ymin: ymin of bounding box.
xmin: xmin of bounding box.
ymax: ymax of bounding box.
xmax: xmax of bounding box.
color: color to draw bounding box. Default is red.
thickness: line thickness. Default value is 4.
display_str_list: list of strings to display in box
(each to be shown on its own line).
use_normalized_coordinates: If True (default), treat coordinates
ymin, xmin, ymax, xmax as relative to the image. Otherwise treat
coordinates as absolute.
"""
draw = ImageDraw.Draw(image)
im_width, im_height = image.size
if use_normalized_coordinates:
(left, right, top, bottom) = (xmin * im_width, xmax * im_width,
ymin * im_height, ymax * im_height)
else:
(left, right, top, bottom) = (xmin, xmax, ymin, ymax)
draw.line([(left, top), (left, bottom), (right, bottom),
(right, top), (left, top)], width=thickness, fill=color)
try:
font = ImageFont.truetype('arial.ttf', 24)
except IOError:
font = ImageFont.load_default()
# If the total height of the display strings added to the top of the bounding
# box exceeds the top of the image, stack the strings below the bounding box
# instead of above.
display_str_heights = [font.getsize(ds)[1] for ds in display_str_list]
# Each display_str has a top and bottom margin of 0.05x.
total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
if top > total_display_str_height:
text_bottom = top
else:
text_bottom = bottom + total_display_str_height
# Reverse list and print from bottom to top.
for display_str in display_str_list[::-1]:
text_width, text_height = font.getsize(display_str)
margin = np.ceil(0.05 * text_height)
draw.rectangle(
[(left, text_bottom - text_height - 2 * margin), (left + text_width,
text_bottom)],
fill=color)
draw.text(
(left + margin, text_bottom - text_height - margin),
display_str,
fill='black',
font=font)
text_bottom -= text_height - 2 * margin
def draw_bounding_boxes_on_image_array(image,
boxes,
color='red',
thickness=4,
display_str_list_list=()):
"""Draws bounding boxes on image (numpy array).
Args:
image: a numpy array object.
boxes: a 2 dimensional numpy array of [N, 4]: (ymin, xmin, ymax, xmax).
The coordinates are in normalized format between [0, 1].
color: color to draw bounding box. Default is red.
thickness: line thickness. Default value is 4.
display_str_list_list: list of list of strings.
a list of strings for each bounding box.
The reason to pass a list of strings for a
bounding box is that it might contain
multiple labels.
Raises:
ValueError: if boxes is not a [N, 4] array
"""
image_pil = Image.fromarray(image)
draw_bounding_boxes_on_image(image_pil, boxes, color, thickness,
display_str_list_list)
np.copyto(image, np.array(image_pil))
def draw_bounding_boxes_on_image(image,
boxes,
color='red',
thickness=4,
display_str_list_list=()):
"""Draws bounding boxes on image.
Args:
image: a PIL.Image object.
boxes: a 2 dimensional numpy array of [N, 4]: (ymin, xmin, ymax, xmax).
The coordinates are in normalized format between [0, 1].
color: color to draw bounding box. Default is red.
thickness: line thickness. Default value is 4.
display_str_list_list: list of list of strings.
a list of strings for each bounding box.
The reason to pass a list of strings for a
bounding box is that it might contain
multiple labels.
Raises:
ValueError: if boxes is not a [N, 4] array
"""
boxes_shape = boxes.shape
if not boxes_shape:
return
if len(boxes_shape) != 2 or boxes_shape[1] != 4:
raise ValueError('Input must be of size [N, 4]')
for i in range(boxes_shape[0]):
display_str_list = ()
if display_str_list_list:
display_str_list = display_str_list_list[i]
draw_bounding_box_on_image(image, boxes[i, 0], boxes[i, 1], boxes[i, 2],
boxes[i, 3], color, thickness, display_str_list)
def _visualize_boxes(image, boxes, classes, scores, category_index, **kwargs):
return visualize_boxes_and_labels_on_image_array(
image, boxes, classes, scores, category_index=category_index, **kwargs)
def _visualize_boxes_and_masks(image, boxes, classes, scores, masks,
category_index, **kwargs):
return visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index=category_index,
instance_masks=masks,
**kwargs)
def _visualize_boxes_and_keypoints(image, boxes, classes, scores, keypoints,
category_index, **kwargs):
return visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index=category_index,
keypoints=keypoints,
**kwargs)
def _visualize_boxes_and_masks_and_keypoints(
image, boxes, classes, scores, masks, keypoints, category_index, **kwargs):
return visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index=category_index,
instance_masks=masks,
keypoints=keypoints,
**kwargs)
def _resize_original_image(image, image_shape):
image = tf.expand_dims(image, 0)
image = tf.image.resize(
image, image_shape, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
return tf.cast(tf.squeeze(image, 0), tf.uint8)
def draw_bounding_boxes_on_image_tensors(images,
boxes,
classes,
scores,
category_index,
original_image_spatial_shape=None,
true_image_shape=None,
instance_masks=None,
keypoints=None,
max_boxes_to_draw=20,
min_score_thresh=0.2,
use_normalized_coordinates=True):
"""Draws bounding boxes, masks, and keypoints on batch of image tensors.
Args:
images: A 4D uint8 image tensor of shape [N, H, W, C]. If C > 3, additional
channels will be ignored. If C = 1, then we convert the images to RGB
images.
boxes: [N, max_detections, 4] float32 tensor of detection boxes.
classes: [N, max_detections] int tensor of detection classes. Note that
classes are 1-indexed.
scores: [N, max_detections] float32 tensor of detection scores.
category_index: a dict that maps integer ids to category dicts. e.g.
{1: {1: 'dog'}, 2: {2: 'cat'}, ...}
original_image_spatial_shape: [N, 2] tensor containing the spatial size of
the original image.
true_image_shape: [N, 3] tensor containing the spatial size of unpadded
original_image.
instance_masks: A 4D uint8 tensor of shape [N, max_detection, H, W] with
instance masks.
keypoints: A 4D float32 tensor of shape [N, max_detection, num_keypoints, 2]
with keypoints.
max_boxes_to_draw: Maximum number of boxes to draw on an image. Default 20.
min_score_thresh: Minimum score threshold for visualization. Default 0.2.
use_normalized_coordinates: Whether to assume boxes and kepoints are in
normalized coordinates (as opposed to absolute coordiantes).
Default is True.
Returns:
4D image tensor of type uint8, with boxes drawn on top.
"""
# Additional channels are being ignored.
if images.shape[3] > 3:
images = images[:, :, :, 0:3]
elif images.shape[3] == 1:
images = tf.image.grayscale_to_rgb(images)
visualization_keyword_args = {
'use_normalized_coordinates': use_normalized_coordinates,
'max_boxes_to_draw': max_boxes_to_draw,
'min_score_thresh': min_score_thresh,
'agnostic_mode': False,
'line_thickness': 4
}
if true_image_shape is None:
true_shapes = tf.constant(-1, shape=[images.shape.as_list()[0], 3])
else:
true_shapes = true_image_shape
if original_image_spatial_shape is None:
original_shapes = tf.constant(-1, shape=[images.shape.as_list()[0], 2])
else:
original_shapes = original_image_spatial_shape
if instance_masks is not None and keypoints is None:
visualize_boxes_fn = functools.partial(
_visualize_boxes_and_masks,
category_index=category_index,
**visualization_keyword_args)
elems = [
true_shapes, original_shapes, images, boxes, classes, scores,
instance_masks
]
elif instance_masks is None and keypoints is not None:
visualize_boxes_fn = functools.partial(
_visualize_boxes_and_keypoints,
category_index=category_index,
**visualization_keyword_args)
elems = [
true_shapes, original_shapes, images, boxes, classes, scores, keypoints
]
elif instance_masks is not None and keypoints is not None:
visualize_boxes_fn = functools.partial(
_visualize_boxes_and_masks_and_keypoints,
category_index=category_index,
**visualization_keyword_args)
elems = [
true_shapes, original_shapes, images, boxes, classes, scores,
instance_masks, keypoints
]
else:
visualize_boxes_fn = functools.partial(
_visualize_boxes,
category_index=category_index,
**visualization_keyword_args)
elems = [
true_shapes, original_shapes, images, boxes, classes, scores
]
def draw_boxes(image_and_detections):
"""Draws boxes on image."""
true_shape = image_and_detections[0]
original_shape = image_and_detections[1]
if true_image_shape is not None:
image = shape_utils.pad_or_clip_nd(
image_and_detections[2], [true_shape[0], true_shape[1], 3])
if original_image_spatial_shape is not None:
image_and_detections[2] = _resize_original_image(image, original_shape)
image_with_boxes = tf.compat.v1.py_func(visualize_boxes_fn,
image_and_detections[2:], tf.uint8)
return image_with_boxes
images = tf.map_fn(draw_boxes, elems, dtype=tf.uint8, back_prop=False)
return images
def draw_keypoints_on_image_array(image,
keypoints,
color='red',
radius=2,
use_normalized_coordinates=True):
"""Draws keypoints on an image (numpy array).
Args:
image: a numpy array with shape [height, width, 3].
keypoints: a numpy array with shape [num_keypoints, 2].
color: color to draw the keypoints with. Default is red.
radius: keypoint radius. Default value is 2.
use_normalized_coordinates: if True (default), treat keypoint values as
relative to the image. Otherwise treat them as absolute.
"""
image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
draw_keypoints_on_image(image_pil, keypoints, color, radius,
use_normalized_coordinates)
np.copyto(image, np.array(image_pil))
def draw_keypoints_on_image(image,
keypoints,
color='red',
radius=2,
use_normalized_coordinates=True):
"""Draws keypoints on an image.
Args:
image: a PIL.Image object.
keypoints: a numpy array with shape [num_keypoints, 2].
color: color to draw the keypoints with. Default is red.
radius: keypoint radius. Default value is 2.
use_normalized_coordinates: if True (default), treat keypoint values as
relative to the image. Otherwise treat them as absolute.
"""
draw = ImageDraw.Draw(image)
im_width, im_height = image.size
keypoints_x = [k[1] for k in keypoints]
keypoints_y = [k[0] for k in keypoints]
if use_normalized_coordinates:
keypoints_x = tuple([im_width * x for x in keypoints_x])
keypoints_y = tuple([im_height * y for y in keypoints_y])
for keypoint_x, keypoint_y in zip(keypoints_x, keypoints_y):
draw.ellipse([(keypoint_x - radius, keypoint_y - radius),
(keypoint_x + radius, keypoint_y + radius)],
outline=color, fill=color)
def draw_mask_on_image_array(image, mask, color='red', alpha=0.4):
"""Draws mask on an image.
Args:
image: uint8 numpy array with shape (img_height, img_height, 3)
mask: a uint8 numpy array of shape (img_height, img_height) with
values between either 0 or 1.
color: color to draw the keypoints with. Default is red.
alpha: transparency value between 0 and 1. (default: 0.4)
Raises:
ValueError: On incorrect data type for image or masks.
"""
if image.dtype != np.uint8:
raise ValueError('`image` not of type np.uint8')
if mask.dtype != np.uint8:
raise ValueError('`mask` not of type np.uint8')
if np.any(np.logical_and(mask != 1, mask != 0)):
raise ValueError('`mask` elements should be in [0, 1]')
if image.shape[:2] != mask.shape:
raise ValueError('The image has spatial dimensions %s but the mask has '
'dimensions %s' % (image.shape[:2], mask.shape))
rgb = ImageColor.getrgb(color)
pil_image = Image.fromarray(image)
solid_color = np.expand_dims(
np.ones_like(mask), axis=2) * np.reshape(list(rgb), [1, 1, 3])
pil_solid_color = Image.fromarray(np.uint8(solid_color)).convert('RGBA')
pil_mask = Image.fromarray(np.uint8(255.0*alpha*mask)).convert('L')
pil_image = Image.composite(pil_solid_color, pil_image, pil_mask)
np.copyto(image, np.array(pil_image.convert('RGB')))
def visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index,
instance_masks=None,
instance_boundaries=None,
keypoints=None,
use_normalized_coordinates=False,
max_boxes_to_draw=20,
min_score_thresh=.5,
agnostic_mode=False,
line_thickness=4,
groundtruth_box_visualization_color='black',
skip_scores=False,
skip_labels=False):
"""Overlay labeled boxes on an image with formatted scores and label names.
This function groups boxes that correspond to the same location
and creates a display string for each detection and overlays these
on the image. Note that this function modifies the image in place, and returns
that same image.
Args:
image: uint8 numpy array with shape (img_height, img_width, 3)
boxes: a numpy array of shape [N, 4]
classes: a numpy array of shape [N]. Note that class indices are 1-based,
and match the keys in the label map.
scores: a numpy array of shape [N] or None. If scores=None, then
this function assumes that the boxes to be plotted are groundtruth
boxes and plot all boxes as black with no classes or scores.
category_index: a dict containing category dictionaries (each holding
category index `id` and category name `name`) keyed by category indices.
instance_masks: a numpy array of shape [N, image_height, image_width] with
values ranging between 0 and 1, can be None.
instance_boundaries: a numpy array of shape [N, image_height, image_width]
with values ranging between 0 and 1, can be None.
keypoints: a numpy array of shape [N, num_keypoints, 2], can
be None
use_normalized_coordinates: whether boxes is to be interpreted as
normalized coordinates or not.
max_boxes_to_draw: maximum number of boxes to visualize. If None, draw
all boxes.
min_score_thresh: minimum score threshold for a box to be visualized
agnostic_mode: boolean (default: False) controlling whether to evaluate in
class-agnostic mode or not. This mode will display scores but ignore
classes.
line_thickness: integer (default: 4) controlling line width of the boxes.
groundtruth_box_visualization_color: box color for visualizing groundtruth
boxes
skip_scores: whether to skip score when drawing a single detection
skip_labels: whether to skip label when drawing a single detection
Returns:
uint8 numpy array with shape (img_height, img_width, 3) with overlaid boxes.
"""
# Create a display string (and color) for every box location, group any boxes
# that correspond to the same location.
box_to_display_str_map = collections.defaultdict(list)
box_to_color_map = collections.defaultdict(str)
box_to_instance_masks_map = {}
box_to_instance_boundaries_map = {}
box_to_keypoints_map = collections.defaultdict(list)
if not max_boxes_to_draw:
max_boxes_to_draw = boxes.shape[0]
for i in range(min(max_boxes_to_draw, boxes.shape[0])):
if scores is None or scores[i] > min_score_thresh:
box = tuple(boxes[i].tolist())
if instance_masks is not None:
box_to_instance_masks_map[box] = instance_masks[i]
if instance_boundaries is not None:
box_to_instance_boundaries_map[box] = instance_boundaries[i]
if keypoints is not None:
box_to_keypoints_map[box].extend(keypoints[i])
if scores is None:
box_to_color_map[box] = groundtruth_box_visualization_color
else:
display_str = ''
if not skip_labels:
if not agnostic_mode:
if classes[i] in category_index.keys():
class_name = category_index[classes[i]]['name']
else:
class_name = 'N/A'
display_str = str(class_name)
if not skip_scores:
if not display_str:
display_str = '{}%'.format(int(100*scores[i]))
else:
display_str = '{}: {}%'.format(display_str, int(100*scores[i]))
box_to_display_str_map[box].append(display_str)
if agnostic_mode:
box_to_color_map[box] = 'DarkOrange'
else:
box_to_color_map[box] = STANDARD_COLORS[
classes[i] % len(STANDARD_COLORS)]
# Draw all boxes onto image.
for box, color in box_to_color_map.items():
ymin, xmin, ymax, xmax = box
if instance_masks is not None:
draw_mask_on_image_array(
image,
box_to_instance_masks_map[box],
color=color
)
if instance_boundaries is not None:
draw_mask_on_image_array(
image,
box_to_instance_boundaries_map[box],
color='red',
alpha=1.0
)
draw_bounding_box_on_image_array(
image,
ymin,
xmin,
ymax,
xmax,
color=color,
thickness=line_thickness,
display_str_list=box_to_display_str_map[box],
use_normalized_coordinates=use_normalized_coordinates)
if keypoints is not None:
draw_keypoints_on_image_array(
image,
box_to_keypoints_map[box],
color=color,
radius=line_thickness / 2,
use_normalized_coordinates=use_normalized_coordinates)
return image
def add_cdf_image_summary(values, name):
"""Adds a tf.summary.image for a CDF plot of the values.
Normalizes `values` such that they sum to 1, plots the cumulative distribution
function and creates a tf image summary.
Args:
values: a 1-D float32 tensor containing the values.
name: name for the image summary.
"""
def cdf_plot(values):
"""Numpy function to plot CDF."""
normalized_values = values / np.sum(values)
sorted_values = np.sort(normalized_values)
cumulative_values = np.cumsum(sorted_values)
fraction_of_examples = (np.arange(cumulative_values.size, dtype=np.float32)
/ cumulative_values.size)
fig = plt.figure(frameon=False)
ax = fig.add_subplot('111')
ax.plot(fraction_of_examples, cumulative_values)
ax.set_ylabel('cumulative normalized values')
ax.set_xlabel('fraction of examples')
fig.canvas.draw()
width, height = fig.get_size_inches() * fig.get_dpi()
image = np.fromstring(fig.canvas.tostring_rgb(), dtype='uint8').reshape(
1, int(height), int(width), 3)
return image
cdf_plot = tf.compat.v1.py_func(cdf_plot, [values], tf.uint8)
tf.compat.v1.summary.image(name, cdf_plot)
def add_hist_image_summary(values, bins, name):
"""Adds a tf.summary.image for a histogram plot of the values.
Plots the histogram of values and creates a tf image summary.
Args:
values: a 1-D float32 tensor containing the values.
bins: bin edges which will be directly passed to np.histogram.
name: name for the image summary.
"""
def hist_plot(values, bins):
"""Numpy function to plot hist."""
fig = plt.figure(frameon=False)
ax = fig.add_subplot('111')
y, x = np.histogram(values, bins=bins)
ax.plot(x[:-1], y)
ax.set_ylabel('count')
ax.set_xlabel('value')
fig.canvas.draw()
width, height = fig.get_size_inches() * fig.get_dpi()
image = np.fromstring(
fig.canvas.tostring_rgb(), dtype='uint8').reshape(
1, int(height), int(width), 3)
return image
hist_plot = tf.compat.v1.py_func(hist_plot, [values, bins], tf.uint8)
tf.compat.v1.summary.image(name, hist_plot)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utility functions for segmentations."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import cv2
def segm_results(masks, detections, image_height, image_width):
"""Generates segmentation results."""
def expand_boxes(boxes, scale):
"""Expands an array of boxes by a given scale."""
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227 # pylint: disable=line-too-long
# The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
# whereas `boxes` here is in [x1, y1, w, h] form
w_half = boxes[:, 2] * .5
h_half = boxes[:, 3] * .5
x_c = boxes[:, 0] + w_half
y_c = boxes[:, 1] + h_half
w_half *= scale
h_half *= scale
boxes_exp = np.zeros(boxes.shape)
boxes_exp[:, 0] = x_c - w_half
boxes_exp[:, 2] = x_c + w_half
boxes_exp[:, 1] = y_c - h_half
boxes_exp[:, 3] = y_c + h_half
return boxes_exp
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812 # pylint: disable=line-too-long
# To work around an issue with cv2.resize (it seems to automatically pad
# with repeated border values), we manually zero-pad the masks by 1 pixel
# prior to resizing back to the original image resolution. This prevents
# "top hat" artifacts. We therefore need to expand the reference boxes by an
# appropriate factor.
mask_size = masks.shape[2]
scale = (mask_size + 2.0) / mask_size
ref_boxes = expand_boxes(detections[:, 1:5], scale)
ref_boxes = ref_boxes.astype(np.int32)
padded_mask = np.zeros((mask_size + 2, mask_size + 2), dtype=np.float32)
segms = []
for mask_ind, mask in enumerate(masks):
padded_mask[1:-1, 1:-1] = mask[:, :]
ref_box = ref_boxes[mask_ind, :]
w = ref_box[2] - ref_box[0] + 1
h = ref_box[3] - ref_box[1] + 1
w = np.maximum(w, 1)
h = np.maximum(h, 1)
mask = cv2.resize(padded_mask, (w, h))
mask = np.array(mask > 0.5, dtype=np.uint8)
im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
x_0 = max(ref_box[0], 0)
x_1 = min(ref_box[2] + 1, image_width)
y_0 = max(ref_box[1], 0)
y_1 = min(ref_box[3] + 1, image_height)
im_mask[y_0:y_1, x_0:x_1] = mask[
(y_0 - ref_box[1]):(y_1 - ref_box[1]),
(x_0 - ref_box[0]):(x_1 - ref_box[0])
]
segms.append(im_mask)
segms = np.array(segms)
assert masks.shape[0] == segms.shape[0]
return segms
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions to performa spatial transformation for Tensor."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow.compat.v2 as tf
_EPSILON = 1e-8
def nearest_upsampling(data, scale):
"""Nearest neighbor upsampling implementation.
Args:
data: A tensor with a shape of [batch, height_in, width_in, channels].
scale: An integer multiple to scale resolution of input data.
Returns:
data_up: A tensor with a shape of
[batch, height_in*scale, width_in*scale, channels]. Same dtype as input
data.
"""
with tf.name_scope('nearest_upsampling'):
bs, _, _, c = data.get_shape().as_list()
shape = tf.shape(input=data)
h = shape[1]
w = shape[2]
bs = -1 if bs is None else bs
# Uses reshape to quickly upsample the input. The nearest pixel is selected
# implicitly via broadcasting.
data = tf.reshape(data, [bs, h, 1, w, 1, c]) * tf.ones(
[1, 1, scale, 1, scale, 1], dtype=data.dtype)
return tf.reshape(data, [bs, h * scale, w * scale, c])
def selective_crop_and_resize(features,
boxes,
box_levels,
boundaries,
output_size=7,
sample_offset=0.5):
"""Crop and resize boxes on a set of feature maps.
Given multiple features maps indexed by different levels, and a set of boxes
where each box is mapped to a certain level, it selectively crops and resizes
boxes from the corresponding feature maps to generate the box features.
We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf,
figure 3 for reference). Specifically, for each feature map, we select an
(output_size, output_size) set of pixels corresponding to the box location,
and then use bilinear interpolation to select the feature value for each
pixel.
For performance, we perform the gather and interpolation on all layers as a
single operation. This is op the multi-level features are first stacked and
gathered into [2*output_size, 2*output_size] feature points. Then bilinear
interpolation is performed on the gathered feature points to generate
[output_size, output_size] RoIAlign feature map.
Here is the step-by-step algorithm:
1. The multi-level features are gathered into a
[batch_size, num_boxes, output_size*2, output_size*2, num_filters]
Tensor. The Tensor contains four neighboring feature points for each
vertice in the output grid.
2. Compute the interpolation kernel of shape
[batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis
can be seen as stacking 2x2 interpolation kernels for all vertices in the
output grid.
3. Element-wise multiply the gathered features and interpolation kernel.
Then apply 2x2 average pooling to reduce spatial dimension to
output_size.
Args:
features: a 5-D tensor of shape
[batch_size, num_levels, max_height, max_width, num_filters] where
cropping and resizing are based.
boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
information of each box w.r.t. the corresponding feature map.
boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
in terms of the number of pixels of the corresponding feature map size.
box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing
the 0-based corresponding feature level index of each box.
boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
the boundary (in (y, x)) of the corresponding feature map for each box.
Any resampled grid points that go beyond the bounary will be clipped.
output_size: a scalar indicating the output crop size.
sample_offset: a float number in [0, 1] indicates the subpixel sample offset
from grid point.
Returns:
features_per_box: a 5-D tensor of shape
[batch_size, num_boxes, output_size, output_size, num_filters]
representing the cropped features.
"""
(batch_size, num_levels, max_feature_height, max_feature_width,
num_filters) = features.get_shape().as_list()
_, num_boxes, _ = boxes.get_shape().as_list()
# Compute the grid position w.r.t. the corresponding feature map.
box_grid_x = []
box_grid_y = []
for i in range(output_size):
box_grid_x.append(boxes[:, :, 1] +
(i + sample_offset) * boxes[:, :, 3] / output_size)
box_grid_y.append(boxes[:, :, 0] +
(i + sample_offset) * boxes[:, :, 2] / output_size)
box_grid_x = tf.stack(box_grid_x, axis=2)
box_grid_y = tf.stack(box_grid_y, axis=2)
# Compute indices for gather operation.
box_grid_y0 = tf.floor(box_grid_y)
box_grid_x0 = tf.floor(box_grid_x)
box_grid_x0 = tf.maximum(0., box_grid_x0)
box_grid_y0 = tf.maximum(0., box_grid_y0)
box_gridx0x1 = tf.stack(
[tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1)),
tf.minimum(box_grid_x0 + 1, tf.expand_dims(boundaries[:, :, 1], -1))],
axis=3)
box_gridy0y1 = tf.stack(
[tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1)),
tf.minimum(box_grid_y0 + 1, tf.expand_dims(boundaries[:, :, 0], -1))],
axis=3)
x_indices = tf.cast(
tf.reshape(box_gridx0x1,
[batch_size, num_boxes, output_size * 2]), dtype=tf.int32)
y_indices = tf.cast(
tf.reshape(box_gridy0y1,
[batch_size, num_boxes, output_size * 2]), dtype=tf.int32)
height_dim_offset = max_feature_width
level_dim_offset = max_feature_height * height_dim_offset
batch_dim_offset = num_levels * level_dim_offset
indices = tf.reshape(
tf.tile(tf.reshape(tf.range(batch_size) * batch_dim_offset,
[batch_size, 1, 1, 1]),
[1, num_boxes, output_size * 2, output_size * 2]) +
tf.tile(tf.reshape(box_levels * level_dim_offset,
[batch_size, num_boxes, 1, 1]),
[1, 1, output_size * 2, output_size * 2]) +
tf.tile(tf.reshape(y_indices * height_dim_offset,
[batch_size, num_boxes, output_size * 2, 1]),
[1, 1, 1, output_size * 2]) +
tf.tile(tf.reshape(x_indices,
[batch_size, num_boxes, 1, output_size * 2]),
[1, 1, output_size * 2, 1]), [-1])
features = tf.reshape(features, [-1, num_filters])
features_per_box = tf.reshape(
tf.gather(features, indices),
[batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
# The RoIAlign feature f can be computed by bilinear interpolation of four
# neighboring feature points f0, f1, f2, and f3.
# f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
# [f10, f11]]
# f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
# f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
ly = box_grid_y - box_grid_y0
lx = box_grid_x - box_grid_x0
hy = 1.0 - ly
hx = 1.0 - lx
kernel_x = tf.reshape(tf.stack([hx, lx], axis=3),
[batch_size, num_boxes, 1, output_size*2])
kernel_y = tf.reshape(tf.stack([hy, ly], axis=3),
[batch_size, num_boxes, output_size*2, 1])
# Uses implicit broadcast to generate the interpolation kernel. The
# multiplier `4` is for avg pooling.
interpolation_kernel = kernel_y * kernel_x * 4
# Interpolates the gathered features with computed interpolation kernels.
features_per_box *= tf.cast(
tf.expand_dims(interpolation_kernel, axis=4),
dtype=features_per_box.dtype)
features_per_box = tf.reshape(
features_per_box,
[batch_size * num_boxes, output_size*2, output_size*2, num_filters])
features_per_box = tf.nn.avg_pool2d(
input=features_per_box,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='VALID')
features_per_box = tf.reshape(
features_per_box,
[batch_size, num_boxes, output_size, output_size, num_filters])
return features_per_box
def multilevel_crop_and_resize(features, boxes, output_size=7):
"""Crop and resize on multilevel feature pyramid.
Generate the (output_size, output_size) set of pixels for each input box
by first locating the box into the correct feature level, and then cropping
and resizing it using the correspoding feature map of that level.
Args:
features: A dictionary with key as pyramid level and value as features.
The features are in shape of [batch_size, height_l, width_l, num_filters].
boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row
represents a box with [y1, x1, y2, x2] in un-normalized coordinates.
output_size: A scalar to indicate the output crop size.
Returns:
A 5-D tensor representing feature crop of shape
[batch_size, num_boxes, output_size, output_size, num_filters].
"""
with tf.name_scope('multilevel_crop_and_resize'):
levels = features.keys()
min_level = min(levels)
max_level = max(levels)
_, max_feature_height, max_feature_width, _ = (
features[min_level].get_shape().as_list())
# Stacks feature pyramid into a features_all of shape
# [batch_size, levels, height, width, num_filters].
features_all = []
for level in range(min_level, max_level + 1):
features_all.append(tf.image.pad_to_bounding_box(
features[level], 0, 0, max_feature_height, max_feature_width))
features_all = tf.stack(features_all, axis=1)
# Assigns boxes to the right level.
box_width = boxes[:, :, 3] - boxes[:, :, 1]
box_height = boxes[:, :, 2] - boxes[:, :, 0]
areas_sqrt = tf.sqrt(box_height * box_width)
levels = tf.cast(
tf.math.floordiv(
tf.math.log(tf.divide(areas_sqrt, 224.0)), tf.math.log(2.0))
+ 4.0,
dtype=tf.int32)
# Maps levels between [min_level, max_level].
levels = tf.minimum(max_level, tf.maximum(levels, min_level))
# Projects box location and sizes to corresponding feature levels.
scale_to_level = tf.cast(
tf.pow(tf.constant(2.0), tf.cast(levels, tf.float32)),
dtype=boxes.dtype)
boxes /= tf.expand_dims(scale_to_level, axis=2)
box_width /= scale_to_level
box_height /= scale_to_level
boxes = tf.concat([boxes[:, :, 0:2],
tf.expand_dims(box_height, -1),
tf.expand_dims(box_width, -1)], axis=-1)
# Maps levels to [0, max_level-min_level].
levels -= min_level
level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32))
boundary = tf.cast(
tf.concat([
tf.expand_dims([[tf.cast(max_feature_height, tf.float32)]] /
level_strides - 1,
axis=-1),
tf.expand_dims([[tf.cast(max_feature_width, tf.float32)]] /
level_strides - 1,
axis=-1),
], axis=-1),
boxes.dtype)
return selective_crop_and_resize(
features_all, boxes, levels, boundary, output_size)
def single_level_feature_crop(features, level_boxes, detection_prior_levels,
min_mask_level, mask_crop_size):
"""Crop the FPN features at the appropriate levels for each detection.
Args:
features: a float tensor of shape [batch_size, num_levels,
max_feature_size, max_feature_size, num_downsample_channels].
level_boxes: a float Tensor of the level boxes to crop from.
[batch_size, num_instances, 4].
detection_prior_levels: an int Tensor of instance assigned level of shape
[batch_size, num_instances].
min_mask_level: minimum FPN level to crop mask feature from.
mask_crop_size: an int of mask crop size.
Returns:
crop_features: a float Tensor of shape [batch_size * num_instances,
mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
instance feature crop.
"""
(batch_size, num_levels, max_feature_size,
_, num_downsample_channels) = features.get_shape().as_list()
_, num_of_instances, _ = level_boxes.get_shape().as_list()
level_boxes = tf.cast(level_boxes, tf.int32)
assert num_of_instances == detection_prior_levels.get_shape().as_list()[1]
x_start_indices = level_boxes[:, :, 1]
y_start_indices = level_boxes[:, :, 0]
# generate the full indices (not just the starting index)
x_idx_list = []
y_idx_list = []
for i in range(mask_crop_size):
x_idx_list.append(x_start_indices + i)
y_idx_list.append(y_start_indices + i)
x_indices = tf.stack(x_idx_list, axis=2)
y_indices = tf.stack(y_idx_list, axis=2)
levels = detection_prior_levels - min_mask_level
height_dim_size = max_feature_size
level_dim_size = max_feature_size * height_dim_size
batch_dim_size = num_levels * level_dim_size
# TODO(weicheng) change this to gather_nd for better readability.
indices = tf.reshape(
tf.tile(
tf.reshape(
tf.range(batch_size) * batch_dim_size,
[batch_size, 1, 1, 1]),
[1, num_of_instances,
mask_crop_size, mask_crop_size]) +
tf.tile(
tf.reshape(levels * level_dim_size,
[batch_size, num_of_instances, 1, 1]),
[1, 1, mask_crop_size, mask_crop_size]) +
tf.tile(
tf.reshape(y_indices * height_dim_size,
[batch_size, num_of_instances,
mask_crop_size, 1]),
[1, 1, 1, mask_crop_size]) +
tf.tile(
tf.reshape(x_indices,
[batch_size, num_of_instances,
1, mask_crop_size]),
[1, 1, mask_crop_size, 1]), [-1])
features_r2 = tf.reshape(features,
[-1, num_downsample_channels])
crop_features = tf.reshape(
tf.gather(features_r2, indices),
[batch_size * num_of_instances,
mask_crop_size, mask_crop_size,
num_downsample_channels])
return crop_features
def crop_mask_in_target_box(masks, boxes, target_boxes, output_size):
"""Crop masks in target boxes.
Args:
masks: A tensor with a shape of [batch_size, num_masks, height, width].
boxes: a float tensor representing box cooridnates that tightly enclose
masks with a shape of [batch_size, num_masks, 4] in un-normalized
coordinates. A box is represented by [ymin, xmin, ymax, xmax].
target_boxes: a float tensor representing target box cooridnates for
masks with a shape of [batch_size, num_masks, 4] in un-normalized
coordinates. A box is represented by [ymin, xmin, ymax, xmax].
output_size: A scalar to indicate the output crop size. It currently only
supports to output a square shape outputs.
Returns:
A 4-D tensor representing feature crop of shape
[batch_size, num_boxes, output_size, output_size].
"""
with tf.name_scope('crop_mask_in_target_box'):
batch_size, num_masks, height, width = masks.get_shape().as_list()
masks = tf.reshape(masks, [batch_size*num_masks, height, width, 1])
# Pad zeros on the boundary of masks.
masks = tf.image.pad_to_bounding_box(masks, 2, 2, height + 4, width + 4)
masks = tf.reshape(masks, [batch_size, num_masks, height+4, width+4, 1])
# Projects target box locations and sizes to corresponding cropped
# mask coordinates.
gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
value=boxes, num_or_size_splits=4, axis=2)
bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
value=target_boxes, num_or_size_splits=4, axis=2)
y_transform = (bb_y_min - gt_y_min) * height / (
gt_y_max - gt_y_min + _EPSILON) + 2
x_transform = (bb_x_min - gt_x_min) * height / (
gt_x_max - gt_x_min + _EPSILON) + 2
h_transform = (bb_y_max - bb_y_min) * width / (
gt_y_max - gt_y_min + _EPSILON)
w_transform = (bb_x_max - bb_x_min) * width / (
gt_x_max - gt_x_min + _EPSILON)
boundaries = tf.concat([
tf.cast(
tf.ones_like(y_transform) * ((height + 4) - 1), dtype=tf.float32),
tf.cast(
tf.ones_like(x_transform) * ((width + 4) - 1), dtype=tf.float32)
],
axis=-1)
# Reshape tensors to have the right shape for selective_crop_and_resize.
trasnformed_boxes = tf.concat(
[y_transform, x_transform, h_transform, w_transform], -1)
levels = tf.tile(tf.reshape(tf.range(num_masks), [1, num_masks]),
[batch_size, 1])
cropped_masks = selective_crop_and_resize(
masks,
trasnformed_boxes,
levels,
boundaries,
output_size,
sample_offset=0)
cropped_masks = tf.squeeze(cropped_masks, axis=-1)
return cropped_masks
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment