Commit c44482ab authored by A. Unique TensorFlower's avatar A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 431756117
parent 10ee28dd
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Anchor box and labeler definition."""
import collections
# Import libraries
import tensorflow as tf
from official.vision.ops import anchor_generator
from official.vision.ops import box_matcher
from official.vision.ops import iou_similarity
from official.vision.ops import target_gather
from official.vision.utils.object_detection import balanced_positive_negative_sampler
from official.vision.utils.object_detection import box_list
from official.vision.utils.object_detection import faster_rcnn_box_coder
class Anchor(object):
"""Anchor class for anchor-based object detectors."""
def __init__(self,
min_level,
max_level,
num_scales,
aspect_ratios,
anchor_size,
image_size):
"""Constructs multiscale anchors.
Args:
min_level: integer number of minimum level of the output feature pyramid.
max_level: integer number of maximum level of the output feature pyramid.
num_scales: integer number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: list of float numbers representing the aspect raito anchors
added on each level. The number indicates the ratio of width to height.
For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
scale level.
anchor_size: float number representing the scale of size of the base
anchor to the feature stride 2^level.
image_size: a list of integer numbers or Tensors representing
[height, width] of the input image size.The image_size should be divided
by the largest feature stride 2^max_level.
"""
self.min_level = min_level
self.max_level = max_level
self.num_scales = num_scales
self.aspect_ratios = aspect_ratios
self.anchor_size = anchor_size
self.image_size = image_size
self.boxes = self._generate_boxes()
def _generate_boxes(self):
"""Generates multiscale anchor boxes.
Returns:
a Tensor of shape [N, 4], representing anchor boxes of all levels
concatenated together.
"""
boxes_all = []
for level in range(self.min_level, self.max_level + 1):
boxes_l = []
for scale in range(self.num_scales):
for aspect_ratio in self.aspect_ratios:
stride = 2 ** level
intermidate_scale = 2 ** (scale / float(self.num_scales))
base_anchor_size = self.anchor_size * stride * intermidate_scale
aspect_x = aspect_ratio ** 0.5
aspect_y = aspect_ratio ** -0.5
half_anchor_size_x = base_anchor_size * aspect_x / 2.0
half_anchor_size_y = base_anchor_size * aspect_y / 2.0
x = tf.range(stride / 2, self.image_size[1], stride)
y = tf.range(stride / 2, self.image_size[0], stride)
xv, yv = tf.meshgrid(x, y)
xv = tf.cast(tf.reshape(xv, [-1]), dtype=tf.float32)
yv = tf.cast(tf.reshape(yv, [-1]), dtype=tf.float32)
# Tensor shape Nx4.
boxes = tf.stack([yv - half_anchor_size_y, xv - half_anchor_size_x,
yv + half_anchor_size_y, xv + half_anchor_size_x],
axis=1)
boxes_l.append(boxes)
# Concat anchors on the same level to tensor shape NxAx4.
boxes_l = tf.stack(boxes_l, axis=1)
boxes_l = tf.reshape(boxes_l, [-1, 4])
boxes_all.append(boxes_l)
return tf.concat(boxes_all, axis=0)
def unpack_labels(self, labels):
"""Unpacks an array of labels into multiscales labels."""
unpacked_labels = collections.OrderedDict()
count = 0
for level in range(self.min_level, self.max_level + 1):
feat_size_y = tf.cast(self.image_size[0] / 2 ** level, tf.int32)
feat_size_x = tf.cast(self.image_size[1] / 2 ** level, tf.int32)
steps = feat_size_y * feat_size_x * self.anchors_per_location
unpacked_labels[str(level)] = tf.reshape(
labels[count:count + steps], [feat_size_y, feat_size_x, -1])
count += steps
return unpacked_labels
@property
def anchors_per_location(self):
return self.num_scales * len(self.aspect_ratios)
@property
def multilevel_boxes(self):
return self.unpack_labels(self.boxes)
class AnchorLabeler(object):
"""Labeler for dense object detector."""
def __init__(self,
match_threshold=0.5,
unmatched_threshold=0.5):
"""Constructs anchor labeler to assign labels to anchors.
Args:
match_threshold: a float number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: a float number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
"""
self.similarity_calc = iou_similarity.IouSimilarity()
self.target_gather = target_gather.TargetGather()
self.matcher = box_matcher.BoxMatcher(
thresholds=[unmatched_threshold, match_threshold],
indicators=[-1, -2, 1],
force_match_for_each_col=True)
self.box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder()
def label_anchors(self,
anchor_boxes,
gt_boxes,
gt_labels,
gt_attributes=None,
gt_weights=None):
"""Labels anchors with ground truth inputs.
Args:
anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_labels: A integer tensor with shape [N, 1] representing groundtruth
classes.
gt_attributes: If not None, a dict of (name, gt_attribute) pairs.
`gt_attribute` is a float tensor with shape [N, attribute_size]
representing groundtruth attributes.
gt_weights: If not None, a float tensor with shape [N] representing
groundtruth weights.
Returns:
cls_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
box_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors_per_location * 4]. The height_l
and width_l represent the dimension of bounding box regression output at
l-th level.
attribute_targets_dict: a dict with (name, attribute_targets) pairs. Each
`attribute_targets` represents an ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors_per_location * attribute_size].
The height_l and width_l represent the dimension of attribute prediction
output at l-th level.
cls_weights: A flattened Tensor with shape [batch_size, num_anchors], that
serves as masking / sample weight for classification loss. Its value
is 1.0 for positive and negative matched anchors, and 0.0 for ignored
anchors.
box_weights: A flattened Tensor with shape [batch_size, num_anchors], that
serves as masking / sample weight for regression loss. Its value is
1.0 for positive matched anchors, and 0.0 for negative and ignored
anchors.
"""
flattened_anchor_boxes = []
for anchors in anchor_boxes.values():
flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4]))
flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0)
similarity_matrix = self.similarity_calc(flattened_anchor_boxes, gt_boxes)
match_indices, match_indicators = self.matcher(similarity_matrix)
mask = tf.less_equal(match_indicators, 0)
cls_mask = tf.expand_dims(mask, -1)
cls_targets = self.target_gather(gt_labels, match_indices, cls_mask, -1)
box_mask = tf.tile(cls_mask, [1, 4])
box_targets = self.target_gather(gt_boxes, match_indices, box_mask)
att_targets = {}
if gt_attributes:
for k, v in gt_attributes.items():
att_size = v.get_shape().as_list()[-1]
att_mask = tf.tile(cls_mask, [1, att_size])
att_targets[k] = self.target_gather(v, match_indices, att_mask, 0.0)
weights = tf.squeeze(tf.ones_like(gt_labels, dtype=tf.float32), -1)
if gt_weights is not None:
weights = tf.math.multiply(weights, gt_weights)
box_weights = self.target_gather(weights, match_indices, mask)
ignore_mask = tf.equal(match_indicators, -2)
cls_weights = self.target_gather(weights, match_indices, ignore_mask)
box_targets_list = box_list.BoxList(box_targets)
anchor_box_list = box_list.BoxList(flattened_anchor_boxes)
box_targets = self.box_coder.encode(box_targets_list, anchor_box_list)
# Unpacks labels into multi-level representations.
cls_targets_dict = unpack_targets(cls_targets, anchor_boxes)
box_targets_dict = unpack_targets(box_targets, anchor_boxes)
attribute_targets_dict = {}
for k, v in att_targets.items():
attribute_targets_dict[k] = unpack_targets(v, anchor_boxes)
return cls_targets_dict, box_targets_dict, attribute_targets_dict, cls_weights, box_weights
class RpnAnchorLabeler(AnchorLabeler):
"""Labeler for Region Proposal Network."""
def __init__(self,
match_threshold=0.7,
unmatched_threshold=0.3,
rpn_batch_size_per_im=256,
rpn_fg_fraction=0.5):
AnchorLabeler.__init__(self, match_threshold=match_threshold,
unmatched_threshold=unmatched_threshold)
self._rpn_batch_size_per_im = rpn_batch_size_per_im
self._rpn_fg_fraction = rpn_fg_fraction
def _get_rpn_samples(self, match_results):
"""Computes anchor labels.
This function performs subsampling for foreground (fg) and background (bg)
anchors.
Args:
match_results: A integer tensor with shape [N] representing the
matching results of anchors. (1) match_results[i]>=0,
meaning that column i is matched with row match_results[i].
(2) match_results[i]=-1, meaning that column i is not matched.
(3) match_results[i]=-2, meaning that column i is ignored.
Returns:
score_targets: a integer tensor with the a shape of [N].
(1) score_targets[i]=1, the anchor is a positive sample.
(2) score_targets[i]=0, negative. (3) score_targets[i]=-1, the anchor is
don't care (ignore).
"""
sampler = (
balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
positive_fraction=self._rpn_fg_fraction, is_static=False))
# indicator includes both positive and negative labels.
# labels includes only positives labels.
# positives = indicator & labels.
# negatives = indicator & !labels.
# ignore = !indicator.
indicator = tf.greater(match_results, -2)
labels = tf.greater(match_results, -1)
samples = sampler.subsample(
indicator, self._rpn_batch_size_per_im, labels)
positive_labels = tf.where(
tf.logical_and(samples, labels),
tf.constant(2, dtype=tf.int32, shape=match_results.shape),
tf.constant(0, dtype=tf.int32, shape=match_results.shape))
negative_labels = tf.where(
tf.logical_and(samples, tf.logical_not(labels)),
tf.constant(1, dtype=tf.int32, shape=match_results.shape),
tf.constant(0, dtype=tf.int32, shape=match_results.shape))
ignore_labels = tf.fill(match_results.shape, -1)
return (ignore_labels + positive_labels + negative_labels,
positive_labels, negative_labels)
def label_anchors(self, anchor_boxes, gt_boxes, gt_labels):
"""Labels anchors with ground truth inputs.
Args:
anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_labels: A integer tensor with shape [N, 1] representing groundtruth
classes.
Returns:
score_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors]. The height_l and width_l
represent the dimension of class logits at l-th level.
box_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
"""
flattened_anchor_boxes = []
for anchors in anchor_boxes.values():
flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4]))
flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0)
similarity_matrix = self.similarity_calc(flattened_anchor_boxes, gt_boxes)
match_indices, match_indicators = self.matcher(similarity_matrix)
box_mask = tf.tile(tf.expand_dims(tf.less_equal(match_indicators, 0), -1),
[1, 4])
box_targets = self.target_gather(gt_boxes, match_indices, box_mask)
box_targets_list = box_list.BoxList(box_targets)
anchor_box_list = box_list.BoxList(flattened_anchor_boxes)
box_targets = self.box_coder.encode(box_targets_list, anchor_box_list)
# Zero out the unmatched and ignored regression targets.
num_matches = match_indices.shape.as_list()[0] or tf.shape(match_indices)[0]
unmatched_ignored_box_targets = tf.zeros([num_matches, 4], dtype=tf.float32)
matched_anchors_mask = tf.greater_equal(match_indicators, 0)
# To broadcast matched_anchors_mask to the same shape as
# matched_reg_targets.
matched_anchors_mask = tf.tile(
tf.expand_dims(matched_anchors_mask, 1),
[1, tf.shape(box_targets)[1]])
box_targets = tf.where(matched_anchors_mask, box_targets,
unmatched_ignored_box_targets)
# score_targets contains the subsampled positive and negative anchors.
score_targets, _, _ = self._get_rpn_samples(match_indicators)
# Unpacks labels.
score_targets_dict = unpack_targets(score_targets, anchor_boxes)
box_targets_dict = unpack_targets(box_targets, anchor_boxes)
return score_targets_dict, box_targets_dict
def build_anchor_generator(min_level, max_level, num_scales, aspect_ratios,
anchor_size):
"""Build anchor generator from levels."""
anchor_sizes = collections.OrderedDict()
strides = collections.OrderedDict()
scales = []
for scale in range(num_scales):
scales.append(2**(scale / float(num_scales)))
for level in range(min_level, max_level + 1):
stride = 2**level
strides[str(level)] = stride
anchor_sizes[str(level)] = anchor_size * stride
anchor_gen = anchor_generator.AnchorGenerator(
anchor_sizes=anchor_sizes,
scales=scales,
aspect_ratios=aspect_ratios,
strides=strides)
return anchor_gen
def unpack_targets(targets, anchor_boxes_dict):
"""Unpacks an array of labels into multiscales labels."""
unpacked_targets = collections.OrderedDict()
count = 0
for level, anchor_boxes in anchor_boxes_dict.items():
feat_size_shape = anchor_boxes.shape.as_list()
feat_size_y = feat_size_shape[0]
feat_size_x = feat_size_shape[1]
anchors_per_location = int(feat_size_shape[2] / 4)
steps = feat_size_y * feat_size_x * anchors_per_location
unpacked_targets[level] = tf.reshape(targets[count:count + steps],
[feat_size_y, feat_size_x, -1])
count += steps
return unpacked_targets
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Multi scale anchor generator definition."""
import tensorflow as tf
# (TODO/tanzheny): consider having customized anchor offset.
class _SingleAnchorGenerator:
"""Utility to generate anchors for a single feature map.
Example:
```python
anchor_gen = _SingleAnchorGenerator(32, [.5, 1., 2.], stride=16)
anchors = anchor_gen([512, 512, 3])
```
"""
def __init__(self,
anchor_size,
scales,
aspect_ratios,
stride,
clip_boxes=False):
"""Constructs single scale anchor.
Args:
anchor_size: A single int represents the base anchor size. The anchor
height will be `anchor_size / sqrt(aspect_ratio)`, anchor width will be
`anchor_size * sqrt(aspect_ratio)`.
scales: A list/tuple, or a list/tuple of a list/tuple of positive
floats representing the actual anchor size to the base `anchor_size`.
aspect_ratios: a list/tuple of positive floats representing the ratio of
anchor width to anchor height.
stride: A single int represents the anchor stride size between center of
each anchor.
clip_boxes: Boolean to represent whether the anchor coordinates should be
clipped to the image size. Defaults to `True`.
Input shape: the size of the image, `[H, W, C]`
Output shape: the size of anchors, `[(H / stride) * (W / stride), 4]`
"""
self.anchor_size = anchor_size
self.scales = scales
self.aspect_ratios = aspect_ratios
self.stride = stride
self.clip_boxes = clip_boxes
def __call__(self, image_size):
image_height = tf.cast(image_size[0], tf.float32)
image_width = tf.cast(image_size[1], tf.float32)
k = len(self.scales) * len(self.aspect_ratios)
aspect_ratios_sqrt = tf.cast(tf.sqrt(self.aspect_ratios), dtype=tf.float32)
anchor_size = tf.cast(self.anchor_size, tf.float32)
# [K]
anchor_heights = []
anchor_widths = []
for scale in self.scales:
anchor_size_t = anchor_size * scale
anchor_height = anchor_size_t / aspect_ratios_sqrt
anchor_width = anchor_size_t * aspect_ratios_sqrt
anchor_heights.append(anchor_height)
anchor_widths.append(anchor_width)
anchor_heights = tf.concat(anchor_heights, axis=0)
anchor_widths = tf.concat(anchor_widths, axis=0)
half_anchor_heights = tf.reshape(0.5 * anchor_heights, [1, 1, k])
half_anchor_widths = tf.reshape(0.5 * anchor_widths, [1, 1, k])
stride = tf.cast(self.stride, tf.float32)
# [W]
cx = tf.range(0.5 * stride, image_width, stride)
# [H]
cy = tf.range(0.5 * stride, image_height, stride)
# [H, W]
cx_grid, cy_grid = tf.meshgrid(cx, cy)
# [H, W, 1]
cx_grid = tf.expand_dims(cx_grid, axis=-1)
cy_grid = tf.expand_dims(cy_grid, axis=-1)
# [H, W, K, 1]
y_min = tf.expand_dims(cy_grid - half_anchor_heights, axis=-1)
y_max = tf.expand_dims(cy_grid + half_anchor_heights, axis=-1)
x_min = tf.expand_dims(cx_grid - half_anchor_widths, axis=-1)
x_max = tf.expand_dims(cx_grid + half_anchor_widths, axis=-1)
if self.clip_boxes:
y_min = tf.maximum(tf.minimum(y_min, image_height), 0.)
y_max = tf.maximum(tf.minimum(y_max, image_height), 0.)
x_min = tf.maximum(tf.minimum(x_min, image_width), 0.)
x_max = tf.maximum(tf.minimum(x_max, image_width), 0.)
# [H, W, K, 4]
result = tf.concat([y_min, x_min, y_max, x_max], axis=-1)
shape = result.shape.as_list()
# [H, W, K * 4]
return tf.reshape(result, [shape[0], shape[1], shape[2] * shape[3]])
class AnchorGenerator():
"""Utility to generate anchors for a multiple feature maps.
Example:
```python
anchor_gen = AnchorGenerator([32, 64], [.5, 1., 2.],
strides=[16, 32])
anchors = anchor_gen([512, 512, 3])
```
"""
def __init__(self,
anchor_sizes,
scales,
aspect_ratios,
strides,
clip_boxes=False):
"""Constructs multiscale anchors.
Args:
anchor_sizes: A list of int represents the anchor size for each scale. The
anchor height will be `anchor_size / sqrt(aspect_ratio)`, anchor width
will be `anchor_size * sqrt(aspect_ratio)` for each scale.
scales: A list/tuple, or a list/tuple of a list/tuple of positive
floats representing the actual anchor size to the base `anchor_size`.
aspect_ratios: A list/tuple, or a list/tuple of a list/tuple of positive
floats representing the ratio of anchor width to anchor height.
strides: A list/tuple of ints represent the anchor stride size between
center of anchors at each scale.
clip_boxes: Boolean to represents whether the anchor coordinates should be
clipped to the image size. Defaults to `False`.
Input shape: the size of the image, `[H, W, C]`
Output shape: the size of anchors concat on each level, `[(H /
strides) * (W / strides), K * 4]`
"""
# aspect_ratio is a single list that is the same across all levels.
aspect_ratios = maybe_map_structure_for_anchor(aspect_ratios, anchor_sizes)
scales = maybe_map_structure_for_anchor(scales, anchor_sizes)
if isinstance(anchor_sizes, dict):
self.anchor_generators = {}
for k in anchor_sizes.keys():
self.anchor_generators[k] = _SingleAnchorGenerator(
anchor_sizes[k], scales[k], aspect_ratios[k], strides[k],
clip_boxes)
elif isinstance(anchor_sizes, (list, tuple)):
self.anchor_generators = []
for anchor_size, scale_list, ar_list, stride in zip(
anchor_sizes, scales, aspect_ratios, strides):
self.anchor_generators.append(
_SingleAnchorGenerator(anchor_size, scale_list, ar_list, stride,
clip_boxes))
def __call__(self, image_size):
anchor_generators = tf.nest.flatten(self.anchor_generators)
results = [anchor_gen(image_size) for anchor_gen in anchor_generators]
return tf.nest.pack_sequence_as(self.anchor_generators, results)
def maybe_map_structure_for_anchor(params, anchor_sizes):
"""broadcast the params to match anchor_sizes."""
if all(isinstance(param, (int, float)) for param in params):
if isinstance(anchor_sizes, (tuple, list)):
return [params] * len(anchor_sizes)
elif isinstance(anchor_sizes, dict):
return tf.nest.map_structure(lambda _: params, anchor_sizes)
else:
raise ValueError("the structure of `anchor_sizes` must be a tuple, "
"list, or dict, given {}".format(anchor_sizes))
else:
return params
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for anchor_generator.py."""
from absl.testing import parameterized
import tensorflow as tf
from official.vision.ops import anchor_generator
class AnchorGeneratorTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
# Single scale anchor.
(5, [1.0], [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]],
[[16., -16., 80., 48.], [16., 16., 80., 80.]]]),
# # Multi aspect ratio anchor.
(6, [1.0, 4.0, 0.25],
[[[-32., -32., 96., 96., 0., -96., 64., 160., -96., 0., 160., 64.]]]),
)
def testAnchorGeneration(self, level, aspect_ratios, expected_boxes):
image_size = [64, 64]
anchor_size = 2**(level + 1)
stride = 2**level
anchor_gen = anchor_generator._SingleAnchorGenerator(
anchor_size=anchor_size,
scales=[1.],
aspect_ratios=aspect_ratios,
stride=stride,
clip_boxes=False)
anchors = anchor_gen(image_size).numpy()
self.assertAllClose(expected_boxes, anchors)
@parameterized.parameters(
# Single scale anchor.
(5, [1.0], [[[0., 0., 48., 48.], [0., 16., 48., 64.]],
[[16., 0., 64., 48.], [16., 16., 64., 64.]]]),
# # Multi aspect ratio anchor.
(6, [1.0, 4.0, 0.25
], [[[0., 0., 64., 64., 0., 0., 64., 64., 0., 0., 64., 64.]]]),
)
def testAnchorGenerationClipped(self, level, aspect_ratios, expected_boxes):
image_size = [64, 64]
anchor_size = 2**(level + 1)
stride = 2**level
anchor_gen = anchor_generator._SingleAnchorGenerator(
anchor_size=anchor_size,
scales=[1.],
aspect_ratios=aspect_ratios,
stride=stride,
clip_boxes=True)
anchors = anchor_gen(image_size).numpy()
self.assertAllClose(expected_boxes, anchors)
class MultiScaleAnchorGeneratorTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
# Multi scale anchor.
(5, 6, [[1.0], [1.0]], [[-16, -16, 48, 48], [-16, 16, 48, 80],
[16, -16, 80, 48], [16, 16, 80, 80],
[-32, -32, 96, 96]]),)
def testAnchorGeneration(self, min_level, max_level, aspect_ratios,
expected_boxes):
image_size = [64, 64]
levels = range(min_level, max_level + 1)
anchor_sizes = [2**(level + 1) for level in levels]
strides = [2**level for level in levels]
anchor_gen = anchor_generator.AnchorGenerator(
anchor_sizes=anchor_sizes,
scales=[1.],
aspect_ratios=aspect_ratios,
strides=strides)
anchors = anchor_gen(image_size)
anchors = [tf.reshape(anchor, [-1, 4]) for anchor in anchors]
anchors = tf.concat(anchors, axis=0).numpy()
self.assertAllClose(expected_boxes, anchors)
@parameterized.parameters(
# Multi scale anchor.
(5, 6, [[1.0], [1.0]], [[-16, -16, 48, 48], [-16, 16, 48, 80],
[16, -16, 80, 48], [16, 16, 80, 80],
[-32, -32, 96, 96]]),)
def testAnchorGenerationClipped(self, min_level, max_level, aspect_ratios,
expected_boxes):
image_size = [64, 64]
levels = range(min_level, max_level + 1)
anchor_sizes = [2**(level + 1) for level in levels]
strides = [2**level for level in levels]
anchor_gen = anchor_generator.AnchorGenerator(
anchor_sizes=anchor_sizes,
scales=[1.],
aspect_ratios=aspect_ratios,
strides=strides,
clip_boxes=False)
anchors = anchor_gen(image_size)
anchors = [tf.reshape(anchor, [-1, 4]) for anchor in anchors]
anchors = tf.concat(anchors, axis=0).numpy()
self.assertAllClose(expected_boxes, anchors)
@parameterized.parameters(
# Multi scale anchor.
(5, 6, [1.0], {
'5': [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]],
[[16., -16., 80., 48.], [16., 16., 80., 80.]]],
'6': [[[-32, -32, 96, 96]]]
}),)
def testAnchorGenerationDict(self, min_level, max_level, aspect_ratios,
expected_boxes):
image_size = [64, 64]
levels = range(min_level, max_level + 1)
anchor_sizes = dict((str(level), 2**(level + 1)) for level in levels)
strides = dict((str(level), 2**level) for level in levels)
anchor_gen = anchor_generator.AnchorGenerator(
anchor_sizes=anchor_sizes,
scales=[1.],
aspect_ratios=aspect_ratios,
strides=strides,
clip_boxes=False)
anchors = anchor_gen(image_size)
for k in expected_boxes.keys():
self.assertAllClose(expected_boxes[k], anchors[k].numpy())
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for anchor.py."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.ops import anchor
class AnchorTest(parameterized.TestCase, tf.test.TestCase):
# The set of parameters are tailored for the MLPerf configuration, where
# the number of anchors is 495132, rpn_batch_size_per_im=256, and
# rpn_fg_fraction=0.5.
@parameterized.parameters(
(512, 25, 25, 25, 25, (512, 512)),
(512, 25, 25, 25, 25, (512, 640)),
(512, 25, 25, 25, 25, (640, 512)),
(495132, 100, 100, 100, 100, (512, 512)),
(495132, 200, 100, 128, 100, (512, 512)),
(495132, 100, 120, 100, 120, (512, 512)),
(495132, 100, 200, 100, 156, (512, 512)),
(495132, 200, 200, 128, 128, (512, 512)),
)
def testAnchorRpnSample(self, num_anchors, num_positives,
num_negatives, expected_positives,
expected_negatives, image_size):
match_results_np = np.empty([num_anchors])
match_results_np.fill(-2)
match_results_np[:num_positives] = 0
match_results_np[num_positives:num_positives + num_negatives] = -1
match_results = tf.convert_to_tensor(value=match_results_np, dtype=tf.int32)
anchor_labeler = anchor.RpnAnchorLabeler(
match_threshold=0.7,
unmatched_threshold=0.3,
rpn_batch_size_per_im=256,
rpn_fg_fraction=0.5)
rpn_sample_op = anchor_labeler._get_rpn_samples(match_results)
labels = [v.numpy() for v in rpn_sample_op]
self.assertLen(labels[0], num_anchors)
positives = np.sum(np.array(labels[0]) == 1)
negatives = np.sum(np.array(labels[0]) == 0)
self.assertEqual(positives, expected_positives)
self.assertEqual(negatives, expected_negatives)
@parameterized.parameters(
# Single scale anchor.
(5, 5, 1, [1.0], 2.0,
[[-16, -16, 48, 48], [-16, 16, 48, 80],
[16, -16, 80, 48], [16, 16, 80, 80]]),
# Multi scale anchor.
(5, 6, 1, [1.0], 2.0,
[[-16, -16, 48, 48], [-16, 16, 48, 80],
[16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]),
# # Multi aspect ratio anchor.
(6, 6, 1, [1.0, 4.0, 0.25], 2.0,
[[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]),
)
def testAnchorGeneration(self, min_level, max_level, num_scales,
aspect_ratios, anchor_size, expected_boxes):
image_size = [64, 64]
anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios,
anchor_size, image_size)
boxes = anchors.boxes.numpy()
self.assertEqual(expected_boxes, boxes.tolist())
@parameterized.parameters(
# Single scale anchor.
(5, 5, 1, [1.0], 2.0,
[[-16, -16, 48, 48], [-16, 16, 48, 80],
[16, -16, 80, 48], [16, 16, 80, 80]]),
# Multi scale anchor.
(5, 6, 1, [1.0], 2.0,
[[-16, -16, 48, 48], [-16, 16, 48, 80],
[16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]),
# # Multi aspect ratio anchor.
(6, 6, 1, [1.0, 4.0, 0.25], 2.0,
[[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]),
)
def testAnchorGenerationWithImageSizeAsTensor(self,
min_level,
max_level,
num_scales,
aspect_ratios,
anchor_size,
expected_boxes):
image_size = tf.constant([64, 64], tf.int32)
anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios,
anchor_size, image_size)
boxes = anchors.boxes.numpy()
self.assertEqual(expected_boxes, boxes.tolist())
@parameterized.parameters(
(3, 6, 2, [1.0], 2.0, False),
(3, 6, 2, [1.0], 2.0, True),
)
def testLabelAnchors(self, min_level, max_level, num_scales, aspect_ratios,
anchor_size, has_attribute):
input_size = [512, 512]
ground_truth_class_id = 2
attribute_name = 'depth'
ground_truth_depth = 3.0
# The matched anchors are the anchors used as ground truth and the anchors
# at the next octave scale on the same location.
expected_anchor_locations = [[0, 0, 0], [0, 0, 1]]
anchor_gen = anchor.build_anchor_generator(min_level, max_level, num_scales,
aspect_ratios, anchor_size)
anchor_boxes = anchor_gen(input_size)
anchor_labeler = anchor.AnchorLabeler()
# Uses the first anchors as ground truth. The ground truth should map to
# two anchors with two intermediate scales at the same location.
gt_boxes = anchor_boxes['3'][0:1, 0, 0:4]
gt_classes = tf.constant([[ground_truth_class_id]], dtype=tf.float32)
gt_attributes = {
attribute_name: tf.constant([[ground_truth_depth]], dtype=tf.float32)
} if has_attribute else {}
(cls_targets, box_targets, att_targets, _,
box_weights) = anchor_labeler.label_anchors(anchor_boxes, gt_boxes,
gt_classes, gt_attributes)
for k, v in cls_targets.items():
cls_targets[k] = v.numpy()
for k, v in box_targets.items():
box_targets[k] = v.numpy()
box_weights = box_weights.numpy()
anchor_locations = np.vstack(
np.where(cls_targets[str(min_level)] > -1)).transpose()
self.assertAllClose(expected_anchor_locations, anchor_locations)
# Two anchor boxes on min_level got matched to the gt_boxes.
self.assertAllClose(tf.reduce_sum(box_weights), 2)
if has_attribute:
self.assertIn(attribute_name, att_targets)
for k, v in att_targets[attribute_name].items():
att_targets[attribute_name][k] = v.numpy()
anchor_locations = np.vstack(
np.where(
att_targets[attribute_name][str(min_level)] > 0.0)).transpose()
self.assertAllClose(expected_anchor_locations, anchor_locations)
else:
self.assertEmpty(att_targets)
@parameterized.parameters(
(3, 7, [.5, 1., 2.], 2, 8, (256, 256)),
(3, 8, [1.], 3, 32, (512, 512)),
(3, 3, [1.], 2, 4, (32, 32)),
)
def testEquivalentResult(self, min_level, max_level, aspect_ratios,
num_scales, anchor_size, image_size):
anchor_gen = anchor.build_anchor_generator(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=anchor_size)
anchors = anchor_gen(image_size)
expected_anchor_gen = anchor.Anchor(min_level, max_level, num_scales,
aspect_ratios, anchor_size, image_size)
expected_anchors = expected_anchor_gen.multilevel_boxes
for k in expected_anchors.keys():
self.assertAllClose(expected_anchors[k], anchors[k])
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Augmentation policies for enhanced image/video preprocessing.
AutoAugment Reference:
- AutoAugment Reference: https://arxiv.org/abs/1805.09501
- AutoAugment for Object Detection Reference: https://arxiv.org/abs/1906.11172
RandAugment Reference: https://arxiv.org/abs/1909.13719
RandomErasing Reference: https://arxiv.org/abs/1708.04896
MixupAndCutmix:
- Mixup: https://arxiv.org/abs/1710.09412
- Cutmix: https://arxiv.org/abs/1905.04899
RandomErasing, Mixup and Cutmix are inspired by
https://github.com/rwightman/pytorch-image-models
"""
import inspect
import math
from typing import Any, List, Iterable, Optional, Text, Tuple
from keras.layers.preprocessing import image_preprocessing as image_ops
import numpy as np
import tensorflow as tf
# This signifies the max integer that the controller RNN could predict for the
# augmentation scheme.
_MAX_LEVEL = 10.
def to_4d(image: tf.Tensor) -> tf.Tensor:
"""Converts an input Tensor to 4 dimensions.
4D image => [N, H, W, C] or [N, C, H, W]
3D image => [1, H, W, C] or [1, C, H, W]
2D image => [1, H, W, 1]
Args:
image: The 2/3/4D input tensor.
Returns:
A 4D image tensor.
Raises:
`TypeError` if `image` is not a 2/3/4D tensor.
"""
shape = tf.shape(image)
original_rank = tf.rank(image)
left_pad = tf.cast(tf.less_equal(original_rank, 3), dtype=tf.int32)
right_pad = tf.cast(tf.equal(original_rank, 2), dtype=tf.int32)
new_shape = tf.concat(
[
tf.ones(shape=left_pad, dtype=tf.int32),
shape,
tf.ones(shape=right_pad, dtype=tf.int32),
],
axis=0,
)
return tf.reshape(image, new_shape)
def from_4d(image: tf.Tensor, ndims: tf.Tensor) -> tf.Tensor:
"""Converts a 4D image back to `ndims` rank."""
shape = tf.shape(image)
begin = tf.cast(tf.less_equal(ndims, 3), dtype=tf.int32)
end = 4 - tf.cast(tf.equal(ndims, 2), dtype=tf.int32)
new_shape = shape[begin:end]
return tf.reshape(image, new_shape)
def _convert_translation_to_transform(translations: tf.Tensor) -> tf.Tensor:
"""Converts translations to a projective transform.
The translation matrix looks like this:
[[1 0 -dx]
[0 1 -dy]
[0 0 1]]
Args:
translations: The 2-element list representing [dx, dy], or a matrix of
2-element lists representing [dx dy] to translate for each image. The
shape must be static.
Returns:
The transformation matrix of shape (num_images, 8).
Raises:
`TypeError` if
- the shape of `translations` is not known or
- the shape of `translations` is not rank 1 or 2.
"""
translations = tf.convert_to_tensor(translations, dtype=tf.float32)
if translations.get_shape().ndims is None:
raise TypeError('translations rank must be statically known')
elif len(translations.get_shape()) == 1:
translations = translations[None]
elif len(translations.get_shape()) != 2:
raise TypeError('translations should have rank 1 or 2.')
num_translations = tf.shape(translations)[0]
return tf.concat(
values=[
tf.ones((num_translations, 1), tf.dtypes.float32),
tf.zeros((num_translations, 1), tf.dtypes.float32),
-translations[:, 0, None],
tf.zeros((num_translations, 1), tf.dtypes.float32),
tf.ones((num_translations, 1), tf.dtypes.float32),
-translations[:, 1, None],
tf.zeros((num_translations, 2), tf.dtypes.float32),
],
axis=1,
)
def _convert_angles_to_transform(angles: tf.Tensor, image_width: tf.Tensor,
image_height: tf.Tensor) -> tf.Tensor:
"""Converts an angle or angles to a projective transform.
Args:
angles: A scalar to rotate all images, or a vector to rotate a batch of
images. This must be a scalar.
image_width: The width of the image(s) to be transformed.
image_height: The height of the image(s) to be transformed.
Returns:
A tensor of shape (num_images, 8).
Raises:
`TypeError` if `angles` is not rank 0 or 1.
"""
angles = tf.convert_to_tensor(angles, dtype=tf.float32)
if len(angles.get_shape()) == 0: # pylint:disable=g-explicit-length-test
angles = angles[None]
elif len(angles.get_shape()) != 1:
raise TypeError('Angles should have a rank 0 or 1.')
x_offset = ((image_width - 1) -
(tf.math.cos(angles) * (image_width - 1) - tf.math.sin(angles) *
(image_height - 1))) / 2.0
y_offset = ((image_height - 1) -
(tf.math.sin(angles) * (image_width - 1) + tf.math.cos(angles) *
(image_height - 1))) / 2.0
num_angles = tf.shape(angles)[0]
return tf.concat(
values=[
tf.math.cos(angles)[:, None],
-tf.math.sin(angles)[:, None],
x_offset[:, None],
tf.math.sin(angles)[:, None],
tf.math.cos(angles)[:, None],
y_offset[:, None],
tf.zeros((num_angles, 2), tf.dtypes.float32),
],
axis=1,
)
def transform(image: tf.Tensor, transforms) -> tf.Tensor:
"""Prepares input data for `image_ops.transform`."""
original_ndims = tf.rank(image)
transforms = tf.convert_to_tensor(transforms, dtype=tf.float32)
if transforms.shape.rank == 1:
transforms = transforms[None]
image = to_4d(image)
image = image_ops.transform(
images=image, transforms=transforms, interpolation='nearest')
return from_4d(image, original_ndims)
def translate(image: tf.Tensor, translations) -> tf.Tensor:
"""Translates image(s) by provided vectors.
Args:
image: An image Tensor of type uint8.
translations: A vector or matrix representing [dx dy].
Returns:
The translated version of the image.
"""
transforms = _convert_translation_to_transform(translations)
return transform(image, transforms=transforms)
def rotate(image: tf.Tensor, degrees: float) -> tf.Tensor:
"""Rotates the image by degrees either clockwise or counterclockwise.
Args:
image: An image Tensor of type uint8.
degrees: Float, a scalar angle in degrees to rotate all images by. If
degrees is positive the image will be rotated clockwise otherwise it will
be rotated counterclockwise.
Returns:
The rotated version of image.
"""
# Convert from degrees to radians.
degrees_to_radians = math.pi / 180.0
radians = tf.cast(degrees * degrees_to_radians, tf.float32)
original_ndims = tf.rank(image)
image = to_4d(image)
image_height = tf.cast(tf.shape(image)[1], tf.float32)
image_width = tf.cast(tf.shape(image)[2], tf.float32)
transforms = _convert_angles_to_transform(
angles=radians, image_width=image_width, image_height=image_height)
# In practice, we should randomize the rotation degrees by flipping
# it negatively half the time, but that's done on 'degrees' outside
# of the function.
image = transform(image, transforms=transforms)
return from_4d(image, original_ndims)
def blend(image1: tf.Tensor, image2: tf.Tensor, factor: float) -> tf.Tensor:
"""Blend image1 and image2 using 'factor'.
Factor can be above 0.0. A value of 0.0 means only image1 is used.
A value of 1.0 means only image2 is used. A value between 0.0 and
1.0 means we linearly interpolate the pixel values between the two
images. A value greater than 1.0 "extrapolates" the difference
between the two pixel values, and we clip the results to values
between 0 and 255.
Args:
image1: An image Tensor of type uint8.
image2: An image Tensor of type uint8.
factor: A floating point value above 0.0.
Returns:
A blended image Tensor of type uint8.
"""
if factor == 0.0:
return tf.convert_to_tensor(image1)
if factor == 1.0:
return tf.convert_to_tensor(image2)
image1 = tf.cast(image1, tf.float32)
image2 = tf.cast(image2, tf.float32)
difference = image2 - image1
scaled = factor * difference
# Do addition in float.
temp = tf.cast(image1, tf.float32) + scaled
# Interpolate
if factor > 0.0 and factor < 1.0:
# Interpolation means we always stay within 0 and 255.
return tf.cast(temp, tf.uint8)
# Extrapolate:
#
# We need to clip and then cast.
return tf.cast(tf.clip_by_value(temp, 0.0, 255.0), tf.uint8)
def cutout(image: tf.Tensor, pad_size: int, replace: int = 0) -> tf.Tensor:
"""Apply cutout (https://arxiv.org/abs/1708.04552) to image.
This operation applies a (2*pad_size x 2*pad_size) mask of zeros to
a random location within `image`. The pixel values filled in will be of the
value `replace`. The location where the mask will be applied is randomly
chosen uniformly over the whole image.
Args:
image: An image Tensor of type uint8.
pad_size: Specifies how big the zero mask that will be generated is that is
applied to the image. The mask will be of size (2*pad_size x 2*pad_size).
replace: What pixel value to fill in the image in the area that has the
cutout mask applied to it.
Returns:
An image Tensor that is of type uint8.
"""
if image.shape.rank not in [3, 4]:
raise ValueError('Bad image rank: {}'.format(image.shape.rank))
if image.shape.rank == 4:
return cutout_video(image, replace=replace)
image_height = tf.shape(image)[0]
image_width = tf.shape(image)[1]
# Sample the center location in the image where the zero mask will be applied.
cutout_center_height = tf.random.uniform(
shape=[], minval=0, maxval=image_height, dtype=tf.int32)
cutout_center_width = tf.random.uniform(
shape=[], minval=0, maxval=image_width, dtype=tf.int32)
image = _fill_rectangle(image, cutout_center_width, cutout_center_height,
pad_size, pad_size, replace)
return image
def _fill_rectangle(image,
center_width,
center_height,
half_width,
half_height,
replace=None):
"""Fill blank area."""
image_height = tf.shape(image)[0]
image_width = tf.shape(image)[1]
lower_pad = tf.maximum(0, center_height - half_height)
upper_pad = tf.maximum(0, image_height - center_height - half_height)
left_pad = tf.maximum(0, center_width - half_width)
right_pad = tf.maximum(0, image_width - center_width - half_width)
cutout_shape = [
image_height - (lower_pad + upper_pad),
image_width - (left_pad + right_pad)
]
padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
mask = tf.pad(
tf.zeros(cutout_shape, dtype=image.dtype),
padding_dims,
constant_values=1)
mask = tf.expand_dims(mask, -1)
mask = tf.tile(mask, [1, 1, 3])
if replace is None:
fill = tf.random.normal(tf.shape(image), dtype=image.dtype)
elif isinstance(replace, tf.Tensor):
fill = replace
else:
fill = tf.ones_like(image, dtype=image.dtype) * replace
image = tf.where(tf.equal(mask, 0), fill, image)
return image
def cutout_video(image: tf.Tensor, replace: int = 0) -> tf.Tensor:
"""Apply cutout (https://arxiv.org/abs/1708.04552) to a video.
This operation applies a random size 3D mask of zeros to a random location
within `image`. The mask is padded The pixel values filled in will be of the
value `replace`. The location where the mask will be applied is randomly
chosen uniformly over the whole image. The size of the mask is randomly
sampled uniformly from [0.25*height, 0.5*height], [0.25*width, 0.5*width],
and [1, 0.25*depth], which represent the height, width, and number of frames
of the input video tensor respectively.
Args:
image: A video Tensor of type uint8.
replace: What pixel value to fill in the image in the area that has the
cutout mask applied to it.
Returns:
An video Tensor that is of type uint8.
"""
image_depth = tf.shape(image)[0]
image_height = tf.shape(image)[1]
image_width = tf.shape(image)[2]
# Sample the center location in the image where the zero mask will be applied.
cutout_center_height = tf.random.uniform(
shape=[], minval=0, maxval=image_height, dtype=tf.int32)
cutout_center_width = tf.random.uniform(
shape=[], minval=0, maxval=image_width, dtype=tf.int32)
cutout_center_depth = tf.random.uniform(
shape=[], minval=0, maxval=image_depth, dtype=tf.int32)
pad_size_height = tf.random.uniform(
shape=[],
minval=tf.maximum(1, tf.cast(image_height / 4, tf.int32)),
maxval=tf.maximum(2, tf.cast(image_height / 2, tf.int32)),
dtype=tf.int32)
pad_size_width = tf.random.uniform(
shape=[],
minval=tf.maximum(1, tf.cast(image_width / 4, tf.int32)),
maxval=tf.maximum(2, tf.cast(image_width / 2, tf.int32)),
dtype=tf.int32)
pad_size_depth = tf.random.uniform(
shape=[],
minval=1,
maxval=tf.maximum(2, tf.cast(image_depth / 4, tf.int32)),
dtype=tf.int32)
lower_pad = tf.maximum(0, cutout_center_height - pad_size_height)
upper_pad = tf.maximum(
0, image_height - cutout_center_height - pad_size_height)
left_pad = tf.maximum(0, cutout_center_width - pad_size_width)
right_pad = tf.maximum(0, image_width - cutout_center_width - pad_size_width)
back_pad = tf.maximum(0, cutout_center_depth - pad_size_depth)
forward_pad = tf.maximum(
0, image_depth - cutout_center_depth - pad_size_depth)
cutout_shape = [
image_depth - (back_pad + forward_pad),
image_height - (lower_pad + upper_pad),
image_width - (left_pad + right_pad),
]
padding_dims = [[back_pad, forward_pad],
[lower_pad, upper_pad],
[left_pad, right_pad]]
mask = tf.pad(
tf.zeros(cutout_shape, dtype=image.dtype),
padding_dims,
constant_values=1)
mask = tf.expand_dims(mask, -1)
mask = tf.tile(mask, [1, 1, 1, 3])
image = tf.where(
tf.equal(mask, 0),
tf.ones_like(image, dtype=image.dtype) * replace, image)
return image
def solarize(image: tf.Tensor, threshold: int = 128) -> tf.Tensor:
"""Solarize the input image(s)."""
# For each pixel in the image, select the pixel
# if the value is less than the threshold.
# Otherwise, subtract 255 from the pixel.
return tf.where(image < threshold, image, 255 - image)
def solarize_add(image: tf.Tensor,
addition: int = 0,
threshold: int = 128) -> tf.Tensor:
"""Additive solarize the input image(s)."""
# For each pixel in the image less than threshold
# we add 'addition' amount to it and then clip the
# pixel value to be between 0 and 255. The value
# of 'addition' is between -128 and 128.
added_image = tf.cast(image, tf.int64) + addition
added_image = tf.cast(tf.clip_by_value(added_image, 0, 255), tf.uint8)
return tf.where(image < threshold, added_image, image)
def color(image: tf.Tensor, factor: float) -> tf.Tensor:
"""Equivalent of PIL Color."""
degenerate = tf.image.grayscale_to_rgb(tf.image.rgb_to_grayscale(image))
return blend(degenerate, image, factor)
def contrast(image: tf.Tensor, factor: float) -> tf.Tensor:
"""Equivalent of PIL Contrast."""
degenerate = tf.image.rgb_to_grayscale(image)
# Cast before calling tf.histogram.
degenerate = tf.cast(degenerate, tf.int32)
# Compute the grayscale histogram, then compute the mean pixel value,
# and create a constant image size of that value. Use that as the
# blending degenerate target of the original image.
hist = tf.histogram_fixed_width(degenerate, [0, 255], nbins=256)
mean = tf.reduce_sum(tf.cast(hist, tf.float32)) / 256.0
degenerate = tf.ones_like(degenerate, dtype=tf.float32) * mean
degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
degenerate = tf.image.grayscale_to_rgb(tf.cast(degenerate, tf.uint8))
return blend(degenerate, image, factor)
def brightness(image: tf.Tensor, factor: float) -> tf.Tensor:
"""Equivalent of PIL Brightness."""
degenerate = tf.zeros_like(image)
return blend(degenerate, image, factor)
def posterize(image: tf.Tensor, bits: int) -> tf.Tensor:
"""Equivalent of PIL Posterize."""
shift = 8 - bits
return tf.bitwise.left_shift(tf.bitwise.right_shift(image, shift), shift)
def wrapped_rotate(image: tf.Tensor, degrees: float, replace: int) -> tf.Tensor:
"""Applies rotation with wrap/unwrap."""
image = rotate(wrap(image), degrees=degrees)
return unwrap(image, replace)
def translate_x(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor:
"""Equivalent of PIL Translate in X dimension."""
image = translate(wrap(image), [-pixels, 0])
return unwrap(image, replace)
def translate_y(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor:
"""Equivalent of PIL Translate in Y dimension."""
image = translate(wrap(image), [0, -pixels])
return unwrap(image, replace)
def shear_x(image: tf.Tensor, level: float, replace: int) -> tf.Tensor:
"""Equivalent of PIL Shearing in X dimension."""
# Shear parallel to x axis is a projective transform
# with a matrix form of:
# [1 level
# 0 1].
image = transform(
image=wrap(image), transforms=[1., level, 0., 0., 1., 0., 0., 0.])
return unwrap(image, replace)
def shear_y(image: tf.Tensor, level: float, replace: int) -> tf.Tensor:
"""Equivalent of PIL Shearing in Y dimension."""
# Shear parallel to y axis is a projective transform
# with a matrix form of:
# [1 0
# level 1].
image = transform(
image=wrap(image), transforms=[1., 0., 0., level, 1., 0., 0., 0.])
return unwrap(image, replace)
def autocontrast(image: tf.Tensor) -> tf.Tensor:
"""Implements Autocontrast function from PIL using TF ops.
Args:
image: A 3D uint8 tensor.
Returns:
The image after it has had autocontrast applied to it and will be of type
uint8.
"""
def scale_channel(image: tf.Tensor) -> tf.Tensor:
"""Scale the 2D image using the autocontrast rule."""
# A possibly cheaper version can be done using cumsum/unique_with_counts
# over the histogram values, rather than iterating over the entire image.
# to compute mins and maxes.
lo = tf.cast(tf.reduce_min(image), tf.float32)
hi = tf.cast(tf.reduce_max(image), tf.float32)
# Scale the image, making the lowest value 0 and the highest value 255.
def scale_values(im):
scale = 255.0 / (hi - lo)
offset = -lo * scale
im = tf.cast(im, tf.float32) * scale + offset
im = tf.clip_by_value(im, 0.0, 255.0)
return tf.cast(im, tf.uint8)
result = tf.cond(hi > lo, lambda: scale_values(image), lambda: image)
return result
# Assumes RGB for now. Scales each channel independently
# and then stacks the result.
s1 = scale_channel(image[..., 0])
s2 = scale_channel(image[..., 1])
s3 = scale_channel(image[..., 2])
image = tf.stack([s1, s2, s3], -1)
return image
def sharpness(image: tf.Tensor, factor: float) -> tf.Tensor:
"""Implements Sharpness function from PIL using TF ops."""
orig_image = image
image = tf.cast(image, tf.float32)
# Make image 4D for conv operation.
image = tf.expand_dims(image, 0)
# SMOOTH PIL Kernel.
if orig_image.shape.rank == 3:
kernel = tf.constant([[1, 1, 1], [1, 5, 1], [1, 1, 1]],
dtype=tf.float32,
shape=[3, 3, 1, 1]) / 13.
# Tile across channel dimension.
kernel = tf.tile(kernel, [1, 1, 3, 1])
strides = [1, 1, 1, 1]
degenerate = tf.nn.depthwise_conv2d(
image, kernel, strides, padding='VALID', dilations=[1, 1])
elif orig_image.shape.rank == 4:
kernel = tf.constant([[1, 1, 1], [1, 5, 1], [1, 1, 1]],
dtype=tf.float32,
shape=[1, 3, 3, 1, 1]) / 13.
strides = [1, 1, 1, 1, 1]
# Run the kernel across each channel
channels = tf.split(image, 3, axis=-1)
degenerates = [
tf.nn.conv3d(channel, kernel, strides, padding='VALID',
dilations=[1, 1, 1, 1, 1])
for channel in channels
]
degenerate = tf.concat(degenerates, -1)
else:
raise ValueError('Bad image rank: {}'.format(image.shape.rank))
degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
degenerate = tf.squeeze(tf.cast(degenerate, tf.uint8), [0])
# For the borders of the resulting image, fill in the values of the
# original image.
mask = tf.ones_like(degenerate)
paddings = [[0, 0]] * (orig_image.shape.rank - 3)
padded_mask = tf.pad(mask, paddings + [[1, 1], [1, 1], [0, 0]])
padded_degenerate = tf.pad(degenerate, paddings + [[1, 1], [1, 1], [0, 0]])
result = tf.where(tf.equal(padded_mask, 1), padded_degenerate, orig_image)
# Blend the final result.
return blend(result, orig_image, factor)
def equalize(image: tf.Tensor) -> tf.Tensor:
"""Implements Equalize function from PIL using TF ops."""
def scale_channel(im, c):
"""Scale the data in the channel to implement equalize."""
im = tf.cast(im[..., c], tf.int32)
# Compute the histogram of the image channel.
histo = tf.histogram_fixed_width(im, [0, 255], nbins=256)
# For the purposes of computing the step, filter out the nonzeros.
nonzero = tf.where(tf.not_equal(histo, 0))
nonzero_histo = tf.reshape(tf.gather(histo, nonzero), [-1])
step = (tf.reduce_sum(nonzero_histo) - nonzero_histo[-1]) // 255
def build_lut(histo, step):
# Compute the cumulative sum, shifting by step // 2
# and then normalization by step.
lut = (tf.cumsum(histo) + (step // 2)) // step
# Shift lut, prepending with 0.
lut = tf.concat([[0], lut[:-1]], 0)
# Clip the counts to be in range. This is done
# in the C code for image.point.
return tf.clip_by_value(lut, 0, 255)
# If step is zero, return the original image. Otherwise, build
# lut from the full histogram and step and then index from it.
result = tf.cond(
tf.equal(step, 0), lambda: im,
lambda: tf.gather(build_lut(histo, step), im))
return tf.cast(result, tf.uint8)
# Assumes RGB for now. Scales each channel independently
# and then stacks the result.
s1 = scale_channel(image, 0)
s2 = scale_channel(image, 1)
s3 = scale_channel(image, 2)
image = tf.stack([s1, s2, s3], -1)
return image
def invert(image: tf.Tensor) -> tf.Tensor:
"""Inverts the image pixels."""
image = tf.convert_to_tensor(image)
return 255 - image
def wrap(image: tf.Tensor) -> tf.Tensor:
"""Returns 'image' with an extra channel set to all 1s."""
shape = tf.shape(image)
extended_channel = tf.expand_dims(tf.ones(shape[:-1], image.dtype), -1)
extended = tf.concat([image, extended_channel], axis=-1)
return extended
def unwrap(image: tf.Tensor, replace: int) -> tf.Tensor:
"""Unwraps an image produced by wrap.
Where there is a 0 in the last channel for every spatial position,
the rest of the three channels in that spatial dimension are grayed
(set to 128). Operations like translate and shear on a wrapped
Tensor will leave 0s in empty locations. Some transformations look
at the intensity of values to do preprocessing, and we want these
empty pixels to assume the 'average' value, rather than pure black.
Args:
image: A 3D Image Tensor with 4 channels.
replace: A one or three value 1D tensor to fill empty pixels.
Returns:
image: A 3D image Tensor with 3 channels.
"""
image_shape = tf.shape(image)
# Flatten the spatial dimensions.
flattened_image = tf.reshape(image, [-1, image_shape[-1]])
# Find all pixels where the last channel is zero.
alpha_channel = tf.expand_dims(flattened_image[..., 3], axis=-1)
replace = tf.concat([replace, tf.ones([1], image.dtype)], 0)
# Where they are zero, fill them in with 'replace'.
flattened_image = tf.where(
tf.equal(alpha_channel, 0),
tf.ones_like(flattened_image, dtype=image.dtype) * replace,
flattened_image)
image = tf.reshape(flattened_image, image_shape)
image = tf.slice(
image,
[0] * image.shape.rank,
tf.concat([image_shape[:-1], [3]], -1))
return image
def _scale_bbox_only_op_probability(prob):
"""Reduce the probability of the bbox-only operation.
Probability is reduced so that we do not distort the content of too many
bounding boxes that are close to each other. The value of 3.0 was a chosen
hyper parameter when designing the autoaugment algorithm that we found
empirically to work well.
Args:
prob: Float that is the probability of applying the bbox-only operation.
Returns:
Reduced probability.
"""
return prob / 3.0
def _apply_bbox_augmentation(image, bbox, augmentation_func, *args):
"""Applies augmentation_func to the subsection of image indicated by bbox.
Args:
image: 3D uint8 Tensor.
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
augmentation_func: Augmentation function that will be applied to the
subsection of image.
*args: Additional parameters that will be passed into augmentation_func
when it is called.
Returns:
A modified version of image, where the bbox location in the image will
have `ugmentation_func applied to it.
"""
image_height = tf.cast(tf.shape(image)[0], tf.float32)
image_width = tf.cast(tf.shape(image)[1], tf.float32)
min_y = tf.cast(image_height * bbox[0], tf.int32)
min_x = tf.cast(image_width * bbox[1], tf.int32)
max_y = tf.cast(image_height * bbox[2], tf.int32)
max_x = tf.cast(image_width * bbox[3], tf.int32)
image_height = tf.cast(image_height, tf.int32)
image_width = tf.cast(image_width, tf.int32)
# Clip to be sure the max values do not fall out of range.
max_y = tf.minimum(max_y, image_height - 1)
max_x = tf.minimum(max_x, image_width - 1)
# Get the sub-tensor that is the image within the bounding box region.
bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :]
# Apply the augmentation function to the bbox portion of the image.
augmented_bbox_content = augmentation_func(bbox_content, *args)
# Pad the augmented_bbox_content and the mask to match the shape of original
# image.
augmented_bbox_content = tf.pad(augmented_bbox_content,
[[min_y, (image_height - 1) - max_y],
[min_x, (image_width - 1) - max_x],
[0, 0]])
# Create a mask that will be used to zero out a part of the original image.
mask_tensor = tf.zeros_like(bbox_content)
mask_tensor = tf.pad(mask_tensor,
[[min_y, (image_height - 1) - max_y],
[min_x, (image_width - 1) - max_x],
[0, 0]],
constant_values=1)
# Replace the old bbox content with the new augmented content.
image = image * mask_tensor + augmented_bbox_content
return image
def _concat_bbox(bbox, bboxes):
"""Helper function that concates bbox to bboxes along the first dimension."""
# Note if all elements in bboxes are -1 (_INVALID_BOX), then this means
# we discard bboxes and start the bboxes Tensor with the current bbox.
bboxes_sum_check = tf.reduce_sum(bboxes)
bbox = tf.expand_dims(bbox, 0)
# This check will be true when it is an _INVALID_BOX
bboxes = tf.cond(tf.equal(bboxes_sum_check, -4.0),
lambda: bbox,
lambda: tf.concat([bboxes, bbox], 0))
return bboxes
def _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob,
augmentation_func, func_changes_bbox,
*args):
"""Applies _apply_bbox_augmentation with probability prob.
Args:
image: 3D uint8 Tensor.
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
new_bboxes: 2D Tensor that is a list of the bboxes in the image after they
have been altered by aug_func. These will only be changed when
func_changes_bbox is set to true. Each bbox has 4 elements
(min_y, min_x, max_y, max_x) of type float that are the normalized
bbox coordinates between 0 and 1.
prob: Float that is the probability of applying _apply_bbox_augmentation.
augmentation_func: Augmentation function that will be applied to the
subsection of image.
func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
to image.
*args: Additional parameters that will be passed into augmentation_func
when it is called.
Returns:
A tuple. Fist element is a modified version of image, where the bbox
location in the image will have augmentation_func applied to it if it is
chosen to be called with probability `prob`. The second element is a
Tensor of Tensors of length 4 that will contain the altered bbox after
applying augmentation_func.
"""
should_apply_op = tf.cast(
tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool)
if func_changes_bbox:
augmented_image, bbox = tf.cond(
should_apply_op,
lambda: augmentation_func(image, bbox, *args),
lambda: (image, bbox))
else:
augmented_image = tf.cond(
should_apply_op,
lambda: _apply_bbox_augmentation(image, bbox, augmentation_func, *args),
lambda: image)
new_bboxes = _concat_bbox(bbox, new_bboxes)
return augmented_image, new_bboxes
def _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func,
func_changes_bbox, *args):
"""Checks to be sure num bboxes > 0 before calling inner function."""
num_bboxes = tf.shape(bboxes)[0]
image, bboxes = tf.cond(
tf.equal(num_bboxes, 0),
lambda: (image, bboxes),
# pylint:disable=g-long-lambda
lambda: _apply_multi_bbox_augmentation(
image, bboxes, prob, aug_func, func_changes_bbox, *args))
# pylint:enable=g-long-lambda
return image, bboxes
# Represents an invalid bounding box that is used for checking for padding
# lists of bounding box coordinates for a few augmentation operations
_INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]]
def _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func,
func_changes_bbox, *args):
"""Applies aug_func to the image for each bbox in bboxes.
Args:
image: 3D uint8 Tensor.
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
has 4 elements (min_y, min_x, max_y, max_x) of type float.
prob: Float that is the probability of applying aug_func to a specific
bounding box within the image.
aug_func: Augmentation function that will be applied to the
subsections of image indicated by the bbox values in bboxes.
func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
to image.
*args: Additional parameters that will be passed into augmentation_func
when it is called.
Returns:
A modified version of image, where each bbox location in the image will
have augmentation_func applied to it if it is chosen to be called with
probability prob independently across all bboxes. Also the final
bboxes are returned that will be unchanged if func_changes_bbox is set to
false and if true, the new altered ones will be returned.
Raises:
ValueError if applied to video.
"""
if image.shape.rank == 4:
raise ValueError('Image rank 4 is not supported')
# Will keep track of the new altered bboxes after aug_func is repeatedly
# applied. The -1 values are a dummy value and this first Tensor will be
# removed upon appending the first real bbox.
new_bboxes = tf.constant(_INVALID_BOX)
# If the bboxes are empty, then just give it _INVALID_BOX. The result
# will be thrown away.
bboxes = tf.cond(tf.equal(tf.size(bboxes), 0),
lambda: tf.constant(_INVALID_BOX),
lambda: bboxes)
bboxes = tf.ensure_shape(bboxes, (None, 4))
# pylint:disable=g-long-lambda
wrapped_aug_func = (
lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper(
_image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args))
# pylint:enable=g-long-lambda
# Setup the while_loop.
num_bboxes = tf.shape(bboxes)[0] # We loop until we go over all bboxes.
idx = tf.constant(0) # Counter for the while loop.
# Conditional function when to end the loop once we go over all bboxes
# images_and_bboxes contain (_image, _new_bboxes)
cond = lambda _idx, _images_and_bboxes: tf.less(_idx, num_bboxes)
# Shuffle the bboxes so that the augmentation order is not deterministic if
# we are not changing the bboxes with aug_func.
if not func_changes_bbox:
loop_bboxes = tf.random.shuffle(bboxes)
else:
loop_bboxes = bboxes
# Main function of while_loop where we repeatedly apply augmentation on the
# bboxes in the image.
# pylint:disable=g-long-lambda
body = lambda _idx, _images_and_bboxes: [
_idx + 1, wrapped_aug_func(_images_and_bboxes[0],
loop_bboxes[_idx],
_images_and_bboxes[1])]
# pylint:enable=g-long-lambda
_, (image, new_bboxes) = tf.while_loop(
cond, body, [idx, (image, new_bboxes)],
shape_invariants=[idx.get_shape(),
(image.get_shape(), tf.TensorShape([None, 4]))])
# Either return the altered bboxes or the original ones depending on if
# we altered them in anyway.
if func_changes_bbox:
final_bboxes = new_bboxes
else:
final_bboxes = bboxes
return image, final_bboxes
def _clip_bbox(min_y, min_x, max_y, max_x):
"""Clip bounding box coordinates between 0 and 1.
Args:
min_y: Normalized bbox coordinate of type float between 0 and 1.
min_x: Normalized bbox coordinate of type float between 0 and 1.
max_y: Normalized bbox coordinate of type float between 0 and 1.
max_x: Normalized bbox coordinate of type float between 0 and 1.
Returns:
Clipped coordinate values between 0 and 1.
"""
min_y = tf.clip_by_value(min_y, 0.0, 1.0)
min_x = tf.clip_by_value(min_x, 0.0, 1.0)
max_y = tf.clip_by_value(max_y, 0.0, 1.0)
max_x = tf.clip_by_value(max_x, 0.0, 1.0)
return min_y, min_x, max_y, max_x
def _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05):
"""Adjusts bbox coordinates to make sure the area is > 0.
Args:
min_y: Normalized bbox coordinate of type float between 0 and 1.
min_x: Normalized bbox coordinate of type float between 0 and 1.
max_y: Normalized bbox coordinate of type float between 0 and 1.
max_x: Normalized bbox coordinate of type float between 0 and 1.
delta: Float, this is used to create a gap of size 2 * delta between
bbox min/max coordinates that are the same on the boundary.
This prevents the bbox from having an area of zero.
Returns:
Tuple of new bbox coordinates between 0 and 1 that will now have a
guaranteed area > 0.
"""
height = max_y - min_y
width = max_x - min_x
def _adjust_bbox_boundaries(min_coord, max_coord):
# Make sure max is never 0 and min is never 1.
max_coord = tf.maximum(max_coord, 0.0 + delta)
min_coord = tf.minimum(min_coord, 1.0 - delta)
return min_coord, max_coord
min_y, max_y = tf.cond(tf.equal(height, 0.0),
lambda: _adjust_bbox_boundaries(min_y, max_y),
lambda: (min_y, max_y))
min_x, max_x = tf.cond(tf.equal(width, 0.0),
lambda: _adjust_bbox_boundaries(min_x, max_x),
lambda: (min_x, max_x))
return min_y, min_x, max_y, max_x
def _rotate_bbox(bbox, image_height, image_width, degrees):
"""Rotates the bbox coordinated by degrees.
Args:
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
image_height: Int, height of the image.
image_width: Int, height of the image.
degrees: Float, a scalar angle in degrees to rotate all images by. If
degrees is positive the image will be rotated clockwise otherwise it will
be rotated counterclockwise.
Returns:
A tensor of the same shape as bbox, but now with the rotated coordinates.
"""
image_height, image_width = (
tf.cast(image_height, tf.float32), tf.cast(image_width, tf.float32))
# Convert from degrees to radians.
degrees_to_radians = math.pi / 180.0
radians = degrees * degrees_to_radians
# Translate the bbox to the center of the image and turn the normalized 0-1
# coordinates to absolute pixel locations.
# Y coordinates are made negative as the y axis of images goes down with
# increasing pixel values, so we negate to make sure x axis and y axis points
# are in the traditionally positive direction.
min_y = -tf.cast(image_height * (bbox[0] - 0.5), tf.int32)
min_x = tf.cast(image_width * (bbox[1] - 0.5), tf.int32)
max_y = -tf.cast(image_height * (bbox[2] - 0.5), tf.int32)
max_x = tf.cast(image_width * (bbox[3] - 0.5), tf.int32)
coordinates = tf.stack(
[[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]])
coordinates = tf.cast(coordinates, tf.float32)
# Rotate the coordinates according to the rotation matrix clockwise if
# radians is positive, else negative
rotation_matrix = tf.stack(
[[tf.cos(radians), tf.sin(radians)],
[-tf.sin(radians), tf.cos(radians)]])
new_coords = tf.cast(
tf.matmul(rotation_matrix, tf.transpose(coordinates)), tf.int32)
# Find min/max values and convert them back to normalized 0-1 floats.
min_y = -(
tf.cast(tf.reduce_max(new_coords[0, :]), tf.float32) / image_height - 0.5)
min_x = tf.cast(tf.reduce_min(new_coords[1, :]),
tf.float32) / image_width + 0.5
max_y = -(
tf.cast(tf.reduce_min(new_coords[0, :]), tf.float32) / image_height - 0.5)
max_x = tf.cast(tf.reduce_max(new_coords[1, :]),
tf.float32) / image_width + 0.5
# Clip the bboxes to be sure the fall between [0, 1].
min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
return tf.stack([min_y, min_x, max_y, max_x])
def rotate_with_bboxes(image, bboxes, degrees, replace):
"""Equivalent of PIL Rotate that rotates the image and bbox.
Args:
image: 3D uint8 Tensor.
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
has 4 elements (min_y, min_x, max_y, max_x) of type float.
degrees: Float, a scalar angle in degrees to rotate all images by. If
degrees is positive the image will be rotated clockwise otherwise it will
be rotated counterclockwise.
replace: A one or three value 1D tensor to fill empty pixels.
Returns:
A tuple containing a 3D uint8 Tensor that will be the result of rotating
image by degrees. The second element of the tuple is bboxes, where now
the coordinates will be shifted to reflect the rotated image.
Raises:
ValueError: If applied to video.
"""
if image.shape.rank == 4:
raise ValueError('Image rank 4 is not supported')
# Rotate the image.
image = wrapped_rotate(image, degrees, replace)
# Convert bbox coordinates to pixel values.
image_height = tf.shape(image)[0]
image_width = tf.shape(image)[1]
# pylint:disable=g-long-lambda
wrapped_rotate_bbox = lambda bbox: _rotate_bbox(
bbox, image_height, image_width, degrees)
# pylint:enable=g-long-lambda
bboxes = tf.map_fn(wrapped_rotate_bbox, bboxes)
return image, bboxes
def _shear_bbox(bbox, image_height, image_width, level, shear_horizontal):
"""Shifts the bbox according to how the image was sheared.
Args:
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
image_height: Int, height of the image.
image_width: Int, height of the image.
level: Float. How much to shear the image.
shear_horizontal: If true then shear in X dimension else shear in
the Y dimension.
Returns:
A tensor of the same shape as bbox, but now with the shifted coordinates.
"""
image_height, image_width = (
tf.cast(image_height, tf.float32), tf.cast(image_width, tf.float32))
# Change bbox coordinates to be pixels.
min_y = tf.cast(image_height * bbox[0], tf.int32)
min_x = tf.cast(image_width * bbox[1], tf.int32)
max_y = tf.cast(image_height * bbox[2], tf.int32)
max_x = tf.cast(image_width * bbox[3], tf.int32)
coordinates = tf.stack(
[[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]])
coordinates = tf.cast(coordinates, tf.float32)
# Shear the coordinates according to the translation matrix.
if shear_horizontal:
translation_matrix = tf.stack(
[[1, 0], [-level, 1]])
else:
translation_matrix = tf.stack(
[[1, -level], [0, 1]])
translation_matrix = tf.cast(translation_matrix, tf.float32)
new_coords = tf.cast(
tf.matmul(translation_matrix, tf.transpose(coordinates)), tf.int32)
# Find min/max values and convert them back to floats.
min_y = tf.cast(tf.reduce_min(new_coords[0, :]), tf.float32) / image_height
min_x = tf.cast(tf.reduce_min(new_coords[1, :]), tf.float32) / image_width
max_y = tf.cast(tf.reduce_max(new_coords[0, :]), tf.float32) / image_height
max_x = tf.cast(tf.reduce_max(new_coords[1, :]), tf.float32) / image_width
# Clip the bboxes to be sure the fall between [0, 1].
min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
return tf.stack([min_y, min_x, max_y, max_x])
def shear_with_bboxes(image, bboxes, level, replace, shear_horizontal):
"""Applies Shear Transformation to the image and shifts the bboxes.
Args:
image: 3D uint8 Tensor.
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
has 4 elements (min_y, min_x, max_y, max_x) of type float with values
between [0, 1].
level: Float. How much to shear the image. This value will be between
-0.3 to 0.3.
replace: A one or three value 1D tensor to fill empty pixels.
shear_horizontal: Boolean. If true then shear in X dimension else shear in
the Y dimension.
Returns:
A tuple containing a 3D uint8 Tensor that will be the result of shearing
image by level. The second element of the tuple is bboxes, where now
the coordinates will be shifted to reflect the sheared image.
Raises:
ValueError: If applied to video.
"""
if image.shape.rank == 4:
raise ValueError('Image rank 4 is not supported')
if shear_horizontal:
image = shear_x(image, level, replace)
else:
image = shear_y(image, level, replace)
# Convert bbox coordinates to pixel values.
image_height = tf.shape(image)[0]
image_width = tf.shape(image)[1]
# pylint:disable=g-long-lambda
wrapped_shear_bbox = lambda bbox: _shear_bbox(
bbox, image_height, image_width, level, shear_horizontal)
# pylint:enable=g-long-lambda
bboxes = tf.map_fn(wrapped_shear_bbox, bboxes)
return image, bboxes
def _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal):
"""Shifts the bbox coordinates by pixels.
Args:
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
image_height: Int, height of the image.
image_width: Int, width of the image.
pixels: An int. How many pixels to shift the bbox.
shift_horizontal: Boolean. If true then shift in X dimension else shift in
Y dimension.
Returns:
A tensor of the same shape as bbox, but now with the shifted coordinates.
"""
pixels = tf.cast(pixels, tf.int32)
# Convert bbox to integer pixel locations.
min_y = tf.cast(tf.cast(image_height, tf.float32) * bbox[0], tf.int32)
min_x = tf.cast(tf.cast(image_width, tf.float32) * bbox[1], tf.int32)
max_y = tf.cast(tf.cast(image_height, tf.float32) * bbox[2], tf.int32)
max_x = tf.cast(tf.cast(image_width, tf.float32) * bbox[3], tf.int32)
if shift_horizontal:
min_x = tf.maximum(0, min_x - pixels)
max_x = tf.minimum(image_width, max_x - pixels)
else:
min_y = tf.maximum(0, min_y - pixels)
max_y = tf.minimum(image_height, max_y - pixels)
# Convert bbox back to floats.
min_y = tf.cast(min_y, tf.float32) / tf.cast(image_height, tf.float32)
min_x = tf.cast(min_x, tf.float32) / tf.cast(image_width, tf.float32)
max_y = tf.cast(max_y, tf.float32) / tf.cast(image_height, tf.float32)
max_x = tf.cast(max_x, tf.float32) / tf.cast(image_width, tf.float32)
# Clip the bboxes to be sure the fall between [0, 1].
min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
return tf.stack([min_y, min_x, max_y, max_x])
def translate_bbox(image, bboxes, pixels, replace, shift_horizontal):
"""Equivalent of PIL Translate in X/Y dimension that shifts image and bbox.
Args:
image: 3D uint8 Tensor.
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
has 4 elements (min_y, min_x, max_y, max_x) of type float with values
between [0, 1].
pixels: An int. How many pixels to shift the image and bboxes
replace: A one or three value 1D tensor to fill empty pixels.
shift_horizontal: Boolean. If true then shift in X dimension else shift in
Y dimension.
Returns:
A tuple containing a 3D uint8 Tensor that will be the result of translating
image by pixels. The second element of the tuple is bboxes, where now
the coordinates will be shifted to reflect the shifted image.
Raises:
ValueError if applied to video.
"""
if image.shape.rank == 4:
raise ValueError('Image rank 4 is not supported')
if shift_horizontal:
image = translate_x(image, pixels, replace)
else:
image = translate_y(image, pixels, replace)
# Convert bbox coordinates to pixel values.
image_height = tf.shape(image)[0]
image_width = tf.shape(image)[1]
# pylint:disable=g-long-lambda
wrapped_shift_bbox = lambda bbox: _shift_bbox(
bbox, image_height, image_width, pixels, shift_horizontal)
# pylint:enable=g-long-lambda
bboxes = tf.map_fn(wrapped_shift_bbox, bboxes)
return image, bboxes
def translate_y_only_bboxes(
image: tf.Tensor, bboxes: tf.Tensor, prob: float, pixels: int, replace):
"""Apply translate_y to each bbox in the image with probability prob."""
if bboxes.shape.rank == 4:
raise ValueError('translate_y_only_bboxes does not support rank 4 boxes')
func_changes_bbox = False
prob = _scale_bbox_only_op_probability(prob)
return _apply_multi_bbox_augmentation_wrapper(
image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace)
def _randomly_negate_tensor(tensor):
"""With 50% prob turn the tensor negative."""
should_flip = tf.cast(tf.floor(tf.random.uniform([]) + 0.5), tf.bool)
final_tensor = tf.cond(should_flip, lambda: tensor, lambda: -tensor)
return final_tensor
def _rotate_level_to_arg(level: float):
level = (level / _MAX_LEVEL) * 30.
level = _randomly_negate_tensor(level)
return (level,)
def _shrink_level_to_arg(level: float):
"""Converts level to ratio by which we shrink the image content."""
if level == 0:
return (1.0,) # if level is zero, do not shrink the image
# Maximum shrinking ratio is 2.9.
level = 2. / (_MAX_LEVEL / level) + 0.9
return (level,)
def _enhance_level_to_arg(level: float):
return ((level / _MAX_LEVEL) * 1.8 + 0.1,)
def _shear_level_to_arg(level: float):
level = (level / _MAX_LEVEL) * 0.3
# Flip level to negative with 50% chance.
level = _randomly_negate_tensor(level)
return (level,)
def _translate_level_to_arg(level: float, translate_const: float):
level = (level / _MAX_LEVEL) * float(translate_const)
# Flip level to negative with 50% chance.
level = _randomly_negate_tensor(level)
return (level,)
def _mult_to_arg(level: float, multiplier: float = 1.):
return (int((level / _MAX_LEVEL) * multiplier),)
def _apply_func_with_prob(func: Any, image: tf.Tensor,
bboxes: Optional[tf.Tensor], args: Any, prob: float):
"""Apply `func` to image w/ `args` as input with probability `prob`."""
assert isinstance(args, tuple)
assert inspect.getfullargspec(func)[0][1] == 'bboxes'
# Apply the function with probability `prob`.
should_apply_op = tf.cast(
tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool)
augmented_image, augmented_bboxes = tf.cond(
should_apply_op,
lambda: func(image, bboxes, *args),
lambda: (image, bboxes))
return augmented_image, augmented_bboxes
def select_and_apply_random_policy(policies: Any,
image: tf.Tensor,
bboxes: Optional[tf.Tensor] = None):
"""Select a random policy from `policies` and apply it to `image`."""
policy_to_select = tf.random.uniform([], maxval=len(policies), dtype=tf.int32)
# Note that using tf.case instead of tf.conds would result in significantly
# larger graphs and would even break export for some larger policies.
for (i, policy) in enumerate(policies):
image, bboxes = tf.cond(
tf.equal(i, policy_to_select),
lambda selected_policy=policy: selected_policy(image, bboxes),
lambda: (image, bboxes))
return image, bboxes
NAME_TO_FUNC = {
'AutoContrast': autocontrast,
'Equalize': equalize,
'Invert': invert,
'Rotate': wrapped_rotate,
'Posterize': posterize,
'Solarize': solarize,
'SolarizeAdd': solarize_add,
'Color': color,
'Contrast': contrast,
'Brightness': brightness,
'Sharpness': sharpness,
'ShearX': shear_x,
'ShearY': shear_y,
'TranslateX': translate_x,
'TranslateY': translate_y,
'Cutout': cutout,
'Rotate_BBox': rotate_with_bboxes,
# pylint:disable=g-long-lambda
'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
image, bboxes, level, replace, shear_horizontal=True),
'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
image, bboxes, level, replace, shear_horizontal=False),
'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
image, bboxes, pixels, replace, shift_horizontal=True),
'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
image, bboxes, pixels, replace, shift_horizontal=False),
# pylint:enable=g-long-lambda
'TranslateY_Only_BBoxes': translate_y_only_bboxes,
}
# Functions that require a `bboxes` parameter.
REQUIRE_BOXES_FUNCS = frozenset({
'Rotate_BBox',
'ShearX_BBox',
'ShearY_BBox',
'TranslateX_BBox',
'TranslateY_BBox',
'TranslateY_Only_BBoxes',
})
# Functions that have a 'prob' parameter
PROB_FUNCS = frozenset({
'TranslateY_Only_BBoxes',
})
# Functions that have a 'replace' parameter
REPLACE_FUNCS = frozenset({
'Rotate',
'TranslateX',
'ShearX',
'ShearY',
'TranslateY',
'Cutout',
'Rotate_BBox',
'ShearX_BBox',
'ShearY_BBox',
'TranslateX_BBox',
'TranslateY_BBox',
'TranslateY_Only_BBoxes',
})
def level_to_arg(cutout_const: float, translate_const: float):
"""Creates a dict mapping image operation names to their arguments."""
no_arg = lambda level: ()
posterize_arg = lambda level: _mult_to_arg(level, 4)
solarize_arg = lambda level: _mult_to_arg(level, 256)
solarize_add_arg = lambda level: _mult_to_arg(level, 110)
cutout_arg = lambda level: _mult_to_arg(level, cutout_const)
translate_arg = lambda level: _translate_level_to_arg(level, translate_const)
translate_bbox_arg = lambda level: _translate_level_to_arg(level, 120)
args = {
'AutoContrast': no_arg,
'Equalize': no_arg,
'Invert': no_arg,
'Rotate': _rotate_level_to_arg,
'Posterize': posterize_arg,
'Solarize': solarize_arg,
'SolarizeAdd': solarize_add_arg,
'Color': _enhance_level_to_arg,
'Contrast': _enhance_level_to_arg,
'Brightness': _enhance_level_to_arg,
'Sharpness': _enhance_level_to_arg,
'ShearX': _shear_level_to_arg,
'ShearY': _shear_level_to_arg,
'Cutout': cutout_arg,
'TranslateX': translate_arg,
'TranslateY': translate_arg,
'Rotate_BBox': _rotate_level_to_arg,
'ShearX_BBox': _shear_level_to_arg,
'ShearY_BBox': _shear_level_to_arg,
# pylint:disable=g-long-lambda
'TranslateX_BBox': lambda level: _translate_level_to_arg(
level, translate_const),
'TranslateY_BBox': lambda level: _translate_level_to_arg(
level, translate_const),
# pylint:enable=g-long-lambda
'TranslateY_Only_BBoxes': translate_bbox_arg,
}
return args
def bbox_wrapper(func):
"""Adds a bboxes function argument to func and returns unchanged bboxes."""
def wrapper(images, bboxes, *args, **kwargs):
return (func(images, *args, **kwargs), bboxes)
return wrapper
def _parse_policy_info(name: Text,
prob: float,
level: float,
replace_value: List[int],
cutout_const: float,
translate_const: float,
level_std: float = 0.) -> Tuple[Any, float, Any]:
"""Return the function that corresponds to `name` and update `level` param."""
func = NAME_TO_FUNC[name]
if level_std > 0:
level += tf.random.normal([], dtype=tf.float32)
level = tf.clip_by_value(level, 0., _MAX_LEVEL)
args = level_to_arg(cutout_const, translate_const)[name](level)
if name in PROB_FUNCS:
# Add in the prob arg if it is required for the function that is called.
args = tuple([prob] + list(args))
if name in REPLACE_FUNCS:
# Add in replace arg if it is required for the function that is called.
args = tuple(list(args) + [replace_value])
# Add bboxes as the second positional argument for the function if it does
# not already exist.
if 'bboxes' not in inspect.getfullargspec(func)[0]:
func = bbox_wrapper(func)
return func, prob, args
class ImageAugment(object):
"""Image augmentation class for applying image distortions."""
def distort(
self,
image: tf.Tensor
) -> tf.Tensor:
"""Given an image tensor, returns a distorted image with the same shape.
Args:
image: `Tensor` of shape [height, width, 3] or
[num_frames, height, width, 3] representing an image or image sequence.
Returns:
The augmented version of `image`.
"""
raise NotImplementedError()
def distort_with_boxes(
self,
image: tf.Tensor,
bboxes: tf.Tensor
) -> Tuple[tf.Tensor, tf.Tensor]:
"""Distorts the image and bounding boxes.
Args:
image: `Tensor` of shape [height, width, 3] or
[num_frames, height, width, 3] representing an image or image sequence.
bboxes: `Tensor` of shape [num_boxes, 4] or [num_frames, num_boxes, 4]
representing bounding boxes for an image or image sequence.
Returns:
The augmented version of `image` and `bboxes`.
"""
raise NotImplementedError
class AutoAugment(ImageAugment):
"""Applies the AutoAugment policy to images.
AutoAugment is from the paper: https://arxiv.org/abs/1805.09501.
"""
def __init__(self,
augmentation_name: Text = 'v0',
policies: Optional[Iterable[Iterable[Tuple[Text, float,
float]]]] = None,
cutout_const: float = 100,
translate_const: float = 250):
"""Applies the AutoAugment policy to images.
Args:
augmentation_name: The name of the AutoAugment policy to use. The
available options are `v0`, `test`, `reduced_cifar10`, `svhn` and
`reduced_imagenet`. `v0` is the policy used for all
of the results in the paper and was found to achieve the best results on
the COCO dataset. `v1`, `v2` and `v3` are additional good policies found
on the COCO dataset that have slight variation in what operations were
used during the search procedure along with how many operations are
applied in parallel to a single image (2 vs 3). Make sure to set
`policies` to `None` (the default) if you want to set options using
`augmentation_name`.
policies: list of lists of tuples in the form `(func, prob, level)`,
`func` is a string name of the augmentation function, `prob` is the
probability of applying the `func` operation, `level` (or magnitude) is
the input argument for `func`. For example:
```
[[('Equalize', 0.9, 3), ('Color', 0.7, 8)],
[('Invert', 0.6, 5), ('Rotate', 0.2, 9), ('ShearX', 0.1, 2)], ...]
```
The outer-most list must be 3-d. The number of operations in a
sub-policy can vary from one sub-policy to another.
If you provide `policies` as input, any option set with
`augmentation_name` will get overriden as they are mutually exclusive.
cutout_const: multiplier for applying cutout.
translate_const: multiplier for applying translation.
Raises:
ValueError if `augmentation_name` is unsupported.
"""
super(AutoAugment, self).__init__()
self.augmentation_name = augmentation_name
self.cutout_const = float(cutout_const)
self.translate_const = float(translate_const)
self.available_policies = {
'detection_v0': self.detection_policy_v0(),
'v0': self.policy_v0(),
'test': self.policy_test(),
'simple': self.policy_simple(),
'reduced_cifar10': self.policy_reduced_cifar10(),
'svhn': self.policy_svhn(),
'reduced_imagenet': self.policy_reduced_imagenet(),
}
if not policies:
if augmentation_name not in self.available_policies:
raise ValueError(
'Invalid augmentation_name: {}'.format(augmentation_name))
self.policies = self.available_policies[augmentation_name]
else:
self._check_policy_shape(policies)
self.policies = policies
def _check_policy_shape(self, policies):
"""Checks dimension and shape of the custom policy.
Args:
policies: List of list of tuples in the form `(func, prob, level)`. Must
have shape of `(:, :, 3)`.
Raises:
ValueError if the shape of `policies` is unexpected.
"""
in_shape = np.array(policies).shape
if len(in_shape) != 3 or in_shape[-1:] != (3,):
raise ValueError('Wrong shape detected for custom policy. Expected '
'(:, :, 3) but got {}.'.format(in_shape))
def _make_tf_policies(self):
"""Prepares the TF functions for augmentations based on the policies."""
replace_value = [128] * 3
# func is the string name of the augmentation function, prob is the
# probability of applying the operation and level is the parameter
# associated with the tf op.
# tf_policies are functions that take in an image and return an augmented
# image.
tf_policies = []
for policy in self.policies:
tf_policy = []
assert_ranges = []
# Link string name to the correct python function and make sure the
# correct argument is passed into that function.
for policy_info in policy:
_, prob, level = policy_info
assert_ranges.append(tf.Assert(tf.less_equal(prob, 1.), [prob]))
assert_ranges.append(
tf.Assert(tf.less_equal(level, int(_MAX_LEVEL)), [level]))
policy_info = list(policy_info) + [
replace_value, self.cutout_const, self.translate_const
]
tf_policy.append(_parse_policy_info(*policy_info))
# Now build the tf policy that will apply the augmentation procedue
# on image.
def make_final_policy(tf_policy_):
def final_policy(image_, bboxes_):
for func, prob, args in tf_policy_:
image_, bboxes_ = _apply_func_with_prob(func, image_, bboxes_, args,
prob)
return image_, bboxes_
return final_policy
with tf.control_dependencies(assert_ranges):
tf_policies.append(make_final_policy(tf_policy))
return tf_policies
def distort(self, image: tf.Tensor) -> tf.Tensor:
"""See base class."""
input_image_type = image.dtype
if input_image_type != tf.uint8:
image = tf.clip_by_value(image, 0.0, 255.0)
image = tf.cast(image, dtype=tf.uint8)
tf_policies = self._make_tf_policies()
image, _ = select_and_apply_random_policy(tf_policies, image, bboxes=None)
return image
def distort_with_boxes(self, image: tf.Tensor,
bboxes: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
"""See base class."""
input_image_type = image.dtype
if input_image_type != tf.uint8:
image = tf.clip_by_value(image, 0.0, 255.0)
image = tf.cast(image, dtype=tf.uint8)
tf_policies = self._make_tf_policies()
image, bboxes = select_and_apply_random_policy(tf_policies, image, bboxes)
return image, bboxes
@staticmethod
def detection_policy_v0():
"""Autoaugment policy that was used in AutoAugment Paper for Detection.
https://arxiv.org/pdf/1906.11172
Each tuple is an augmentation operation of the form
(operation, probability, magnitude). Each element in policy is a
sub-policy that will be applied sequentially on the image.
Returns:
the policy.
"""
policy = [
[('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],
[('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],
[('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],
[('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],
[('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],
]
return policy
@staticmethod
def policy_v0():
"""Autoaugment policy that was used in AutoAugment Paper.
Each tuple is an augmentation operation of the form
(operation, probability, magnitude). Each element in policy is a
sub-policy that will be applied sequentially on the image.
Returns:
the policy.
"""
policy = [
[('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
[('Color', 0.4, 9), ('Equalize', 0.6, 3)],
[('Color', 0.4, 1), ('Rotate', 0.6, 8)],
[('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
[('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
[('Color', 0.2, 0), ('Equalize', 0.8, 8)],
[('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
[('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
[('Color', 0.6, 1), ('Equalize', 1.0, 2)],
[('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
[('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
[('Color', 0.4, 7), ('Equalize', 0.6, 0)],
[('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
[('Solarize', 0.6, 8), ('Color', 0.6, 9)],
[('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
[('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)],
[('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
[('ShearY', 0.8, 0), ('Color', 0.6, 4)],
[('Color', 1.0, 0), ('Rotate', 0.6, 2)],
[('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
[('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
[('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
[('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
[('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
[('Color', 0.8, 6), ('Rotate', 0.4, 5)],
]
return policy
@staticmethod
def policy_reduced_cifar10():
"""Autoaugment policy for reduced CIFAR-10 dataset.
Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501.
Each tuple is an augmentation operation of the form
(operation, probability, magnitude). Each element in policy is a
sub-policy that will be applied sequentially on the image.
Returns:
the policy.
"""
policy = [
[('Invert', 0.1, 7), ('Contrast', 0.2, 6)],
[('Rotate', 0.7, 2), ('TranslateX', 0.3, 9)],
[('Sharpness', 0.8, 1), ('Sharpness', 0.9, 3)],
[('ShearY', 0.5, 8), ('TranslateY', 0.7, 9)],
[('AutoContrast', 0.5, 8), ('Equalize', 0.9, 2)],
[('ShearY', 0.2, 7), ('Posterize', 0.3, 7)],
[('Color', 0.4, 3), ('Brightness', 0.6, 7)],
[('Sharpness', 0.3, 9), ('Brightness', 0.7, 9)],
[('Equalize', 0.6, 5), ('Equalize', 0.5, 1)],
[('Contrast', 0.6, 7), ('Sharpness', 0.6, 5)],
[('Color', 0.7, 7), ('TranslateX', 0.5, 8)],
[('Equalize', 0.3, 7), ('AutoContrast', 0.4, 8)],
[('TranslateY', 0.4, 3), ('Sharpness', 0.2, 6)],
[('Brightness', 0.9, 6), ('Color', 0.2, 8)],
[('Solarize', 0.5, 2), ('Invert', 0.0, 3)],
[('Equalize', 0.2, 0), ('AutoContrast', 0.6, 0)],
[('Equalize', 0.2, 8), ('Equalize', 0.6, 4)],
[('Color', 0.9, 9), ('Equalize', 0.6, 6)],
[('AutoContrast', 0.8, 4), ('Solarize', 0.2, 8)],
[('Brightness', 0.1, 3), ('Color', 0.7, 0)],
[('Solarize', 0.4, 5), ('AutoContrast', 0.9, 3)],
[('TranslateY', 0.9, 9), ('TranslateY', 0.7, 9)],
[('AutoContrast', 0.9, 2), ('Solarize', 0.8, 3)],
[('Equalize', 0.8, 8), ('Invert', 0.1, 3)],
[('TranslateY', 0.7, 9), ('AutoContrast', 0.9, 1)],
]
return policy
@staticmethod
def policy_svhn():
"""Autoaugment policy for SVHN dataset.
Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501.
Each tuple is an augmentation operation of the form
(operation, probability, magnitude). Each element in policy is a
sub-policy that will be applied sequentially on the image.
Returns:
the policy.
"""
policy = [
[('ShearX', 0.9, 4), ('Invert', 0.2, 3)],
[('ShearY', 0.9, 8), ('Invert', 0.7, 5)],
[('Equalize', 0.6, 5), ('Solarize', 0.6, 6)],
[('Invert', 0.9, 3), ('Equalize', 0.6, 3)],
[('Equalize', 0.6, 1), ('Rotate', 0.9, 3)],
[('ShearX', 0.9, 4), ('AutoContrast', 0.8, 3)],
[('ShearY', 0.9, 8), ('Invert', 0.4, 5)],
[('ShearY', 0.9, 5), ('Solarize', 0.2, 6)],
[('Invert', 0.9, 6), ('AutoContrast', 0.8, 1)],
[('Equalize', 0.6, 3), ('Rotate', 0.9, 3)],
[('ShearX', 0.9, 4), ('Solarize', 0.3, 3)],
[('ShearY', 0.8, 8), ('Invert', 0.7, 4)],
[('Equalize', 0.9, 5), ('TranslateY', 0.6, 6)],
[('Invert', 0.9, 4), ('Equalize', 0.6, 7)],
[('Contrast', 0.3, 3), ('Rotate', 0.8, 4)],
[('Invert', 0.8, 5), ('TranslateY', 0.0, 2)],
[('ShearY', 0.7, 6), ('Solarize', 0.4, 8)],
[('Invert', 0.6, 4), ('Rotate', 0.8, 4)],
[('ShearY', 0.3, 7), ('TranslateX', 0.9, 3)],
[('ShearX', 0.1, 6), ('Invert', 0.6, 5)],
[('Solarize', 0.7, 2), ('TranslateY', 0.6, 7)],
[('ShearY', 0.8, 4), ('Invert', 0.8, 8)],
[('ShearX', 0.7, 9), ('TranslateY', 0.8, 3)],
[('ShearY', 0.8, 5), ('AutoContrast', 0.7, 3)],
[('ShearX', 0.7, 2), ('Invert', 0.1, 5)],
]
return policy
@staticmethod
def policy_reduced_imagenet():
"""Autoaugment policy for reduced ImageNet dataset.
Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501.
Each tuple is an augmentation operation of the form
(operation, probability, magnitude). Each element in policy is a
sub-policy that will be applied sequentially on the image.
Returns:
the policy.
"""
policy = [
[('Posterize', 0.4, 8), ('Rotate', 0.6, 9)],
[('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
[('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
[('Posterize', 0.6, 7), ('Posterize', 0.6, 6)],
[('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
[('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
[('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
[('Posterize', 0.8, 5), ('Equalize', 1.0, 2)],
[('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
[('Equalize', 0.6, 8), ('Posterize', 0.4, 6)],
[('Rotate', 0.8, 8), ('Color', 0.4, 0)],
[('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
[('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
[('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
[('Color', 0.6, 4), ('Contrast', 1.0, 8)],
[('Rotate', 0.8, 8), ('Color', 1.0, 2)],
[('Color', 0.8, 8), ('Solarize', 0.8, 7)],
[('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
[('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
[('Color', 0.4, 0), ('Equalize', 0.6, 3)],
[('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
[('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
[('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
[('Color', 0.6, 4), ('Contrast', 1.0, 8)],
[('Equalize', 0.8, 8), ('Equalize', 0.6, 3)]
]
return policy
@staticmethod
def policy_simple():
"""Same as `policy_v0`, except with custom ops removed."""
policy = [
[('Color', 0.4, 9), ('Equalize', 0.6, 3)],
[('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
[('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
[('Color', 0.2, 0), ('Equalize', 0.8, 8)],
[('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
[('Color', 0.6, 1), ('Equalize', 1.0, 2)],
[('Color', 0.4, 7), ('Equalize', 0.6, 0)],
[('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
[('Solarize', 0.6, 8), ('Color', 0.6, 9)],
[('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
[('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
[('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
[('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
]
return policy
@staticmethod
def policy_test():
"""Autoaugment test policy for debugging."""
policy = [
[('TranslateX', 1.0, 4), ('Equalize', 1.0, 10)],
]
return policy
def _maybe_identity(x: Optional[tf.Tensor]) -> Optional[tf.Tensor]:
return tf.identity(x) if x is not None else None
class RandAugment(ImageAugment):
"""Applies the RandAugment policy to images.
RandAugment is from the paper https://arxiv.org/abs/1909.13719,
"""
def __init__(self,
num_layers: int = 2,
magnitude: float = 10.,
cutout_const: float = 40.,
translate_const: float = 100.,
magnitude_std: float = 0.0,
prob_to_apply: Optional[float] = None,
exclude_ops: Optional[List[str]] = None):
"""Applies the RandAugment policy to images.
Args:
num_layers: Integer, the number of augmentation transformations to apply
sequentially to an image. Represented as (N) in the paper. Usually best
values will be in the range [1, 3].
magnitude: Integer, shared magnitude across all augmentation operations.
Represented as (M) in the paper. Usually best values are in the range
[5, 10].
cutout_const: multiplier for applying cutout.
translate_const: multiplier for applying translation.
magnitude_std: randomness of the severity as proposed by the authors of
the timm library.
prob_to_apply: The probability to apply the selected augmentation at each
layer.
exclude_ops: exclude selected operations.
"""
super(RandAugment, self).__init__()
self.num_layers = num_layers
self.magnitude = float(magnitude)
self.cutout_const = float(cutout_const)
self.translate_const = float(translate_const)
self.prob_to_apply = (
float(prob_to_apply) if prob_to_apply is not None else None)
self.available_ops = [
'AutoContrast', 'Equalize', 'Invert', 'Rotate', 'Posterize', 'Solarize',
'Color', 'Contrast', 'Brightness', 'Sharpness', 'ShearX', 'ShearY',
'TranslateX', 'TranslateY', 'Cutout', 'SolarizeAdd'
]
self.magnitude_std = magnitude_std
if exclude_ops:
self.available_ops = [
op for op in self.available_ops if op not in exclude_ops
]
@classmethod
def build_for_detection(cls,
num_layers: int = 2,
magnitude: float = 10.,
cutout_const: float = 40.,
translate_const: float = 100.,
magnitude_std: float = 0.0,
prob_to_apply: Optional[float] = None,
exclude_ops: Optional[List[str]] = None):
"""Builds a RandAugment that modifies bboxes for geometric transforms."""
augmenter = cls(
num_layers=num_layers,
magnitude=magnitude,
cutout_const=cutout_const,
translate_const=translate_const,
magnitude_std=magnitude_std,
prob_to_apply=prob_to_apply,
exclude_ops=exclude_ops)
box_aware_ops_by_base_name = {
'Rotate': 'Rotate_BBox',
'ShearX': 'ShearX_BBox',
'ShearY': 'ShearY_BBox',
'TranslateX': 'TranslateX_BBox',
'TranslateY': 'TranslateY_BBox',
}
augmenter.available_ops = [
box_aware_ops_by_base_name.get(op_name) or op_name
for op_name in augmenter.available_ops
]
return augmenter
def _distort_common(
self,
image: tf.Tensor,
bboxes: Optional[tf.Tensor] = None
) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
"""Distorts the image and optionally bounding boxes."""
input_image_type = image.dtype
if input_image_type != tf.uint8:
image = tf.clip_by_value(image, 0.0, 255.0)
image = tf.cast(image, dtype=tf.uint8)
replace_value = [128] * 3
min_prob, max_prob = 0.2, 0.8
aug_image = image
aug_bboxes = bboxes
for _ in range(self.num_layers):
op_to_select = tf.random.uniform([],
maxval=len(self.available_ops) + 1,
dtype=tf.int32)
branch_fns = []
for (i, op_name) in enumerate(self.available_ops):
prob = tf.random.uniform([],
minval=min_prob,
maxval=max_prob,
dtype=tf.float32)
func, _, args = _parse_policy_info(op_name, prob, self.magnitude,
replace_value, self.cutout_const,
self.translate_const,
self.magnitude_std)
branch_fns.append((
i,
# pylint:disable=g-long-lambda
lambda selected_func=func, selected_args=args: selected_func(
image, bboxes, *selected_args)))
# pylint:enable=g-long-lambda
aug_image, aug_bboxes = tf.switch_case(
branch_index=op_to_select,
branch_fns=branch_fns,
default=lambda: (tf.identity(image), _maybe_identity(bboxes)))
if self.prob_to_apply is not None:
aug_image, aug_bboxes = tf.cond(
tf.random.uniform(shape=[], dtype=tf.float32) < self.prob_to_apply,
lambda: (tf.identity(aug_image), _maybe_identity(aug_bboxes)),
lambda: (tf.identity(image), _maybe_identity(bboxes)))
image = aug_image
bboxes = aug_bboxes
image = tf.cast(image, dtype=input_image_type)
return image, bboxes
def distort(self, image: tf.Tensor) -> tf.Tensor:
"""See base class."""
image, _ = self._distort_common(image)
return image
def distort_with_boxes(self, image: tf.Tensor,
bboxes: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
"""See base class."""
image, bboxes = self._distort_common(image, bboxes)
return image, bboxes
class RandomErasing(ImageAugment):
"""Applies RandomErasing to a single image.
Reference: https://arxiv.org/abs/1708.04896
Implementaion is inspired by https://github.com/rwightman/pytorch-image-models
"""
def __init__(self,
probability: float = 0.25,
min_area: float = 0.02,
max_area: float = 1 / 3,
min_aspect: float = 0.3,
max_aspect=None,
min_count=1,
max_count=1,
trials=10):
"""Applies RandomErasing to a single image.
Args:
probability (float, optional): Probability of augmenting the image.
Defaults to 0.25.
min_area (float, optional): Minimum area of the random erasing rectangle.
Defaults to 0.02.
max_area (float, optional): Maximum area of the random erasing rectangle.
Defaults to 1/3.
min_aspect (float, optional): Minimum aspect rate of the random erasing
rectangle. Defaults to 0.3.
max_aspect ([type], optional): Maximum aspect rate of the random erasing
rectangle. Defaults to None.
min_count (int, optional): Minimum number of erased rectangles. Defaults
to 1.
max_count (int, optional): Maximum number of erased rectangles. Defaults
to 1.
trials (int, optional): Maximum number of trials to randomly sample a
rectangle that fulfills constraint. Defaults to 10.
"""
self._probability = probability
self._min_area = float(min_area)
self._max_area = float(max_area)
self._min_log_aspect = math.log(min_aspect)
self._max_log_aspect = math.log(max_aspect or 1 / min_aspect)
self._min_count = min_count
self._max_count = max_count
self._trials = trials
def distort(self, image: tf.Tensor) -> tf.Tensor:
"""Applies RandomErasing to single `image`.
Args:
image (tf.Tensor): Of shape [height, width, 3] representing an image.
Returns:
tf.Tensor: The augmented version of `image`.
"""
uniform_random = tf.random.uniform(shape=[], minval=0., maxval=1.0)
mirror_cond = tf.less(uniform_random, self._probability)
image = tf.cond(mirror_cond, lambda: self._erase(image), lambda: image)
return image
@tf.function
def _erase(self, image: tf.Tensor) -> tf.Tensor:
"""Erase an area."""
if self._min_count == self._max_count:
count = self._min_count
else:
count = tf.random.uniform(
shape=[],
minval=int(self._min_count),
maxval=int(self._max_count - self._min_count + 1),
dtype=tf.int32)
image_height = tf.shape(image)[0]
image_width = tf.shape(image)[1]
area = tf.cast(image_width * image_height, tf.float32)
for _ in range(count):
# Work around since break is not supported in tf.function
is_trial_successfull = False
for _ in range(self._trials):
if not is_trial_successfull:
erase_area = tf.random.uniform(
shape=[],
minval=area * self._min_area,
maxval=area * self._max_area)
aspect_ratio = tf.math.exp(
tf.random.uniform(
shape=[],
minval=self._min_log_aspect,
maxval=self._max_log_aspect))
half_height = tf.cast(
tf.math.round(tf.math.sqrt(erase_area * aspect_ratio) / 2),
dtype=tf.int32)
half_width = tf.cast(
tf.math.round(tf.math.sqrt(erase_area / aspect_ratio) / 2),
dtype=tf.int32)
if 2 * half_height < image_height and 2 * half_width < image_width:
center_height = tf.random.uniform(
shape=[],
minval=0,
maxval=int(image_height - 2 * half_height),
dtype=tf.int32)
center_width = tf.random.uniform(
shape=[],
minval=0,
maxval=int(image_width - 2 * half_width),
dtype=tf.int32)
image = _fill_rectangle(
image,
center_width,
center_height,
half_width,
half_height,
replace=None)
is_trial_successfull = True
return image
class MixupAndCutmix:
"""Applies Mixup and/or Cutmix to a batch of images.
- Mixup: https://arxiv.org/abs/1710.09412
- Cutmix: https://arxiv.org/abs/1905.04899
Implementaion is inspired by https://github.com/rwightman/pytorch-image-models
"""
def __init__(self,
mixup_alpha: float = .8,
cutmix_alpha: float = 1.,
prob: float = 1.0,
switch_prob: float = 0.5,
label_smoothing: float = 0.1,
num_classes: int = 1001):
"""Applies Mixup and/or Cutmix to a batch of images.
Args:
mixup_alpha (float, optional): For drawing a random lambda (`lam`) from a
beta distribution (for each image). If zero Mixup is deactivated.
Defaults to .8.
cutmix_alpha (float, optional): For drawing a random lambda (`lam`) from a
beta distribution (for each image). If zero Cutmix is deactivated.
Defaults to 1..
prob (float, optional): Of augmenting the batch. Defaults to 1.0.
switch_prob (float, optional): Probability of applying Cutmix for the
batch. Defaults to 0.5.
label_smoothing (float, optional): Constant for label smoothing. Defaults
to 0.1.
num_classes (int, optional): Number of classes. Defaults to 1001.
"""
self.mixup_alpha = mixup_alpha
self.cutmix_alpha = cutmix_alpha
self.mix_prob = prob
self.switch_prob = switch_prob
self.label_smoothing = label_smoothing
self.num_classes = num_classes
self.mode = 'batch'
self.mixup_enabled = True
if self.mixup_alpha and not self.cutmix_alpha:
self.switch_prob = -1
elif not self.mixup_alpha and self.cutmix_alpha:
self.switch_prob = 1
def __call__(self, images: tf.Tensor,
labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
return self.distort(images, labels)
def distort(self, images: tf.Tensor,
labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
"""Applies Mixup and/or Cutmix to batch of images and transforms labels.
Args:
images (tf.Tensor): Of shape [batch_size,height, width, 3] representing a
batch of image.
labels (tf.Tensor): Of shape [batch_size, ] representing the class id for
each image of the batch.
Returns:
Tuple[tf.Tensor, tf.Tensor]: The augmented version of `image` and
`labels`.
"""
augment_cond = tf.less(
tf.random.uniform(shape=[], minval=0., maxval=1.0), self.mix_prob)
# pylint: disable=g-long-lambda
augment_a = lambda: self._update_labels(*tf.cond(
tf.less(
tf.random.uniform(shape=[], minval=0., maxval=1.0), self.switch_prob
), lambda: self._cutmix(images, labels), lambda: self._mixup(
images, labels)))
augment_b = lambda: (images, self._smooth_labels(labels))
# pylint: enable=g-long-lambda
return tf.cond(augment_cond, augment_a, augment_b)
@staticmethod
def _sample_from_beta(alpha, beta, shape):
sample_alpha = tf.random.gamma(shape, 1., beta=alpha)
sample_beta = tf.random.gamma(shape, 1., beta=beta)
return sample_alpha / (sample_alpha + sample_beta)
def _cutmix(self, images: tf.Tensor,
labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
"""Apply cutmix."""
lam = MixupAndCutmix._sample_from_beta(self.cutmix_alpha, self.cutmix_alpha,
labels.shape)
ratio = tf.math.sqrt(1 - lam)
batch_size = tf.shape(images)[0]
image_height, image_width = tf.shape(images)[1], tf.shape(images)[2]
cut_height = tf.cast(
ratio * tf.cast(image_height, dtype=tf.float32), dtype=tf.int32)
cut_width = tf.cast(
ratio * tf.cast(image_height, dtype=tf.float32), dtype=tf.int32)
random_center_height = tf.random.uniform(
shape=[batch_size], minval=0, maxval=image_height, dtype=tf.int32)
random_center_width = tf.random.uniform(
shape=[batch_size], minval=0, maxval=image_width, dtype=tf.int32)
bbox_area = cut_height * cut_width
lam = 1. - bbox_area / (image_height * image_width)
lam = tf.cast(lam, dtype=tf.float32)
images = tf.map_fn(
lambda x: _fill_rectangle(*x),
(images, random_center_width, random_center_height, cut_width // 2,
cut_height // 2, tf.reverse(images, [0])),
dtype=(tf.float32, tf.int32, tf.int32, tf.int32, tf.int32, tf.float32),
fn_output_signature=tf.TensorSpec(images.shape[1:], dtype=tf.float32))
return images, labels, lam
def _mixup(self, images: tf.Tensor,
labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
lam = MixupAndCutmix._sample_from_beta(self.mixup_alpha, self.mixup_alpha,
labels.shape)
lam = tf.reshape(lam, [-1, 1, 1, 1])
images = lam * images + (1. - lam) * tf.reverse(images, [0])
return images, labels, tf.squeeze(lam)
def _smooth_labels(self, labels: tf.Tensor) -> tf.Tensor:
off_value = self.label_smoothing / self.num_classes
on_value = 1. - self.label_smoothing + off_value
smooth_labels = tf.one_hot(
labels, self.num_classes, on_value=on_value, off_value=off_value)
return smooth_labels
def _update_labels(self, images: tf.Tensor, labels: tf.Tensor,
lam: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
labels_1 = self._smooth_labels(labels)
labels_2 = tf.reverse(labels_1, [0])
lam = tf.reshape(lam, [-1, 1])
labels = lam * labels_1 + (1. - lam) * labels_2
return images, labels
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for autoaugment."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import random
from absl.testing import parameterized
import tensorflow as tf
from official.vision.ops import augment
def get_dtype_test_cases():
return [
('uint8', tf.uint8),
('int32', tf.int32),
('float16', tf.float16),
('float32', tf.float32),
]
@parameterized.named_parameters(get_dtype_test_cases())
class TransformsTest(parameterized.TestCase, tf.test.TestCase):
"""Basic tests for fundamental transformations."""
def test_to_from_4d(self, dtype):
for shape in [(10, 10), (10, 10, 10), (10, 10, 10, 10)]:
original_ndims = len(shape)
image = tf.zeros(shape, dtype=dtype)
image_4d = augment.to_4d(image)
self.assertEqual(4, tf.rank(image_4d))
self.assertAllEqual(image, augment.from_4d(image_4d, original_ndims))
def test_transform(self, dtype):
image = tf.constant([[1, 2], [3, 4]], dtype=dtype)
self.assertAllEqual(
augment.transform(image, transforms=[1] * 8), [[4, 4], [4, 4]])
def test_translate(self, dtype):
image = tf.constant(
[[1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]], dtype=dtype)
translations = [-1, -1]
translated = augment.translate(image=image, translations=translations)
expected = [[1, 0, 1, 1], [0, 1, 0, 0], [1, 0, 1, 1], [1, 0, 1, 1]]
self.assertAllEqual(translated, expected)
def test_translate_shapes(self, dtype):
translation = [0, 0]
for shape in [(3, 3), (5, 5), (224, 224, 3)]:
image = tf.zeros(shape, dtype=dtype)
self.assertAllEqual(image, augment.translate(image, translation))
def test_translate_invalid_translation(self, dtype):
image = tf.zeros((1, 1), dtype=dtype)
invalid_translation = [[[1, 1]]]
with self.assertRaisesRegex(TypeError, 'rank 1 or 2'):
_ = augment.translate(image, invalid_translation)
def test_rotate(self, dtype):
image = tf.reshape(tf.cast(tf.range(9), dtype), (3, 3))
rotation = 90.
transformed = augment.rotate(image=image, degrees=rotation)
expected = [[2, 5, 8], [1, 4, 7], [0, 3, 6]]
self.assertAllEqual(transformed, expected)
def test_rotate_shapes(self, dtype):
degrees = 0.
for shape in [(3, 3), (5, 5), (224, 224, 3)]:
image = tf.zeros(shape, dtype=dtype)
self.assertAllEqual(image, augment.rotate(image, degrees))
class AutoaugmentTest(tf.test.TestCase, parameterized.TestCase):
AVAILABLE_POLICIES = [
'v0',
'test',
'simple',
'reduced_cifar10',
'svhn',
'reduced_imagenet',
'detection_v0',
]
def test_autoaugment(self):
"""Smoke test to be sure there are no syntax errors."""
image = tf.zeros((224, 224, 3), dtype=tf.uint8)
for policy in self.AVAILABLE_POLICIES:
augmenter = augment.AutoAugment(augmentation_name=policy)
aug_image = augmenter.distort(image)
self.assertEqual((224, 224, 3), aug_image.shape)
def test_autoaugment_with_bboxes(self):
"""Smoke test to be sure there are no syntax errors with bboxes."""
image = tf.zeros((224, 224, 3), dtype=tf.uint8)
bboxes = tf.ones((2, 4), dtype=tf.float32)
for policy in self.AVAILABLE_POLICIES:
augmenter = augment.AutoAugment(augmentation_name=policy)
aug_image, aug_bboxes = augmenter.distort_with_boxes(image, bboxes)
self.assertEqual((224, 224, 3), aug_image.shape)
self.assertEqual((2, 4), aug_bboxes.shape)
def test_randaug(self):
"""Smoke test to be sure there are no syntax errors."""
image = tf.zeros((224, 224, 3), dtype=tf.uint8)
augmenter = augment.RandAugment()
aug_image = augmenter.distort(image)
self.assertEqual((224, 224, 3), aug_image.shape)
def test_randaug_with_bboxes(self):
"""Smoke test to be sure there are no syntax errors with bboxes."""
image = tf.zeros((224, 224, 3), dtype=tf.uint8)
bboxes = tf.ones((2, 4), dtype=tf.float32)
augmenter = augment.RandAugment()
aug_image, aug_bboxes = augmenter.distort_with_boxes(image, bboxes)
self.assertEqual((224, 224, 3), aug_image.shape)
self.assertEqual((2, 4), aug_bboxes.shape)
def test_randaug_build_for_detection(self):
"""Smoke test to be sure there are no syntax errors built for detection."""
image = tf.zeros((224, 224, 3), dtype=tf.uint8)
bboxes = tf.ones((2, 4), dtype=tf.float32)
augmenter = augment.RandAugment.build_for_detection()
self.assertCountEqual(augmenter.available_ops, [
'AutoContrast', 'Equalize', 'Invert', 'Posterize', 'Solarize', 'Color',
'Contrast', 'Brightness', 'Sharpness', 'Cutout', 'SolarizeAdd',
'Rotate_BBox', 'ShearX_BBox', 'ShearY_BBox', 'TranslateX_BBox',
'TranslateY_BBox'
])
aug_image, aug_bboxes = augmenter.distort_with_boxes(image, bboxes)
self.assertEqual((224, 224, 3), aug_image.shape)
self.assertEqual((2, 4), aug_bboxes.shape)
def test_all_policy_ops(self):
"""Smoke test to be sure all augmentation functions can execute."""
prob = 1
magnitude = 10
replace_value = [128] * 3
cutout_const = 100
translate_const = 250
image = tf.ones((224, 224, 3), dtype=tf.uint8)
bboxes = None
for op_name in augment.NAME_TO_FUNC.keys() - augment.REQUIRE_BOXES_FUNCS:
func, _, args = augment._parse_policy_info(op_name, prob, magnitude,
replace_value, cutout_const,
translate_const)
image, bboxes = func(image, bboxes, *args)
self.assertEqual((224, 224, 3), image.shape)
self.assertIsNone(bboxes)
def test_all_policy_ops_with_bboxes(self):
"""Smoke test to be sure all augmentation functions can execute."""
prob = 1
magnitude = 10
replace_value = [128] * 3
cutout_const = 100
translate_const = 250
image = tf.ones((224, 224, 3), dtype=tf.uint8)
bboxes = tf.ones((2, 4), dtype=tf.float32)
for op_name in augment.NAME_TO_FUNC:
func, _, args = augment._parse_policy_info(op_name, prob, magnitude,
replace_value, cutout_const,
translate_const)
image, bboxes = func(image, bboxes, *args)
self.assertEqual((224, 224, 3), image.shape)
self.assertEqual((2, 4), bboxes.shape)
def test_autoaugment_video(self):
"""Smoke test with video to be sure there are no syntax errors."""
image = tf.zeros((2, 224, 224, 3), dtype=tf.uint8)
for policy in self.AVAILABLE_POLICIES:
augmenter = augment.AutoAugment(augmentation_name=policy)
aug_image = augmenter.distort(image)
self.assertEqual((2, 224, 224, 3), aug_image.shape)
def test_autoaugment_video_with_boxes(self):
"""Smoke test with video to be sure there are no syntax errors."""
image = tf.zeros((2, 224, 224, 3), dtype=tf.uint8)
bboxes = tf.ones((2, 2, 4), dtype=tf.float32)
for policy in self.AVAILABLE_POLICIES:
augmenter = augment.AutoAugment(augmentation_name=policy)
aug_image, aug_bboxes = augmenter.distort_with_boxes(image, bboxes)
self.assertEqual((2, 224, 224, 3), aug_image.shape)
self.assertEqual((2, 2, 4), aug_bboxes.shape)
def test_randaug_video(self):
"""Smoke test with video to be sure there are no syntax errors."""
image = tf.zeros((2, 224, 224, 3), dtype=tf.uint8)
augmenter = augment.RandAugment()
aug_image = augmenter.distort(image)
self.assertEqual((2, 224, 224, 3), aug_image.shape)
def test_all_policy_ops_video(self):
"""Smoke test to be sure all video augmentation functions can execute."""
prob = 1
magnitude = 10
replace_value = [128] * 3
cutout_const = 100
translate_const = 250
image = tf.ones((2, 224, 224, 3), dtype=tf.uint8)
bboxes = None
for op_name in augment.NAME_TO_FUNC.keys() - augment.REQUIRE_BOXES_FUNCS:
func, _, args = augment._parse_policy_info(op_name, prob, magnitude,
replace_value, cutout_const,
translate_const)
image, bboxes = func(image, bboxes, *args)
self.assertEqual((2, 224, 224, 3), image.shape)
self.assertIsNone(bboxes)
def test_all_policy_ops_video_with_bboxes(self):
"""Smoke test to be sure all video augmentation functions can execute."""
prob = 1
magnitude = 10
replace_value = [128] * 3
cutout_const = 100
translate_const = 250
image = tf.ones((2, 224, 224, 3), dtype=tf.uint8)
bboxes = tf.ones((2, 2, 4), dtype=tf.float32)
for op_name in augment.NAME_TO_FUNC:
func, _, args = augment._parse_policy_info(op_name, prob, magnitude,
replace_value, cutout_const,
translate_const)
if op_name in {
'Rotate_BBox',
'ShearX_BBox',
'ShearY_BBox',
'TranslateX_BBox',
'TranslateY_BBox',
'TranslateY_Only_BBoxes',
}:
with self.assertRaises(ValueError):
func(image, bboxes, *args)
else:
image, bboxes = func(image, bboxes, *args)
self.assertEqual((2, 224, 224, 3), image.shape)
self.assertEqual((2, 2, 4), bboxes.shape)
def _generate_test_policy(self):
"""Generate a test policy at random."""
op_list = list(augment.NAME_TO_FUNC.keys())
size = 6
prob = [round(random.uniform(0., 1.), 1) for _ in range(size)]
mag = [round(random.uniform(0, 10)) for _ in range(size)]
policy = []
for i in range(0, size, 2):
policy.append([(op_list[i], prob[i], mag[i]),
(op_list[i + 1], prob[i + 1], mag[i + 1])])
return policy
def test_custom_policy(self):
"""Test autoaugment with a custom policy."""
image = tf.zeros((224, 224, 3), dtype=tf.uint8)
augmenter = augment.AutoAugment(policies=self._generate_test_policy())
aug_image = augmenter.distort(image)
self.assertEqual((224, 224, 3), aug_image.shape)
@parameterized.named_parameters(
{'testcase_name': '_OutOfRangeProb',
'sub_policy': ('Equalize', 1.1, 3), 'value': '1.1'},
{'testcase_name': '_OutOfRangeMag',
'sub_policy': ('Equalize', 0.9, 11), 'value': '11'},
)
def test_invalid_custom_sub_policy(self, sub_policy, value):
"""Test autoaugment with out-of-range values in the custom policy."""
image = tf.zeros((224, 224, 3), dtype=tf.uint8)
policy = self._generate_test_policy()
policy[0][0] = sub_policy
augmenter = augment.AutoAugment(policies=policy)
with self.assertRaisesRegex(
tf.errors.InvalidArgumentError,
r'Expected \'tf.Tensor\(False, shape=\(\), dtype=bool\)\' to be true. '
r'Summarized data: ({})'.format(value)):
augmenter.distort(image)
def test_invalid_custom_policy_ndim(self):
"""Test autoaugment with wrong dimension in the custom policy."""
policy = [[('Equalize', 0.8, 1), ('Shear', 0.8, 4)],
[('TranslateY', 0.6, 3), ('Rotate', 0.9, 3)]]
policy = [[policy]]
with self.assertRaisesRegex(
ValueError,
r'Expected \(:, :, 3\) but got \(1, 1, 2, 2, 3\).'):
augment.AutoAugment(policies=policy)
def test_invalid_custom_policy_shape(self):
"""Test autoaugment with wrong shape in the custom policy."""
policy = [[('Equalize', 0.8, 1, 1), ('Shear', 0.8, 4, 1)],
[('TranslateY', 0.6, 3, 1), ('Rotate', 0.9, 3, 1)]]
with self.assertRaisesRegex(
ValueError,
r'Expected \(:, :, 3\) but got \(2, 2, 4\)'):
augment.AutoAugment(policies=policy)
def test_invalid_custom_policy_key(self):
"""Test autoaugment with invalid key in the custom policy."""
image = tf.zeros((224, 224, 3), dtype=tf.uint8)
policy = [[('AAAAA', 0.8, 1), ('Shear', 0.8, 4)],
[('TranslateY', 0.6, 3), ('Rotate', 0.9, 3)]]
augmenter = augment.AutoAugment(policies=policy)
with self.assertRaisesRegex(KeyError, '\'AAAAA\''):
augmenter.distort(image)
class RandomErasingTest(tf.test.TestCase, parameterized.TestCase):
def test_random_erase_replaces_some_pixels(self):
image = tf.zeros((224, 224, 3), dtype=tf.float32)
augmenter = augment.RandomErasing(probability=1., max_count=10)
aug_image = augmenter.distort(image)
self.assertEqual((224, 224, 3), aug_image.shape)
self.assertNotEqual(0, tf.reduce_max(aug_image))
class MixupAndCutmixTest(tf.test.TestCase, parameterized.TestCase):
def test_mixup_and_cutmix_smoothes_labels(self):
batch_size = 12
num_classes = 1000
label_smoothing = 0.1
images = tf.random.normal((batch_size, 224, 224, 3), dtype=tf.float32)
labels = tf.range(batch_size)
augmenter = augment.MixupAndCutmix(
num_classes=num_classes, label_smoothing=label_smoothing)
aug_images, aug_labels = augmenter.distort(images, labels)
self.assertEqual(images.shape, aug_images.shape)
self.assertEqual(images.dtype, aug_images.dtype)
self.assertEqual([batch_size, num_classes], aug_labels.shape)
self.assertAllLessEqual(aug_labels, 1. - label_smoothing +
2. / num_classes) # With tolerance
self.assertAllGreaterEqual(aug_labels, label_smoothing / num_classes -
1e4) # With tolerance
def test_mixup_changes_image(self):
batch_size = 12
num_classes = 1000
label_smoothing = 0.1
images = tf.random.normal((batch_size, 224, 224, 3), dtype=tf.float32)
labels = tf.range(batch_size)
augmenter = augment.MixupAndCutmix(
mixup_alpha=1., cutmix_alpha=0., num_classes=num_classes)
aug_images, aug_labels = augmenter.distort(images, labels)
self.assertEqual(images.shape, aug_images.shape)
self.assertEqual(images.dtype, aug_images.dtype)
self.assertEqual([batch_size, num_classes], aug_labels.shape)
self.assertAllLessEqual(aug_labels, 1. - label_smoothing +
2. / num_classes) # With tolerance
self.assertAllGreaterEqual(aug_labels, label_smoothing / num_classes -
1e4) # With tolerance
self.assertFalse(tf.math.reduce_all(images == aug_images))
def test_cutmix_changes_image(self):
batch_size = 12
num_classes = 1000
label_smoothing = 0.1
images = tf.random.normal((batch_size, 224, 224, 3), dtype=tf.float32)
labels = tf.range(batch_size)
augmenter = augment.MixupAndCutmix(
mixup_alpha=0., cutmix_alpha=1., num_classes=num_classes)
aug_images, aug_labels = augmenter.distort(images, labels)
self.assertEqual(images.shape, aug_images.shape)
self.assertEqual(images.dtype, aug_images.dtype)
self.assertEqual([batch_size, num_classes], aug_labels.shape)
self.assertAllLessEqual(aug_labels, 1. - label_smoothing +
2. / num_classes) # With tolerance
self.assertAllGreaterEqual(aug_labels, label_smoothing / num_classes -
1e4) # With tolerance
self.assertFalse(tf.math.reduce_all(images == aug_images))
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Box matcher implementation."""
import tensorflow as tf
class BoxMatcher:
"""Matcher based on highest value.
This class computes matches from a similarity matrix. Each column is matched
to a single row.
To support object detection target assignment this class enables setting both
positive_threshold (upper threshold) and negative_threshold (lower thresholds)
defining three categories of similarity which define whether examples are
positive, negative, or ignored, for example:
(1) thresholds=[negative_threshold, positive_threshold], and
indicators=[negative_value, ignore_value, positive_value]: The similarity
metrics below negative_threshold will be assigned with negative_value,
the metrics between negative_threshold and positive_threshold will be
assigned ignore_value, and the metrics above positive_threshold will be
assigned positive_value.
(2) thresholds=[negative_threshold, positive_threshold], and
indicators=[ignore_value, negative_value, positive_value]: The similarity
metric below negative_threshold will be assigned with ignore_value,
the metrics between negative_threshold and positive_threshold will be
assigned negative_value, and the metrics above positive_threshold will be
assigned positive_value.
"""
def __init__(self, thresholds, indicators, force_match_for_each_col=False):
"""Construct BoxMatcher.
Args:
thresholds: A list of thresholds to classify boxes into
different buckets. The list needs to be sorted, and will be prepended
with -Inf and appended with +Inf.
indicators: A list of values to assign for each bucket. len(`indicators`)
must equal to len(`thresholds`) + 1.
force_match_for_each_col: If True, ensures that each column is matched to
at least one row (which is not guaranteed otherwise if the
positive_threshold is high). Defaults to False. If True, all force
matched row will be assigned to `indicators[-1]`.
Raises:
ValueError: If `threshold` not sorted,
or len(indicators) != len(threshold) + 1
"""
if not all([lo <= hi for (lo, hi) in zip(thresholds[:-1], thresholds[1:])]):
raise ValueError('`threshold` must be sorted, got {}'.format(thresholds))
self.indicators = indicators
if len(indicators) != len(thresholds) + 1:
raise ValueError('len(`indicators`) must be len(`thresholds`) + 1, got '
'indicators {}, thresholds {}'.format(
indicators, thresholds))
thresholds = thresholds[:]
thresholds.insert(0, -float('inf'))
thresholds.append(float('inf'))
self.thresholds = thresholds
self._force_match_for_each_col = force_match_for_each_col
def __call__(self, similarity_matrix):
"""Tries to match each column of the similarity matrix to a row.
Args:
similarity_matrix: A float tensor of shape [N, M] representing any
similarity metric.
Returns:
A integer tensor of shape [N] with corresponding match indices for each
of M columns, for positive match, the match result will be the
corresponding row index, for negative match, the match will be
`negative_value`, for ignored match, the match result will be
`ignore_value`.
"""
squeeze_result = False
if len(similarity_matrix.shape) == 2:
squeeze_result = True
similarity_matrix = tf.expand_dims(similarity_matrix, axis=0)
static_shape = similarity_matrix.shape.as_list()
num_rows = static_shape[1] or tf.shape(similarity_matrix)[1]
batch_size = static_shape[0] or tf.shape(similarity_matrix)[0]
def _match_when_rows_are_empty():
"""Performs matching when the rows of similarity matrix are empty.
When the rows are empty, all detections are false positives. So we return
a tensor of -1's to indicate that the columns do not match to any rows.
Returns:
matches: int32 tensor indicating the row each column matches to.
"""
with tf.name_scope('empty_gt_boxes'):
matches = tf.zeros([batch_size, num_rows], dtype=tf.int32)
match_labels = -tf.ones([batch_size, num_rows], dtype=tf.int32)
return matches, match_labels
def _match_when_rows_are_non_empty():
"""Performs matching when the rows of similarity matrix are non empty.
Returns:
matches: int32 tensor indicating the row each column matches to.
"""
# Matches for each column
with tf.name_scope('non_empty_gt_boxes'):
matches = tf.argmax(similarity_matrix, axis=-1, output_type=tf.int32)
# Get logical indices of ignored and unmatched columns as tf.int64
matched_vals = tf.reduce_max(similarity_matrix, axis=-1)
matched_indicators = tf.zeros([batch_size, num_rows], tf.int32)
match_dtype = matched_vals.dtype
for (ind, low, high) in zip(self.indicators, self.thresholds[:-1],
self.thresholds[1:]):
low_threshold = tf.cast(low, match_dtype)
high_threshold = tf.cast(high, match_dtype)
mask = tf.logical_and(
tf.greater_equal(matched_vals, low_threshold),
tf.less(matched_vals, high_threshold))
matched_indicators = self._set_values_using_indicator(
matched_indicators, mask, ind)
if self._force_match_for_each_col:
# [batch_size, M], for each col (groundtruth_box), find the best
# matching row (anchor).
force_match_column_ids = tf.argmax(
input=similarity_matrix, axis=1, output_type=tf.int32)
# [batch_size, M, N]
force_match_column_indicators = tf.one_hot(
force_match_column_ids, depth=num_rows)
# [batch_size, N], for each row (anchor), find the largest column
# index for groundtruth box
force_match_row_ids = tf.argmax(
input=force_match_column_indicators, axis=1, output_type=tf.int32)
# [batch_size, N]
force_match_column_mask = tf.cast(
tf.reduce_max(force_match_column_indicators, axis=1),
tf.bool)
# [batch_size, N]
final_matches = tf.where(force_match_column_mask, force_match_row_ids,
matches)
final_matched_indicators = tf.where(
force_match_column_mask, self.indicators[-1] *
tf.ones([batch_size, num_rows], dtype=tf.int32),
matched_indicators)
return final_matches, final_matched_indicators
else:
return matches, matched_indicators
num_gt_boxes = similarity_matrix.shape.as_list()[-1] or tf.shape(
similarity_matrix)[-1]
result_match, result_matched_indicators = tf.cond(
pred=tf.greater(num_gt_boxes, 0),
true_fn=_match_when_rows_are_non_empty,
false_fn=_match_when_rows_are_empty)
if squeeze_result:
result_match = tf.squeeze(result_match, axis=0)
result_matched_indicators = tf.squeeze(result_matched_indicators, axis=0)
return result_match, result_matched_indicators
def _set_values_using_indicator(self, x, indicator, val):
"""Set the indicated fields of x to val.
Args:
x: tensor.
indicator: boolean with same shape as x.
val: scalar with value to set.
Returns:
modified tensor.
"""
indicator = tf.cast(indicator, x.dtype)
return tf.add(tf.multiply(x, 1 - indicator), val * indicator)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for box_matcher.py."""
import tensorflow as tf
from official.vision.ops import box_matcher
class BoxMatcherTest(tf.test.TestCase):
def test_box_matcher_unbatched(self):
sim_matrix = tf.constant(
[[0.04, 0, 0, 0],
[0, 0, 1., 0]],
dtype=tf.float32)
fg_threshold = 0.5
bg_thresh_hi = 0.2
bg_thresh_lo = 0.0
matcher = box_matcher.BoxMatcher(
thresholds=[bg_thresh_lo, bg_thresh_hi, fg_threshold],
indicators=[-3, -2, -1, 1])
match_indices, match_indicators = matcher(sim_matrix)
positive_matches = tf.greater_equal(match_indicators, 0)
negative_matches = tf.equal(match_indicators, -2)
self.assertAllEqual(
positive_matches.numpy(), [False, True])
self.assertAllEqual(
negative_matches.numpy(), [True, False])
self.assertAllEqual(
match_indices.numpy(), [0, 2])
self.assertAllEqual(
match_indicators.numpy(), [-2, 1])
def test_box_matcher_batched(self):
sim_matrix = tf.constant(
[[[0.04, 0, 0, 0],
[0, 0, 1., 0]]],
dtype=tf.float32)
fg_threshold = 0.5
bg_thresh_hi = 0.2
bg_thresh_lo = 0.0
matcher = box_matcher.BoxMatcher(
thresholds=[bg_thresh_lo, bg_thresh_hi, fg_threshold],
indicators=[-3, -2, -1, 1])
match_indices, match_indicators = matcher(sim_matrix)
positive_matches = tf.greater_equal(match_indicators, 0)
negative_matches = tf.equal(match_indicators, -2)
self.assertAllEqual(
positive_matches.numpy(), [[False, True]])
self.assertAllEqual(
negative_matches.numpy(), [[True, False]])
self.assertAllEqual(
match_indices.numpy(), [[0, 2]])
self.assertAllEqual(
match_indicators.numpy(), [[-2, 1]])
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Box related ops."""
# Import libraries
import numpy as np
import tensorflow as tf
EPSILON = 1e-8
BBOX_XFORM_CLIP = np.log(1000. / 16.)
def yxyx_to_xywh(boxes):
"""Converts boxes from ymin, xmin, ymax, xmax to xmin, ymin, width, height.
Args:
boxes: a numpy array whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
Returns:
boxes: a numpy array whose shape is the same as `boxes` in new format.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
boxes_ymin = boxes[..., 0]
boxes_xmin = boxes[..., 1]
boxes_width = boxes[..., 3] - boxes[..., 1]
boxes_height = boxes[..., 2] - boxes[..., 0]
new_boxes = np.stack(
[boxes_xmin, boxes_ymin, boxes_width, boxes_height], axis=-1)
return new_boxes
def yxyx_to_cycxhw(boxes):
"""Converts box corner coordinates to center plus height and width terms.
Args:
boxes: a `Tensor` with last dimension of 4, representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
Returns:
boxes: a `Tensor` with the same shape as the inputted boxes, in the format
of cy, cx, height, width.
Raises:
ValueError: if the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError('Last dimension of boxes must be 4 but is {:d}'.format(
boxes.shape[-1]))
boxes_ycenter = (boxes[..., 0] + boxes[..., 2]) / 2
boxes_xcenter = (boxes[..., 1] + boxes[..., 3]) / 2
boxes_height = boxes[..., 2] - boxes[..., 0]
boxes_width = boxes[..., 3] - boxes[..., 1]
new_boxes = tf.stack(
[boxes_ycenter, boxes_xcenter, boxes_height, boxes_width], axis=-1)
return new_boxes
def cycxhw_to_yxyx(boxes):
"""Converts box center coordinates plus height and width terms to corner.
Args:
boxes: a numpy array whose last dimension is 4 representing the coordinates
of boxes in cy, cx, height, width order.
Returns:
boxes: a numpy array whose shape is the same as `boxes` in new format.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
boxes_ymin = boxes[..., 0] - boxes[..., 2] / 2
boxes_xmin = boxes[..., 1] - boxes[..., 3] / 2
boxes_ymax = boxes[..., 0] + boxes[..., 2] / 2
boxes_xmax = boxes[..., 1] + boxes[..., 3] / 2
new_boxes = tf.stack([
boxes_ymin, boxes_xmin, boxes_ymax, boxes_xmax], axis=-1)
return new_boxes
def jitter_boxes(boxes, noise_scale=0.025):
"""Jitter the box coordinates by some noise distribution.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
noise_scale: a python float which specifies the magnitude of noise. The rule
of thumb is to set this between (0, 0.1]. The default value is found to
mimic the noisy detections best empirically.
Returns:
jittered_boxes: a tensor whose shape is the same as `boxes` representing
the jittered boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
with tf.name_scope('jitter_boxes'):
bbox_jitters = tf.random.normal(tf.shape(boxes), stddev=noise_scale)
ymin = boxes[..., 0:1]
xmin = boxes[..., 1:2]
ymax = boxes[..., 2:3]
xmax = boxes[..., 3:4]
width = xmax - xmin
height = ymax - ymin
new_center_x = (xmin + xmax) / 2.0 + bbox_jitters[..., 0:1] * width
new_center_y = (ymin + ymax) / 2.0 + bbox_jitters[..., 1:2] * height
new_width = width * tf.math.exp(bbox_jitters[..., 2:3])
new_height = height * tf.math.exp(bbox_jitters[..., 3:4])
jittered_boxes = tf.concat(
[new_center_y - new_height * 0.5, new_center_x - new_width * 0.5,
new_center_y + new_height * 0.5, new_center_x + new_width * 0.5],
axis=-1)
return jittered_boxes
def normalize_boxes(boxes, image_shape):
"""Converts boxes to the normalized coordinates.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
normalized_boxes: a tensor whose shape is the same as `boxes` representing
the normalized boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
with tf.name_scope('normalize_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height = image_shape[..., 0:1]
width = image_shape[..., 1:2]
ymin = boxes[..., 0:1] / height
xmin = boxes[..., 1:2] / width
ymax = boxes[..., 2:3] / height
xmax = boxes[..., 3:4] / width
normalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
return normalized_boxes
def denormalize_boxes(boxes, image_shape):
"""Converts boxes normalized by [height, width] to pixel coordinates.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
denormalized_boxes: a tensor whose shape is the same as `boxes` representing
the denormalized boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
with tf.name_scope('denormalize_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height, width = tf.split(image_shape, 2, axis=-1)
ymin, xmin, ymax, xmax = tf.split(boxes, 4, axis=-1)
ymin = ymin * height
xmin = xmin * width
ymax = ymax * height
xmax = xmax * width
denormalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
return denormalized_boxes
def clip_boxes(boxes, image_shape):
"""Clips boxes to image boundaries.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
clipped_boxes: a tensor whose shape is the same as `boxes` representing the
clipped boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
with tf.name_scope('clip_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
max_length = [height, width, height, width]
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height, width = tf.unstack(image_shape, axis=-1)
max_length = tf.stack([height, width, height, width], axis=-1)
clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0)
return clipped_boxes
def compute_outer_boxes(boxes, image_shape, scale=1.0):
"""Compute outer box encloses an object with a margin.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
scale: a float number specifying the scale of output outer boxes to input
`boxes`.
Returns:
outer_boxes: a tensor whose shape is the same as `boxes` representing the
outer boxes.
"""
if scale < 1.0:
raise ValueError(
'scale is {}, but outer box scale must be greater than 1.0.'.format(
scale))
centers_y = (boxes[..., 0] + boxes[..., 2]) / 2.0
centers_x = (boxes[..., 1] + boxes[..., 3]) / 2.0
box_height = (boxes[..., 2] - boxes[..., 0]) * scale
box_width = (boxes[..., 3] - boxes[..., 1]) * scale
outer_boxes = tf.stack(
[centers_y - box_height / 2.0, centers_x - box_width / 2.0,
centers_y + box_height / 2.0, centers_x + box_width / 2.0],
axis=1)
outer_boxes = clip_boxes(outer_boxes, image_shape)
return outer_boxes
def encode_boxes(boxes, anchors, weights=None):
"""Encode boxes to targets.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
encoded_boxes: a tensor whose shape is the same as `boxes` representing the
encoded box targets.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
with tf.name_scope('encode_boxes'):
boxes = tf.cast(boxes, dtype=anchors.dtype)
ymin = boxes[..., 0:1]
xmin = boxes[..., 1:2]
ymax = boxes[..., 2:3]
xmax = boxes[..., 3:4]
box_h = ymax - ymin
box_w = xmax - xmin
box_yc = ymin + 0.5 * box_h
box_xc = xmin + 0.5 * box_w
anchor_ymin = anchors[..., 0:1]
anchor_xmin = anchors[..., 1:2]
anchor_ymax = anchors[..., 2:3]
anchor_xmax = anchors[..., 3:4]
anchor_h = anchor_ymax - anchor_ymin
anchor_w = anchor_xmax - anchor_xmin
anchor_yc = anchor_ymin + 0.5 * anchor_h
anchor_xc = anchor_xmin + 0.5 * anchor_w
encoded_dy = (box_yc - anchor_yc) / anchor_h
encoded_dx = (box_xc - anchor_xc) / anchor_w
encoded_dh = tf.math.log(box_h / anchor_h)
encoded_dw = tf.math.log(box_w / anchor_w)
if weights:
encoded_dy *= weights[0]
encoded_dx *= weights[1]
encoded_dh *= weights[2]
encoded_dw *= weights[3]
encoded_boxes = tf.concat(
[encoded_dy, encoded_dx, encoded_dh, encoded_dw], axis=-1)
return encoded_boxes
def decode_boxes(encoded_boxes, anchors, weights=None):
"""Decode boxes.
Args:
encoded_boxes: a tensor whose last dimension is 4 representing the
coordinates of encoded boxes in ymin, xmin, ymax, xmax order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
encoded_boxes: a tensor whose shape is the same as `boxes` representing the
decoded box targets.
"""
if encoded_boxes.shape[-1] != 4:
raise ValueError(
'encoded_boxes.shape[-1] is {:d}, but must be 4.'
.format(encoded_boxes.shape[-1]))
with tf.name_scope('decode_boxes'):
encoded_boxes = tf.cast(encoded_boxes, dtype=anchors.dtype)
dy = encoded_boxes[..., 0:1]
dx = encoded_boxes[..., 1:2]
dh = encoded_boxes[..., 2:3]
dw = encoded_boxes[..., 3:4]
if weights:
dy /= weights[0]
dx /= weights[1]
dh /= weights[2]
dw /= weights[3]
dh = tf.math.minimum(dh, BBOX_XFORM_CLIP)
dw = tf.math.minimum(dw, BBOX_XFORM_CLIP)
anchor_ymin = anchors[..., 0:1]
anchor_xmin = anchors[..., 1:2]
anchor_ymax = anchors[..., 2:3]
anchor_xmax = anchors[..., 3:4]
anchor_h = anchor_ymax - anchor_ymin
anchor_w = anchor_xmax - anchor_xmin
anchor_yc = anchor_ymin + 0.5 * anchor_h
anchor_xc = anchor_xmin + 0.5 * anchor_w
decoded_boxes_yc = dy * anchor_h + anchor_yc
decoded_boxes_xc = dx * anchor_w + anchor_xc
decoded_boxes_h = tf.math.exp(dh) * anchor_h
decoded_boxes_w = tf.math.exp(dw) * anchor_w
decoded_boxes_ymin = decoded_boxes_yc - 0.5 * decoded_boxes_h
decoded_boxes_xmin = decoded_boxes_xc - 0.5 * decoded_boxes_w
decoded_boxes_ymax = decoded_boxes_ymin + decoded_boxes_h
decoded_boxes_xmax = decoded_boxes_xmin + decoded_boxes_w
decoded_boxes = tf.concat(
[decoded_boxes_ymin, decoded_boxes_xmin,
decoded_boxes_ymax, decoded_boxes_xmax],
axis=-1)
return decoded_boxes
def filter_boxes(boxes, scores, image_shape, min_size_threshold):
"""Filter and remove boxes that are too small or fall outside the image.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
representing the original scores of the boxes.
image_shape: a tensor whose shape is the same as, or `broadcastable` to
`boxes` except the last dimension, which is 2, representing [height,
width] of the scaled image.
min_size_threshold: a float representing the minimal box size in each side
(w.r.t. the scaled image). Boxes whose sides are smaller than it will be
filtered out.
Returns:
filtered_boxes: a tensor whose shape is the same as `boxes` but with
the position of the filtered boxes are filled with 0.
filtered_scores: a tensor whose shape is the same as 'scores' but with
the positinon of the filtered boxes filled with 0.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
with tf.name_scope('filter_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height = image_shape[..., 0]
width = image_shape[..., 1]
ymin = boxes[..., 0]
xmin = boxes[..., 1]
ymax = boxes[..., 2]
xmax = boxes[..., 3]
h = ymax - ymin
w = xmax - xmin
yc = ymin + 0.5 * h
xc = xmin + 0.5 * w
min_size = tf.cast(
tf.math.maximum(min_size_threshold, 0.0), dtype=boxes.dtype)
filtered_size_mask = tf.math.logical_and(
tf.math.greater(h, min_size), tf.math.greater(w, min_size))
filtered_center_mask = tf.logical_and(
tf.math.logical_and(tf.math.greater(yc, 0.0), tf.math.less(yc, height)),
tf.math.logical_and(tf.math.greater(xc, 0.0), tf.math.less(xc, width)))
filtered_mask = tf.math.logical_and(
filtered_size_mask, filtered_center_mask)
filtered_scores = tf.where(filtered_mask, scores, tf.zeros_like(scores))
filtered_boxes = tf.cast(
tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
return filtered_boxes, filtered_scores
def filter_boxes_by_scores(boxes, scores, min_score_threshold):
"""Filter and remove boxes whose scores are smaller than the threshold.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
representing the original scores of the boxes.
min_score_threshold: a float representing the minimal box score threshold.
Boxes whose score are smaller than it will be filtered out.
Returns:
filtered_boxes: a tensor whose shape is the same as `boxes` but with
the position of the filtered boxes are filled with -1.
filtered_scores: a tensor whose shape is the same as 'scores' but with
the
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('filter_boxes_by_scores'):
filtered_mask = tf.math.greater(scores, min_score_threshold)
filtered_scores = tf.where(filtered_mask, scores, -tf.ones_like(scores))
filtered_boxes = tf.cast(
tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
return filtered_boxes, filtered_scores
def gather_instances(selected_indices, instances, *aux_instances):
"""Gather instances by indices.
Args:
selected_indices: a Tensor of shape [batch, K] which indicates the selected
indices in instance dimension (2nd dimension).
instances: a Tensor of shape [batch, N, ...] where the 2nd dimension is
the instance dimension to be selected from.
*aux_instances: the additional Tensors whose shapes are in [batch, N, ...]
which are the tensors to be selected from using the `selected_indices`.
Returns:
selected_instances: the tensor of shape [batch, K, ...] which corresponds to
the selected instances of the `instances` tensor.
selected_aux_instances: the additional tensors of shape [batch, K, ...]
which corresponds to the selected instances of the `aus_instances`
tensors.
"""
batch_size = instances.shape[0]
if batch_size == 1:
selected_instances = tf.squeeze(
tf.gather(instances, selected_indices, axis=1), axis=1)
if aux_instances:
selected_aux_instances = [
tf.squeeze(
tf.gather(a, selected_indices, axis=1), axis=1)
for a in aux_instances
]
return tuple([selected_instances] + selected_aux_instances)
else:
return selected_instances
else:
indices_shape = tf.shape(selected_indices)
batch_indices = (
tf.expand_dims(tf.range(indices_shape[0]), axis=-1) *
tf.ones([1, indices_shape[-1]], dtype=tf.int32))
gather_nd_indices = tf.stack(
[batch_indices, selected_indices], axis=-1)
selected_instances = tf.gather_nd(instances, gather_nd_indices)
if aux_instances:
selected_aux_instances = [
tf.gather_nd(a, gather_nd_indices) for a in aux_instances
]
return tuple([selected_instances] + selected_aux_instances)
else:
return selected_instances
def top_k_boxes(boxes, scores, k):
"""Sort and select top k boxes according to the scores.
Args:
boxes: a tensor of shape [batch_size, N, 4] representing the coordinate of
the boxes. N is the number of boxes per image.
scores: a tensor of shsape [batch_size, N] representing the socre of the
boxes.
k: an integer or a tensor indicating the top k number.
Returns:
selected_boxes: a tensor of shape [batch_size, k, 4] representing the
selected top k box coordinates.
selected_scores: a tensor of shape [batch_size, k] representing the selected
top k box scores.
"""
with tf.name_scope('top_k_boxes'):
selected_scores, top_k_indices = tf.nn.top_k(scores, k=k, sorted=True)
selected_boxes = gather_instances(top_k_indices, boxes)
return selected_boxes, selected_scores
def get_non_empty_box_indices(boxes):
"""Get indices for non-empty boxes."""
# Selects indices if box height or width is 0.
height = boxes[:, 2] - boxes[:, 0]
width = boxes[:, 3] - boxes[:, 1]
indices = tf.where(tf.logical_and(tf.greater(height, 0),
tf.greater(width, 0)))
return indices[:, 0]
def bbox_overlap(boxes, gt_boxes):
"""Calculates the overlap between proposal and ground truth boxes.
Some `boxes` or `gt_boxes` may have been padded. The returned `iou` tensor
for these boxes will be -1.
Args:
boxes: a tensor with a shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. This
tensor might have paddings with a negative value.
Returns:
iou: a tensor with as a shape of [batch_size, N, MAX_NUM_INSTANCES].
"""
with tf.name_scope('bbox_overlap'):
bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
value=boxes, num_or_size_splits=4, axis=2)
gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
value=gt_boxes, num_or_size_splits=4, axis=2)
# Calculates the intersection area.
i_xmin = tf.math.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
i_xmax = tf.math.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
i_ymin = tf.math.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
i_ymax = tf.math.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
i_area = (
tf.math.maximum((i_xmax - i_xmin), 0) *
tf.math.maximum((i_ymax - i_ymin), 0))
# Calculates the union area.
bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min)
gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min)
# Adds a small epsilon to avoid divide-by-zero.
u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8
# Calculates IoU.
iou = i_area / u_area
# Fills -1 for IoU entries between the padded ground truth boxes.
gt_invalid_mask = tf.less(
tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
padding_mask = tf.logical_or(
tf.zeros_like(bb_x_min, dtype=tf.bool),
tf.transpose(gt_invalid_mask, [0, 2, 1]))
iou = tf.where(padding_mask, -tf.ones_like(iou), iou)
# Fills -1 for for invalid (-1) boxes.
boxes_invalid_mask = tf.less(
tf.reduce_max(boxes, axis=-1, keepdims=True), 0.0)
iou = tf.where(boxes_invalid_mask, -tf.ones_like(iou), iou)
return iou
def bbox_generalized_overlap(boxes, gt_boxes):
"""Calculates the GIOU between proposal and ground truth boxes.
The generalized intersection of union is an adjustment of the traditional IOU
metric which provides continuous updates even for predictions with no overlap.
This metric is defined in https://giou.stanford.edu/GIoU.pdf. Note, some
`gt_boxes` may have been padded. The returned `giou` tensor for these boxes
will be -1.
Args:
boxes: a `Tensor` with a shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
gt_boxes: a `Tensor` with a shape of [batch_size, max_num_instances, 4].
This tensor may have paddings with a negative value and will also be in
the [ymin, xmin, ymax, xmax] format.
Returns:
giou: a `Tensor` with as a shape of [batch_size, N, max_num_instances].
"""
with tf.name_scope('bbox_generalized_overlap'):
assert boxes.shape.as_list(
)[-1] == 4, 'Boxes must be defined by 4 coordinates.'
assert gt_boxes.shape.as_list(
)[-1] == 4, 'Groundtruth boxes must be defined by 4 coordinates.'
bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
value=boxes, num_or_size_splits=4, axis=2)
gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
value=gt_boxes, num_or_size_splits=4, axis=2)
# Calculates the hull area for each pair of boxes, with one from
# boxes and the other from gt_boxes.
# Outputs for coordinates are of shape [batch_size, N, max_num_instances]
h_xmin = tf.minimum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
h_xmax = tf.maximum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
h_ymin = tf.minimum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
h_ymax = tf.maximum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
h_area = tf.maximum((h_xmax - h_xmin), 0) * tf.maximum((h_ymax - h_ymin), 0)
# Add a small epsilon to avoid divide-by-zero.
h_area = h_area + 1e-8
# Calculates the intersection area.
i_xmin = tf.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
i_xmax = tf.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
i_ymin = tf.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
i_ymax = tf.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
i_area = tf.maximum((i_xmax - i_xmin), 0) * tf.maximum((i_ymax - i_ymin), 0)
# Calculates the union area.
bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min)
gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min)
# Adds a small epsilon to avoid divide-by-zero.
u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8
# Calculates IoU.
iou = i_area / u_area
# Calculates GIoU.
giou = iou - (h_area - u_area) / h_area
# Fills -1 for GIoU entries between the padded ground truth boxes.
gt_invalid_mask = tf.less(
tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
padding_mask = tf.broadcast_to(
tf.transpose(gt_invalid_mask, [0, 2, 1]), tf.shape(giou))
giou = tf.where(padding_mask, -tf.ones_like(giou), giou)
return giou
def box_matching(boxes, gt_boxes, gt_classes):
"""Match boxes to groundtruth boxes.
Given the proposal boxes and the groundtruth boxes and classes, perform the
groundtruth matching by taking the argmax of the IoU between boxes and
groundtruth boxes.
Args:
boxes: a tensor of shape of [batch_size, N, 4] representing the box
coordiantes to be matched to groundtruth boxes.
gt_boxes: a tensor of shape of [batch_size, MAX_INSTANCES, 4] representing
the groundtruth box coordinates. It is padded with -1s to indicate the
invalid boxes.
gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
classes. It is padded with -1s to indicate the invalid classes.
Returns:
matched_gt_boxes: a tensor of shape of [batch_size, N, 4], representing
the matched groundtruth box coordinates for each input box. If the box
does not overlap with any groundtruth boxes, the matched boxes of it
will be set to all 0s.
matched_gt_classes: a tensor of shape of [batch_size, N], representing
the matched groundtruth classes for each input box. If the box does not
overlap with any groundtruth boxes, the matched box classes of it will
be set to 0, which corresponds to the background class.
matched_gt_indices: a tensor of shape of [batch_size, N], representing
the indices of the matched groundtruth boxes in the original gt_boxes
tensor. If the box does not overlap with any groundtruth boxes, the
index of the matched groundtruth will be set to -1.
matched_iou: a tensor of shape of [batch_size, N], representing the IoU
between the box and its matched groundtruth box. The matched IoU is the
maximum IoU of the box and all the groundtruth boxes.
iou: a tensor of shape of [batch_size, N, K], representing the IoU matrix
between boxes and the groundtruth boxes. The IoU between a box and the
invalid groundtruth boxes whose coordinates are [-1, -1, -1, -1] is -1.
"""
# Compute IoU between boxes and gt_boxes.
# iou <- [batch_size, N, K]
iou = bbox_overlap(boxes, gt_boxes)
# max_iou <- [batch_size, N]
# 0.0 -> no match to gt, or -1.0 match to no gt
matched_iou = tf.reduce_max(iou, axis=-1)
# background_box_mask <- bool, [batch_size, N]
background_box_mask = tf.less_equal(matched_iou, 0.0)
argmax_iou_indices = tf.argmax(iou, axis=-1, output_type=tf.int32)
matched_gt_boxes, matched_gt_classes = gather_instances(
argmax_iou_indices, gt_boxes, gt_classes)
matched_gt_boxes = tf.where(
tf.tile(tf.expand_dims(background_box_mask, axis=-1), [1, 1, 4]),
tf.zeros_like(matched_gt_boxes, dtype=matched_gt_boxes.dtype),
matched_gt_boxes)
matched_gt_classes = tf.where(
background_box_mask,
tf.zeros_like(matched_gt_classes),
matched_gt_classes)
matched_gt_indices = tf.where(
background_box_mask,
-tf.ones_like(argmax_iou_indices),
argmax_iou_indices)
return (matched_gt_boxes, matched_gt_classes, matched_gt_indices,
matched_iou, iou)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Region Similarity Calculators."""
import tensorflow as tf
def area(box):
"""Computes area of boxes.
B: batch_size
N: number of boxes
Args:
box: a float Tensor with [N, 4], or [B, N, 4].
Returns:
a float Tensor with [N], or [B, N]
"""
with tf.name_scope('Area'):
y_min, x_min, y_max, x_max = tf.split(
value=box, num_or_size_splits=4, axis=-1)
return tf.squeeze((y_max - y_min) * (x_max - x_min), axis=-1)
def intersection(gt_boxes, boxes):
"""Compute pairwise intersection areas between boxes.
B: batch_size
N: number of groundtruth boxes.
M: number of anchor boxes.
Args:
gt_boxes: a float Tensor with [N, 4], or [B, N, 4]
boxes: a float Tensor with [M, 4], or [B, M, 4]
Returns:
a float Tensor with shape [N, M] or [B, N, M] representing pairwise
intersections.
"""
with tf.name_scope('Intersection'):
y_min1, x_min1, y_max1, x_max1 = tf.split(
value=gt_boxes, num_or_size_splits=4, axis=-1)
y_min2, x_min2, y_max2, x_max2 = tf.split(
value=boxes, num_or_size_splits=4, axis=-1)
boxes_rank = len(boxes.shape)
perm = [1, 0] if boxes_rank == 2 else [0, 2, 1]
# [N, M] or [B, N, M]
y_min_max = tf.minimum(y_max1, tf.transpose(y_max2, perm))
y_max_min = tf.maximum(y_min1, tf.transpose(y_min2, perm))
x_min_max = tf.minimum(x_max1, tf.transpose(x_max2, perm))
x_max_min = tf.maximum(x_min1, tf.transpose(x_min2, perm))
intersect_heights = y_min_max - y_max_min
intersect_widths = x_min_max - x_max_min
zeros_t = tf.cast(0, intersect_heights.dtype)
intersect_heights = tf.maximum(zeros_t, intersect_heights)
intersect_widths = tf.maximum(zeros_t, intersect_widths)
return intersect_heights * intersect_widths
def iou(gt_boxes, boxes):
"""Computes pairwise intersection-over-union between box collections.
Args:
gt_boxes: a float Tensor with [N, 4].
boxes: a float Tensor with [M, 4].
Returns:
a Tensor with shape [N, M] representing pairwise iou scores.
"""
with tf.name_scope('IOU'):
intersections = intersection(gt_boxes, boxes)
gt_boxes_areas = area(gt_boxes)
boxes_areas = area(boxes)
boxes_rank = len(boxes_areas.shape)
boxes_axis = 1 if (boxes_rank == 2) else 0
gt_boxes_areas = tf.expand_dims(gt_boxes_areas, -1)
boxes_areas = tf.expand_dims(boxes_areas, boxes_axis)
unions = gt_boxes_areas + boxes_areas
unions = unions - intersections
return tf.where(
tf.equal(intersections, 0.0), tf.zeros_like(intersections),
tf.truediv(intersections, unions))
class IouSimilarity:
"""Class to compute similarity based on Intersection over Union (IOU) metric.
"""
def __init__(self, mask_val=-1):
self.mask_val = mask_val
def __call__(self, boxes_1, boxes_2, boxes_1_masks=None, boxes_2_masks=None):
"""Compute pairwise IOU similarity between ground truth boxes and anchors.
B: batch_size
N: Number of groundtruth boxes.
M: Number of anchor boxes.
Args:
boxes_1: a float Tensor with M or B * M boxes.
boxes_2: a float Tensor with N or B * N boxes, the rank must be less than
or equal to rank of `boxes_1`.
boxes_1_masks: a boolean Tensor with M or B * M boxes. Optional.
boxes_2_masks: a boolean Tensor with N or B * N boxes. Optional.
Returns:
A Tensor with shape [M, N] or [B, M, N] representing pairwise
iou scores, anchor per row and groundtruth_box per colulmn.
Input shape:
boxes_1: [N, 4], or [B, N, 4]
boxes_2: [M, 4], or [B, M, 4]
boxes_1_masks: [N, 1], or [B, N, 1]
boxes_2_masks: [M, 1], or [B, M, 1]
Output shape:
[M, N], or [B, M, N]
"""
boxes_1 = tf.cast(boxes_1, tf.float32)
boxes_2 = tf.cast(boxes_2, tf.float32)
boxes_1_rank = len(boxes_1.shape)
boxes_2_rank = len(boxes_2.shape)
if boxes_1_rank < 2 or boxes_1_rank > 3:
raise ValueError(
'`groudtruth_boxes` must be rank 2 or 3, got {}'.format(boxes_1_rank))
if boxes_2_rank < 2 or boxes_2_rank > 3:
raise ValueError(
'`anchors` must be rank 2 or 3, got {}'.format(boxes_2_rank))
if boxes_1_rank < boxes_2_rank:
raise ValueError('`groundtruth_boxes` is unbatched while `anchors` is '
'batched is not a valid use case, got groundtruth_box '
'rank {}, and anchors rank {}'.format(
boxes_1_rank, boxes_2_rank))
result = iou(boxes_1, boxes_2)
if boxes_1_masks is None and boxes_2_masks is None:
return result
background_mask = None
mask_val_t = tf.cast(self.mask_val, result.dtype) * tf.ones_like(result)
perm = [1, 0] if boxes_2_rank == 2 else [0, 2, 1]
if boxes_1_masks is not None and boxes_2_masks is not None:
background_mask = tf.logical_or(boxes_1_masks,
tf.transpose(boxes_2_masks, perm))
elif boxes_1_masks is not None:
background_mask = boxes_1_masks
else:
background_mask = tf.logical_or(
tf.zeros(tf.shape(boxes_2)[:-1], dtype=tf.bool),
tf.transpose(boxes_2_masks, perm))
return tf.where(background_mask, mask_val_t, result)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for iou_similarity.py."""
import tensorflow as tf
from official.vision.ops import iou_similarity
class BoxMatcherTest(tf.test.TestCase):
def test_similarity_unbatched(self):
boxes = tf.constant(
[
[0, 0, 1, 1],
[5, 0, 10, 5],
],
dtype=tf.float32)
gt_boxes = tf.constant(
[
[0, 0, 5, 5],
[0, 5, 5, 10],
[5, 0, 10, 5],
[5, 5, 10, 10],
],
dtype=tf.float32)
sim_calc = iou_similarity.IouSimilarity()
sim_matrix = sim_calc(boxes, gt_boxes)
self.assertAllClose(
sim_matrix.numpy(),
[[0.04, 0, 0, 0],
[0, 0, 1., 0]])
def test_similarity_batched(self):
boxes = tf.constant(
[[
[0, 0, 1, 1],
[5, 0, 10, 5],
]],
dtype=tf.float32)
gt_boxes = tf.constant(
[[
[0, 0, 5, 5],
[0, 5, 5, 10],
[5, 0, 10, 5],
[5, 5, 10, 10],
]],
dtype=tf.float32)
sim_calc = iou_similarity.IouSimilarity()
sim_matrix = sim_calc(boxes, gt_boxes)
self.assertAllClose(
sim_matrix.numpy(),
[[[0.04, 0, 0, 0],
[0, 0, 1., 0]]])
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for segmentations."""
import math
# Import libraries
import cv2
import numpy as np
def paste_instance_masks(masks,
detected_boxes,
image_height,
image_width):
"""Paste instance masks to generate the image segmentation results.
Args:
masks: a numpy array of shape [N, mask_height, mask_width] representing the
instance masks w.r.t. the `detected_boxes`.
detected_boxes: a numpy array of shape [N, 4] representing the reference
bounding boxes.
image_height: an integer representing the height of the image.
image_width: an integer representing the width of the image.
Returns:
segms: a numpy array of shape [N, image_height, image_width] representing
the instance masks *pasted* on the image canvas.
"""
def expand_boxes(boxes, scale):
"""Expands an array of boxes by a given scale."""
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227 # pylint: disable=line-too-long
# The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
# whereas `boxes` here is in [x1, y1, w, h] form
w_half = boxes[:, 2] * .5
h_half = boxes[:, 3] * .5
x_c = boxes[:, 0] + w_half
y_c = boxes[:, 1] + h_half
w_half *= scale
h_half *= scale
boxes_exp = np.zeros(boxes.shape)
boxes_exp[:, 0] = x_c - w_half
boxes_exp[:, 2] = x_c + w_half
boxes_exp[:, 1] = y_c - h_half
boxes_exp[:, 3] = y_c + h_half
return boxes_exp
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812 # pylint: disable=line-too-long
# To work around an issue with cv2.resize (it seems to automatically pad
# with repeated border values), we manually zero-pad the masks by 1 pixel
# prior to resizing back to the original image resolution. This prevents
# "top hat" artifacts. We therefore need to expand the reference boxes by an
# appropriate factor.
_, mask_height, mask_width = masks.shape
scale = max((mask_width + 2.0) / mask_width,
(mask_height + 2.0) / mask_height)
ref_boxes = expand_boxes(detected_boxes, scale)
ref_boxes = ref_boxes.astype(np.int32)
padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
segms = []
for mask_ind, mask in enumerate(masks):
im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
# Process mask inside bounding boxes.
padded_mask[1:-1, 1:-1] = mask[:, :]
ref_box = ref_boxes[mask_ind, :]
w = ref_box[2] - ref_box[0] + 1
h = ref_box[3] - ref_box[1] + 1
w = np.maximum(w, 1)
h = np.maximum(h, 1)
mask = cv2.resize(padded_mask, (w, h))
mask = np.array(mask > 0.5, dtype=np.uint8)
x_0 = min(max(ref_box[0], 0), image_width)
x_1 = min(max(ref_box[2] + 1, 0), image_width)
y_0 = min(max(ref_box[1], 0), image_height)
y_1 = min(max(ref_box[3] + 1, 0), image_height)
im_mask[y_0:y_1, x_0:x_1] = mask[
(y_0 - ref_box[1]):(y_1 - ref_box[1]),
(x_0 - ref_box[0]):(x_1 - ref_box[0])
]
segms.append(im_mask)
segms = np.array(segms)
assert masks.shape[0] == segms.shape[0]
return segms
def paste_instance_masks_v2(masks,
detected_boxes,
image_height,
image_width):
"""Paste instance masks to generate the image segmentation (v2).
Args:
masks: a numpy array of shape [N, mask_height, mask_width] representing the
instance masks w.r.t. the `detected_boxes`.
detected_boxes: a numpy array of shape [N, 4] representing the reference
bounding boxes.
image_height: an integer representing the height of the image.
image_width: an integer representing the width of the image.
Returns:
segms: a numpy array of shape [N, image_height, image_width] representing
the instance masks *pasted* on the image canvas.
"""
_, mask_height, mask_width = masks.shape
segms = []
for i, mask in enumerate(masks):
box = detected_boxes[i, :]
xmin = box[0]
ymin = box[1]
xmax = xmin + box[2]
ymax = ymin + box[3]
# Sample points of the cropped mask w.r.t. the image grid.
# Note that these coordinates may fall beyond the image.
# Pixel clipping will happen after warping.
xmin_int = int(math.floor(xmin))
xmax_int = int(math.ceil(xmax))
ymin_int = int(math.floor(ymin))
ymax_int = int(math.ceil(ymax))
alpha = box[2] / (1.0 * mask_width)
beta = box[3] / (1.0 * mask_height)
# pylint: disable=invalid-name
# Transformation from mask pixel indices to image coordinate.
M_mask_to_image = np.array(
[[alpha, 0, xmin],
[0, beta, ymin],
[0, 0, 1]],
dtype=np.float32)
# Transformation from image to cropped mask coordinate.
M_image_to_crop = np.array(
[[1, 0, -xmin_int],
[0, 1, -ymin_int],
[0, 0, 1]],
dtype=np.float32)
M = np.dot(M_image_to_crop, M_mask_to_image)
# Compensate the half pixel offset that OpenCV has in the
# warpPerspective implementation: the top-left pixel is sampled
# at (0,0), but we want it to be at (0.5, 0.5).
M = np.dot(
np.dot(
np.array([[1, 0, -0.5],
[0, 1, -0.5],
[0, 0, 1]], np.float32),
M),
np.array([[1, 0, 0.5],
[0, 1, 0.5],
[0, 0, 1]], np.float32))
# pylint: enable=invalid-name
cropped_mask = cv2.warpPerspective(
mask.astype(np.float32), M,
(xmax_int - xmin_int, ymax_int - ymin_int))
cropped_mask = np.array(cropped_mask > 0.5, dtype=np.uint8)
img_mask = np.zeros((image_height, image_width))
x0 = max(min(xmin_int, image_width), 0)
x1 = max(min(xmax_int, image_width), 0)
y0 = max(min(ymin_int, image_height), 0)
y1 = max(min(ymax_int, image_height), 0)
img_mask[y0:y1, x0:x1] = cropped_mask[
(y0 - ymin_int):(y1 - ymin_int),
(x0 - xmin_int):(x1 - xmin_int)]
segms.append(img_mask)
segms = np.array(segms)
return segms
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for mask_ops.py."""
# Import libraries
import numpy as np
import tensorflow as tf
from official.vision.ops import mask_ops
class MaskUtilsTest(tf.test.TestCase):
def testPasteInstanceMasks(self):
image_height = 10
image_width = 10
mask_height = 6
mask_width = 6
masks = np.random.randint(0, 255, (1, mask_height, mask_width))
detected_boxes = np.array([[0.0, 2.0, mask_width, mask_height]])
_ = mask_ops.paste_instance_masks(
masks, detected_boxes, image_height, image_width)
def testPasteInstanceMasksV2(self):
image_height = 10
image_width = 10
mask_height = 6
mask_width = 6
masks = np.random.randint(0, 255, (1, mask_height, mask_width))
detected_boxes = np.array([[0.0, 2.0, mask_width, mask_height]])
image_masks = mask_ops.paste_instance_masks_v2(
masks, detected_boxes, image_height, image_width)
self.assertNDArrayNear(
image_masks[:, 2:8, 0:6],
np.array(masks > 0.5, dtype=np.uint8),
1e-5)
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tensorflow implementation of non max suppression."""
# Import libraries
import tensorflow as tf
from official.vision.ops import box_ops
NMS_TILE_SIZE = 512
def _self_suppression(iou, _, iou_sum):
batch_size = tf.shape(iou)[0]
can_suppress_others = tf.cast(
tf.reshape(tf.reduce_max(iou, 1) <= 0.5, [batch_size, -1, 1]), iou.dtype)
iou_suppressed = tf.reshape(
tf.cast(tf.reduce_max(can_suppress_others * iou, 1) <= 0.5, iou.dtype),
[batch_size, -1, 1]) * iou
iou_sum_new = tf.reduce_sum(iou_suppressed, [1, 2])
return [
iou_suppressed,
tf.reduce_any(iou_sum - iou_sum_new > 0.5), iou_sum_new
]
def _cross_suppression(boxes, box_slice, iou_threshold, inner_idx):
batch_size = tf.shape(boxes)[0]
new_slice = tf.slice(boxes, [0, inner_idx * NMS_TILE_SIZE, 0],
[batch_size, NMS_TILE_SIZE, 4])
iou = box_ops.bbox_overlap(new_slice, box_slice)
ret_slice = tf.expand_dims(
tf.cast(tf.reduce_all(iou < iou_threshold, [1]), box_slice.dtype),
2) * box_slice
return boxes, ret_slice, iou_threshold, inner_idx + 1
def _suppression_loop_body(boxes, iou_threshold, output_size, idx):
"""Process boxes in the range [idx*NMS_TILE_SIZE, (idx+1)*NMS_TILE_SIZE).
Args:
boxes: a tensor with a shape of [batch_size, anchors, 4].
iou_threshold: a float representing the threshold for deciding whether boxes
overlap too much with respect to IOU.
output_size: an int32 tensor of size [batch_size]. Representing the number
of selected boxes for each batch.
idx: an integer scalar representing induction variable.
Returns:
boxes: updated boxes.
iou_threshold: pass down iou_threshold to the next iteration.
output_size: the updated output_size.
idx: the updated induction variable.
"""
num_tiles = tf.shape(boxes)[1] // NMS_TILE_SIZE
batch_size = tf.shape(boxes)[0]
# Iterates over tiles that can possibly suppress the current tile.
box_slice = tf.slice(boxes, [0, idx * NMS_TILE_SIZE, 0],
[batch_size, NMS_TILE_SIZE, 4])
_, box_slice, _, _ = tf.while_loop(
lambda _boxes, _box_slice, _threshold, inner_idx: inner_idx < idx,
_cross_suppression, [boxes, box_slice, iou_threshold,
tf.constant(0)])
# Iterates over the current tile to compute self-suppression.
iou = box_ops.bbox_overlap(box_slice, box_slice)
mask = tf.expand_dims(
tf.reshape(tf.range(NMS_TILE_SIZE), [1, -1]) > tf.reshape(
tf.range(NMS_TILE_SIZE), [-1, 1]), 0)
iou *= tf.cast(tf.logical_and(mask, iou >= iou_threshold), iou.dtype)
suppressed_iou, _, _ = tf.while_loop(
lambda _iou, loop_condition, _iou_sum: loop_condition, _self_suppression,
[iou, tf.constant(True),
tf.reduce_sum(iou, [1, 2])])
suppressed_box = tf.reduce_sum(suppressed_iou, 1) > 0
box_slice *= tf.expand_dims(1.0 - tf.cast(suppressed_box, box_slice.dtype), 2)
# Uses box_slice to update the input boxes.
mask = tf.reshape(
tf.cast(tf.equal(tf.range(num_tiles), idx), boxes.dtype), [1, -1, 1, 1])
boxes = tf.tile(tf.expand_dims(
box_slice, [1]), [1, num_tiles, 1, 1]) * mask + tf.reshape(
boxes, [batch_size, num_tiles, NMS_TILE_SIZE, 4]) * (1 - mask)
boxes = tf.reshape(boxes, [batch_size, -1, 4])
# Updates output_size.
output_size += tf.reduce_sum(
tf.cast(tf.reduce_any(box_slice > 0, [2]), tf.int32), [1])
return boxes, iou_threshold, output_size, idx + 1
def sorted_non_max_suppression_padded(scores,
boxes,
max_output_size,
iou_threshold):
"""A wrapper that handles non-maximum suppression.
Assumption:
* The boxes are sorted by scores unless the box is a dot (all coordinates
are zero).
* Boxes with higher scores can be used to suppress boxes with lower scores.
The overal design of the algorithm is to handle boxes tile-by-tile:
boxes = boxes.pad_to_multiply_of(tile_size)
num_tiles = len(boxes) // tile_size
output_boxes = []
for i in range(num_tiles):
box_tile = boxes[i*tile_size : (i+1)*tile_size]
for j in range(i - 1):
suppressing_tile = boxes[j*tile_size : (j+1)*tile_size]
iou = bbox_overlap(box_tile, suppressing_tile)
# if the box is suppressed in iou, clear it to a dot
box_tile *= _update_boxes(iou)
# Iteratively handle the diagnal tile.
iou = _box_overlap(box_tile, box_tile)
iou_changed = True
while iou_changed:
# boxes that are not suppressed by anything else
suppressing_boxes = _get_suppressing_boxes(iou)
# boxes that are suppressed by suppressing_boxes
suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes)
# clear iou to 0 for boxes that are suppressed, as they cannot be used
# to suppress other boxes any more
new_iou = _clear_iou(iou, suppressed_boxes)
iou_changed = (new_iou != iou)
iou = new_iou
# remaining boxes that can still suppress others, are selected boxes.
output_boxes.append(_get_suppressing_boxes(iou))
if len(output_boxes) >= max_output_size:
break
Args:
scores: a tensor with a shape of [batch_size, anchors].
boxes: a tensor with a shape of [batch_size, anchors, 4].
max_output_size: a scalar integer `Tensor` representing the maximum number
of boxes to be selected by non max suppression.
iou_threshold: a float representing the threshold for deciding whether boxes
overlap too much with respect to IOU.
Returns:
nms_scores: a tensor with a shape of [batch_size, anchors]. It has same
dtype as input scores.
nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has
same dtype as input boxes.
"""
batch_size = tf.shape(boxes)[0]
num_boxes = tf.shape(boxes)[1]
pad = tf.cast(
tf.math.ceil(tf.cast(num_boxes, tf.float32) / NMS_TILE_SIZE),
tf.int32) * NMS_TILE_SIZE - num_boxes
boxes = tf.pad(tf.cast(boxes, tf.float32), [[0, 0], [0, pad], [0, 0]])
scores = tf.pad(
tf.cast(scores, tf.float32), [[0, 0], [0, pad]], constant_values=-1)
num_boxes += pad
def _loop_cond(unused_boxes, unused_threshold, output_size, idx):
return tf.logical_and(
tf.reduce_min(output_size) < max_output_size,
idx < num_boxes // NMS_TILE_SIZE)
selected_boxes, _, output_size, _ = tf.while_loop(
_loop_cond, _suppression_loop_body, [
boxes, iou_threshold,
tf.zeros([batch_size], tf.int32),
tf.constant(0)
])
idx = num_boxes - tf.cast(
tf.nn.top_k(
tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) *
tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0],
tf.int32)
idx = tf.minimum(idx, num_boxes - 1)
idx = tf.reshape(
idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]), [-1])
boxes = tf.reshape(
tf.gather(tf.reshape(boxes, [-1, 4]), idx),
[batch_size, max_output_size, 4])
boxes = boxes * tf.cast(
tf.reshape(tf.range(max_output_size), [1, -1, 1]) < tf.reshape(
output_size, [-1, 1, 1]), boxes.dtype)
scores = tf.reshape(
tf.gather(tf.reshape(scores, [-1, 1]), idx),
[batch_size, max_output_size])
scores = scores * tf.cast(
tf.reshape(tf.range(max_output_size), [1, -1]) < tf.reshape(
output_size, [-1, 1]), scores.dtype)
return scores, boxes
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Preprocessing ops."""
import math
from typing import Optional, Tuple, Union
from six.moves import range
import tensorflow as tf
from official.vision.ops import augment
from official.vision.ops import box_ops
CENTER_CROP_FRACTION = 0.875
def clip_or_pad_to_fixed_size(input_tensor, size, constant_values=0):
"""Pads data to a fixed length at the first dimension.
Args:
input_tensor: `Tensor` with any dimension.
size: `int` number for the first dimension of output Tensor.
constant_values: `int` value assigned to the paddings.
Returns:
`Tensor` with the first dimension padded to `size`.
"""
input_shape = input_tensor.get_shape().as_list()
padding_shape = []
# Computes the padding length on the first dimension, clip input tensor if it
# is longer than `size`.
input_length = tf.shape(input_tensor)[0]
input_length = tf.clip_by_value(input_length, 0, size)
input_tensor = input_tensor[:input_length]
padding_length = tf.maximum(0, size - input_length)
padding_shape.append(padding_length)
# Copies shapes of the rest of input shape dimensions.
for i in range(1, len(input_shape)):
padding_shape.append(tf.shape(input_tensor)[i])
# Pads input tensor to the fixed first dimension.
paddings = tf.cast(constant_values * tf.ones(padding_shape),
input_tensor.dtype)
padded_tensor = tf.concat([input_tensor, paddings], axis=0)
output_shape = input_shape
output_shape[0] = size
padded_tensor.set_shape(output_shape)
return padded_tensor
def normalize_image(image,
offset=(0.485, 0.456, 0.406),
scale=(0.229, 0.224, 0.225)):
"""Normalizes the image to zero mean and unit variance."""
with tf.name_scope('normalize_image'):
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
offset = tf.constant(offset)
offset = tf.expand_dims(offset, axis=0)
offset = tf.expand_dims(offset, axis=0)
image -= offset
scale = tf.constant(scale)
scale = tf.expand_dims(scale, axis=0)
scale = tf.expand_dims(scale, axis=0)
image /= scale
return image
def compute_padded_size(desired_size, stride):
"""Compute the padded size given the desired size and the stride.
The padded size will be the smallest rectangle, such that each dimension is
the smallest multiple of the stride which is larger than the desired
dimension. For example, if desired_size = (100, 200) and stride = 32,
the output padded_size = (128, 224).
Args:
desired_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the target output image size.
stride: an integer, the stride of the backbone network.
Returns:
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size.
"""
if isinstance(desired_size, list) or isinstance(desired_size, tuple):
padded_size = [int(math.ceil(d * 1.0 / stride) * stride)
for d in desired_size]
else:
padded_size = tf.cast(
tf.math.ceil(
tf.cast(desired_size, dtype=tf.float32) / stride) * stride,
tf.int32)
return padded_size
def resize_and_crop_image(image,
desired_size,
padded_size,
aug_scale_min=1.0,
aug_scale_max=1.0,
seed=1,
method=tf.image.ResizeMethod.BILINEAR):
"""Resizes the input image to output size (RetinaNet style).
Resize and pad images given the desired output size of the image and
stride size.
Here are the preprocessing steps.
1. For a given image, keep its aspect ratio and rescale the image to make it
the largest rectangle to be bounded by the rectangle specified by the
`desired_size`.
2. Pad the rescaled image to the padded_size.
Args:
image: a `Tensor` of shape [height, width, 3] representing an image.
desired_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the desired actual output image size.
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size. Padding will be applied
after scaling the image to the desired_size.
aug_scale_min: a `float` with range between [0, 1.0] representing minimum
random scale applied to desired_size for training scale jittering.
aug_scale_max: a `float` with range between [1.0, inf] representing maximum
random scale applied to desired_size for training scale jittering.
seed: seed for random scale jittering.
method: function to resize input image to scaled image.
Returns:
output_image: `Tensor` of shape [height, width, 3] where [height, width]
equals to `output_size`.
image_info: a 2D `Tensor` that encodes the information of the image and the
applied preprocessing. It is in the format of
[[original_height, original_width], [desired_height, desired_width],
[y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
desired_width] is the actual scaled image size, and [y_scale, x_scale] is
the scaling factor, which is the ratio of
scaled dimension / original dimension.
"""
with tf.name_scope('resize_and_crop_image'):
image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
if random_jittering:
random_scale = tf.random.uniform(
[], aug_scale_min, aug_scale_max, seed=seed)
scaled_size = tf.round(random_scale * desired_size)
else:
scaled_size = desired_size
scale = tf.minimum(
scaled_size[0] / image_size[0], scaled_size[1] / image_size[1])
scaled_size = tf.round(image_size * scale)
# Computes 2D image_scale.
image_scale = scaled_size / image_size
# Selects non-zero random offset (x, y) if scaled image is larger than
# desired_size.
if random_jittering:
max_offset = scaled_size - desired_size
max_offset = tf.where(
tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
offset = max_offset * tf.random.uniform([2,], 0, 1, seed=seed)
offset = tf.cast(offset, tf.int32)
else:
offset = tf.zeros((2,), tf.int32)
scaled_image = tf.image.resize(
image, tf.cast(scaled_size, tf.int32), method=method)
if random_jittering:
scaled_image = scaled_image[
offset[0]:offset[0] + desired_size[0],
offset[1]:offset[1] + desired_size[1], :]
output_image = tf.image.pad_to_bounding_box(
scaled_image, 0, 0, padded_size[0], padded_size[1])
image_info = tf.stack([
image_size,
tf.constant(desired_size, dtype=tf.float32),
image_scale,
tf.cast(offset, tf.float32)])
return output_image, image_info
def resize_and_crop_image_v2(image,
short_side,
long_side,
padded_size,
aug_scale_min=1.0,
aug_scale_max=1.0,
seed=1,
method=tf.image.ResizeMethod.BILINEAR):
"""Resizes the input image to output size (Faster R-CNN style).
Resize and pad images given the specified short / long side length and the
stride size.
Here are the preprocessing steps.
1. For a given image, keep its aspect ratio and first try to rescale the short
side of the original image to `short_side`.
2. If the scaled image after 1 has a long side that exceeds `long_side`, keep
the aspect ratio and rescal the long side of the image to `long_side`.
2. Pad the rescaled image to the padded_size.
Args:
image: a `Tensor` of shape [height, width, 3] representing an image.
short_side: a scalar `Tensor` or `int` representing the desired short side
to be rescaled to.
long_side: a scalar `Tensor` or `int` representing the desired long side to
be rescaled to.
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size. Padding will be applied
after scaling the image to the desired_size.
aug_scale_min: a `float` with range between [0, 1.0] representing minimum
random scale applied to desired_size for training scale jittering.
aug_scale_max: a `float` with range between [1.0, inf] representing maximum
random scale applied to desired_size for training scale jittering.
seed: seed for random scale jittering.
method: function to resize input image to scaled image.
Returns:
output_image: `Tensor` of shape [height, width, 3] where [height, width]
equals to `output_size`.
image_info: a 2D `Tensor` that encodes the information of the image and the
applied preprocessing. It is in the format of
[[original_height, original_width], [desired_height, desired_width],
[y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
desired_width] is the actual scaled image size, and [y_scale, x_scale] is
the scaling factor, which is the ratio of
scaled dimension / original dimension.
"""
with tf.name_scope('resize_and_crop_image_v2'):
image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
scale_using_short_side = (
short_side / tf.math.minimum(image_size[0], image_size[1]))
scale_using_long_side = (
long_side / tf.math.maximum(image_size[0], image_size[1]))
scaled_size = tf.math.round(image_size * scale_using_short_side)
scaled_size = tf.where(
tf.math.greater(
tf.math.maximum(scaled_size[0], scaled_size[1]), long_side),
tf.math.round(image_size * scale_using_long_side),
scaled_size)
desired_size = scaled_size
random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
if random_jittering:
random_scale = tf.random.uniform(
[], aug_scale_min, aug_scale_max, seed=seed)
scaled_size = tf.math.round(random_scale * scaled_size)
# Computes 2D image_scale.
image_scale = scaled_size / image_size
# Selects non-zero random offset (x, y) if scaled image is larger than
# desired_size.
if random_jittering:
max_offset = scaled_size - desired_size
max_offset = tf.where(
tf.math.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
offset = max_offset * tf.random.uniform([2,], 0, 1, seed=seed)
offset = tf.cast(offset, tf.int32)
else:
offset = tf.zeros((2,), tf.int32)
scaled_image = tf.image.resize(
image, tf.cast(scaled_size, tf.int32), method=method)
if random_jittering:
scaled_image = scaled_image[
offset[0]:offset[0] + desired_size[0],
offset[1]:offset[1] + desired_size[1], :]
output_image = tf.image.pad_to_bounding_box(
scaled_image, 0, 0, padded_size[0], padded_size[1])
image_info = tf.stack([
image_size,
tf.cast(desired_size, dtype=tf.float32),
image_scale,
tf.cast(offset, tf.float32)])
return output_image, image_info
def resize_image(
image: tf.Tensor,
size: Union[Tuple[int, int], int],
max_size: Optional[int] = None,
method: tf.image.ResizeMethod = tf.image.ResizeMethod.BILINEAR):
"""Resize image with size and max_size.
Args:
image: the image to be resized.
size: if list to tuple, resize to it. If scalar, we keep the same
aspect ratio and resize the short side to the value.
max_size: only used when size is a scalar. When the larger side is larger
than max_size after resized with size we used max_size to keep the aspect
ratio instead.
method: the method argument passed to tf.image.resize.
Returns:
the resized image and image_info to be used for downstream processing.
image_info: a 2D `Tensor` that encodes the information of the image and the
applied preprocessing. It is in the format of
[[original_height, original_width], [resized_height, resized_width],
[y_scale, x_scale], [0, 0]], where [resized_height, resized_width]
is the actual scaled image size, and [y_scale, x_scale] is the
scaling factor, which is the ratio of
scaled dimension / original dimension.
"""
def get_size_with_aspect_ratio(image_size, size, max_size=None):
h = image_size[0]
w = image_size[1]
if max_size is not None:
min_original_size = tf.cast(tf.math.minimum(w, h), dtype=tf.float32)
max_original_size = tf.cast(tf.math.maximum(w, h), dtype=tf.float32)
if max_original_size / min_original_size * size > max_size:
size = tf.cast(
tf.math.floor(max_size * min_original_size / max_original_size),
dtype=tf.int32)
else:
size = tf.cast(size, tf.int32)
else:
size = tf.cast(size, tf.int32)
if (w <= h and w == size) or (h <= w and h == size):
return tf.stack([h, w])
if w < h:
ow = size
oh = tf.cast(
(tf.cast(size, dtype=tf.float32) * tf.cast(h, dtype=tf.float32) /
tf.cast(w, dtype=tf.float32)),
dtype=tf.int32)
else:
oh = size
ow = tf.cast(
(tf.cast(size, dtype=tf.float32) * tf.cast(w, dtype=tf.float32) /
tf.cast(h, dtype=tf.float32)),
dtype=tf.int32)
return tf.stack([oh, ow])
def get_size(image_size, size, max_size=None):
if isinstance(size, (list, tuple)):
return size[::-1]
else:
return get_size_with_aspect_ratio(image_size, size, max_size)
orignal_size = tf.shape(image)[0:2]
size = get_size(orignal_size, size, max_size)
rescaled_image = tf.image.resize(
image, tf.cast(size, tf.int32), method=method)
image_scale = size / orignal_size
image_info = tf.stack([
tf.cast(orignal_size, dtype=tf.float32),
tf.cast(size, dtype=tf.float32),
tf.cast(image_scale, tf.float32),
tf.constant([0.0, 0.0], dtype=tf.float32)
])
return rescaled_image, image_info
def center_crop_image(image):
"""Center crop a square shape slice from the input image.
It crops a square shape slice from the image. The side of the actual crop
is 224 / 256 = 0.875 of the short side of the original image. References:
[1] Very Deep Convolutional Networks for Large-Scale Image Recognition
https://arxiv.org/abs/1409.1556
[2] Deep Residual Learning for Image Recognition
https://arxiv.org/abs/1512.03385
Args:
image: a Tensor of shape [height, width, 3] representing the input image.
Returns:
cropped_image: a Tensor representing the center cropped image.
"""
with tf.name_scope('center_crop_image'):
image_size = tf.cast(tf.shape(image)[:2], dtype=tf.float32)
crop_size = (
CENTER_CROP_FRACTION * tf.math.minimum(image_size[0], image_size[1]))
crop_offset = tf.cast((image_size - crop_size) / 2.0, dtype=tf.int32)
crop_size = tf.cast(crop_size, dtype=tf.int32)
cropped_image = image[
crop_offset[0]:crop_offset[0] + crop_size,
crop_offset[1]:crop_offset[1] + crop_size, :]
return cropped_image
def center_crop_image_v2(image_bytes, image_shape):
"""Center crop a square shape slice from the input image.
It crops a square shape slice from the image. The side of the actual crop
is 224 / 256 = 0.875 of the short side of the original image. References:
[1] Very Deep Convolutional Networks for Large-Scale Image Recognition
https://arxiv.org/abs/1409.1556
[2] Deep Residual Learning for Image Recognition
https://arxiv.org/abs/1512.03385
This is a faster version of `center_crop_image` which takes the original
image bytes and image size as the inputs, and partially decode the JPEG
bytes according to the center crop.
Args:
image_bytes: a Tensor of type string representing the raw image bytes.
image_shape: a Tensor specifying the shape of the raw image.
Returns:
cropped_image: a Tensor representing the center cropped image.
"""
with tf.name_scope('center_image_crop_v2'):
image_shape = tf.cast(image_shape, tf.float32)
crop_size = (
CENTER_CROP_FRACTION * tf.math.minimum(image_shape[0], image_shape[1]))
crop_offset = tf.cast((image_shape - crop_size) / 2.0, dtype=tf.int32)
crop_size = tf.cast(crop_size, dtype=tf.int32)
crop_window = tf.stack(
[crop_offset[0], crop_offset[1], crop_size, crop_size])
cropped_image = tf.image.decode_and_crop_jpeg(
image_bytes, crop_window, channels=3)
return cropped_image
def random_crop_image(image,
aspect_ratio_range=(3. / 4., 4. / 3.),
area_range=(0.08, 1.0),
max_attempts=10,
seed=1):
"""Randomly crop an arbitrary shaped slice from the input image.
Args:
image: a Tensor of shape [height, width, 3] representing the input image.
aspect_ratio_range: a list of floats. The cropped area of the image must
have an aspect ratio = width / height within this range.
area_range: a list of floats. The cropped reas of the image must contain
a fraction of the input image within this range.
max_attempts: the number of attempts at generating a cropped region of the
image of the specified constraints. After max_attempts failures, return
the entire image.
seed: the seed of the random generator.
Returns:
cropped_image: a Tensor representing the random cropped image. Can be the
original image if max_attempts is exhausted.
"""
with tf.name_scope('random_crop_image'):
crop_offset, crop_size, _ = tf.image.sample_distorted_bounding_box(
tf.shape(image),
tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
seed=seed,
min_object_covered=area_range[0],
aspect_ratio_range=aspect_ratio_range,
area_range=area_range,
max_attempts=max_attempts)
cropped_image = tf.slice(image, crop_offset, crop_size)
return cropped_image
def random_crop_image_v2(image_bytes,
image_shape,
aspect_ratio_range=(3. / 4., 4. / 3.),
area_range=(0.08, 1.0),
max_attempts=10,
seed=1):
"""Randomly crop an arbitrary shaped slice from the input image.
This is a faster version of `random_crop_image` which takes the original
image bytes and image size as the inputs, and partially decode the JPEG
bytes according to the generated crop.
Args:
image_bytes: a Tensor of type string representing the raw image bytes.
image_shape: a Tensor specifying the shape of the raw image.
aspect_ratio_range: a list of floats. The cropped area of the image must
have an aspect ratio = width / height within this range.
area_range: a list of floats. The cropped reas of the image must contain
a fraction of the input image within this range.
max_attempts: the number of attempts at generating a cropped region of the
image of the specified constraints. After max_attempts failures, return
the entire image.
seed: the seed of the random generator.
Returns:
cropped_image: a Tensor representing the random cropped image. Can be the
original image if max_attempts is exhausted.
"""
with tf.name_scope('random_crop_image_v2'):
crop_offset, crop_size, _ = tf.image.sample_distorted_bounding_box(
image_shape,
tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
seed=seed,
min_object_covered=area_range[0],
aspect_ratio_range=aspect_ratio_range,
area_range=area_range,
max_attempts=max_attempts)
offset_y, offset_x, _ = tf.unstack(crop_offset)
crop_height, crop_width, _ = tf.unstack(crop_size)
crop_window = tf.stack([offset_y, offset_x, crop_height, crop_width])
cropped_image = tf.image.decode_and_crop_jpeg(
image_bytes, crop_window, channels=3)
return cropped_image
def resize_and_crop_boxes(boxes,
image_scale,
output_size,
offset):
"""Resizes boxes to output size with scale and offset.
Args:
boxes: `Tensor` of shape [N, 4] representing ground truth boxes.
image_scale: 2D float `Tensor` representing scale factors that apply to
[height, width] of input image.
output_size: 2D `Tensor` or `int` representing [height, width] of target
output image size.
offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
boxes.
Returns:
boxes: `Tensor` of shape [N, 4] representing the scaled boxes.
"""
with tf.name_scope('resize_and_crop_boxes'):
# Adjusts box coordinates based on image_scale and offset.
boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
# Clips the boxes.
boxes = box_ops.clip_boxes(boxes, output_size)
return boxes
def resize_and_crop_masks(masks,
image_scale,
output_size,
offset):
"""Resizes boxes to output size with scale and offset.
Args:
masks: `Tensor` of shape [N, H, W, 1] representing ground truth masks.
image_scale: 2D float `Tensor` representing scale factors that apply to
[height, width] of input image.
output_size: 2D `Tensor` or `int` representing [height, width] of target
output image size.
offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
boxes.
Returns:
masks: `Tensor` of shape [N, H, W, 1] representing the scaled masks.
"""
with tf.name_scope('resize_and_crop_masks'):
mask_size = tf.cast(tf.shape(masks)[1:3], tf.float32)
# Pad masks to avoid empty mask annotations.
masks = tf.concat(
[tf.zeros([1, mask_size[0], mask_size[1], 1]), masks], axis=0)
scaled_size = tf.cast(image_scale * mask_size, tf.int32)
scaled_masks = tf.image.resize(
masks, scaled_size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
offset = tf.cast(offset, tf.int32)
scaled_masks = scaled_masks[
:,
offset[0]:offset[0] + output_size[0],
offset[1]:offset[1] + output_size[1],
:]
output_masks = tf.image.pad_to_bounding_box(
scaled_masks, 0, 0, output_size[0], output_size[1])
# Remove padding.
output_masks = output_masks[1::]
return output_masks
def horizontal_flip_image(image):
"""Flips image horizontally."""
return tf.image.flip_left_right(image)
def horizontal_flip_boxes(normalized_boxes):
"""Flips normalized boxes horizontally."""
ymin, xmin, ymax, xmax = tf.split(
value=normalized_boxes, num_or_size_splits=4, axis=1)
flipped_xmin = tf.subtract(1.0, xmax)
flipped_xmax = tf.subtract(1.0, xmin)
flipped_boxes = tf.concat([ymin, flipped_xmin, ymax, flipped_xmax], 1)
return flipped_boxes
def horizontal_flip_masks(masks):
"""Flips masks horizontally."""
return masks[:, :, ::-1]
def random_horizontal_flip(image, normalized_boxes=None, masks=None, seed=1):
"""Randomly flips input image and bounding boxes."""
with tf.name_scope('random_horizontal_flip'):
do_flip = tf.greater(tf.random.uniform([], seed=seed), 0.5)
image = tf.cond(
do_flip,
lambda: horizontal_flip_image(image),
lambda: image)
if normalized_boxes is not None:
normalized_boxes = tf.cond(
do_flip,
lambda: horizontal_flip_boxes(normalized_boxes),
lambda: normalized_boxes)
if masks is not None:
masks = tf.cond(
do_flip,
lambda: horizontal_flip_masks(masks),
lambda: masks)
return image, normalized_boxes, masks
def color_jitter(image: tf.Tensor,
brightness: Optional[float] = 0.,
contrast: Optional[float] = 0.,
saturation: Optional[float] = 0.,
seed: Optional[int] = None) -> tf.Tensor:
"""Applies color jitter to an image, similarly to torchvision`s ColorJitter.
Args:
image (tf.Tensor): Of shape [height, width, 3] and type uint8.
brightness (float, optional): Magnitude for brightness jitter. Defaults to
0.
contrast (float, optional): Magnitude for contrast jitter. Defaults to 0.
saturation (float, optional): Magnitude for saturation jitter. Defaults to
0.
seed (int, optional): Random seed. Defaults to None.
Returns:
tf.Tensor: The augmented `image` of type uint8.
"""
image = tf.cast(image, dtype=tf.uint8)
image = random_brightness(image, brightness, seed=seed)
image = random_contrast(image, contrast, seed=seed)
image = random_saturation(image, saturation, seed=seed)
return image
def random_brightness(image: tf.Tensor,
brightness: float = 0.,
seed: Optional[int] = None) -> tf.Tensor:
"""Jitters brightness of an image.
Args:
image (tf.Tensor): Of shape [height, width, 3] and type uint8.
brightness (float, optional): Magnitude for brightness jitter. Defaults to
0.
seed (int, optional): Random seed. Defaults to None.
Returns:
tf.Tensor: The augmented `image` of type uint8.
"""
assert brightness >= 0, '`brightness` must be positive'
brightness = tf.random.uniform([],
max(0, 1 - brightness),
1 + brightness,
seed=seed,
dtype=tf.float32)
return augment.brightness(image, brightness)
def random_contrast(image: tf.Tensor,
contrast: float = 0.,
seed: Optional[int] = None) -> tf.Tensor:
"""Jitters contrast of an image, similarly to torchvision`s ColorJitter.
Args:
image (tf.Tensor): Of shape [height, width, 3] and type uint8.
contrast (float, optional): Magnitude for contrast jitter. Defaults to 0.
seed (int, optional): Random seed. Defaults to None.
Returns:
tf.Tensor: The augmented `image` of type uint8.
"""
assert contrast >= 0, '`contrast` must be positive'
contrast = tf.random.uniform([],
max(0, 1 - contrast),
1 + contrast,
seed=seed,
dtype=tf.float32)
return augment.contrast(image, contrast)
def random_saturation(image: tf.Tensor,
saturation: float = 0.,
seed: Optional[int] = None) -> tf.Tensor:
"""Jitters saturation of an image, similarly to torchvision`s ColorJitter.
Args:
image (tf.Tensor): Of shape [height, width, 3] and type uint8.
saturation (float, optional): Magnitude for saturation jitter. Defaults to
0.
seed (int, optional): Random seed. Defaults to None.
Returns:
tf.Tensor: The augmented `image` of type uint8.
"""
assert saturation >= 0, '`saturation` must be positive'
saturation = tf.random.uniform([],
max(0, 1 - saturation),
1 + saturation,
seed=seed,
dtype=tf.float32)
return _saturation(image, saturation)
def _saturation(image: tf.Tensor,
saturation: Optional[float] = 0.) -> tf.Tensor:
return augment.blend(
tf.repeat(tf.image.rgb_to_grayscale(image), 3, axis=-1), image,
saturation)
def random_crop_image_with_boxes_and_labels(img, boxes, labels, min_scale,
aspect_ratio_range,
min_overlap_params, max_retry):
"""Crops a random slice from the input image.
The function will correspondingly recompute the bounding boxes and filter out
outside boxes and their labels.
References:
[1] End-to-End Object Detection with Transformers
https://arxiv.org/abs/2005.12872
The preprocessing steps:
1. Sample a minimum IoU overlap.
2. For each trial, sample the new image width, height, and top-left corner.
3. Compute the IoUs of bounding boxes with the cropped image and retry if
the maximum IoU is below the sampled threshold.
4. Find boxes whose centers are in the cropped image.
5. Compute new bounding boxes in the cropped region and only select those
boxes' labels.
Args:
img: a 'Tensor' of shape [height, width, 3] representing the input image.
boxes: a 'Tensor' of shape [N, 4] representing the ground-truth bounding
boxes with (ymin, xmin, ymax, xmax).
labels: a 'Tensor' of shape [N,] representing the class labels of the boxes.
min_scale: a 'float' in [0.0, 1.0) indicating the lower bound of the random
scale variable.
aspect_ratio_range: a list of two 'float' that specifies the lower and upper
bound of the random aspect ratio.
min_overlap_params: a list of four 'float' representing the min value, max
value, step size, and offset for the minimum overlap sample.
max_retry: an 'int' representing the number of trials for cropping. If it is
exhausted, no cropping will be performed.
Returns:
img: a Tensor representing the random cropped image. Can be the
original image if max_retry is exhausted.
boxes: a Tensor representing the bounding boxes in the cropped image.
labels: a Tensor representing the new bounding boxes' labels.
"""
shape = tf.shape(img)
original_h = shape[0]
original_w = shape[1]
minval, maxval, step, offset = min_overlap_params
min_overlap = tf.math.floordiv(
tf.random.uniform([], minval=minval, maxval=maxval), step) * step - offset
min_overlap = tf.clip_by_value(min_overlap, 0.0, 1.1)
if min_overlap > 1.0:
return img, boxes, labels
aspect_ratio_low = aspect_ratio_range[0]
aspect_ratio_high = aspect_ratio_range[1]
for _ in tf.range(max_retry):
scale_h = tf.random.uniform([], min_scale, 1.0)
scale_w = tf.random.uniform([], min_scale, 1.0)
new_h = tf.cast(
scale_h * tf.cast(original_h, dtype=tf.float32), dtype=tf.int32)
new_w = tf.cast(
scale_w * tf.cast(original_w, dtype=tf.float32), dtype=tf.int32)
# Aspect ratio has to be in the prespecified range
aspect_ratio = new_h / new_w
if aspect_ratio_low > aspect_ratio or aspect_ratio > aspect_ratio_high:
continue
left = tf.random.uniform([], 0, original_w - new_w, dtype=tf.int32)
right = left + new_w
top = tf.random.uniform([], 0, original_h - new_h, dtype=tf.int32)
bottom = top + new_h
normalized_left = tf.cast(
left, dtype=tf.float32) / tf.cast(
original_w, dtype=tf.float32)
normalized_right = tf.cast(
right, dtype=tf.float32) / tf.cast(
original_w, dtype=tf.float32)
normalized_top = tf.cast(
top, dtype=tf.float32) / tf.cast(
original_h, dtype=tf.float32)
normalized_bottom = tf.cast(
bottom, dtype=tf.float32) / tf.cast(
original_h, dtype=tf.float32)
cropped_box = tf.expand_dims(
tf.stack([
normalized_top,
normalized_left,
normalized_bottom,
normalized_right,
]),
axis=0)
iou = box_ops.bbox_overlap(
tf.expand_dims(cropped_box, axis=0),
tf.expand_dims(boxes, axis=0)) # (1, 1, n_ground_truth)
iou = tf.squeeze(iou, axis=[0, 1])
# If not a single bounding box has a Jaccard overlap of greater than
# the minimum, try again
if tf.reduce_max(iou) < min_overlap:
continue
centroids = box_ops.yxyx_to_cycxhw(boxes)
mask = tf.math.logical_and(
tf.math.logical_and(centroids[:, 0] > normalized_top,
centroids[:, 0] < normalized_bottom),
tf.math.logical_and(centroids[:, 1] > normalized_left,
centroids[:, 1] < normalized_right))
# If not a single bounding box has its center in the crop, try again.
if tf.reduce_sum(tf.cast(mask, dtype=tf.int32)) > 0:
indices = tf.squeeze(tf.where(mask), axis=1)
filtered_boxes = tf.gather(boxes, indices)
boxes = tf.clip_by_value(
(filtered_boxes[..., :] * tf.cast(
tf.stack([original_h, original_w, original_h, original_w]),
dtype=tf.float32) -
tf.cast(tf.stack([top, left, top, left]), dtype=tf.float32)) /
tf.cast(tf.stack([new_h, new_w, new_h, new_w]), dtype=tf.float32),
0.0, 1.0)
img = tf.image.crop_to_bounding_box(img, top, left, bottom - top,
right - left)
labels = tf.gather(labels, indices)
break
return img, boxes, labels
def random_crop(image,
boxes,
labels,
min_scale=0.3,
aspect_ratio_range=(0.5, 2.0),
min_overlap_params=(0.0, 1.4, 0.2, 0.1),
max_retry=50,
seed=None):
"""Randomly crop the image and boxes, filtering labels.
Args:
image: a 'Tensor' of shape [height, width, 3] representing the input image.
boxes: a 'Tensor' of shape [N, 4] representing the ground-truth bounding
boxes with (ymin, xmin, ymax, xmax).
labels: a 'Tensor' of shape [N,] representing the class labels of the boxes.
min_scale: a 'float' in [0.0, 1.0) indicating the lower bound of the random
scale variable.
aspect_ratio_range: a list of two 'float' that specifies the lower and upper
bound of the random aspect ratio.
min_overlap_params: a list of four 'float' representing the min value, max
value, step size, and offset for the minimum overlap sample.
max_retry: an 'int' representing the number of trials for cropping. If it is
exhausted, no cropping will be performed.
seed: the random number seed of int, but could be None.
Returns:
image: a Tensor representing the random cropped image. Can be the
original image if max_retry is exhausted.
boxes: a Tensor representing the bounding boxes in the cropped image.
labels: a Tensor representing the new bounding boxes' labels.
"""
with tf.name_scope('random_crop'):
do_crop = tf.greater(tf.random.uniform([], seed=seed), 0.5)
if do_crop:
return random_crop_image_with_boxes_and_labels(image, boxes, labels,
min_scale,
aspect_ratio_range,
min_overlap_params,
max_retry)
else:
return image, boxes, labels
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Utils for processing video dataset features."""
from typing import Optional, Tuple
import tensorflow as tf
def _sample_or_pad_sequence_indices(sequence: tf.Tensor,
num_steps: int,
stride: int,
offset: tf.Tensor) -> tf.Tensor:
"""Returns indices to take for sampling or padding sequences to fixed size."""
sequence_length = tf.shape(sequence)[0]
sel_idx = tf.range(sequence_length)
# Repeats sequence until num_steps are available in total.
max_length = num_steps * stride + offset
num_repeats = tf.math.floordiv(
max_length + sequence_length - 1, sequence_length)
sel_idx = tf.tile(sel_idx, [num_repeats])
steps = tf.range(offset, offset + num_steps * stride, stride)
return tf.gather(sel_idx, steps)
def sample_linspace_sequence(sequence: tf.Tensor,
num_windows: int,
num_steps: int,
stride: int) -> tf.Tensor:
"""Samples `num_windows` segments from sequence with linearly spaced offsets.
The samples are concatenated in a single `tf.Tensor` in order to have the same
format structure per timestep (e.g. a single frame). If `num_steps` * `stride`
is bigger than the number of timesteps, the sequence is repeated. This
function can be used in evaluation in order to extract enough segments to span
the entire sequence.
Args:
sequence: Any tensor where the first dimension is timesteps.
num_windows: Number of windows retrieved from the sequence.
num_steps: Number of steps (e.g. frames) to take.
stride: Distance to sample between timesteps.
Returns:
A single `tf.Tensor` with first dimension `num_windows` * `num_steps`. The
tensor contains the concatenated list of `num_windows` tensors which offsets
have been linearly spaced from input.
"""
sequence_length = tf.shape(sequence)[0]
max_offset = tf.maximum(0, sequence_length - num_steps * stride)
offsets = tf.linspace(0.0, tf.cast(max_offset, tf.float32), num_windows)
offsets = tf.cast(offsets, tf.int32)
all_indices = []
for i in range(num_windows):
all_indices.append(_sample_or_pad_sequence_indices(
sequence=sequence,
num_steps=num_steps,
stride=stride,
offset=offsets[i]))
indices = tf.concat(all_indices, axis=0)
indices.set_shape((num_windows * num_steps,))
return tf.gather(sequence, indices)
def sample_sequence(sequence: tf.Tensor,
num_steps: int,
random: bool,
stride: int,
seed: Optional[int] = None) -> tf.Tensor:
"""Samples a single segment of size `num_steps` from a given sequence.
If `random` is not `True`, this function will simply sample the central window
of the sequence. Otherwise, a random offset will be chosen in a way that the
desired `num_steps` might be extracted from the sequence.
Args:
sequence: Any tensor where the first dimension is timesteps.
num_steps: Number of steps (e.g. frames) to take.
random: A boolean indicating whether to random sample the single window. If
`True`, the offset is randomized. If `False`, the middle frame minus half
of `num_steps` is the first frame.
stride: Distance to sample between timesteps.
seed: A deterministic seed to use when sampling.
Returns:
A single `tf.Tensor` with first dimension `num_steps` with the sampled
segment.
"""
sequence_length = tf.shape(sequence)[0]
if random:
sequence_length = tf.cast(sequence_length, tf.float32)
frame_stride = tf.cast(stride, tf.float32)
max_offset = tf.cond(
sequence_length > (num_steps - 1) * frame_stride,
lambda: sequence_length - (num_steps - 1) * frame_stride,
lambda: sequence_length)
offset = tf.random.uniform(
(),
maxval=tf.cast(max_offset, dtype=tf.int32),
dtype=tf.int32,
seed=seed)
else:
offset = (sequence_length - num_steps * stride) // 2
offset = tf.maximum(0, offset)
indices = _sample_or_pad_sequence_indices(
sequence=sequence,
num_steps=num_steps,
stride=stride,
offset=offset)
indices.set_shape((num_steps,))
return tf.gather(sequence, indices)
def decode_jpeg(image_string: tf.Tensor, channels: int = 0) -> tf.Tensor:
"""Decodes JPEG raw bytes string into a RGB uint8 Tensor.
Args:
image_string: A `tf.Tensor` of type strings with the raw JPEG bytes where
the first dimension is timesteps.
channels: Number of channels of the JPEG image. Allowed values are 0, 1 and
3. If 0, the number of channels will be calculated at runtime and no
static shape is set.
Returns:
A Tensor of shape [T, H, W, C] of type uint8 with the decoded images.
"""
return tf.map_fn(
lambda x: tf.image.decode_jpeg(x, channels=channels),
image_string, back_prop=False, dtype=tf.uint8)
def crop_image(frames: tf.Tensor,
target_height: int,
target_width: int,
random: bool = False,
num_crops: int = 1,
seed: Optional[int] = None) -> tf.Tensor:
"""Crops the image sequence of images.
If requested size is bigger than image size, image is padded with 0. If not
random cropping, a central crop is performed if num_crops is 1.
Args:
frames: A Tensor of dimension [timesteps, in_height, in_width, channels].
target_height: Target cropped image height.
target_width: Target cropped image width.
random: A boolean indicating if crop should be randomized.
num_crops: Number of crops (support 1 for central crop and 3 for 3-crop).
seed: A deterministic seed to use when random cropping.
Returns:
A Tensor of shape [timesteps, out_height, out_width, channels] of type uint8
with the cropped images.
"""
if random:
# Random spatial crop.
shape = tf.shape(frames)
# If a static_shape is available (e.g. when using this method from add_image
# method), it will be used to have an output tensor with static shape.
static_shape = frames.shape.as_list()
seq_len = shape[0] if static_shape[0] is None else static_shape[0]
channels = shape[3] if static_shape[3] is None else static_shape[3]
frames = tf.image.random_crop(
frames, (seq_len, target_height, target_width, channels), seed)
else:
if num_crops == 1:
# Central crop or pad.
frames = tf.image.resize_with_crop_or_pad(frames, target_height,
target_width)
elif num_crops == 3:
# Three-crop evaluation.
shape = tf.shape(frames)
static_shape = frames.shape.as_list()
seq_len = shape[0] if static_shape[0] is None else static_shape[0]
height = shape[1] if static_shape[1] is None else static_shape[1]
width = shape[2] if static_shape[2] is None else static_shape[2]
channels = shape[3] if static_shape[3] is None else static_shape[3]
size = tf.convert_to_tensor(
(seq_len, target_height, target_width, channels))
offset_1 = tf.broadcast_to([0, 0, 0, 0], [4])
# pylint:disable=g-long-lambda
offset_2 = tf.cond(
tf.greater_equal(height, width),
true_fn=lambda: tf.broadcast_to([
0, tf.cast(height, tf.float32) / 2 - target_height // 2, 0, 0
], [4]),
false_fn=lambda: tf.broadcast_to([
0, 0, tf.cast(width, tf.float32) / 2 - target_width // 2, 0
], [4]))
offset_3 = tf.cond(
tf.greater_equal(height, width),
true_fn=lambda: tf.broadcast_to(
[0, tf.cast(height, tf.float32) - target_height, 0, 0], [4]),
false_fn=lambda: tf.broadcast_to(
[0, 0, tf.cast(width, tf.float32) - target_width, 0], [4]))
# pylint:disable=g-long-lambda
crops = []
for offset in [offset_1, offset_2, offset_3]:
offset = tf.cast(tf.math.round(offset), tf.int32)
crops.append(tf.slice(frames, offset, size))
frames = tf.concat(crops, axis=0)
else:
raise NotImplementedError(
f"Only 1-crop and 3-crop are supported. Found {num_crops!r}.")
return frames
def resize_smallest(frames: tf.Tensor,
min_resize: int) -> tf.Tensor:
"""Resizes frames so that min(`height`, `width`) is equal to `min_resize`.
This function will not do anything if the min(`height`, `width`) is already
equal to `min_resize`. This allows to save compute time.
Args:
frames: A Tensor of dimension [timesteps, input_h, input_w, channels].
min_resize: Minimum size of the final image dimensions.
Returns:
A Tensor of shape [timesteps, output_h, output_w, channels] of type
frames.dtype where min(output_h, output_w) = min_resize.
"""
shape = tf.shape(frames)
input_h = shape[1]
input_w = shape[2]
output_h = tf.maximum(min_resize, (input_h * min_resize) // input_w)
output_w = tf.maximum(min_resize, (input_w * min_resize) // input_h)
def resize_fn():
frames_resized = tf.image.resize(frames, (output_h, output_w))
return tf.cast(frames_resized, frames.dtype)
should_resize = tf.math.logical_or(tf.not_equal(input_w, output_w),
tf.not_equal(input_h, output_h))
frames = tf.cond(should_resize, resize_fn, lambda: frames)
return frames
def random_crop_resize(frames: tf.Tensor,
output_h: int,
output_w: int,
num_frames: int,
num_channels: int,
aspect_ratio: Tuple[float, float],
area_range: Tuple[float, float]) -> tf.Tensor:
"""First crops clip with jittering and then resizes to (output_h, output_w).
Args:
frames: A Tensor of dimension [timesteps, input_h, input_w, channels].
output_h: Resized image height.
output_w: Resized image width.
num_frames: Number of input frames per clip.
num_channels: Number of channels of the clip.
aspect_ratio: Float tuple with the aspect range for cropping.
area_range: Float tuple with the area range for cropping.
Returns:
A Tensor of shape [timesteps, output_h, output_w, channels] of type
frames.dtype.
"""
shape = tf.shape(frames)
seq_len, _, _, channels = shape[0], shape[1], shape[2], shape[3]
bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
factor = output_w / output_h
aspect_ratio = (aspect_ratio[0] * factor, aspect_ratio[1] * factor)
sample_distorted_bbox = tf.image.sample_distorted_bounding_box(
shape[1:],
bounding_boxes=bbox,
min_object_covered=0.1,
aspect_ratio_range=aspect_ratio,
area_range=area_range,
max_attempts=100,
use_image_if_no_bounding_boxes=True)
bbox_begin, bbox_size, _ = sample_distorted_bbox
offset_y, offset_x, _ = tf.unstack(bbox_begin)
target_height, target_width, _ = tf.unstack(bbox_size)
size = tf.convert_to_tensor((
seq_len, target_height, target_width, channels))
offset = tf.convert_to_tensor((
0, offset_y, offset_x, 0))
frames = tf.slice(frames, offset, size)
frames = tf.cast(
tf.image.resize(frames, (output_h, output_w)),
frames.dtype)
frames.set_shape((num_frames, output_h, output_w, num_channels))
return frames
def random_flip_left_right(
frames: tf.Tensor,
seed: Optional[int] = None) -> tf.Tensor:
"""Flips all the frames with a probability of 50%.
Args:
frames: A Tensor of shape [timesteps, input_h, input_w, channels].
seed: A seed to use for the random sampling.
Returns:
A Tensor of shape [timesteps, output_h, output_w, channels] eventually
flipped left right.
"""
is_flipped = tf.random.uniform(
(), minval=0, maxval=2, dtype=tf.int32, seed=seed)
frames = tf.cond(tf.equal(is_flipped, 1),
true_fn=lambda: tf.image.flip_left_right(frames),
false_fn=lambda: frames)
return frames
def normalize_image(frames: tf.Tensor,
zero_centering_image: bool,
dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
"""Normalizes images.
Args:
frames: A Tensor of numbers.
zero_centering_image: If True, results are in [-1, 1], if False, results are
in [0, 1].
dtype: Type of output Tensor.
Returns:
A Tensor of same shape as the input and of the given type.
"""
frames = tf.cast(frames, dtype)
if zero_centering_image:
return frames * (2.0 / 255.0) - 1.0
else:
return frames / 255.0
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
import io
import itertools
import numpy as np
from PIL import Image
import tensorflow as tf
from official.vision.ops import preprocess_ops_3d
class ParserUtilsTest(tf.test.TestCase):
def setUp(self):
super().setUp()
# [[0, 1, ..., 119], [1, 2, ..., 120], ..., [119, 120, ..., 218]].
self._frames = tf.stack([tf.range(i, i + 120) for i in range(90)])
self._frames = tf.cast(self._frames, tf.uint8)
self._frames = self._frames[tf.newaxis, :, :, tf.newaxis]
self._frames = tf.broadcast_to(self._frames, (6, 90, 120, 3))
# Create an equivalent numpy array for assertions.
self._np_frames = np.array([range(i, i + 120) for i in range(90)])
self._np_frames = self._np_frames[np.newaxis, :, :, np.newaxis]
self._np_frames = np.broadcast_to(self._np_frames, (6, 90, 120, 3))
def test_sample_linspace_sequence(self):
sequence = tf.range(100)
sampled_seq_1 = preprocess_ops_3d.sample_linspace_sequence(
sequence, 10, 10, 1)
sampled_seq_2 = preprocess_ops_3d.sample_linspace_sequence(
sequence, 7, 10, 1)
sampled_seq_3 = preprocess_ops_3d.sample_linspace_sequence(
sequence, 7, 5, 2)
sampled_seq_4 = preprocess_ops_3d.sample_linspace_sequence(
sequence, 101, 1, 1)
self.assertAllEqual(sampled_seq_1, range(100))
# [0, 1, 2, 3, 4, ..., 8, 9, 15, 16, ..., 97, 98, 99]
self.assertAllEqual(
sampled_seq_2,
[15 * i + j for i, j in itertools.product(range(7), range(10))])
# [0, 2, 4, 6, 8, 15, 17, 19, ..., 96, 98]
self.assertAllEqual(
sampled_seq_3,
[15 * i + 2 * j for i, j in itertools.product(range(7), range(5))])
self.assertAllEqual(sampled_seq_4, [0] + list(range(100)))
def test_sample_sequence(self):
sequence = tf.range(100)
sampled_seq_1 = preprocess_ops_3d.sample_sequence(sequence, 10, False, 1)
sampled_seq_2 = preprocess_ops_3d.sample_sequence(sequence, 10, False, 2)
sampled_seq_3 = preprocess_ops_3d.sample_sequence(sequence, 10, True, 1)
self.assertAllEqual(sampled_seq_1, range(45, 55))
self.assertAllEqual(sampled_seq_2, range(40, 60, 2))
offset_3 = sampled_seq_3[0]
self.assertBetween(offset_3, 0, 99)
self.assertAllEqual(sampled_seq_3, range(offset_3, offset_3 + 10))
def test_decode_jpeg(self):
# Create a random RGB JPEG image.
random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
random_image = Image.fromarray(random_image)
with io.BytesIO() as buffer:
random_image.save(buffer, format='JPEG')
raw_image_bytes = buffer.getvalue()
raw_image = tf.constant([raw_image_bytes, raw_image_bytes])
decoded_image = preprocess_ops_3d.decode_jpeg(raw_image, 3)
self.assertEqual(decoded_image.shape.as_list()[3], 3)
self.assertAllEqual(decoded_image.shape, (2, 263, 320, 3))
def test_crop_image(self):
cropped_image_1 = preprocess_ops_3d.crop_image(self._frames, 50, 70)
cropped_image_2 = preprocess_ops_3d.crop_image(self._frames, 200, 200)
cropped_image_3 = preprocess_ops_3d.crop_image(self._frames, 50, 70, True)
cropped_image_4 = preprocess_ops_3d.crop_image(
self._frames, 90, 90, False, 3)
self.assertAllEqual(cropped_image_1.shape, (6, 50, 70, 3))
self.assertAllEqual(cropped_image_1, self._np_frames[:, 20:70, 25:95, :])
self.assertAllEqual(cropped_image_2.shape, (6, 200, 200, 3))
expected = np.pad(
self._np_frames, ((0, 0), (55, 55), (40, 40), (0, 0)), 'constant')
self.assertAllEqual(cropped_image_2, expected)
self.assertAllEqual(cropped_image_3.shape, (6, 50, 70, 3))
offset = cropped_image_3[0, 0, 0, 0]
expected = np.array([range(i, i + 70) for i in range(offset, offset + 50)])
expected = expected[np.newaxis, :, :, np.newaxis]
expected = np.broadcast_to(expected, (6, 50, 70, 3))
self.assertAllEqual(cropped_image_3, expected)
self.assertAllEqual(cropped_image_4.shape, (18, 90, 90, 3))
def test_resize_smallest(self):
resized_frames_1 = preprocess_ops_3d.resize_smallest(self._frames, 180)
resized_frames_2 = preprocess_ops_3d.resize_smallest(self._frames, 45)
resized_frames_3 = preprocess_ops_3d.resize_smallest(self._frames, 90)
resized_frames_4 = preprocess_ops_3d.resize_smallest(
tf.transpose(self._frames, (0, 2, 1, 3)), 45)
self.assertAllEqual(resized_frames_1.shape, (6, 180, 240, 3))
self.assertAllEqual(resized_frames_2.shape, (6, 45, 60, 3))
self.assertAllEqual(resized_frames_3.shape, (6, 90, 120, 3))
self.assertAllEqual(resized_frames_4.shape, (6, 60, 45, 3))
def test_random_crop_resize(self):
resized_frames_1 = preprocess_ops_3d.random_crop_resize(
self._frames, 256, 256, 6, 3, (0.5, 2), (0.3, 1))
resized_frames_2 = preprocess_ops_3d.random_crop_resize(
self._frames, 224, 224, 6, 3, (0.5, 2), (0.3, 1))
resized_frames_3 = preprocess_ops_3d.random_crop_resize(
self._frames, 256, 256, 6, 3, (0.8, 1.2), (0.3, 1))
resized_frames_4 = preprocess_ops_3d.random_crop_resize(
self._frames, 256, 256, 6, 3, (0.5, 2), (0.1, 1))
self.assertAllEqual(resized_frames_1.shape, (6, 256, 256, 3))
self.assertAllEqual(resized_frames_2.shape, (6, 224, 224, 3))
self.assertAllEqual(resized_frames_3.shape, (6, 256, 256, 3))
self.assertAllEqual(resized_frames_4.shape, (6, 256, 256, 3))
def test_random_flip_left_right(self):
flipped_frames = preprocess_ops_3d.random_flip_left_right(self._frames)
flipped = np.fliplr(self._np_frames[0, :, :, 0])
flipped = flipped[np.newaxis, :, :, np.newaxis]
flipped = np.broadcast_to(flipped, (6, 90, 120, 3))
self.assertTrue((flipped_frames == self._np_frames).numpy().all() or (
flipped_frames == flipped).numpy().all())
def test_normalize_image(self):
normalized_images_1 = preprocess_ops_3d.normalize_image(
self._frames, False, tf.float32)
normalized_images_2 = preprocess_ops_3d.normalize_image(
self._frames, True, tf.float32)
self.assertAllClose(normalized_images_1, self._np_frames / 255)
self.assertAllClose(normalized_images_2, self._np_frames * 2 / 255 - 1.0)
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for preprocess_ops.py."""
import io
# Import libraries
from absl.testing import parameterized
import numpy as np
from PIL import Image
import tensorflow as tf
from official.vision.ops import preprocess_ops
def _encode_image(image_array, fmt):
image = Image.fromarray(image_array)
with io.BytesIO() as output:
image.save(output, format=fmt)
return output.getvalue()
class InputUtilsTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
([1], 10),
([1, 2], 10),
([1, 2, 3], 10),
([11], 10),
([12, 2], 10),
([13, 2, 3], 10),
)
def test_pad_to_fixed_size(self, input_shape, output_size):
# Copies input shape to padding shape.
clip_shape = input_shape[:]
clip_shape[0] = min(output_size, clip_shape[0])
padding_shape = input_shape[:]
padding_shape[0] = max(output_size - input_shape[0], 0)
expected_outputs = np.concatenate(
[np.ones(clip_shape), np.zeros(padding_shape)], axis=0)
data = tf.ones(input_shape)
output_data = preprocess_ops.clip_or_pad_to_fixed_size(
data, output_size, constant_values=0)
output_data = output_data.numpy()
self.assertAllClose(output_size, output_data.shape[0])
self.assertAllClose(expected_outputs, output_data)
@parameterized.parameters(
(100, 200, 100, 200, 32, 1.0, 1.0, 128, 224),
(100, 256, 128, 256, 32, 1.0, 1.0, 128, 256),
(200, 512, 200, 128, 32, 0.25, 0.25, 224, 128),
)
def test_resize_and_crop_image_rectangluar_case(self, input_height,
input_width, desired_height,
desired_width, stride,
scale_y, scale_x,
output_height, output_width):
image = tf.convert_to_tensor(
np.random.rand(input_height, input_width, 3))
desired_size = (desired_height, desired_width)
resized_image, image_info = preprocess_ops.resize_and_crop_image(
image,
desired_size=desired_size,
padded_size=preprocess_ops.compute_padded_size(desired_size, stride))
resized_image_shape = tf.shape(resized_image)
self.assertAllEqual(
[output_height, output_width, 3],
resized_image_shape.numpy())
self.assertNDArrayNear(
[[input_height, input_width],
[desired_height, desired_width],
[scale_y, scale_x],
[0.0, 0.0]],
image_info.numpy(),
1e-5)
@parameterized.parameters(
(100, 200, 220, 220, 32, 1.1, 1.1, 224, 224),
(512, 512, 1024, 1024, 32, 2.0, 2.0, 1024, 1024),
)
def test_resize_and_crop_image_square_case(self, input_height, input_width,
desired_height, desired_width,
stride, scale_y, scale_x,
output_height, output_width):
image = tf.convert_to_tensor(
np.random.rand(input_height, input_width, 3))
desired_size = (desired_height, desired_width)
resized_image, image_info = preprocess_ops.resize_and_crop_image(
image,
desired_size=desired_size,
padded_size=preprocess_ops.compute_padded_size(desired_size, stride))
resized_image_shape = tf.shape(resized_image)
self.assertAllEqual(
[output_height, output_width, 3],
resized_image_shape.numpy())
self.assertNDArrayNear(
[[input_height, input_width],
[desired_height, desired_width],
[scale_y, scale_x],
[0.0, 0.0]],
image_info.numpy(),
1e-5)
@parameterized.parameters(
(100, 200, 100, 300, 32, 1.0, 1.0, 100, 200, 128, 320),
(200, 100, 100, 300, 32, 1.0, 1.0, 200, 100, 320, 128),
(100, 200, 80, 100, 32, 0.5, 0.5, 50, 100, 96, 128),
(200, 100, 80, 100, 32, 0.5, 0.5, 100, 50, 128, 96),
)
def test_resize_and_crop_image_v2(self, input_height, input_width, short_side,
long_side, stride, scale_y, scale_x,
desired_height, desired_width,
output_height, output_width):
image = tf.convert_to_tensor(
np.random.rand(input_height, input_width, 3))
image_shape = tf.shape(image)[0:2]
desired_size = tf.where(
tf.greater(image_shape[0], image_shape[1]),
tf.constant([long_side, short_side], dtype=tf.int32),
tf.constant([short_side, long_side], dtype=tf.int32))
resized_image, image_info = preprocess_ops.resize_and_crop_image_v2(
image,
short_side=short_side,
long_side=long_side,
padded_size=preprocess_ops.compute_padded_size(desired_size, stride))
resized_image_shape = tf.shape(resized_image)
self.assertAllEqual(
[output_height, output_width, 3],
resized_image_shape.numpy())
self.assertNDArrayNear(
[[input_height, input_width],
[desired_height, desired_width],
[scale_y, scale_x],
[0.0, 0.0]],
image_info.numpy(),
1e-5)
@parameterized.parameters(
(400, 600), (600, 400),
)
def test_center_crop_image(self, input_height, input_width):
image = tf.convert_to_tensor(
np.random.rand(input_height, input_width, 3))
cropped_image = preprocess_ops.center_crop_image(image)
cropped_image_shape = tf.shape(cropped_image)
self.assertAllEqual([350, 350, 3], cropped_image_shape.numpy())
@parameterized.parameters(
(400, 600), (600, 400),
)
def test_center_crop_image_v2(self, input_height, input_width):
image_bytes = tf.constant(
_encode_image(
np.uint8(np.random.rand(input_height, input_width, 3) * 255),
fmt='JPEG'),
dtype=tf.string)
cropped_image = preprocess_ops.center_crop_image_v2(
image_bytes, tf.constant([input_height, input_width, 3], tf.int32))
cropped_image_shape = tf.shape(cropped_image)
self.assertAllEqual([350, 350, 3], cropped_image_shape.numpy())
@parameterized.parameters(
(400, 600), (600, 400),
)
def test_random_crop_image(self, input_height, input_width):
image = tf.convert_to_tensor(
np.random.rand(input_height, input_width, 3))
_ = preprocess_ops.random_crop_image(image)
@parameterized.parameters(
(400, 600), (600, 400),
)
def test_random_crop_image_v2(self, input_height, input_width):
image_bytes = tf.constant(
_encode_image(
np.uint8(np.random.rand(input_height, input_width, 3) * 255),
fmt='JPEG'),
dtype=tf.string)
_ = preprocess_ops.random_crop_image_v2(
image_bytes, tf.constant([input_height, input_width, 3], tf.int32))
@parameterized.parameters((400, 600, 0), (400, 600, 0.4), (600, 400, 1.4))
def testColorJitter(self, input_height, input_width, color_jitter):
image = tf.convert_to_tensor(np.random.rand(input_height, input_width, 3))
jittered_image = preprocess_ops.color_jitter(image, color_jitter,
color_jitter, color_jitter)
assert jittered_image.shape == image.shape
@parameterized.parameters((400, 600, 0), (400, 600, 0.4), (600, 400, 1))
def testSaturation(self, input_height, input_width, saturation):
image = tf.convert_to_tensor(np.random.rand(input_height, input_width, 3))
jittered_image = preprocess_ops._saturation(image, saturation)
assert jittered_image.shape == image.shape
@parameterized.parameters((640, 640, 20), (1280, 1280, 30))
def test_random_crop(self, input_height, input_width, num_boxes):
image = tf.convert_to_tensor(np.random.rand(input_height, input_width, 3))
boxes_height = np.random.randint(0, input_height, size=(num_boxes, 1))
top = np.random.randint(0, high=(input_height - boxes_height))
down = top + boxes_height
boxes_width = np.random.randint(0, input_width, size=(num_boxes, 1))
left = np.random.randint(0, high=(input_width - boxes_width))
right = left + boxes_width
boxes = tf.constant(
np.concatenate([top, left, down, right], axis=-1), tf.float32)
labels = tf.constant(
np.random.randint(low=0, high=num_boxes, size=(num_boxes,)), tf.int64)
_ = preprocess_ops.random_crop(image, boxes, labels)
@parameterized.parameters(
((640, 640, 3), (1000, 1000), None, (1000, 1000, 3)),
((1280, 640, 3), 320, None, (640, 320, 3)),
((640, 1280, 3), 320, None, (320, 640, 3)),
((640, 640, 3), 320, 100, (100, 100, 3)))
def test_resize_image(self, input_shape, size, max_size, expected_shape):
resized_img, image_info = preprocess_ops.resize_image(
tf.zeros((input_shape)), size, max_size)
self.assertAllEqual(tf.shape(resized_img), expected_shape)
self.assertAllEqual(image_info[0], input_shape[:-1])
self.assertAllEqual(image_info[1], expected_shape[:-1])
self.assertAllEqual(
image_info[2],
np.array(expected_shape[:-1]) / np.array(input_shape[:-1]))
self.assertAllEqual(image_info[3], [0, 0])
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Class to subsample minibatches by balancing positives and negatives.
Subsamples minibatches based on a pre-specified positive fraction in range
[0,1]. The class presumes there are many more negatives than positive examples:
if the desired batch_size cannot be achieved with the pre-specified positive
fraction, it fills the rest with negative examples. If this is not sufficient
for obtaining the desired batch_size, it returns fewer examples.
The main function to call is Subsample(self, indicator, labels). For convenience
one can also call SubsampleWeights(self, weights, labels) which is defined in
the minibatch_sampler base class.
When is_static is True, it implements a method that guarantees static shapes.
It also ensures the length of output of the subsample is always batch_size, even
when number of examples set to True in indicator is less than batch_size.
This is originally implemented in TensorFlow Object Detection API.
"""
# Import libraries
import tensorflow as tf
def combined_static_and_dynamic_shape(tensor):
"""Returns a list containing static and dynamic values for the dimensions.
Returns a list of static and dynamic values for shape dimensions. This is
useful to preserve static shapes when available in reshape operation.
Args:
tensor: A tensor of any type.
Returns:
A list of size tensor.shape.ndims containing integers or a scalar tensor.
"""
static_tensor_shape = tensor.shape.as_list()
dynamic_tensor_shape = tf.shape(input=tensor)
combined_shape = []
for index, dim in enumerate(static_tensor_shape):
if dim is not None:
combined_shape.append(dim)
else:
combined_shape.append(dynamic_tensor_shape[index])
return combined_shape
def indices_to_dense_vector(indices,
size,
indices_value=1.,
default_value=0,
dtype=tf.float32):
"""Creates dense vector with indices set to specific value and rest to zeros.
This function exists because it is unclear if it is safe to use
tf.sparse_to_dense(indices, [size], 1, validate_indices=False)
with indices which are not ordered.
This function accepts a dynamic size (e.g. tf.shape(tensor)[0])
Args:
indices: 1d Tensor with integer indices which are to be set to
indices_values.
size: scalar with size (integer) of output Tensor.
indices_value: values of elements specified by indices in the output vector
default_value: values of other elements in the output vector.
dtype: data type.
Returns:
dense 1D Tensor of shape [size] with indices set to indices_values and the
rest set to default_value.
"""
size = tf.cast(size, dtype=tf.int32)
zeros = tf.ones([size], dtype=dtype) * default_value
values = tf.ones_like(indices, dtype=dtype) * indices_value
return tf.dynamic_stitch(
[tf.range(size), tf.cast(indices, dtype=tf.int32)], [zeros, values])
def matmul_gather_on_zeroth_axis(params, indices, scope=None):
"""Matrix multiplication based implementation of tf.gather on zeroth axis.
TODO(rathodv, jonathanhuang): enable sparse matmul option.
Args:
params: A float32 Tensor. The tensor from which to gather values.
Must be at least rank 1.
indices: A Tensor. Must be one of the following types: int32, int64.
Must be in range [0, params.shape[0])
scope: A name for the operation (optional).
Returns:
A Tensor. Has the same type as params. Values from params gathered
from indices given by indices, with shape indices.shape + params.shape[1:].
"""
scope = scope or 'MatMulGather'
with tf.name_scope(scope):
params_shape = combined_static_and_dynamic_shape(params)
indices_shape = combined_static_and_dynamic_shape(indices)
params2d = tf.reshape(params, [params_shape[0], -1])
indicator_matrix = tf.one_hot(indices, params_shape[0])
gathered_result_flattened = tf.matmul(indicator_matrix, params2d)
return tf.reshape(gathered_result_flattened,
tf.stack(indices_shape + params_shape[1:]))
class BalancedPositiveNegativeSampler:
"""Subsamples minibatches to a desired balance of positives and negatives."""
def __init__(self, positive_fraction=0.5, is_static=False):
"""Constructs a minibatch sampler.
Args:
positive_fraction: desired fraction of positive examples (scalar in [0,1])
in the batch.
is_static: If True, uses an implementation with static shape guarantees.
Raises:
ValueError: if positive_fraction < 0, or positive_fraction > 1
"""
if positive_fraction < 0 or positive_fraction > 1:
raise ValueError('positive_fraction should be in range [0,1]. '
'Received: %s.' % positive_fraction)
self._positive_fraction = positive_fraction
self._is_static = is_static
@staticmethod
def subsample_indicator(indicator, num_samples):
"""Subsample indicator vector.
Given a boolean indicator vector with M elements set to `True`, the function
assigns all but `num_samples` of these previously `True` elements to
`False`. If `num_samples` is greater than M, the original indicator vector
is returned.
Args:
indicator: a 1-dimensional boolean tensor indicating which elements
are allowed to be sampled and which are not.
num_samples: int32 scalar tensor
Returns:
a boolean tensor with the same shape as input (indicator) tensor
"""
indices = tf.where(indicator)
indices = tf.random.shuffle(indices)
indices = tf.reshape(indices, [-1])
num_samples = tf.minimum(tf.size(input=indices), num_samples)
selected_indices = tf.slice(indices, [0], tf.reshape(num_samples, [1]))
selected_indicator = indices_to_dense_vector(
selected_indices,
tf.shape(input=indicator)[0])
return tf.equal(selected_indicator, 1)
def _get_num_pos_neg_samples(self, sorted_indices_tensor, sample_size):
"""Counts the number of positives and negatives numbers to be sampled.
Args:
sorted_indices_tensor: A sorted int32 tensor of shape [N] which contains
the signed indices of the examples where the sign is based on the label
value. The examples that cannot be sampled are set to 0. It samples
at most sample_size*positive_fraction positive examples and remaining
from negative examples.
sample_size: Size of subsamples.
Returns:
A tuple containing the number of positive and negative labels in the
subsample.
"""
input_length = tf.shape(input=sorted_indices_tensor)[0]
valid_positive_index = tf.greater(sorted_indices_tensor,
tf.zeros(input_length, tf.int32))
num_sampled_pos = tf.reduce_sum(
input_tensor=tf.cast(valid_positive_index, tf.int32))
max_num_positive_samples = tf.constant(
int(sample_size * self._positive_fraction), tf.int32)
num_positive_samples = tf.minimum(max_num_positive_samples, num_sampled_pos)
num_negative_samples = tf.constant(sample_size,
tf.int32) - num_positive_samples
return num_positive_samples, num_negative_samples
def _get_values_from_start_and_end(self, input_tensor, num_start_samples,
num_end_samples, total_num_samples):
"""slices num_start_samples and last num_end_samples from input_tensor.
Args:
input_tensor: An int32 tensor of shape [N] to be sliced.
num_start_samples: Number of examples to be sliced from the beginning
of the input tensor.
num_end_samples: Number of examples to be sliced from the end of the
input tensor.
total_num_samples: Sum of is num_start_samples and num_end_samples. This
should be a scalar.
Returns:
A tensor containing the first num_start_samples and last num_end_samples
from input_tensor.
"""
input_length = tf.shape(input=input_tensor)[0]
start_positions = tf.less(tf.range(input_length), num_start_samples)
end_positions = tf.greater_equal(
tf.range(input_length), input_length - num_end_samples)
selected_positions = tf.logical_or(start_positions, end_positions)
selected_positions = tf.cast(selected_positions, tf.float32)
indexed_positions = tf.multiply(tf.cumsum(selected_positions),
selected_positions)
one_hot_selector = tf.one_hot(tf.cast(indexed_positions, tf.int32) - 1,
total_num_samples,
dtype=tf.float32)
return tf.cast(tf.tensordot(tf.cast(input_tensor, tf.float32),
one_hot_selector, axes=[0, 0]), tf.int32)
def _static_subsample(self, indicator, batch_size, labels):
"""Returns subsampled minibatch.
Args:
indicator: boolean tensor of shape [N] whose True entries can be sampled.
N should be a complie time constant.
batch_size: desired batch size. This scalar cannot be None.
labels: boolean tensor of shape [N] denoting positive(=True) and negative
(=False) examples. N should be a complie time constant.
Returns:
sampled_idx_indicator: boolean tensor of shape [N], True for entries which
are sampled. It ensures the length of output of the subsample is always
batch_size, even when number of examples set to True in indicator is
less than batch_size.
Raises:
ValueError: if labels and indicator are not 1D boolean tensors.
"""
# Check if indicator and labels have a static size.
if not indicator.shape.is_fully_defined():
raise ValueError('indicator must be static in shape when is_static is'
'True')
if not labels.shape.is_fully_defined():
raise ValueError('labels must be static in shape when is_static is'
'True')
if not isinstance(batch_size, int):
raise ValueError('batch_size has to be an integer when is_static is'
'True.')
input_length = tf.shape(input=indicator)[0]
# Set the number of examples set True in indicator to be at least
# batch_size.
num_true_sampled = tf.reduce_sum(
input_tensor=tf.cast(indicator, tf.float32))
additional_false_sample = tf.less_equal(
tf.cumsum(tf.cast(tf.logical_not(indicator), tf.float32)),
batch_size - num_true_sampled)
indicator = tf.logical_or(indicator, additional_false_sample)
# Shuffle indicator and label. Need to store the permutation to restore the
# order post sampling.
permutation = tf.random.shuffle(tf.range(input_length))
indicator = matmul_gather_on_zeroth_axis(
tf.cast(indicator, tf.float32), permutation)
labels = matmul_gather_on_zeroth_axis(
tf.cast(labels, tf.float32), permutation)
# index (starting from 1) when indicator is True, 0 when False
indicator_idx = tf.where(
tf.cast(indicator, tf.bool), tf.range(1, input_length + 1),
tf.zeros(input_length, tf.int32))
# Replace -1 for negative, +1 for positive labels
signed_label = tf.where(
tf.cast(labels, tf.bool), tf.ones(input_length, tf.int32),
tf.scalar_mul(-1, tf.ones(input_length, tf.int32)))
# negative of index for negative label, positive index for positive label,
# 0 when indicator is False.
signed_indicator_idx = tf.multiply(indicator_idx, signed_label)
sorted_signed_indicator_idx = tf.nn.top_k(
signed_indicator_idx, input_length, sorted=True).values
[num_positive_samples,
num_negative_samples] = self._get_num_pos_neg_samples(
sorted_signed_indicator_idx, batch_size)
sampled_idx = self._get_values_from_start_and_end(
sorted_signed_indicator_idx, num_positive_samples,
num_negative_samples, batch_size)
# Shift the indices to start from 0 and remove any samples that are set as
# False.
sampled_idx = tf.abs(sampled_idx) - tf.ones(batch_size, tf.int32)
sampled_idx = tf.multiply(
tf.cast(tf.greater_equal(sampled_idx, tf.constant(0)), tf.int32),
sampled_idx)
sampled_idx_indicator = tf.cast(
tf.reduce_sum(
input_tensor=tf.one_hot(sampled_idx, depth=input_length), axis=0),
tf.bool)
# project back the order based on stored permutations
reprojections = tf.one_hot(permutation, depth=input_length,
dtype=tf.float32)
return tf.cast(tf.tensordot(
tf.cast(sampled_idx_indicator, tf.float32),
reprojections, axes=[0, 0]), tf.bool)
def subsample(self, indicator, batch_size, labels, scope=None):
"""Returns subsampled minibatch.
Args:
indicator: boolean tensor of shape [N] whose True entries can be sampled.
batch_size: desired batch size. If None, keeps all positive samples and
randomly selects negative samples so that the positive sample fraction
matches self._positive_fraction. It cannot be None is is_static is True.
labels: boolean tensor of shape [N] denoting positive(=True) and negative
(=False) examples.
scope: name scope.
Returns:
sampled_idx_indicator: boolean tensor of shape [N], True for entries which
are sampled.
Raises:
ValueError: if labels and indicator are not 1D boolean tensors.
"""
if len(indicator.get_shape().as_list()) != 1:
raise ValueError('indicator must be 1 dimensional, got a tensor of '
'shape %s' % indicator.get_shape())
if len(labels.get_shape().as_list()) != 1:
raise ValueError('labels must be 1 dimensional, got a tensor of '
'shape %s' % labels.get_shape())
if labels.dtype != tf.bool:
raise ValueError('labels should be of type bool. Received: %s' %
labels.dtype)
if indicator.dtype != tf.bool:
raise ValueError('indicator should be of type bool. Received: %s' %
indicator.dtype)
scope = scope or 'BalancedPositiveNegativeSampler'
with tf.name_scope(scope):
if self._is_static:
return self._static_subsample(indicator, batch_size, labels)
else:
# Only sample from indicated samples
negative_idx = tf.logical_not(labels)
positive_idx = tf.logical_and(labels, indicator)
negative_idx = tf.logical_and(negative_idx, indicator)
# Sample positive and negative samples separately
if batch_size is None:
max_num_pos = tf.reduce_sum(
input_tensor=tf.cast(positive_idx, dtype=tf.int32))
else:
max_num_pos = int(self._positive_fraction * batch_size)
sampled_pos_idx = self.subsample_indicator(positive_idx, max_num_pos)
num_sampled_pos = tf.reduce_sum(
input_tensor=tf.cast(sampled_pos_idx, tf.int32))
if batch_size is None:
negative_positive_ratio = (
1 - self._positive_fraction) / self._positive_fraction
max_num_neg = tf.cast(
negative_positive_ratio *
tf.cast(num_sampled_pos, dtype=tf.float32),
dtype=tf.int32)
else:
max_num_neg = batch_size - num_sampled_pos
sampled_neg_idx = self.subsample_indicator(negative_idx, max_num_neg)
return tf.logical_or(sampled_pos_idx, sampled_neg_idx)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment