Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
c44482ab
Commit
c44482ab
authored
Mar 01, 2022
by
A. Unique TensorFlower
Browse files
Internal change
PiperOrigin-RevId: 431756117
parent
10ee28dd
Changes
235
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
7433 additions
and
0 deletions
+7433
-0
official/vision/ops/__init__.py
official/vision/ops/__init__.py
+14
-0
official/vision/ops/anchor.py
official/vision/ops/anchor.py
+378
-0
official/vision/ops/anchor_generator.py
official/vision/ops/anchor_generator.py
+182
-0
official/vision/ops/anchor_generator_test.py
official/vision/ops/anchor_generator_test.py
+137
-0
official/vision/ops/anchor_test.py
official/vision/ops/anchor_test.py
+186
-0
official/vision/ops/augment.py
official/vision/ops/augment.py
+2317
-0
official/vision/ops/augment_test.py
official/vision/ops/augment_test.py
+435
-0
official/vision/ops/box_matcher.py
official/vision/ops/box_matcher.py
+191
-0
official/vision/ops/box_matcher_test.py
official/vision/ops/box_matcher_test.py
+78
-0
official/vision/ops/box_ops.py
official/vision/ops/box_ops.py
+763
-0
official/vision/ops/iou_similarity.py
official/vision/ops/iou_similarity.py
+167
-0
official/vision/ops/iou_similarity_test.py
official/vision/ops/iou_similarity_test.py
+76
-0
official/vision/ops/mask_ops.py
official/vision/ops/mask_ops.py
+190
-0
official/vision/ops/mask_ops_test.py
official/vision/ops/mask_ops_test.py
+55
-0
official/vision/ops/nms.py
official/vision/ops/nms.py
+202
-0
official/vision/ops/preprocess_ops.py
official/vision/ops/preprocess_ops.py
+919
-0
official/vision/ops/preprocess_ops_3d.py
official/vision/ops/preprocess_ops_3d.py
+355
-0
official/vision/ops/preprocess_ops_3d_test.py
official/vision/ops/preprocess_ops_3d_test.py
+159
-0
official/vision/ops/preprocess_ops_test.py
official/vision/ops/preprocess_ops_test.py
+246
-0
official/vision/ops/sampling_ops.py
official/vision/ops/sampling_ops.py
+383
-0
No files found.
official/vision/ops/__init__.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
official/vision/ops/anchor.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Anchor box and labeler definition."""
import
collections
# Import libraries
import
tensorflow
as
tf
from
official.vision.ops
import
anchor_generator
from
official.vision.ops
import
box_matcher
from
official.vision.ops
import
iou_similarity
from
official.vision.ops
import
target_gather
from
official.vision.utils.object_detection
import
balanced_positive_negative_sampler
from
official.vision.utils.object_detection
import
box_list
from
official.vision.utils.object_detection
import
faster_rcnn_box_coder
class
Anchor
(
object
):
"""Anchor class for anchor-based object detectors."""
def
__init__
(
self
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
image_size
):
"""Constructs multiscale anchors.
Args:
min_level: integer number of minimum level of the output feature pyramid.
max_level: integer number of maximum level of the output feature pyramid.
num_scales: integer number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: list of float numbers representing the aspect raito anchors
added on each level. The number indicates the ratio of width to height.
For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
scale level.
anchor_size: float number representing the scale of size of the base
anchor to the feature stride 2^level.
image_size: a list of integer numbers or Tensors representing
[height, width] of the input image size.The image_size should be divided
by the largest feature stride 2^max_level.
"""
self
.
min_level
=
min_level
self
.
max_level
=
max_level
self
.
num_scales
=
num_scales
self
.
aspect_ratios
=
aspect_ratios
self
.
anchor_size
=
anchor_size
self
.
image_size
=
image_size
self
.
boxes
=
self
.
_generate_boxes
()
def
_generate_boxes
(
self
):
"""Generates multiscale anchor boxes.
Returns:
a Tensor of shape [N, 4], representing anchor boxes of all levels
concatenated together.
"""
boxes_all
=
[]
for
level
in
range
(
self
.
min_level
,
self
.
max_level
+
1
):
boxes_l
=
[]
for
scale
in
range
(
self
.
num_scales
):
for
aspect_ratio
in
self
.
aspect_ratios
:
stride
=
2
**
level
intermidate_scale
=
2
**
(
scale
/
float
(
self
.
num_scales
))
base_anchor_size
=
self
.
anchor_size
*
stride
*
intermidate_scale
aspect_x
=
aspect_ratio
**
0.5
aspect_y
=
aspect_ratio
**
-
0.5
half_anchor_size_x
=
base_anchor_size
*
aspect_x
/
2.0
half_anchor_size_y
=
base_anchor_size
*
aspect_y
/
2.0
x
=
tf
.
range
(
stride
/
2
,
self
.
image_size
[
1
],
stride
)
y
=
tf
.
range
(
stride
/
2
,
self
.
image_size
[
0
],
stride
)
xv
,
yv
=
tf
.
meshgrid
(
x
,
y
)
xv
=
tf
.
cast
(
tf
.
reshape
(
xv
,
[
-
1
]),
dtype
=
tf
.
float32
)
yv
=
tf
.
cast
(
tf
.
reshape
(
yv
,
[
-
1
]),
dtype
=
tf
.
float32
)
# Tensor shape Nx4.
boxes
=
tf
.
stack
([
yv
-
half_anchor_size_y
,
xv
-
half_anchor_size_x
,
yv
+
half_anchor_size_y
,
xv
+
half_anchor_size_x
],
axis
=
1
)
boxes_l
.
append
(
boxes
)
# Concat anchors on the same level to tensor shape NxAx4.
boxes_l
=
tf
.
stack
(
boxes_l
,
axis
=
1
)
boxes_l
=
tf
.
reshape
(
boxes_l
,
[
-
1
,
4
])
boxes_all
.
append
(
boxes_l
)
return
tf
.
concat
(
boxes_all
,
axis
=
0
)
def
unpack_labels
(
self
,
labels
):
"""Unpacks an array of labels into multiscales labels."""
unpacked_labels
=
collections
.
OrderedDict
()
count
=
0
for
level
in
range
(
self
.
min_level
,
self
.
max_level
+
1
):
feat_size_y
=
tf
.
cast
(
self
.
image_size
[
0
]
/
2
**
level
,
tf
.
int32
)
feat_size_x
=
tf
.
cast
(
self
.
image_size
[
1
]
/
2
**
level
,
tf
.
int32
)
steps
=
feat_size_y
*
feat_size_x
*
self
.
anchors_per_location
unpacked_labels
[
str
(
level
)]
=
tf
.
reshape
(
labels
[
count
:
count
+
steps
],
[
feat_size_y
,
feat_size_x
,
-
1
])
count
+=
steps
return
unpacked_labels
@
property
def
anchors_per_location
(
self
):
return
self
.
num_scales
*
len
(
self
.
aspect_ratios
)
@
property
def
multilevel_boxes
(
self
):
return
self
.
unpack_labels
(
self
.
boxes
)
class
AnchorLabeler
(
object
):
"""Labeler for dense object detector."""
def
__init__
(
self
,
match_threshold
=
0.5
,
unmatched_threshold
=
0.5
):
"""Constructs anchor labeler to assign labels to anchors.
Args:
match_threshold: a float number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: a float number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
"""
self
.
similarity_calc
=
iou_similarity
.
IouSimilarity
()
self
.
target_gather
=
target_gather
.
TargetGather
()
self
.
matcher
=
box_matcher
.
BoxMatcher
(
thresholds
=
[
unmatched_threshold
,
match_threshold
],
indicators
=
[
-
1
,
-
2
,
1
],
force_match_for_each_col
=
True
)
self
.
box_coder
=
faster_rcnn_box_coder
.
FasterRcnnBoxCoder
()
def
label_anchors
(
self
,
anchor_boxes
,
gt_boxes
,
gt_labels
,
gt_attributes
=
None
,
gt_weights
=
None
):
"""Labels anchors with ground truth inputs.
Args:
anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_labels: A integer tensor with shape [N, 1] representing groundtruth
classes.
gt_attributes: If not None, a dict of (name, gt_attribute) pairs.
`gt_attribute` is a float tensor with shape [N, attribute_size]
representing groundtruth attributes.
gt_weights: If not None, a float tensor with shape [N] representing
groundtruth weights.
Returns:
cls_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
box_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors_per_location * 4]. The height_l
and width_l represent the dimension of bounding box regression output at
l-th level.
attribute_targets_dict: a dict with (name, attribute_targets) pairs. Each
`attribute_targets` represents an ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors_per_location * attribute_size].
The height_l and width_l represent the dimension of attribute prediction
output at l-th level.
cls_weights: A flattened Tensor with shape [batch_size, num_anchors], that
serves as masking / sample weight for classification loss. Its value
is 1.0 for positive and negative matched anchors, and 0.0 for ignored
anchors.
box_weights: A flattened Tensor with shape [batch_size, num_anchors], that
serves as masking / sample weight for regression loss. Its value is
1.0 for positive matched anchors, and 0.0 for negative and ignored
anchors.
"""
flattened_anchor_boxes
=
[]
for
anchors
in
anchor_boxes
.
values
():
flattened_anchor_boxes
.
append
(
tf
.
reshape
(
anchors
,
[
-
1
,
4
]))
flattened_anchor_boxes
=
tf
.
concat
(
flattened_anchor_boxes
,
axis
=
0
)
similarity_matrix
=
self
.
similarity_calc
(
flattened_anchor_boxes
,
gt_boxes
)
match_indices
,
match_indicators
=
self
.
matcher
(
similarity_matrix
)
mask
=
tf
.
less_equal
(
match_indicators
,
0
)
cls_mask
=
tf
.
expand_dims
(
mask
,
-
1
)
cls_targets
=
self
.
target_gather
(
gt_labels
,
match_indices
,
cls_mask
,
-
1
)
box_mask
=
tf
.
tile
(
cls_mask
,
[
1
,
4
])
box_targets
=
self
.
target_gather
(
gt_boxes
,
match_indices
,
box_mask
)
att_targets
=
{}
if
gt_attributes
:
for
k
,
v
in
gt_attributes
.
items
():
att_size
=
v
.
get_shape
().
as_list
()[
-
1
]
att_mask
=
tf
.
tile
(
cls_mask
,
[
1
,
att_size
])
att_targets
[
k
]
=
self
.
target_gather
(
v
,
match_indices
,
att_mask
,
0.0
)
weights
=
tf
.
squeeze
(
tf
.
ones_like
(
gt_labels
,
dtype
=
tf
.
float32
),
-
1
)
if
gt_weights
is
not
None
:
weights
=
tf
.
math
.
multiply
(
weights
,
gt_weights
)
box_weights
=
self
.
target_gather
(
weights
,
match_indices
,
mask
)
ignore_mask
=
tf
.
equal
(
match_indicators
,
-
2
)
cls_weights
=
self
.
target_gather
(
weights
,
match_indices
,
ignore_mask
)
box_targets_list
=
box_list
.
BoxList
(
box_targets
)
anchor_box_list
=
box_list
.
BoxList
(
flattened_anchor_boxes
)
box_targets
=
self
.
box_coder
.
encode
(
box_targets_list
,
anchor_box_list
)
# Unpacks labels into multi-level representations.
cls_targets_dict
=
unpack_targets
(
cls_targets
,
anchor_boxes
)
box_targets_dict
=
unpack_targets
(
box_targets
,
anchor_boxes
)
attribute_targets_dict
=
{}
for
k
,
v
in
att_targets
.
items
():
attribute_targets_dict
[
k
]
=
unpack_targets
(
v
,
anchor_boxes
)
return
cls_targets_dict
,
box_targets_dict
,
attribute_targets_dict
,
cls_weights
,
box_weights
class
RpnAnchorLabeler
(
AnchorLabeler
):
"""Labeler for Region Proposal Network."""
def
__init__
(
self
,
match_threshold
=
0.7
,
unmatched_threshold
=
0.3
,
rpn_batch_size_per_im
=
256
,
rpn_fg_fraction
=
0.5
):
AnchorLabeler
.
__init__
(
self
,
match_threshold
=
match_threshold
,
unmatched_threshold
=
unmatched_threshold
)
self
.
_rpn_batch_size_per_im
=
rpn_batch_size_per_im
self
.
_rpn_fg_fraction
=
rpn_fg_fraction
def
_get_rpn_samples
(
self
,
match_results
):
"""Computes anchor labels.
This function performs subsampling for foreground (fg) and background (bg)
anchors.
Args:
match_results: A integer tensor with shape [N] representing the
matching results of anchors. (1) match_results[i]>=0,
meaning that column i is matched with row match_results[i].
(2) match_results[i]=-1, meaning that column i is not matched.
(3) match_results[i]=-2, meaning that column i is ignored.
Returns:
score_targets: a integer tensor with the a shape of [N].
(1) score_targets[i]=1, the anchor is a positive sample.
(2) score_targets[i]=0, negative. (3) score_targets[i]=-1, the anchor is
don't care (ignore).
"""
sampler
=
(
balanced_positive_negative_sampler
.
BalancedPositiveNegativeSampler
(
positive_fraction
=
self
.
_rpn_fg_fraction
,
is_static
=
False
))
# indicator includes both positive and negative labels.
# labels includes only positives labels.
# positives = indicator & labels.
# negatives = indicator & !labels.
# ignore = !indicator.
indicator
=
tf
.
greater
(
match_results
,
-
2
)
labels
=
tf
.
greater
(
match_results
,
-
1
)
samples
=
sampler
.
subsample
(
indicator
,
self
.
_rpn_batch_size_per_im
,
labels
)
positive_labels
=
tf
.
where
(
tf
.
logical_and
(
samples
,
labels
),
tf
.
constant
(
2
,
dtype
=
tf
.
int32
,
shape
=
match_results
.
shape
),
tf
.
constant
(
0
,
dtype
=
tf
.
int32
,
shape
=
match_results
.
shape
))
negative_labels
=
tf
.
where
(
tf
.
logical_and
(
samples
,
tf
.
logical_not
(
labels
)),
tf
.
constant
(
1
,
dtype
=
tf
.
int32
,
shape
=
match_results
.
shape
),
tf
.
constant
(
0
,
dtype
=
tf
.
int32
,
shape
=
match_results
.
shape
))
ignore_labels
=
tf
.
fill
(
match_results
.
shape
,
-
1
)
return
(
ignore_labels
+
positive_labels
+
negative_labels
,
positive_labels
,
negative_labels
)
def
label_anchors
(
self
,
anchor_boxes
,
gt_boxes
,
gt_labels
):
"""Labels anchors with ground truth inputs.
Args:
anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_labels: A integer tensor with shape [N, 1] representing groundtruth
classes.
Returns:
score_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors]. The height_l and width_l
represent the dimension of class logits at l-th level.
box_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
"""
flattened_anchor_boxes
=
[]
for
anchors
in
anchor_boxes
.
values
():
flattened_anchor_boxes
.
append
(
tf
.
reshape
(
anchors
,
[
-
1
,
4
]))
flattened_anchor_boxes
=
tf
.
concat
(
flattened_anchor_boxes
,
axis
=
0
)
similarity_matrix
=
self
.
similarity_calc
(
flattened_anchor_boxes
,
gt_boxes
)
match_indices
,
match_indicators
=
self
.
matcher
(
similarity_matrix
)
box_mask
=
tf
.
tile
(
tf
.
expand_dims
(
tf
.
less_equal
(
match_indicators
,
0
),
-
1
),
[
1
,
4
])
box_targets
=
self
.
target_gather
(
gt_boxes
,
match_indices
,
box_mask
)
box_targets_list
=
box_list
.
BoxList
(
box_targets
)
anchor_box_list
=
box_list
.
BoxList
(
flattened_anchor_boxes
)
box_targets
=
self
.
box_coder
.
encode
(
box_targets_list
,
anchor_box_list
)
# Zero out the unmatched and ignored regression targets.
num_matches
=
match_indices
.
shape
.
as_list
()[
0
]
or
tf
.
shape
(
match_indices
)[
0
]
unmatched_ignored_box_targets
=
tf
.
zeros
([
num_matches
,
4
],
dtype
=
tf
.
float32
)
matched_anchors_mask
=
tf
.
greater_equal
(
match_indicators
,
0
)
# To broadcast matched_anchors_mask to the same shape as
# matched_reg_targets.
matched_anchors_mask
=
tf
.
tile
(
tf
.
expand_dims
(
matched_anchors_mask
,
1
),
[
1
,
tf
.
shape
(
box_targets
)[
1
]])
box_targets
=
tf
.
where
(
matched_anchors_mask
,
box_targets
,
unmatched_ignored_box_targets
)
# score_targets contains the subsampled positive and negative anchors.
score_targets
,
_
,
_
=
self
.
_get_rpn_samples
(
match_indicators
)
# Unpacks labels.
score_targets_dict
=
unpack_targets
(
score_targets
,
anchor_boxes
)
box_targets_dict
=
unpack_targets
(
box_targets
,
anchor_boxes
)
return
score_targets_dict
,
box_targets_dict
def
build_anchor_generator
(
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
):
"""Build anchor generator from levels."""
anchor_sizes
=
collections
.
OrderedDict
()
strides
=
collections
.
OrderedDict
()
scales
=
[]
for
scale
in
range
(
num_scales
):
scales
.
append
(
2
**
(
scale
/
float
(
num_scales
)))
for
level
in
range
(
min_level
,
max_level
+
1
):
stride
=
2
**
level
strides
[
str
(
level
)]
=
stride
anchor_sizes
[
str
(
level
)]
=
anchor_size
*
stride
anchor_gen
=
anchor_generator
.
AnchorGenerator
(
anchor_sizes
=
anchor_sizes
,
scales
=
scales
,
aspect_ratios
=
aspect_ratios
,
strides
=
strides
)
return
anchor_gen
def
unpack_targets
(
targets
,
anchor_boxes_dict
):
"""Unpacks an array of labels into multiscales labels."""
unpacked_targets
=
collections
.
OrderedDict
()
count
=
0
for
level
,
anchor_boxes
in
anchor_boxes_dict
.
items
():
feat_size_shape
=
anchor_boxes
.
shape
.
as_list
()
feat_size_y
=
feat_size_shape
[
0
]
feat_size_x
=
feat_size_shape
[
1
]
anchors_per_location
=
int
(
feat_size_shape
[
2
]
/
4
)
steps
=
feat_size_y
*
feat_size_x
*
anchors_per_location
unpacked_targets
[
level
]
=
tf
.
reshape
(
targets
[
count
:
count
+
steps
],
[
feat_size_y
,
feat_size_x
,
-
1
])
count
+=
steps
return
unpacked_targets
official/vision/ops/anchor_generator.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Multi scale anchor generator definition."""
import
tensorflow
as
tf
# (TODO/tanzheny): consider having customized anchor offset.
class
_SingleAnchorGenerator
:
"""Utility to generate anchors for a single feature map.
Example:
```python
anchor_gen = _SingleAnchorGenerator(32, [.5, 1., 2.], stride=16)
anchors = anchor_gen([512, 512, 3])
```
"""
def
__init__
(
self
,
anchor_size
,
scales
,
aspect_ratios
,
stride
,
clip_boxes
=
False
):
"""Constructs single scale anchor.
Args:
anchor_size: A single int represents the base anchor size. The anchor
height will be `anchor_size / sqrt(aspect_ratio)`, anchor width will be
`anchor_size * sqrt(aspect_ratio)`.
scales: A list/tuple, or a list/tuple of a list/tuple of positive
floats representing the actual anchor size to the base `anchor_size`.
aspect_ratios: a list/tuple of positive floats representing the ratio of
anchor width to anchor height.
stride: A single int represents the anchor stride size between center of
each anchor.
clip_boxes: Boolean to represent whether the anchor coordinates should be
clipped to the image size. Defaults to `True`.
Input shape: the size of the image, `[H, W, C]`
Output shape: the size of anchors, `[(H / stride) * (W / stride), 4]`
"""
self
.
anchor_size
=
anchor_size
self
.
scales
=
scales
self
.
aspect_ratios
=
aspect_ratios
self
.
stride
=
stride
self
.
clip_boxes
=
clip_boxes
def
__call__
(
self
,
image_size
):
image_height
=
tf
.
cast
(
image_size
[
0
],
tf
.
float32
)
image_width
=
tf
.
cast
(
image_size
[
1
],
tf
.
float32
)
k
=
len
(
self
.
scales
)
*
len
(
self
.
aspect_ratios
)
aspect_ratios_sqrt
=
tf
.
cast
(
tf
.
sqrt
(
self
.
aspect_ratios
),
dtype
=
tf
.
float32
)
anchor_size
=
tf
.
cast
(
self
.
anchor_size
,
tf
.
float32
)
# [K]
anchor_heights
=
[]
anchor_widths
=
[]
for
scale
in
self
.
scales
:
anchor_size_t
=
anchor_size
*
scale
anchor_height
=
anchor_size_t
/
aspect_ratios_sqrt
anchor_width
=
anchor_size_t
*
aspect_ratios_sqrt
anchor_heights
.
append
(
anchor_height
)
anchor_widths
.
append
(
anchor_width
)
anchor_heights
=
tf
.
concat
(
anchor_heights
,
axis
=
0
)
anchor_widths
=
tf
.
concat
(
anchor_widths
,
axis
=
0
)
half_anchor_heights
=
tf
.
reshape
(
0.5
*
anchor_heights
,
[
1
,
1
,
k
])
half_anchor_widths
=
tf
.
reshape
(
0.5
*
anchor_widths
,
[
1
,
1
,
k
])
stride
=
tf
.
cast
(
self
.
stride
,
tf
.
float32
)
# [W]
cx
=
tf
.
range
(
0.5
*
stride
,
image_width
,
stride
)
# [H]
cy
=
tf
.
range
(
0.5
*
stride
,
image_height
,
stride
)
# [H, W]
cx_grid
,
cy_grid
=
tf
.
meshgrid
(
cx
,
cy
)
# [H, W, 1]
cx_grid
=
tf
.
expand_dims
(
cx_grid
,
axis
=-
1
)
cy_grid
=
tf
.
expand_dims
(
cy_grid
,
axis
=-
1
)
# [H, W, K, 1]
y_min
=
tf
.
expand_dims
(
cy_grid
-
half_anchor_heights
,
axis
=-
1
)
y_max
=
tf
.
expand_dims
(
cy_grid
+
half_anchor_heights
,
axis
=-
1
)
x_min
=
tf
.
expand_dims
(
cx_grid
-
half_anchor_widths
,
axis
=-
1
)
x_max
=
tf
.
expand_dims
(
cx_grid
+
half_anchor_widths
,
axis
=-
1
)
if
self
.
clip_boxes
:
y_min
=
tf
.
maximum
(
tf
.
minimum
(
y_min
,
image_height
),
0.
)
y_max
=
tf
.
maximum
(
tf
.
minimum
(
y_max
,
image_height
),
0.
)
x_min
=
tf
.
maximum
(
tf
.
minimum
(
x_min
,
image_width
),
0.
)
x_max
=
tf
.
maximum
(
tf
.
minimum
(
x_max
,
image_width
),
0.
)
# [H, W, K, 4]
result
=
tf
.
concat
([
y_min
,
x_min
,
y_max
,
x_max
],
axis
=-
1
)
shape
=
result
.
shape
.
as_list
()
# [H, W, K * 4]
return
tf
.
reshape
(
result
,
[
shape
[
0
],
shape
[
1
],
shape
[
2
]
*
shape
[
3
]])
class
AnchorGenerator
():
"""Utility to generate anchors for a multiple feature maps.
Example:
```python
anchor_gen = AnchorGenerator([32, 64], [.5, 1., 2.],
strides=[16, 32])
anchors = anchor_gen([512, 512, 3])
```
"""
def
__init__
(
self
,
anchor_sizes
,
scales
,
aspect_ratios
,
strides
,
clip_boxes
=
False
):
"""Constructs multiscale anchors.
Args:
anchor_sizes: A list of int represents the anchor size for each scale. The
anchor height will be `anchor_size / sqrt(aspect_ratio)`, anchor width
will be `anchor_size * sqrt(aspect_ratio)` for each scale.
scales: A list/tuple, or a list/tuple of a list/tuple of positive
floats representing the actual anchor size to the base `anchor_size`.
aspect_ratios: A list/tuple, or a list/tuple of a list/tuple of positive
floats representing the ratio of anchor width to anchor height.
strides: A list/tuple of ints represent the anchor stride size between
center of anchors at each scale.
clip_boxes: Boolean to represents whether the anchor coordinates should be
clipped to the image size. Defaults to `False`.
Input shape: the size of the image, `[H, W, C]`
Output shape: the size of anchors concat on each level, `[(H /
strides) * (W / strides), K * 4]`
"""
# aspect_ratio is a single list that is the same across all levels.
aspect_ratios
=
maybe_map_structure_for_anchor
(
aspect_ratios
,
anchor_sizes
)
scales
=
maybe_map_structure_for_anchor
(
scales
,
anchor_sizes
)
if
isinstance
(
anchor_sizes
,
dict
):
self
.
anchor_generators
=
{}
for
k
in
anchor_sizes
.
keys
():
self
.
anchor_generators
[
k
]
=
_SingleAnchorGenerator
(
anchor_sizes
[
k
],
scales
[
k
],
aspect_ratios
[
k
],
strides
[
k
],
clip_boxes
)
elif
isinstance
(
anchor_sizes
,
(
list
,
tuple
)):
self
.
anchor_generators
=
[]
for
anchor_size
,
scale_list
,
ar_list
,
stride
in
zip
(
anchor_sizes
,
scales
,
aspect_ratios
,
strides
):
self
.
anchor_generators
.
append
(
_SingleAnchorGenerator
(
anchor_size
,
scale_list
,
ar_list
,
stride
,
clip_boxes
))
def
__call__
(
self
,
image_size
):
anchor_generators
=
tf
.
nest
.
flatten
(
self
.
anchor_generators
)
results
=
[
anchor_gen
(
image_size
)
for
anchor_gen
in
anchor_generators
]
return
tf
.
nest
.
pack_sequence_as
(
self
.
anchor_generators
,
results
)
def
maybe_map_structure_for_anchor
(
params
,
anchor_sizes
):
"""broadcast the params to match anchor_sizes."""
if
all
(
isinstance
(
param
,
(
int
,
float
))
for
param
in
params
):
if
isinstance
(
anchor_sizes
,
(
tuple
,
list
)):
return
[
params
]
*
len
(
anchor_sizes
)
elif
isinstance
(
anchor_sizes
,
dict
):
return
tf
.
nest
.
map_structure
(
lambda
_
:
params
,
anchor_sizes
)
else
:
raise
ValueError
(
"the structure of `anchor_sizes` must be a tuple, "
"list, or dict, given {}"
.
format
(
anchor_sizes
))
else
:
return
params
official/vision/ops/anchor_generator_test.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for anchor_generator.py."""
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.vision.ops
import
anchor_generator
class
AnchorGeneratorTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
parameters
(
# Single scale anchor.
(
5
,
[
1.0
],
[[[
-
16.
,
-
16.
,
48.
,
48.
],
[
-
16.
,
16.
,
48.
,
80.
]],
[[
16.
,
-
16.
,
80.
,
48.
],
[
16.
,
16.
,
80.
,
80.
]]]),
# # Multi aspect ratio anchor.
(
6
,
[
1.0
,
4.0
,
0.25
],
[[[
-
32.
,
-
32.
,
96.
,
96.
,
0.
,
-
96.
,
64.
,
160.
,
-
96.
,
0.
,
160.
,
64.
]]]),
)
def
testAnchorGeneration
(
self
,
level
,
aspect_ratios
,
expected_boxes
):
image_size
=
[
64
,
64
]
anchor_size
=
2
**
(
level
+
1
)
stride
=
2
**
level
anchor_gen
=
anchor_generator
.
_SingleAnchorGenerator
(
anchor_size
=
anchor_size
,
scales
=
[
1.
],
aspect_ratios
=
aspect_ratios
,
stride
=
stride
,
clip_boxes
=
False
)
anchors
=
anchor_gen
(
image_size
).
numpy
()
self
.
assertAllClose
(
expected_boxes
,
anchors
)
@
parameterized
.
parameters
(
# Single scale anchor.
(
5
,
[
1.0
],
[[[
0.
,
0.
,
48.
,
48.
],
[
0.
,
16.
,
48.
,
64.
]],
[[
16.
,
0.
,
64.
,
48.
],
[
16.
,
16.
,
64.
,
64.
]]]),
# # Multi aspect ratio anchor.
(
6
,
[
1.0
,
4.0
,
0.25
],
[[[
0.
,
0.
,
64.
,
64.
,
0.
,
0.
,
64.
,
64.
,
0.
,
0.
,
64.
,
64.
]]]),
)
def
testAnchorGenerationClipped
(
self
,
level
,
aspect_ratios
,
expected_boxes
):
image_size
=
[
64
,
64
]
anchor_size
=
2
**
(
level
+
1
)
stride
=
2
**
level
anchor_gen
=
anchor_generator
.
_SingleAnchorGenerator
(
anchor_size
=
anchor_size
,
scales
=
[
1.
],
aspect_ratios
=
aspect_ratios
,
stride
=
stride
,
clip_boxes
=
True
)
anchors
=
anchor_gen
(
image_size
).
numpy
()
self
.
assertAllClose
(
expected_boxes
,
anchors
)
class
MultiScaleAnchorGeneratorTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
parameters
(
# Multi scale anchor.
(
5
,
6
,
[[
1.0
],
[
1.0
]],
[[
-
16
,
-
16
,
48
,
48
],
[
-
16
,
16
,
48
,
80
],
[
16
,
-
16
,
80
,
48
],
[
16
,
16
,
80
,
80
],
[
-
32
,
-
32
,
96
,
96
]]),)
def
testAnchorGeneration
(
self
,
min_level
,
max_level
,
aspect_ratios
,
expected_boxes
):
image_size
=
[
64
,
64
]
levels
=
range
(
min_level
,
max_level
+
1
)
anchor_sizes
=
[
2
**
(
level
+
1
)
for
level
in
levels
]
strides
=
[
2
**
level
for
level
in
levels
]
anchor_gen
=
anchor_generator
.
AnchorGenerator
(
anchor_sizes
=
anchor_sizes
,
scales
=
[
1.
],
aspect_ratios
=
aspect_ratios
,
strides
=
strides
)
anchors
=
anchor_gen
(
image_size
)
anchors
=
[
tf
.
reshape
(
anchor
,
[
-
1
,
4
])
for
anchor
in
anchors
]
anchors
=
tf
.
concat
(
anchors
,
axis
=
0
).
numpy
()
self
.
assertAllClose
(
expected_boxes
,
anchors
)
@
parameterized
.
parameters
(
# Multi scale anchor.
(
5
,
6
,
[[
1.0
],
[
1.0
]],
[[
-
16
,
-
16
,
48
,
48
],
[
-
16
,
16
,
48
,
80
],
[
16
,
-
16
,
80
,
48
],
[
16
,
16
,
80
,
80
],
[
-
32
,
-
32
,
96
,
96
]]),)
def
testAnchorGenerationClipped
(
self
,
min_level
,
max_level
,
aspect_ratios
,
expected_boxes
):
image_size
=
[
64
,
64
]
levels
=
range
(
min_level
,
max_level
+
1
)
anchor_sizes
=
[
2
**
(
level
+
1
)
for
level
in
levels
]
strides
=
[
2
**
level
for
level
in
levels
]
anchor_gen
=
anchor_generator
.
AnchorGenerator
(
anchor_sizes
=
anchor_sizes
,
scales
=
[
1.
],
aspect_ratios
=
aspect_ratios
,
strides
=
strides
,
clip_boxes
=
False
)
anchors
=
anchor_gen
(
image_size
)
anchors
=
[
tf
.
reshape
(
anchor
,
[
-
1
,
4
])
for
anchor
in
anchors
]
anchors
=
tf
.
concat
(
anchors
,
axis
=
0
).
numpy
()
self
.
assertAllClose
(
expected_boxes
,
anchors
)
@
parameterized
.
parameters
(
# Multi scale anchor.
(
5
,
6
,
[
1.0
],
{
'5'
:
[[[
-
16.
,
-
16.
,
48.
,
48.
],
[
-
16.
,
16.
,
48.
,
80.
]],
[[
16.
,
-
16.
,
80.
,
48.
],
[
16.
,
16.
,
80.
,
80.
]]],
'6'
:
[[[
-
32
,
-
32
,
96
,
96
]]]
}),)
def
testAnchorGenerationDict
(
self
,
min_level
,
max_level
,
aspect_ratios
,
expected_boxes
):
image_size
=
[
64
,
64
]
levels
=
range
(
min_level
,
max_level
+
1
)
anchor_sizes
=
dict
((
str
(
level
),
2
**
(
level
+
1
))
for
level
in
levels
)
strides
=
dict
((
str
(
level
),
2
**
level
)
for
level
in
levels
)
anchor_gen
=
anchor_generator
.
AnchorGenerator
(
anchor_sizes
=
anchor_sizes
,
scales
=
[
1.
],
aspect_ratios
=
aspect_ratios
,
strides
=
strides
,
clip_boxes
=
False
)
anchors
=
anchor_gen
(
image_size
)
for
k
in
expected_boxes
.
keys
():
self
.
assertAllClose
(
expected_boxes
[
k
],
anchors
[
k
].
numpy
())
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/ops/anchor_test.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for anchor.py."""
# Import libraries
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.vision.ops
import
anchor
class
AnchorTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
# The set of parameters are tailored for the MLPerf configuration, where
# the number of anchors is 495132, rpn_batch_size_per_im=256, and
# rpn_fg_fraction=0.5.
@
parameterized
.
parameters
(
(
512
,
25
,
25
,
25
,
25
,
(
512
,
512
)),
(
512
,
25
,
25
,
25
,
25
,
(
512
,
640
)),
(
512
,
25
,
25
,
25
,
25
,
(
640
,
512
)),
(
495132
,
100
,
100
,
100
,
100
,
(
512
,
512
)),
(
495132
,
200
,
100
,
128
,
100
,
(
512
,
512
)),
(
495132
,
100
,
120
,
100
,
120
,
(
512
,
512
)),
(
495132
,
100
,
200
,
100
,
156
,
(
512
,
512
)),
(
495132
,
200
,
200
,
128
,
128
,
(
512
,
512
)),
)
def
testAnchorRpnSample
(
self
,
num_anchors
,
num_positives
,
num_negatives
,
expected_positives
,
expected_negatives
,
image_size
):
match_results_np
=
np
.
empty
([
num_anchors
])
match_results_np
.
fill
(
-
2
)
match_results_np
[:
num_positives
]
=
0
match_results_np
[
num_positives
:
num_positives
+
num_negatives
]
=
-
1
match_results
=
tf
.
convert_to_tensor
(
value
=
match_results_np
,
dtype
=
tf
.
int32
)
anchor_labeler
=
anchor
.
RpnAnchorLabeler
(
match_threshold
=
0.7
,
unmatched_threshold
=
0.3
,
rpn_batch_size_per_im
=
256
,
rpn_fg_fraction
=
0.5
)
rpn_sample_op
=
anchor_labeler
.
_get_rpn_samples
(
match_results
)
labels
=
[
v
.
numpy
()
for
v
in
rpn_sample_op
]
self
.
assertLen
(
labels
[
0
],
num_anchors
)
positives
=
np
.
sum
(
np
.
array
(
labels
[
0
])
==
1
)
negatives
=
np
.
sum
(
np
.
array
(
labels
[
0
])
==
0
)
self
.
assertEqual
(
positives
,
expected_positives
)
self
.
assertEqual
(
negatives
,
expected_negatives
)
@
parameterized
.
parameters
(
# Single scale anchor.
(
5
,
5
,
1
,
[
1.0
],
2.0
,
[[
-
16
,
-
16
,
48
,
48
],
[
-
16
,
16
,
48
,
80
],
[
16
,
-
16
,
80
,
48
],
[
16
,
16
,
80
,
80
]]),
# Multi scale anchor.
(
5
,
6
,
1
,
[
1.0
],
2.0
,
[[
-
16
,
-
16
,
48
,
48
],
[
-
16
,
16
,
48
,
80
],
[
16
,
-
16
,
80
,
48
],
[
16
,
16
,
80
,
80
],
[
-
32
,
-
32
,
96
,
96
]]),
# # Multi aspect ratio anchor.
(
6
,
6
,
1
,
[
1.0
,
4.0
,
0.25
],
2.0
,
[[
-
32
,
-
32
,
96
,
96
],
[
-
0
,
-
96
,
64
,
160
],
[
-
96
,
-
0
,
160
,
64
]]),
)
def
testAnchorGeneration
(
self
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
expected_boxes
):
image_size
=
[
64
,
64
]
anchors
=
anchor
.
Anchor
(
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
image_size
)
boxes
=
anchors
.
boxes
.
numpy
()
self
.
assertEqual
(
expected_boxes
,
boxes
.
tolist
())
@
parameterized
.
parameters
(
# Single scale anchor.
(
5
,
5
,
1
,
[
1.0
],
2.0
,
[[
-
16
,
-
16
,
48
,
48
],
[
-
16
,
16
,
48
,
80
],
[
16
,
-
16
,
80
,
48
],
[
16
,
16
,
80
,
80
]]),
# Multi scale anchor.
(
5
,
6
,
1
,
[
1.0
],
2.0
,
[[
-
16
,
-
16
,
48
,
48
],
[
-
16
,
16
,
48
,
80
],
[
16
,
-
16
,
80
,
48
],
[
16
,
16
,
80
,
80
],
[
-
32
,
-
32
,
96
,
96
]]),
# # Multi aspect ratio anchor.
(
6
,
6
,
1
,
[
1.0
,
4.0
,
0.25
],
2.0
,
[[
-
32
,
-
32
,
96
,
96
],
[
-
0
,
-
96
,
64
,
160
],
[
-
96
,
-
0
,
160
,
64
]]),
)
def
testAnchorGenerationWithImageSizeAsTensor
(
self
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
expected_boxes
):
image_size
=
tf
.
constant
([
64
,
64
],
tf
.
int32
)
anchors
=
anchor
.
Anchor
(
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
image_size
)
boxes
=
anchors
.
boxes
.
numpy
()
self
.
assertEqual
(
expected_boxes
,
boxes
.
tolist
())
@
parameterized
.
parameters
(
(
3
,
6
,
2
,
[
1.0
],
2.0
,
False
),
(
3
,
6
,
2
,
[
1.0
],
2.0
,
True
),
)
def
testLabelAnchors
(
self
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
has_attribute
):
input_size
=
[
512
,
512
]
ground_truth_class_id
=
2
attribute_name
=
'depth'
ground_truth_depth
=
3.0
# The matched anchors are the anchors used as ground truth and the anchors
# at the next octave scale on the same location.
expected_anchor_locations
=
[[
0
,
0
,
0
],
[
0
,
0
,
1
]]
anchor_gen
=
anchor
.
build_anchor_generator
(
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
)
anchor_boxes
=
anchor_gen
(
input_size
)
anchor_labeler
=
anchor
.
AnchorLabeler
()
# Uses the first anchors as ground truth. The ground truth should map to
# two anchors with two intermediate scales at the same location.
gt_boxes
=
anchor_boxes
[
'3'
][
0
:
1
,
0
,
0
:
4
]
gt_classes
=
tf
.
constant
([[
ground_truth_class_id
]],
dtype
=
tf
.
float32
)
gt_attributes
=
{
attribute_name
:
tf
.
constant
([[
ground_truth_depth
]],
dtype
=
tf
.
float32
)
}
if
has_attribute
else
{}
(
cls_targets
,
box_targets
,
att_targets
,
_
,
box_weights
)
=
anchor_labeler
.
label_anchors
(
anchor_boxes
,
gt_boxes
,
gt_classes
,
gt_attributes
)
for
k
,
v
in
cls_targets
.
items
():
cls_targets
[
k
]
=
v
.
numpy
()
for
k
,
v
in
box_targets
.
items
():
box_targets
[
k
]
=
v
.
numpy
()
box_weights
=
box_weights
.
numpy
()
anchor_locations
=
np
.
vstack
(
np
.
where
(
cls_targets
[
str
(
min_level
)]
>
-
1
)).
transpose
()
self
.
assertAllClose
(
expected_anchor_locations
,
anchor_locations
)
# Two anchor boxes on min_level got matched to the gt_boxes.
self
.
assertAllClose
(
tf
.
reduce_sum
(
box_weights
),
2
)
if
has_attribute
:
self
.
assertIn
(
attribute_name
,
att_targets
)
for
k
,
v
in
att_targets
[
attribute_name
].
items
():
att_targets
[
attribute_name
][
k
]
=
v
.
numpy
()
anchor_locations
=
np
.
vstack
(
np
.
where
(
att_targets
[
attribute_name
][
str
(
min_level
)]
>
0.0
)).
transpose
()
self
.
assertAllClose
(
expected_anchor_locations
,
anchor_locations
)
else
:
self
.
assertEmpty
(
att_targets
)
@
parameterized
.
parameters
(
(
3
,
7
,
[.
5
,
1.
,
2.
],
2
,
8
,
(
256
,
256
)),
(
3
,
8
,
[
1.
],
3
,
32
,
(
512
,
512
)),
(
3
,
3
,
[
1.
],
2
,
4
,
(
32
,
32
)),
)
def
testEquivalentResult
(
self
,
min_level
,
max_level
,
aspect_ratios
,
num_scales
,
anchor_size
,
image_size
):
anchor_gen
=
anchor
.
build_anchor_generator
(
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
anchor_size
)
anchors
=
anchor_gen
(
image_size
)
expected_anchor_gen
=
anchor
.
Anchor
(
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
image_size
)
expected_anchors
=
expected_anchor_gen
.
multilevel_boxes
for
k
in
expected_anchors
.
keys
():
self
.
assertAllClose
(
expected_anchors
[
k
],
anchors
[
k
])
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/ops/augment.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Augmentation policies for enhanced image/video preprocessing.
AutoAugment Reference:
- AutoAugment Reference: https://arxiv.org/abs/1805.09501
- AutoAugment for Object Detection Reference: https://arxiv.org/abs/1906.11172
RandAugment Reference: https://arxiv.org/abs/1909.13719
RandomErasing Reference: https://arxiv.org/abs/1708.04896
MixupAndCutmix:
- Mixup: https://arxiv.org/abs/1710.09412
- Cutmix: https://arxiv.org/abs/1905.04899
RandomErasing, Mixup and Cutmix are inspired by
https://github.com/rwightman/pytorch-image-models
"""
import
inspect
import
math
from
typing
import
Any
,
List
,
Iterable
,
Optional
,
Text
,
Tuple
from
keras.layers.preprocessing
import
image_preprocessing
as
image_ops
import
numpy
as
np
import
tensorflow
as
tf
# This signifies the max integer that the controller RNN could predict for the
# augmentation scheme.
_MAX_LEVEL
=
10.
def
to_4d
(
image
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Converts an input Tensor to 4 dimensions.
4D image => [N, H, W, C] or [N, C, H, W]
3D image => [1, H, W, C] or [1, C, H, W]
2D image => [1, H, W, 1]
Args:
image: The 2/3/4D input tensor.
Returns:
A 4D image tensor.
Raises:
`TypeError` if `image` is not a 2/3/4D tensor.
"""
shape
=
tf
.
shape
(
image
)
original_rank
=
tf
.
rank
(
image
)
left_pad
=
tf
.
cast
(
tf
.
less_equal
(
original_rank
,
3
),
dtype
=
tf
.
int32
)
right_pad
=
tf
.
cast
(
tf
.
equal
(
original_rank
,
2
),
dtype
=
tf
.
int32
)
new_shape
=
tf
.
concat
(
[
tf
.
ones
(
shape
=
left_pad
,
dtype
=
tf
.
int32
),
shape
,
tf
.
ones
(
shape
=
right_pad
,
dtype
=
tf
.
int32
),
],
axis
=
0
,
)
return
tf
.
reshape
(
image
,
new_shape
)
def
from_4d
(
image
:
tf
.
Tensor
,
ndims
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Converts a 4D image back to `ndims` rank."""
shape
=
tf
.
shape
(
image
)
begin
=
tf
.
cast
(
tf
.
less_equal
(
ndims
,
3
),
dtype
=
tf
.
int32
)
end
=
4
-
tf
.
cast
(
tf
.
equal
(
ndims
,
2
),
dtype
=
tf
.
int32
)
new_shape
=
shape
[
begin
:
end
]
return
tf
.
reshape
(
image
,
new_shape
)
def
_convert_translation_to_transform
(
translations
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Converts translations to a projective transform.
The translation matrix looks like this:
[[1 0 -dx]
[0 1 -dy]
[0 0 1]]
Args:
translations: The 2-element list representing [dx, dy], or a matrix of
2-element lists representing [dx dy] to translate for each image. The
shape must be static.
Returns:
The transformation matrix of shape (num_images, 8).
Raises:
`TypeError` if
- the shape of `translations` is not known or
- the shape of `translations` is not rank 1 or 2.
"""
translations
=
tf
.
convert_to_tensor
(
translations
,
dtype
=
tf
.
float32
)
if
translations
.
get_shape
().
ndims
is
None
:
raise
TypeError
(
'translations rank must be statically known'
)
elif
len
(
translations
.
get_shape
())
==
1
:
translations
=
translations
[
None
]
elif
len
(
translations
.
get_shape
())
!=
2
:
raise
TypeError
(
'translations should have rank 1 or 2.'
)
num_translations
=
tf
.
shape
(
translations
)[
0
]
return
tf
.
concat
(
values
=
[
tf
.
ones
((
num_translations
,
1
),
tf
.
dtypes
.
float32
),
tf
.
zeros
((
num_translations
,
1
),
tf
.
dtypes
.
float32
),
-
translations
[:,
0
,
None
],
tf
.
zeros
((
num_translations
,
1
),
tf
.
dtypes
.
float32
),
tf
.
ones
((
num_translations
,
1
),
tf
.
dtypes
.
float32
),
-
translations
[:,
1
,
None
],
tf
.
zeros
((
num_translations
,
2
),
tf
.
dtypes
.
float32
),
],
axis
=
1
,
)
def
_convert_angles_to_transform
(
angles
:
tf
.
Tensor
,
image_width
:
tf
.
Tensor
,
image_height
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Converts an angle or angles to a projective transform.
Args:
angles: A scalar to rotate all images, or a vector to rotate a batch of
images. This must be a scalar.
image_width: The width of the image(s) to be transformed.
image_height: The height of the image(s) to be transformed.
Returns:
A tensor of shape (num_images, 8).
Raises:
`TypeError` if `angles` is not rank 0 or 1.
"""
angles
=
tf
.
convert_to_tensor
(
angles
,
dtype
=
tf
.
float32
)
if
len
(
angles
.
get_shape
())
==
0
:
# pylint:disable=g-explicit-length-test
angles
=
angles
[
None
]
elif
len
(
angles
.
get_shape
())
!=
1
:
raise
TypeError
(
'Angles should have a rank 0 or 1.'
)
x_offset
=
((
image_width
-
1
)
-
(
tf
.
math
.
cos
(
angles
)
*
(
image_width
-
1
)
-
tf
.
math
.
sin
(
angles
)
*
(
image_height
-
1
)))
/
2.0
y_offset
=
((
image_height
-
1
)
-
(
tf
.
math
.
sin
(
angles
)
*
(
image_width
-
1
)
+
tf
.
math
.
cos
(
angles
)
*
(
image_height
-
1
)))
/
2.0
num_angles
=
tf
.
shape
(
angles
)[
0
]
return
tf
.
concat
(
values
=
[
tf
.
math
.
cos
(
angles
)[:,
None
],
-
tf
.
math
.
sin
(
angles
)[:,
None
],
x_offset
[:,
None
],
tf
.
math
.
sin
(
angles
)[:,
None
],
tf
.
math
.
cos
(
angles
)[:,
None
],
y_offset
[:,
None
],
tf
.
zeros
((
num_angles
,
2
),
tf
.
dtypes
.
float32
),
],
axis
=
1
,
)
def
transform
(
image
:
tf
.
Tensor
,
transforms
)
->
tf
.
Tensor
:
"""Prepares input data for `image_ops.transform`."""
original_ndims
=
tf
.
rank
(
image
)
transforms
=
tf
.
convert_to_tensor
(
transforms
,
dtype
=
tf
.
float32
)
if
transforms
.
shape
.
rank
==
1
:
transforms
=
transforms
[
None
]
image
=
to_4d
(
image
)
image
=
image_ops
.
transform
(
images
=
image
,
transforms
=
transforms
,
interpolation
=
'nearest'
)
return
from_4d
(
image
,
original_ndims
)
def
translate
(
image
:
tf
.
Tensor
,
translations
)
->
tf
.
Tensor
:
"""Translates image(s) by provided vectors.
Args:
image: An image Tensor of type uint8.
translations: A vector or matrix representing [dx dy].
Returns:
The translated version of the image.
"""
transforms
=
_convert_translation_to_transform
(
translations
)
return
transform
(
image
,
transforms
=
transforms
)
def
rotate
(
image
:
tf
.
Tensor
,
degrees
:
float
)
->
tf
.
Tensor
:
"""Rotates the image by degrees either clockwise or counterclockwise.
Args:
image: An image Tensor of type uint8.
degrees: Float, a scalar angle in degrees to rotate all images by. If
degrees is positive the image will be rotated clockwise otherwise it will
be rotated counterclockwise.
Returns:
The rotated version of image.
"""
# Convert from degrees to radians.
degrees_to_radians
=
math
.
pi
/
180.0
radians
=
tf
.
cast
(
degrees
*
degrees_to_radians
,
tf
.
float32
)
original_ndims
=
tf
.
rank
(
image
)
image
=
to_4d
(
image
)
image_height
=
tf
.
cast
(
tf
.
shape
(
image
)[
1
],
tf
.
float32
)
image_width
=
tf
.
cast
(
tf
.
shape
(
image
)[
2
],
tf
.
float32
)
transforms
=
_convert_angles_to_transform
(
angles
=
radians
,
image_width
=
image_width
,
image_height
=
image_height
)
# In practice, we should randomize the rotation degrees by flipping
# it negatively half the time, but that's done on 'degrees' outside
# of the function.
image
=
transform
(
image
,
transforms
=
transforms
)
return
from_4d
(
image
,
original_ndims
)
def
blend
(
image1
:
tf
.
Tensor
,
image2
:
tf
.
Tensor
,
factor
:
float
)
->
tf
.
Tensor
:
"""Blend image1 and image2 using 'factor'.
Factor can be above 0.0. A value of 0.0 means only image1 is used.
A value of 1.0 means only image2 is used. A value between 0.0 and
1.0 means we linearly interpolate the pixel values between the two
images. A value greater than 1.0 "extrapolates" the difference
between the two pixel values, and we clip the results to values
between 0 and 255.
Args:
image1: An image Tensor of type uint8.
image2: An image Tensor of type uint8.
factor: A floating point value above 0.0.
Returns:
A blended image Tensor of type uint8.
"""
if
factor
==
0.0
:
return
tf
.
convert_to_tensor
(
image1
)
if
factor
==
1.0
:
return
tf
.
convert_to_tensor
(
image2
)
image1
=
tf
.
cast
(
image1
,
tf
.
float32
)
image2
=
tf
.
cast
(
image2
,
tf
.
float32
)
difference
=
image2
-
image1
scaled
=
factor
*
difference
# Do addition in float.
temp
=
tf
.
cast
(
image1
,
tf
.
float32
)
+
scaled
# Interpolate
if
factor
>
0.0
and
factor
<
1.0
:
# Interpolation means we always stay within 0 and 255.
return
tf
.
cast
(
temp
,
tf
.
uint8
)
# Extrapolate:
#
# We need to clip and then cast.
return
tf
.
cast
(
tf
.
clip_by_value
(
temp
,
0.0
,
255.0
),
tf
.
uint8
)
def
cutout
(
image
:
tf
.
Tensor
,
pad_size
:
int
,
replace
:
int
=
0
)
->
tf
.
Tensor
:
"""Apply cutout (https://arxiv.org/abs/1708.04552) to image.
This operation applies a (2*pad_size x 2*pad_size) mask of zeros to
a random location within `image`. The pixel values filled in will be of the
value `replace`. The location where the mask will be applied is randomly
chosen uniformly over the whole image.
Args:
image: An image Tensor of type uint8.
pad_size: Specifies how big the zero mask that will be generated is that is
applied to the image. The mask will be of size (2*pad_size x 2*pad_size).
replace: What pixel value to fill in the image in the area that has the
cutout mask applied to it.
Returns:
An image Tensor that is of type uint8.
"""
if
image
.
shape
.
rank
not
in
[
3
,
4
]:
raise
ValueError
(
'Bad image rank: {}'
.
format
(
image
.
shape
.
rank
))
if
image
.
shape
.
rank
==
4
:
return
cutout_video
(
image
,
replace
=
replace
)
image_height
=
tf
.
shape
(
image
)[
0
]
image_width
=
tf
.
shape
(
image
)[
1
]
# Sample the center location in the image where the zero mask will be applied.
cutout_center_height
=
tf
.
random
.
uniform
(
shape
=
[],
minval
=
0
,
maxval
=
image_height
,
dtype
=
tf
.
int32
)
cutout_center_width
=
tf
.
random
.
uniform
(
shape
=
[],
minval
=
0
,
maxval
=
image_width
,
dtype
=
tf
.
int32
)
image
=
_fill_rectangle
(
image
,
cutout_center_width
,
cutout_center_height
,
pad_size
,
pad_size
,
replace
)
return
image
def
_fill_rectangle
(
image
,
center_width
,
center_height
,
half_width
,
half_height
,
replace
=
None
):
"""Fill blank area."""
image_height
=
tf
.
shape
(
image
)[
0
]
image_width
=
tf
.
shape
(
image
)[
1
]
lower_pad
=
tf
.
maximum
(
0
,
center_height
-
half_height
)
upper_pad
=
tf
.
maximum
(
0
,
image_height
-
center_height
-
half_height
)
left_pad
=
tf
.
maximum
(
0
,
center_width
-
half_width
)
right_pad
=
tf
.
maximum
(
0
,
image_width
-
center_width
-
half_width
)
cutout_shape
=
[
image_height
-
(
lower_pad
+
upper_pad
),
image_width
-
(
left_pad
+
right_pad
)
]
padding_dims
=
[[
lower_pad
,
upper_pad
],
[
left_pad
,
right_pad
]]
mask
=
tf
.
pad
(
tf
.
zeros
(
cutout_shape
,
dtype
=
image
.
dtype
),
padding_dims
,
constant_values
=
1
)
mask
=
tf
.
expand_dims
(
mask
,
-
1
)
mask
=
tf
.
tile
(
mask
,
[
1
,
1
,
3
])
if
replace
is
None
:
fill
=
tf
.
random
.
normal
(
tf
.
shape
(
image
),
dtype
=
image
.
dtype
)
elif
isinstance
(
replace
,
tf
.
Tensor
):
fill
=
replace
else
:
fill
=
tf
.
ones_like
(
image
,
dtype
=
image
.
dtype
)
*
replace
image
=
tf
.
where
(
tf
.
equal
(
mask
,
0
),
fill
,
image
)
return
image
def
cutout_video
(
image
:
tf
.
Tensor
,
replace
:
int
=
0
)
->
tf
.
Tensor
:
"""Apply cutout (https://arxiv.org/abs/1708.04552) to a video.
This operation applies a random size 3D mask of zeros to a random location
within `image`. The mask is padded The pixel values filled in will be of the
value `replace`. The location where the mask will be applied is randomly
chosen uniformly over the whole image. The size of the mask is randomly
sampled uniformly from [0.25*height, 0.5*height], [0.25*width, 0.5*width],
and [1, 0.25*depth], which represent the height, width, and number of frames
of the input video tensor respectively.
Args:
image: A video Tensor of type uint8.
replace: What pixel value to fill in the image in the area that has the
cutout mask applied to it.
Returns:
An video Tensor that is of type uint8.
"""
image_depth
=
tf
.
shape
(
image
)[
0
]
image_height
=
tf
.
shape
(
image
)[
1
]
image_width
=
tf
.
shape
(
image
)[
2
]
# Sample the center location in the image where the zero mask will be applied.
cutout_center_height
=
tf
.
random
.
uniform
(
shape
=
[],
minval
=
0
,
maxval
=
image_height
,
dtype
=
tf
.
int32
)
cutout_center_width
=
tf
.
random
.
uniform
(
shape
=
[],
minval
=
0
,
maxval
=
image_width
,
dtype
=
tf
.
int32
)
cutout_center_depth
=
tf
.
random
.
uniform
(
shape
=
[],
minval
=
0
,
maxval
=
image_depth
,
dtype
=
tf
.
int32
)
pad_size_height
=
tf
.
random
.
uniform
(
shape
=
[],
minval
=
tf
.
maximum
(
1
,
tf
.
cast
(
image_height
/
4
,
tf
.
int32
)),
maxval
=
tf
.
maximum
(
2
,
tf
.
cast
(
image_height
/
2
,
tf
.
int32
)),
dtype
=
tf
.
int32
)
pad_size_width
=
tf
.
random
.
uniform
(
shape
=
[],
minval
=
tf
.
maximum
(
1
,
tf
.
cast
(
image_width
/
4
,
tf
.
int32
)),
maxval
=
tf
.
maximum
(
2
,
tf
.
cast
(
image_width
/
2
,
tf
.
int32
)),
dtype
=
tf
.
int32
)
pad_size_depth
=
tf
.
random
.
uniform
(
shape
=
[],
minval
=
1
,
maxval
=
tf
.
maximum
(
2
,
tf
.
cast
(
image_depth
/
4
,
tf
.
int32
)),
dtype
=
tf
.
int32
)
lower_pad
=
tf
.
maximum
(
0
,
cutout_center_height
-
pad_size_height
)
upper_pad
=
tf
.
maximum
(
0
,
image_height
-
cutout_center_height
-
pad_size_height
)
left_pad
=
tf
.
maximum
(
0
,
cutout_center_width
-
pad_size_width
)
right_pad
=
tf
.
maximum
(
0
,
image_width
-
cutout_center_width
-
pad_size_width
)
back_pad
=
tf
.
maximum
(
0
,
cutout_center_depth
-
pad_size_depth
)
forward_pad
=
tf
.
maximum
(
0
,
image_depth
-
cutout_center_depth
-
pad_size_depth
)
cutout_shape
=
[
image_depth
-
(
back_pad
+
forward_pad
),
image_height
-
(
lower_pad
+
upper_pad
),
image_width
-
(
left_pad
+
right_pad
),
]
padding_dims
=
[[
back_pad
,
forward_pad
],
[
lower_pad
,
upper_pad
],
[
left_pad
,
right_pad
]]
mask
=
tf
.
pad
(
tf
.
zeros
(
cutout_shape
,
dtype
=
image
.
dtype
),
padding_dims
,
constant_values
=
1
)
mask
=
tf
.
expand_dims
(
mask
,
-
1
)
mask
=
tf
.
tile
(
mask
,
[
1
,
1
,
1
,
3
])
image
=
tf
.
where
(
tf
.
equal
(
mask
,
0
),
tf
.
ones_like
(
image
,
dtype
=
image
.
dtype
)
*
replace
,
image
)
return
image
def
solarize
(
image
:
tf
.
Tensor
,
threshold
:
int
=
128
)
->
tf
.
Tensor
:
"""Solarize the input image(s)."""
# For each pixel in the image, select the pixel
# if the value is less than the threshold.
# Otherwise, subtract 255 from the pixel.
return
tf
.
where
(
image
<
threshold
,
image
,
255
-
image
)
def
solarize_add
(
image
:
tf
.
Tensor
,
addition
:
int
=
0
,
threshold
:
int
=
128
)
->
tf
.
Tensor
:
"""Additive solarize the input image(s)."""
# For each pixel in the image less than threshold
# we add 'addition' amount to it and then clip the
# pixel value to be between 0 and 255. The value
# of 'addition' is between -128 and 128.
added_image
=
tf
.
cast
(
image
,
tf
.
int64
)
+
addition
added_image
=
tf
.
cast
(
tf
.
clip_by_value
(
added_image
,
0
,
255
),
tf
.
uint8
)
return
tf
.
where
(
image
<
threshold
,
added_image
,
image
)
def
color
(
image
:
tf
.
Tensor
,
factor
:
float
)
->
tf
.
Tensor
:
"""Equivalent of PIL Color."""
degenerate
=
tf
.
image
.
grayscale_to_rgb
(
tf
.
image
.
rgb_to_grayscale
(
image
))
return
blend
(
degenerate
,
image
,
factor
)
def
contrast
(
image
:
tf
.
Tensor
,
factor
:
float
)
->
tf
.
Tensor
:
"""Equivalent of PIL Contrast."""
degenerate
=
tf
.
image
.
rgb_to_grayscale
(
image
)
# Cast before calling tf.histogram.
degenerate
=
tf
.
cast
(
degenerate
,
tf
.
int32
)
# Compute the grayscale histogram, then compute the mean pixel value,
# and create a constant image size of that value. Use that as the
# blending degenerate target of the original image.
hist
=
tf
.
histogram_fixed_width
(
degenerate
,
[
0
,
255
],
nbins
=
256
)
mean
=
tf
.
reduce_sum
(
tf
.
cast
(
hist
,
tf
.
float32
))
/
256.0
degenerate
=
tf
.
ones_like
(
degenerate
,
dtype
=
tf
.
float32
)
*
mean
degenerate
=
tf
.
clip_by_value
(
degenerate
,
0.0
,
255.0
)
degenerate
=
tf
.
image
.
grayscale_to_rgb
(
tf
.
cast
(
degenerate
,
tf
.
uint8
))
return
blend
(
degenerate
,
image
,
factor
)
def
brightness
(
image
:
tf
.
Tensor
,
factor
:
float
)
->
tf
.
Tensor
:
"""Equivalent of PIL Brightness."""
degenerate
=
tf
.
zeros_like
(
image
)
return
blend
(
degenerate
,
image
,
factor
)
def
posterize
(
image
:
tf
.
Tensor
,
bits
:
int
)
->
tf
.
Tensor
:
"""Equivalent of PIL Posterize."""
shift
=
8
-
bits
return
tf
.
bitwise
.
left_shift
(
tf
.
bitwise
.
right_shift
(
image
,
shift
),
shift
)
def
wrapped_rotate
(
image
:
tf
.
Tensor
,
degrees
:
float
,
replace
:
int
)
->
tf
.
Tensor
:
"""Applies rotation with wrap/unwrap."""
image
=
rotate
(
wrap
(
image
),
degrees
=
degrees
)
return
unwrap
(
image
,
replace
)
def
translate_x
(
image
:
tf
.
Tensor
,
pixels
:
int
,
replace
:
int
)
->
tf
.
Tensor
:
"""Equivalent of PIL Translate in X dimension."""
image
=
translate
(
wrap
(
image
),
[
-
pixels
,
0
])
return
unwrap
(
image
,
replace
)
def
translate_y
(
image
:
tf
.
Tensor
,
pixels
:
int
,
replace
:
int
)
->
tf
.
Tensor
:
"""Equivalent of PIL Translate in Y dimension."""
image
=
translate
(
wrap
(
image
),
[
0
,
-
pixels
])
return
unwrap
(
image
,
replace
)
def
shear_x
(
image
:
tf
.
Tensor
,
level
:
float
,
replace
:
int
)
->
tf
.
Tensor
:
"""Equivalent of PIL Shearing in X dimension."""
# Shear parallel to x axis is a projective transform
# with a matrix form of:
# [1 level
# 0 1].
image
=
transform
(
image
=
wrap
(
image
),
transforms
=
[
1.
,
level
,
0.
,
0.
,
1.
,
0.
,
0.
,
0.
])
return
unwrap
(
image
,
replace
)
def
shear_y
(
image
:
tf
.
Tensor
,
level
:
float
,
replace
:
int
)
->
tf
.
Tensor
:
"""Equivalent of PIL Shearing in Y dimension."""
# Shear parallel to y axis is a projective transform
# with a matrix form of:
# [1 0
# level 1].
image
=
transform
(
image
=
wrap
(
image
),
transforms
=
[
1.
,
0.
,
0.
,
level
,
1.
,
0.
,
0.
,
0.
])
return
unwrap
(
image
,
replace
)
def
autocontrast
(
image
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Implements Autocontrast function from PIL using TF ops.
Args:
image: A 3D uint8 tensor.
Returns:
The image after it has had autocontrast applied to it and will be of type
uint8.
"""
def
scale_channel
(
image
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Scale the 2D image using the autocontrast rule."""
# A possibly cheaper version can be done using cumsum/unique_with_counts
# over the histogram values, rather than iterating over the entire image.
# to compute mins and maxes.
lo
=
tf
.
cast
(
tf
.
reduce_min
(
image
),
tf
.
float32
)
hi
=
tf
.
cast
(
tf
.
reduce_max
(
image
),
tf
.
float32
)
# Scale the image, making the lowest value 0 and the highest value 255.
def
scale_values
(
im
):
scale
=
255.0
/
(
hi
-
lo
)
offset
=
-
lo
*
scale
im
=
tf
.
cast
(
im
,
tf
.
float32
)
*
scale
+
offset
im
=
tf
.
clip_by_value
(
im
,
0.0
,
255.0
)
return
tf
.
cast
(
im
,
tf
.
uint8
)
result
=
tf
.
cond
(
hi
>
lo
,
lambda
:
scale_values
(
image
),
lambda
:
image
)
return
result
# Assumes RGB for now. Scales each channel independently
# and then stacks the result.
s1
=
scale_channel
(
image
[...,
0
])
s2
=
scale_channel
(
image
[...,
1
])
s3
=
scale_channel
(
image
[...,
2
])
image
=
tf
.
stack
([
s1
,
s2
,
s3
],
-
1
)
return
image
def
sharpness
(
image
:
tf
.
Tensor
,
factor
:
float
)
->
tf
.
Tensor
:
"""Implements Sharpness function from PIL using TF ops."""
orig_image
=
image
image
=
tf
.
cast
(
image
,
tf
.
float32
)
# Make image 4D for conv operation.
image
=
tf
.
expand_dims
(
image
,
0
)
# SMOOTH PIL Kernel.
if
orig_image
.
shape
.
rank
==
3
:
kernel
=
tf
.
constant
([[
1
,
1
,
1
],
[
1
,
5
,
1
],
[
1
,
1
,
1
]],
dtype
=
tf
.
float32
,
shape
=
[
3
,
3
,
1
,
1
])
/
13.
# Tile across channel dimension.
kernel
=
tf
.
tile
(
kernel
,
[
1
,
1
,
3
,
1
])
strides
=
[
1
,
1
,
1
,
1
]
degenerate
=
tf
.
nn
.
depthwise_conv2d
(
image
,
kernel
,
strides
,
padding
=
'VALID'
,
dilations
=
[
1
,
1
])
elif
orig_image
.
shape
.
rank
==
4
:
kernel
=
tf
.
constant
([[
1
,
1
,
1
],
[
1
,
5
,
1
],
[
1
,
1
,
1
]],
dtype
=
tf
.
float32
,
shape
=
[
1
,
3
,
3
,
1
,
1
])
/
13.
strides
=
[
1
,
1
,
1
,
1
,
1
]
# Run the kernel across each channel
channels
=
tf
.
split
(
image
,
3
,
axis
=-
1
)
degenerates
=
[
tf
.
nn
.
conv3d
(
channel
,
kernel
,
strides
,
padding
=
'VALID'
,
dilations
=
[
1
,
1
,
1
,
1
,
1
])
for
channel
in
channels
]
degenerate
=
tf
.
concat
(
degenerates
,
-
1
)
else
:
raise
ValueError
(
'Bad image rank: {}'
.
format
(
image
.
shape
.
rank
))
degenerate
=
tf
.
clip_by_value
(
degenerate
,
0.0
,
255.0
)
degenerate
=
tf
.
squeeze
(
tf
.
cast
(
degenerate
,
tf
.
uint8
),
[
0
])
# For the borders of the resulting image, fill in the values of the
# original image.
mask
=
tf
.
ones_like
(
degenerate
)
paddings
=
[[
0
,
0
]]
*
(
orig_image
.
shape
.
rank
-
3
)
padded_mask
=
tf
.
pad
(
mask
,
paddings
+
[[
1
,
1
],
[
1
,
1
],
[
0
,
0
]])
padded_degenerate
=
tf
.
pad
(
degenerate
,
paddings
+
[[
1
,
1
],
[
1
,
1
],
[
0
,
0
]])
result
=
tf
.
where
(
tf
.
equal
(
padded_mask
,
1
),
padded_degenerate
,
orig_image
)
# Blend the final result.
return
blend
(
result
,
orig_image
,
factor
)
def
equalize
(
image
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Implements Equalize function from PIL using TF ops."""
def
scale_channel
(
im
,
c
):
"""Scale the data in the channel to implement equalize."""
im
=
tf
.
cast
(
im
[...,
c
],
tf
.
int32
)
# Compute the histogram of the image channel.
histo
=
tf
.
histogram_fixed_width
(
im
,
[
0
,
255
],
nbins
=
256
)
# For the purposes of computing the step, filter out the nonzeros.
nonzero
=
tf
.
where
(
tf
.
not_equal
(
histo
,
0
))
nonzero_histo
=
tf
.
reshape
(
tf
.
gather
(
histo
,
nonzero
),
[
-
1
])
step
=
(
tf
.
reduce_sum
(
nonzero_histo
)
-
nonzero_histo
[
-
1
])
//
255
def
build_lut
(
histo
,
step
):
# Compute the cumulative sum, shifting by step // 2
# and then normalization by step.
lut
=
(
tf
.
cumsum
(
histo
)
+
(
step
//
2
))
//
step
# Shift lut, prepending with 0.
lut
=
tf
.
concat
([[
0
],
lut
[:
-
1
]],
0
)
# Clip the counts to be in range. This is done
# in the C code for image.point.
return
tf
.
clip_by_value
(
lut
,
0
,
255
)
# If step is zero, return the original image. Otherwise, build
# lut from the full histogram and step and then index from it.
result
=
tf
.
cond
(
tf
.
equal
(
step
,
0
),
lambda
:
im
,
lambda
:
tf
.
gather
(
build_lut
(
histo
,
step
),
im
))
return
tf
.
cast
(
result
,
tf
.
uint8
)
# Assumes RGB for now. Scales each channel independently
# and then stacks the result.
s1
=
scale_channel
(
image
,
0
)
s2
=
scale_channel
(
image
,
1
)
s3
=
scale_channel
(
image
,
2
)
image
=
tf
.
stack
([
s1
,
s2
,
s3
],
-
1
)
return
image
def
invert
(
image
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Inverts the image pixels."""
image
=
tf
.
convert_to_tensor
(
image
)
return
255
-
image
def
wrap
(
image
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Returns 'image' with an extra channel set to all 1s."""
shape
=
tf
.
shape
(
image
)
extended_channel
=
tf
.
expand_dims
(
tf
.
ones
(
shape
[:
-
1
],
image
.
dtype
),
-
1
)
extended
=
tf
.
concat
([
image
,
extended_channel
],
axis
=-
1
)
return
extended
def
unwrap
(
image
:
tf
.
Tensor
,
replace
:
int
)
->
tf
.
Tensor
:
"""Unwraps an image produced by wrap.
Where there is a 0 in the last channel for every spatial position,
the rest of the three channels in that spatial dimension are grayed
(set to 128). Operations like translate and shear on a wrapped
Tensor will leave 0s in empty locations. Some transformations look
at the intensity of values to do preprocessing, and we want these
empty pixels to assume the 'average' value, rather than pure black.
Args:
image: A 3D Image Tensor with 4 channels.
replace: A one or three value 1D tensor to fill empty pixels.
Returns:
image: A 3D image Tensor with 3 channels.
"""
image_shape
=
tf
.
shape
(
image
)
# Flatten the spatial dimensions.
flattened_image
=
tf
.
reshape
(
image
,
[
-
1
,
image_shape
[
-
1
]])
# Find all pixels where the last channel is zero.
alpha_channel
=
tf
.
expand_dims
(
flattened_image
[...,
3
],
axis
=-
1
)
replace
=
tf
.
concat
([
replace
,
tf
.
ones
([
1
],
image
.
dtype
)],
0
)
# Where they are zero, fill them in with 'replace'.
flattened_image
=
tf
.
where
(
tf
.
equal
(
alpha_channel
,
0
),
tf
.
ones_like
(
flattened_image
,
dtype
=
image
.
dtype
)
*
replace
,
flattened_image
)
image
=
tf
.
reshape
(
flattened_image
,
image_shape
)
image
=
tf
.
slice
(
image
,
[
0
]
*
image
.
shape
.
rank
,
tf
.
concat
([
image_shape
[:
-
1
],
[
3
]],
-
1
))
return
image
def
_scale_bbox_only_op_probability
(
prob
):
"""Reduce the probability of the bbox-only operation.
Probability is reduced so that we do not distort the content of too many
bounding boxes that are close to each other. The value of 3.0 was a chosen
hyper parameter when designing the autoaugment algorithm that we found
empirically to work well.
Args:
prob: Float that is the probability of applying the bbox-only operation.
Returns:
Reduced probability.
"""
return
prob
/
3.0
def
_apply_bbox_augmentation
(
image
,
bbox
,
augmentation_func
,
*
args
):
"""Applies augmentation_func to the subsection of image indicated by bbox.
Args:
image: 3D uint8 Tensor.
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
augmentation_func: Augmentation function that will be applied to the
subsection of image.
*args: Additional parameters that will be passed into augmentation_func
when it is called.
Returns:
A modified version of image, where the bbox location in the image will
have `ugmentation_func applied to it.
"""
image_height
=
tf
.
cast
(
tf
.
shape
(
image
)[
0
],
tf
.
float32
)
image_width
=
tf
.
cast
(
tf
.
shape
(
image
)[
1
],
tf
.
float32
)
min_y
=
tf
.
cast
(
image_height
*
bbox
[
0
],
tf
.
int32
)
min_x
=
tf
.
cast
(
image_width
*
bbox
[
1
],
tf
.
int32
)
max_y
=
tf
.
cast
(
image_height
*
bbox
[
2
],
tf
.
int32
)
max_x
=
tf
.
cast
(
image_width
*
bbox
[
3
],
tf
.
int32
)
image_height
=
tf
.
cast
(
image_height
,
tf
.
int32
)
image_width
=
tf
.
cast
(
image_width
,
tf
.
int32
)
# Clip to be sure the max values do not fall out of range.
max_y
=
tf
.
minimum
(
max_y
,
image_height
-
1
)
max_x
=
tf
.
minimum
(
max_x
,
image_width
-
1
)
# Get the sub-tensor that is the image within the bounding box region.
bbox_content
=
image
[
min_y
:
max_y
+
1
,
min_x
:
max_x
+
1
,
:]
# Apply the augmentation function to the bbox portion of the image.
augmented_bbox_content
=
augmentation_func
(
bbox_content
,
*
args
)
# Pad the augmented_bbox_content and the mask to match the shape of original
# image.
augmented_bbox_content
=
tf
.
pad
(
augmented_bbox_content
,
[[
min_y
,
(
image_height
-
1
)
-
max_y
],
[
min_x
,
(
image_width
-
1
)
-
max_x
],
[
0
,
0
]])
# Create a mask that will be used to zero out a part of the original image.
mask_tensor
=
tf
.
zeros_like
(
bbox_content
)
mask_tensor
=
tf
.
pad
(
mask_tensor
,
[[
min_y
,
(
image_height
-
1
)
-
max_y
],
[
min_x
,
(
image_width
-
1
)
-
max_x
],
[
0
,
0
]],
constant_values
=
1
)
# Replace the old bbox content with the new augmented content.
image
=
image
*
mask_tensor
+
augmented_bbox_content
return
image
def
_concat_bbox
(
bbox
,
bboxes
):
"""Helper function that concates bbox to bboxes along the first dimension."""
# Note if all elements in bboxes are -1 (_INVALID_BOX), then this means
# we discard bboxes and start the bboxes Tensor with the current bbox.
bboxes_sum_check
=
tf
.
reduce_sum
(
bboxes
)
bbox
=
tf
.
expand_dims
(
bbox
,
0
)
# This check will be true when it is an _INVALID_BOX
bboxes
=
tf
.
cond
(
tf
.
equal
(
bboxes_sum_check
,
-
4.0
),
lambda
:
bbox
,
lambda
:
tf
.
concat
([
bboxes
,
bbox
],
0
))
return
bboxes
def
_apply_bbox_augmentation_wrapper
(
image
,
bbox
,
new_bboxes
,
prob
,
augmentation_func
,
func_changes_bbox
,
*
args
):
"""Applies _apply_bbox_augmentation with probability prob.
Args:
image: 3D uint8 Tensor.
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
new_bboxes: 2D Tensor that is a list of the bboxes in the image after they
have been altered by aug_func. These will only be changed when
func_changes_bbox is set to true. Each bbox has 4 elements
(min_y, min_x, max_y, max_x) of type float that are the normalized
bbox coordinates between 0 and 1.
prob: Float that is the probability of applying _apply_bbox_augmentation.
augmentation_func: Augmentation function that will be applied to the
subsection of image.
func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
to image.
*args: Additional parameters that will be passed into augmentation_func
when it is called.
Returns:
A tuple. Fist element is a modified version of image, where the bbox
location in the image will have augmentation_func applied to it if it is
chosen to be called with probability `prob`. The second element is a
Tensor of Tensors of length 4 that will contain the altered bbox after
applying augmentation_func.
"""
should_apply_op
=
tf
.
cast
(
tf
.
floor
(
tf
.
random
.
uniform
([],
dtype
=
tf
.
float32
)
+
prob
),
tf
.
bool
)
if
func_changes_bbox
:
augmented_image
,
bbox
=
tf
.
cond
(
should_apply_op
,
lambda
:
augmentation_func
(
image
,
bbox
,
*
args
),
lambda
:
(
image
,
bbox
))
else
:
augmented_image
=
tf
.
cond
(
should_apply_op
,
lambda
:
_apply_bbox_augmentation
(
image
,
bbox
,
augmentation_func
,
*
args
),
lambda
:
image
)
new_bboxes
=
_concat_bbox
(
bbox
,
new_bboxes
)
return
augmented_image
,
new_bboxes
def
_apply_multi_bbox_augmentation_wrapper
(
image
,
bboxes
,
prob
,
aug_func
,
func_changes_bbox
,
*
args
):
"""Checks to be sure num bboxes > 0 before calling inner function."""
num_bboxes
=
tf
.
shape
(
bboxes
)[
0
]
image
,
bboxes
=
tf
.
cond
(
tf
.
equal
(
num_bboxes
,
0
),
lambda
:
(
image
,
bboxes
),
# pylint:disable=g-long-lambda
lambda
:
_apply_multi_bbox_augmentation
(
image
,
bboxes
,
prob
,
aug_func
,
func_changes_bbox
,
*
args
))
# pylint:enable=g-long-lambda
return
image
,
bboxes
# Represents an invalid bounding box that is used for checking for padding
# lists of bounding box coordinates for a few augmentation operations
_INVALID_BOX
=
[[
-
1.0
,
-
1.0
,
-
1.0
,
-
1.0
]]
def
_apply_multi_bbox_augmentation
(
image
,
bboxes
,
prob
,
aug_func
,
func_changes_bbox
,
*
args
):
"""Applies aug_func to the image for each bbox in bboxes.
Args:
image: 3D uint8 Tensor.
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
has 4 elements (min_y, min_x, max_y, max_x) of type float.
prob: Float that is the probability of applying aug_func to a specific
bounding box within the image.
aug_func: Augmentation function that will be applied to the
subsections of image indicated by the bbox values in bboxes.
func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
to image.
*args: Additional parameters that will be passed into augmentation_func
when it is called.
Returns:
A modified version of image, where each bbox location in the image will
have augmentation_func applied to it if it is chosen to be called with
probability prob independently across all bboxes. Also the final
bboxes are returned that will be unchanged if func_changes_bbox is set to
false and if true, the new altered ones will be returned.
Raises:
ValueError if applied to video.
"""
if
image
.
shape
.
rank
==
4
:
raise
ValueError
(
'Image rank 4 is not supported'
)
# Will keep track of the new altered bboxes after aug_func is repeatedly
# applied. The -1 values are a dummy value and this first Tensor will be
# removed upon appending the first real bbox.
new_bboxes
=
tf
.
constant
(
_INVALID_BOX
)
# If the bboxes are empty, then just give it _INVALID_BOX. The result
# will be thrown away.
bboxes
=
tf
.
cond
(
tf
.
equal
(
tf
.
size
(
bboxes
),
0
),
lambda
:
tf
.
constant
(
_INVALID_BOX
),
lambda
:
bboxes
)
bboxes
=
tf
.
ensure_shape
(
bboxes
,
(
None
,
4
))
# pylint:disable=g-long-lambda
wrapped_aug_func
=
(
lambda
_image
,
bbox
,
_new_bboxes
:
_apply_bbox_augmentation_wrapper
(
_image
,
bbox
,
_new_bboxes
,
prob
,
aug_func
,
func_changes_bbox
,
*
args
))
# pylint:enable=g-long-lambda
# Setup the while_loop.
num_bboxes
=
tf
.
shape
(
bboxes
)[
0
]
# We loop until we go over all bboxes.
idx
=
tf
.
constant
(
0
)
# Counter for the while loop.
# Conditional function when to end the loop once we go over all bboxes
# images_and_bboxes contain (_image, _new_bboxes)
cond
=
lambda
_idx
,
_images_and_bboxes
:
tf
.
less
(
_idx
,
num_bboxes
)
# Shuffle the bboxes so that the augmentation order is not deterministic if
# we are not changing the bboxes with aug_func.
if
not
func_changes_bbox
:
loop_bboxes
=
tf
.
random
.
shuffle
(
bboxes
)
else
:
loop_bboxes
=
bboxes
# Main function of while_loop where we repeatedly apply augmentation on the
# bboxes in the image.
# pylint:disable=g-long-lambda
body
=
lambda
_idx
,
_images_and_bboxes
:
[
_idx
+
1
,
wrapped_aug_func
(
_images_and_bboxes
[
0
],
loop_bboxes
[
_idx
],
_images_and_bboxes
[
1
])]
# pylint:enable=g-long-lambda
_
,
(
image
,
new_bboxes
)
=
tf
.
while_loop
(
cond
,
body
,
[
idx
,
(
image
,
new_bboxes
)],
shape_invariants
=
[
idx
.
get_shape
(),
(
image
.
get_shape
(),
tf
.
TensorShape
([
None
,
4
]))])
# Either return the altered bboxes or the original ones depending on if
# we altered them in anyway.
if
func_changes_bbox
:
final_bboxes
=
new_bboxes
else
:
final_bboxes
=
bboxes
return
image
,
final_bboxes
def
_clip_bbox
(
min_y
,
min_x
,
max_y
,
max_x
):
"""Clip bounding box coordinates between 0 and 1.
Args:
min_y: Normalized bbox coordinate of type float between 0 and 1.
min_x: Normalized bbox coordinate of type float between 0 and 1.
max_y: Normalized bbox coordinate of type float between 0 and 1.
max_x: Normalized bbox coordinate of type float between 0 and 1.
Returns:
Clipped coordinate values between 0 and 1.
"""
min_y
=
tf
.
clip_by_value
(
min_y
,
0.0
,
1.0
)
min_x
=
tf
.
clip_by_value
(
min_x
,
0.0
,
1.0
)
max_y
=
tf
.
clip_by_value
(
max_y
,
0.0
,
1.0
)
max_x
=
tf
.
clip_by_value
(
max_x
,
0.0
,
1.0
)
return
min_y
,
min_x
,
max_y
,
max_x
def
_check_bbox_area
(
min_y
,
min_x
,
max_y
,
max_x
,
delta
=
0.05
):
"""Adjusts bbox coordinates to make sure the area is > 0.
Args:
min_y: Normalized bbox coordinate of type float between 0 and 1.
min_x: Normalized bbox coordinate of type float between 0 and 1.
max_y: Normalized bbox coordinate of type float between 0 and 1.
max_x: Normalized bbox coordinate of type float between 0 and 1.
delta: Float, this is used to create a gap of size 2 * delta between
bbox min/max coordinates that are the same on the boundary.
This prevents the bbox from having an area of zero.
Returns:
Tuple of new bbox coordinates between 0 and 1 that will now have a
guaranteed area > 0.
"""
height
=
max_y
-
min_y
width
=
max_x
-
min_x
def
_adjust_bbox_boundaries
(
min_coord
,
max_coord
):
# Make sure max is never 0 and min is never 1.
max_coord
=
tf
.
maximum
(
max_coord
,
0.0
+
delta
)
min_coord
=
tf
.
minimum
(
min_coord
,
1.0
-
delta
)
return
min_coord
,
max_coord
min_y
,
max_y
=
tf
.
cond
(
tf
.
equal
(
height
,
0.0
),
lambda
:
_adjust_bbox_boundaries
(
min_y
,
max_y
),
lambda
:
(
min_y
,
max_y
))
min_x
,
max_x
=
tf
.
cond
(
tf
.
equal
(
width
,
0.0
),
lambda
:
_adjust_bbox_boundaries
(
min_x
,
max_x
),
lambda
:
(
min_x
,
max_x
))
return
min_y
,
min_x
,
max_y
,
max_x
def
_rotate_bbox
(
bbox
,
image_height
,
image_width
,
degrees
):
"""Rotates the bbox coordinated by degrees.
Args:
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
image_height: Int, height of the image.
image_width: Int, height of the image.
degrees: Float, a scalar angle in degrees to rotate all images by. If
degrees is positive the image will be rotated clockwise otherwise it will
be rotated counterclockwise.
Returns:
A tensor of the same shape as bbox, but now with the rotated coordinates.
"""
image_height
,
image_width
=
(
tf
.
cast
(
image_height
,
tf
.
float32
),
tf
.
cast
(
image_width
,
tf
.
float32
))
# Convert from degrees to radians.
degrees_to_radians
=
math
.
pi
/
180.0
radians
=
degrees
*
degrees_to_radians
# Translate the bbox to the center of the image and turn the normalized 0-1
# coordinates to absolute pixel locations.
# Y coordinates are made negative as the y axis of images goes down with
# increasing pixel values, so we negate to make sure x axis and y axis points
# are in the traditionally positive direction.
min_y
=
-
tf
.
cast
(
image_height
*
(
bbox
[
0
]
-
0.5
),
tf
.
int32
)
min_x
=
tf
.
cast
(
image_width
*
(
bbox
[
1
]
-
0.5
),
tf
.
int32
)
max_y
=
-
tf
.
cast
(
image_height
*
(
bbox
[
2
]
-
0.5
),
tf
.
int32
)
max_x
=
tf
.
cast
(
image_width
*
(
bbox
[
3
]
-
0.5
),
tf
.
int32
)
coordinates
=
tf
.
stack
(
[[
min_y
,
min_x
],
[
min_y
,
max_x
],
[
max_y
,
min_x
],
[
max_y
,
max_x
]])
coordinates
=
tf
.
cast
(
coordinates
,
tf
.
float32
)
# Rotate the coordinates according to the rotation matrix clockwise if
# radians is positive, else negative
rotation_matrix
=
tf
.
stack
(
[[
tf
.
cos
(
radians
),
tf
.
sin
(
radians
)],
[
-
tf
.
sin
(
radians
),
tf
.
cos
(
radians
)]])
new_coords
=
tf
.
cast
(
tf
.
matmul
(
rotation_matrix
,
tf
.
transpose
(
coordinates
)),
tf
.
int32
)
# Find min/max values and convert them back to normalized 0-1 floats.
min_y
=
-
(
tf
.
cast
(
tf
.
reduce_max
(
new_coords
[
0
,
:]),
tf
.
float32
)
/
image_height
-
0.5
)
min_x
=
tf
.
cast
(
tf
.
reduce_min
(
new_coords
[
1
,
:]),
tf
.
float32
)
/
image_width
+
0.5
max_y
=
-
(
tf
.
cast
(
tf
.
reduce_min
(
new_coords
[
0
,
:]),
tf
.
float32
)
/
image_height
-
0.5
)
max_x
=
tf
.
cast
(
tf
.
reduce_max
(
new_coords
[
1
,
:]),
tf
.
float32
)
/
image_width
+
0.5
# Clip the bboxes to be sure the fall between [0, 1].
min_y
,
min_x
,
max_y
,
max_x
=
_clip_bbox
(
min_y
,
min_x
,
max_y
,
max_x
)
min_y
,
min_x
,
max_y
,
max_x
=
_check_bbox_area
(
min_y
,
min_x
,
max_y
,
max_x
)
return
tf
.
stack
([
min_y
,
min_x
,
max_y
,
max_x
])
def
rotate_with_bboxes
(
image
,
bboxes
,
degrees
,
replace
):
"""Equivalent of PIL Rotate that rotates the image and bbox.
Args:
image: 3D uint8 Tensor.
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
has 4 elements (min_y, min_x, max_y, max_x) of type float.
degrees: Float, a scalar angle in degrees to rotate all images by. If
degrees is positive the image will be rotated clockwise otherwise it will
be rotated counterclockwise.
replace: A one or three value 1D tensor to fill empty pixels.
Returns:
A tuple containing a 3D uint8 Tensor that will be the result of rotating
image by degrees. The second element of the tuple is bboxes, where now
the coordinates will be shifted to reflect the rotated image.
Raises:
ValueError: If applied to video.
"""
if
image
.
shape
.
rank
==
4
:
raise
ValueError
(
'Image rank 4 is not supported'
)
# Rotate the image.
image
=
wrapped_rotate
(
image
,
degrees
,
replace
)
# Convert bbox coordinates to pixel values.
image_height
=
tf
.
shape
(
image
)[
0
]
image_width
=
tf
.
shape
(
image
)[
1
]
# pylint:disable=g-long-lambda
wrapped_rotate_bbox
=
lambda
bbox
:
_rotate_bbox
(
bbox
,
image_height
,
image_width
,
degrees
)
# pylint:enable=g-long-lambda
bboxes
=
tf
.
map_fn
(
wrapped_rotate_bbox
,
bboxes
)
return
image
,
bboxes
def
_shear_bbox
(
bbox
,
image_height
,
image_width
,
level
,
shear_horizontal
):
"""Shifts the bbox according to how the image was sheared.
Args:
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
image_height: Int, height of the image.
image_width: Int, height of the image.
level: Float. How much to shear the image.
shear_horizontal: If true then shear in X dimension else shear in
the Y dimension.
Returns:
A tensor of the same shape as bbox, but now with the shifted coordinates.
"""
image_height
,
image_width
=
(
tf
.
cast
(
image_height
,
tf
.
float32
),
tf
.
cast
(
image_width
,
tf
.
float32
))
# Change bbox coordinates to be pixels.
min_y
=
tf
.
cast
(
image_height
*
bbox
[
0
],
tf
.
int32
)
min_x
=
tf
.
cast
(
image_width
*
bbox
[
1
],
tf
.
int32
)
max_y
=
tf
.
cast
(
image_height
*
bbox
[
2
],
tf
.
int32
)
max_x
=
tf
.
cast
(
image_width
*
bbox
[
3
],
tf
.
int32
)
coordinates
=
tf
.
stack
(
[[
min_y
,
min_x
],
[
min_y
,
max_x
],
[
max_y
,
min_x
],
[
max_y
,
max_x
]])
coordinates
=
tf
.
cast
(
coordinates
,
tf
.
float32
)
# Shear the coordinates according to the translation matrix.
if
shear_horizontal
:
translation_matrix
=
tf
.
stack
(
[[
1
,
0
],
[
-
level
,
1
]])
else
:
translation_matrix
=
tf
.
stack
(
[[
1
,
-
level
],
[
0
,
1
]])
translation_matrix
=
tf
.
cast
(
translation_matrix
,
tf
.
float32
)
new_coords
=
tf
.
cast
(
tf
.
matmul
(
translation_matrix
,
tf
.
transpose
(
coordinates
)),
tf
.
int32
)
# Find min/max values and convert them back to floats.
min_y
=
tf
.
cast
(
tf
.
reduce_min
(
new_coords
[
0
,
:]),
tf
.
float32
)
/
image_height
min_x
=
tf
.
cast
(
tf
.
reduce_min
(
new_coords
[
1
,
:]),
tf
.
float32
)
/
image_width
max_y
=
tf
.
cast
(
tf
.
reduce_max
(
new_coords
[
0
,
:]),
tf
.
float32
)
/
image_height
max_x
=
tf
.
cast
(
tf
.
reduce_max
(
new_coords
[
1
,
:]),
tf
.
float32
)
/
image_width
# Clip the bboxes to be sure the fall between [0, 1].
min_y
,
min_x
,
max_y
,
max_x
=
_clip_bbox
(
min_y
,
min_x
,
max_y
,
max_x
)
min_y
,
min_x
,
max_y
,
max_x
=
_check_bbox_area
(
min_y
,
min_x
,
max_y
,
max_x
)
return
tf
.
stack
([
min_y
,
min_x
,
max_y
,
max_x
])
def
shear_with_bboxes
(
image
,
bboxes
,
level
,
replace
,
shear_horizontal
):
"""Applies Shear Transformation to the image and shifts the bboxes.
Args:
image: 3D uint8 Tensor.
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
has 4 elements (min_y, min_x, max_y, max_x) of type float with values
between [0, 1].
level: Float. How much to shear the image. This value will be between
-0.3 to 0.3.
replace: A one or three value 1D tensor to fill empty pixels.
shear_horizontal: Boolean. If true then shear in X dimension else shear in
the Y dimension.
Returns:
A tuple containing a 3D uint8 Tensor that will be the result of shearing
image by level. The second element of the tuple is bboxes, where now
the coordinates will be shifted to reflect the sheared image.
Raises:
ValueError: If applied to video.
"""
if
image
.
shape
.
rank
==
4
:
raise
ValueError
(
'Image rank 4 is not supported'
)
if
shear_horizontal
:
image
=
shear_x
(
image
,
level
,
replace
)
else
:
image
=
shear_y
(
image
,
level
,
replace
)
# Convert bbox coordinates to pixel values.
image_height
=
tf
.
shape
(
image
)[
0
]
image_width
=
tf
.
shape
(
image
)[
1
]
# pylint:disable=g-long-lambda
wrapped_shear_bbox
=
lambda
bbox
:
_shear_bbox
(
bbox
,
image_height
,
image_width
,
level
,
shear_horizontal
)
# pylint:enable=g-long-lambda
bboxes
=
tf
.
map_fn
(
wrapped_shear_bbox
,
bboxes
)
return
image
,
bboxes
def
_shift_bbox
(
bbox
,
image_height
,
image_width
,
pixels
,
shift_horizontal
):
"""Shifts the bbox coordinates by pixels.
Args:
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
image_height: Int, height of the image.
image_width: Int, width of the image.
pixels: An int. How many pixels to shift the bbox.
shift_horizontal: Boolean. If true then shift in X dimension else shift in
Y dimension.
Returns:
A tensor of the same shape as bbox, but now with the shifted coordinates.
"""
pixels
=
tf
.
cast
(
pixels
,
tf
.
int32
)
# Convert bbox to integer pixel locations.
min_y
=
tf
.
cast
(
tf
.
cast
(
image_height
,
tf
.
float32
)
*
bbox
[
0
],
tf
.
int32
)
min_x
=
tf
.
cast
(
tf
.
cast
(
image_width
,
tf
.
float32
)
*
bbox
[
1
],
tf
.
int32
)
max_y
=
tf
.
cast
(
tf
.
cast
(
image_height
,
tf
.
float32
)
*
bbox
[
2
],
tf
.
int32
)
max_x
=
tf
.
cast
(
tf
.
cast
(
image_width
,
tf
.
float32
)
*
bbox
[
3
],
tf
.
int32
)
if
shift_horizontal
:
min_x
=
tf
.
maximum
(
0
,
min_x
-
pixels
)
max_x
=
tf
.
minimum
(
image_width
,
max_x
-
pixels
)
else
:
min_y
=
tf
.
maximum
(
0
,
min_y
-
pixels
)
max_y
=
tf
.
minimum
(
image_height
,
max_y
-
pixels
)
# Convert bbox back to floats.
min_y
=
tf
.
cast
(
min_y
,
tf
.
float32
)
/
tf
.
cast
(
image_height
,
tf
.
float32
)
min_x
=
tf
.
cast
(
min_x
,
tf
.
float32
)
/
tf
.
cast
(
image_width
,
tf
.
float32
)
max_y
=
tf
.
cast
(
max_y
,
tf
.
float32
)
/
tf
.
cast
(
image_height
,
tf
.
float32
)
max_x
=
tf
.
cast
(
max_x
,
tf
.
float32
)
/
tf
.
cast
(
image_width
,
tf
.
float32
)
# Clip the bboxes to be sure the fall between [0, 1].
min_y
,
min_x
,
max_y
,
max_x
=
_clip_bbox
(
min_y
,
min_x
,
max_y
,
max_x
)
min_y
,
min_x
,
max_y
,
max_x
=
_check_bbox_area
(
min_y
,
min_x
,
max_y
,
max_x
)
return
tf
.
stack
([
min_y
,
min_x
,
max_y
,
max_x
])
def
translate_bbox
(
image
,
bboxes
,
pixels
,
replace
,
shift_horizontal
):
"""Equivalent of PIL Translate in X/Y dimension that shifts image and bbox.
Args:
image: 3D uint8 Tensor.
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
has 4 elements (min_y, min_x, max_y, max_x) of type float with values
between [0, 1].
pixels: An int. How many pixels to shift the image and bboxes
replace: A one or three value 1D tensor to fill empty pixels.
shift_horizontal: Boolean. If true then shift in X dimension else shift in
Y dimension.
Returns:
A tuple containing a 3D uint8 Tensor that will be the result of translating
image by pixels. The second element of the tuple is bboxes, where now
the coordinates will be shifted to reflect the shifted image.
Raises:
ValueError if applied to video.
"""
if
image
.
shape
.
rank
==
4
:
raise
ValueError
(
'Image rank 4 is not supported'
)
if
shift_horizontal
:
image
=
translate_x
(
image
,
pixels
,
replace
)
else
:
image
=
translate_y
(
image
,
pixels
,
replace
)
# Convert bbox coordinates to pixel values.
image_height
=
tf
.
shape
(
image
)[
0
]
image_width
=
tf
.
shape
(
image
)[
1
]
# pylint:disable=g-long-lambda
wrapped_shift_bbox
=
lambda
bbox
:
_shift_bbox
(
bbox
,
image_height
,
image_width
,
pixels
,
shift_horizontal
)
# pylint:enable=g-long-lambda
bboxes
=
tf
.
map_fn
(
wrapped_shift_bbox
,
bboxes
)
return
image
,
bboxes
def
translate_y_only_bboxes
(
image
:
tf
.
Tensor
,
bboxes
:
tf
.
Tensor
,
prob
:
float
,
pixels
:
int
,
replace
):
"""Apply translate_y to each bbox in the image with probability prob."""
if
bboxes
.
shape
.
rank
==
4
:
raise
ValueError
(
'translate_y_only_bboxes does not support rank 4 boxes'
)
func_changes_bbox
=
False
prob
=
_scale_bbox_only_op_probability
(
prob
)
return
_apply_multi_bbox_augmentation_wrapper
(
image
,
bboxes
,
prob
,
translate_y
,
func_changes_bbox
,
pixels
,
replace
)
def
_randomly_negate_tensor
(
tensor
):
"""With 50% prob turn the tensor negative."""
should_flip
=
tf
.
cast
(
tf
.
floor
(
tf
.
random
.
uniform
([])
+
0.5
),
tf
.
bool
)
final_tensor
=
tf
.
cond
(
should_flip
,
lambda
:
tensor
,
lambda
:
-
tensor
)
return
final_tensor
def
_rotate_level_to_arg
(
level
:
float
):
level
=
(
level
/
_MAX_LEVEL
)
*
30.
level
=
_randomly_negate_tensor
(
level
)
return
(
level
,)
def
_shrink_level_to_arg
(
level
:
float
):
"""Converts level to ratio by which we shrink the image content."""
if
level
==
0
:
return
(
1.0
,)
# if level is zero, do not shrink the image
# Maximum shrinking ratio is 2.9.
level
=
2.
/
(
_MAX_LEVEL
/
level
)
+
0.9
return
(
level
,)
def
_enhance_level_to_arg
(
level
:
float
):
return
((
level
/
_MAX_LEVEL
)
*
1.8
+
0.1
,)
def
_shear_level_to_arg
(
level
:
float
):
level
=
(
level
/
_MAX_LEVEL
)
*
0.3
# Flip level to negative with 50% chance.
level
=
_randomly_negate_tensor
(
level
)
return
(
level
,)
def
_translate_level_to_arg
(
level
:
float
,
translate_const
:
float
):
level
=
(
level
/
_MAX_LEVEL
)
*
float
(
translate_const
)
# Flip level to negative with 50% chance.
level
=
_randomly_negate_tensor
(
level
)
return
(
level
,)
def
_mult_to_arg
(
level
:
float
,
multiplier
:
float
=
1.
):
return
(
int
((
level
/
_MAX_LEVEL
)
*
multiplier
),)
def
_apply_func_with_prob
(
func
:
Any
,
image
:
tf
.
Tensor
,
bboxes
:
Optional
[
tf
.
Tensor
],
args
:
Any
,
prob
:
float
):
"""Apply `func` to image w/ `args` as input with probability `prob`."""
assert
isinstance
(
args
,
tuple
)
assert
inspect
.
getfullargspec
(
func
)[
0
][
1
]
==
'bboxes'
# Apply the function with probability `prob`.
should_apply_op
=
tf
.
cast
(
tf
.
floor
(
tf
.
random
.
uniform
([],
dtype
=
tf
.
float32
)
+
prob
),
tf
.
bool
)
augmented_image
,
augmented_bboxes
=
tf
.
cond
(
should_apply_op
,
lambda
:
func
(
image
,
bboxes
,
*
args
),
lambda
:
(
image
,
bboxes
))
return
augmented_image
,
augmented_bboxes
def
select_and_apply_random_policy
(
policies
:
Any
,
image
:
tf
.
Tensor
,
bboxes
:
Optional
[
tf
.
Tensor
]
=
None
):
"""Select a random policy from `policies` and apply it to `image`."""
policy_to_select
=
tf
.
random
.
uniform
([],
maxval
=
len
(
policies
),
dtype
=
tf
.
int32
)
# Note that using tf.case instead of tf.conds would result in significantly
# larger graphs and would even break export for some larger policies.
for
(
i
,
policy
)
in
enumerate
(
policies
):
image
,
bboxes
=
tf
.
cond
(
tf
.
equal
(
i
,
policy_to_select
),
lambda
selected_policy
=
policy
:
selected_policy
(
image
,
bboxes
),
lambda
:
(
image
,
bboxes
))
return
image
,
bboxes
NAME_TO_FUNC
=
{
'AutoContrast'
:
autocontrast
,
'Equalize'
:
equalize
,
'Invert'
:
invert
,
'Rotate'
:
wrapped_rotate
,
'Posterize'
:
posterize
,
'Solarize'
:
solarize
,
'SolarizeAdd'
:
solarize_add
,
'Color'
:
color
,
'Contrast'
:
contrast
,
'Brightness'
:
brightness
,
'Sharpness'
:
sharpness
,
'ShearX'
:
shear_x
,
'ShearY'
:
shear_y
,
'TranslateX'
:
translate_x
,
'TranslateY'
:
translate_y
,
'Cutout'
:
cutout
,
'Rotate_BBox'
:
rotate_with_bboxes
,
# pylint:disable=g-long-lambda
'ShearX_BBox'
:
lambda
image
,
bboxes
,
level
,
replace
:
shear_with_bboxes
(
image
,
bboxes
,
level
,
replace
,
shear_horizontal
=
True
),
'ShearY_BBox'
:
lambda
image
,
bboxes
,
level
,
replace
:
shear_with_bboxes
(
image
,
bboxes
,
level
,
replace
,
shear_horizontal
=
False
),
'TranslateX_BBox'
:
lambda
image
,
bboxes
,
pixels
,
replace
:
translate_bbox
(
image
,
bboxes
,
pixels
,
replace
,
shift_horizontal
=
True
),
'TranslateY_BBox'
:
lambda
image
,
bboxes
,
pixels
,
replace
:
translate_bbox
(
image
,
bboxes
,
pixels
,
replace
,
shift_horizontal
=
False
),
# pylint:enable=g-long-lambda
'TranslateY_Only_BBoxes'
:
translate_y_only_bboxes
,
}
# Functions that require a `bboxes` parameter.
REQUIRE_BOXES_FUNCS
=
frozenset
({
'Rotate_BBox'
,
'ShearX_BBox'
,
'ShearY_BBox'
,
'TranslateX_BBox'
,
'TranslateY_BBox'
,
'TranslateY_Only_BBoxes'
,
})
# Functions that have a 'prob' parameter
PROB_FUNCS
=
frozenset
({
'TranslateY_Only_BBoxes'
,
})
# Functions that have a 'replace' parameter
REPLACE_FUNCS
=
frozenset
({
'Rotate'
,
'TranslateX'
,
'ShearX'
,
'ShearY'
,
'TranslateY'
,
'Cutout'
,
'Rotate_BBox'
,
'ShearX_BBox'
,
'ShearY_BBox'
,
'TranslateX_BBox'
,
'TranslateY_BBox'
,
'TranslateY_Only_BBoxes'
,
})
def
level_to_arg
(
cutout_const
:
float
,
translate_const
:
float
):
"""Creates a dict mapping image operation names to their arguments."""
no_arg
=
lambda
level
:
()
posterize_arg
=
lambda
level
:
_mult_to_arg
(
level
,
4
)
solarize_arg
=
lambda
level
:
_mult_to_arg
(
level
,
256
)
solarize_add_arg
=
lambda
level
:
_mult_to_arg
(
level
,
110
)
cutout_arg
=
lambda
level
:
_mult_to_arg
(
level
,
cutout_const
)
translate_arg
=
lambda
level
:
_translate_level_to_arg
(
level
,
translate_const
)
translate_bbox_arg
=
lambda
level
:
_translate_level_to_arg
(
level
,
120
)
args
=
{
'AutoContrast'
:
no_arg
,
'Equalize'
:
no_arg
,
'Invert'
:
no_arg
,
'Rotate'
:
_rotate_level_to_arg
,
'Posterize'
:
posterize_arg
,
'Solarize'
:
solarize_arg
,
'SolarizeAdd'
:
solarize_add_arg
,
'Color'
:
_enhance_level_to_arg
,
'Contrast'
:
_enhance_level_to_arg
,
'Brightness'
:
_enhance_level_to_arg
,
'Sharpness'
:
_enhance_level_to_arg
,
'ShearX'
:
_shear_level_to_arg
,
'ShearY'
:
_shear_level_to_arg
,
'Cutout'
:
cutout_arg
,
'TranslateX'
:
translate_arg
,
'TranslateY'
:
translate_arg
,
'Rotate_BBox'
:
_rotate_level_to_arg
,
'ShearX_BBox'
:
_shear_level_to_arg
,
'ShearY_BBox'
:
_shear_level_to_arg
,
# pylint:disable=g-long-lambda
'TranslateX_BBox'
:
lambda
level
:
_translate_level_to_arg
(
level
,
translate_const
),
'TranslateY_BBox'
:
lambda
level
:
_translate_level_to_arg
(
level
,
translate_const
),
# pylint:enable=g-long-lambda
'TranslateY_Only_BBoxes'
:
translate_bbox_arg
,
}
return
args
def
bbox_wrapper
(
func
):
"""Adds a bboxes function argument to func and returns unchanged bboxes."""
def
wrapper
(
images
,
bboxes
,
*
args
,
**
kwargs
):
return
(
func
(
images
,
*
args
,
**
kwargs
),
bboxes
)
return
wrapper
def
_parse_policy_info
(
name
:
Text
,
prob
:
float
,
level
:
float
,
replace_value
:
List
[
int
],
cutout_const
:
float
,
translate_const
:
float
,
level_std
:
float
=
0.
)
->
Tuple
[
Any
,
float
,
Any
]:
"""Return the function that corresponds to `name` and update `level` param."""
func
=
NAME_TO_FUNC
[
name
]
if
level_std
>
0
:
level
+=
tf
.
random
.
normal
([],
dtype
=
tf
.
float32
)
level
=
tf
.
clip_by_value
(
level
,
0.
,
_MAX_LEVEL
)
args
=
level_to_arg
(
cutout_const
,
translate_const
)[
name
](
level
)
if
name
in
PROB_FUNCS
:
# Add in the prob arg if it is required for the function that is called.
args
=
tuple
([
prob
]
+
list
(
args
))
if
name
in
REPLACE_FUNCS
:
# Add in replace arg if it is required for the function that is called.
args
=
tuple
(
list
(
args
)
+
[
replace_value
])
# Add bboxes as the second positional argument for the function if it does
# not already exist.
if
'bboxes'
not
in
inspect
.
getfullargspec
(
func
)[
0
]:
func
=
bbox_wrapper
(
func
)
return
func
,
prob
,
args
class
ImageAugment
(
object
):
"""Image augmentation class for applying image distortions."""
def
distort
(
self
,
image
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Given an image tensor, returns a distorted image with the same shape.
Args:
image: `Tensor` of shape [height, width, 3] or
[num_frames, height, width, 3] representing an image or image sequence.
Returns:
The augmented version of `image`.
"""
raise
NotImplementedError
()
def
distort_with_boxes
(
self
,
image
:
tf
.
Tensor
,
bboxes
:
tf
.
Tensor
)
->
Tuple
[
tf
.
Tensor
,
tf
.
Tensor
]:
"""Distorts the image and bounding boxes.
Args:
image: `Tensor` of shape [height, width, 3] or
[num_frames, height, width, 3] representing an image or image sequence.
bboxes: `Tensor` of shape [num_boxes, 4] or [num_frames, num_boxes, 4]
representing bounding boxes for an image or image sequence.
Returns:
The augmented version of `image` and `bboxes`.
"""
raise
NotImplementedError
class
AutoAugment
(
ImageAugment
):
"""Applies the AutoAugment policy to images.
AutoAugment is from the paper: https://arxiv.org/abs/1805.09501.
"""
def
__init__
(
self
,
augmentation_name
:
Text
=
'v0'
,
policies
:
Optional
[
Iterable
[
Iterable
[
Tuple
[
Text
,
float
,
float
]]]]
=
None
,
cutout_const
:
float
=
100
,
translate_const
:
float
=
250
):
"""Applies the AutoAugment policy to images.
Args:
augmentation_name: The name of the AutoAugment policy to use. The
available options are `v0`, `test`, `reduced_cifar10`, `svhn` and
`reduced_imagenet`. `v0` is the policy used for all
of the results in the paper and was found to achieve the best results on
the COCO dataset. `v1`, `v2` and `v3` are additional good policies found
on the COCO dataset that have slight variation in what operations were
used during the search procedure along with how many operations are
applied in parallel to a single image (2 vs 3). Make sure to set
`policies` to `None` (the default) if you want to set options using
`augmentation_name`.
policies: list of lists of tuples in the form `(func, prob, level)`,
`func` is a string name of the augmentation function, `prob` is the
probability of applying the `func` operation, `level` (or magnitude) is
the input argument for `func`. For example:
```
[[('Equalize', 0.9, 3), ('Color', 0.7, 8)],
[('Invert', 0.6, 5), ('Rotate', 0.2, 9), ('ShearX', 0.1, 2)], ...]
```
The outer-most list must be 3-d. The number of operations in a
sub-policy can vary from one sub-policy to another.
If you provide `policies` as input, any option set with
`augmentation_name` will get overriden as they are mutually exclusive.
cutout_const: multiplier for applying cutout.
translate_const: multiplier for applying translation.
Raises:
ValueError if `augmentation_name` is unsupported.
"""
super
(
AutoAugment
,
self
).
__init__
()
self
.
augmentation_name
=
augmentation_name
self
.
cutout_const
=
float
(
cutout_const
)
self
.
translate_const
=
float
(
translate_const
)
self
.
available_policies
=
{
'detection_v0'
:
self
.
detection_policy_v0
(),
'v0'
:
self
.
policy_v0
(),
'test'
:
self
.
policy_test
(),
'simple'
:
self
.
policy_simple
(),
'reduced_cifar10'
:
self
.
policy_reduced_cifar10
(),
'svhn'
:
self
.
policy_svhn
(),
'reduced_imagenet'
:
self
.
policy_reduced_imagenet
(),
}
if
not
policies
:
if
augmentation_name
not
in
self
.
available_policies
:
raise
ValueError
(
'Invalid augmentation_name: {}'
.
format
(
augmentation_name
))
self
.
policies
=
self
.
available_policies
[
augmentation_name
]
else
:
self
.
_check_policy_shape
(
policies
)
self
.
policies
=
policies
def
_check_policy_shape
(
self
,
policies
):
"""Checks dimension and shape of the custom policy.
Args:
policies: List of list of tuples in the form `(func, prob, level)`. Must
have shape of `(:, :, 3)`.
Raises:
ValueError if the shape of `policies` is unexpected.
"""
in_shape
=
np
.
array
(
policies
).
shape
if
len
(
in_shape
)
!=
3
or
in_shape
[
-
1
:]
!=
(
3
,):
raise
ValueError
(
'Wrong shape detected for custom policy. Expected '
'(:, :, 3) but got {}.'
.
format
(
in_shape
))
def
_make_tf_policies
(
self
):
"""Prepares the TF functions for augmentations based on the policies."""
replace_value
=
[
128
]
*
3
# func is the string name of the augmentation function, prob is the
# probability of applying the operation and level is the parameter
# associated with the tf op.
# tf_policies are functions that take in an image and return an augmented
# image.
tf_policies
=
[]
for
policy
in
self
.
policies
:
tf_policy
=
[]
assert_ranges
=
[]
# Link string name to the correct python function and make sure the
# correct argument is passed into that function.
for
policy_info
in
policy
:
_
,
prob
,
level
=
policy_info
assert_ranges
.
append
(
tf
.
Assert
(
tf
.
less_equal
(
prob
,
1.
),
[
prob
]))
assert_ranges
.
append
(
tf
.
Assert
(
tf
.
less_equal
(
level
,
int
(
_MAX_LEVEL
)),
[
level
]))
policy_info
=
list
(
policy_info
)
+
[
replace_value
,
self
.
cutout_const
,
self
.
translate_const
]
tf_policy
.
append
(
_parse_policy_info
(
*
policy_info
))
# Now build the tf policy that will apply the augmentation procedue
# on image.
def
make_final_policy
(
tf_policy_
):
def
final_policy
(
image_
,
bboxes_
):
for
func
,
prob
,
args
in
tf_policy_
:
image_
,
bboxes_
=
_apply_func_with_prob
(
func
,
image_
,
bboxes_
,
args
,
prob
)
return
image_
,
bboxes_
return
final_policy
with
tf
.
control_dependencies
(
assert_ranges
):
tf_policies
.
append
(
make_final_policy
(
tf_policy
))
return
tf_policies
def
distort
(
self
,
image
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""See base class."""
input_image_type
=
image
.
dtype
if
input_image_type
!=
tf
.
uint8
:
image
=
tf
.
clip_by_value
(
image
,
0.0
,
255.0
)
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
uint8
)
tf_policies
=
self
.
_make_tf_policies
()
image
,
_
=
select_and_apply_random_policy
(
tf_policies
,
image
,
bboxes
=
None
)
return
image
def
distort_with_boxes
(
self
,
image
:
tf
.
Tensor
,
bboxes
:
tf
.
Tensor
)
->
Tuple
[
tf
.
Tensor
,
tf
.
Tensor
]:
"""See base class."""
input_image_type
=
image
.
dtype
if
input_image_type
!=
tf
.
uint8
:
image
=
tf
.
clip_by_value
(
image
,
0.0
,
255.0
)
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
uint8
)
tf_policies
=
self
.
_make_tf_policies
()
image
,
bboxes
=
select_and_apply_random_policy
(
tf_policies
,
image
,
bboxes
)
return
image
,
bboxes
@
staticmethod
def
detection_policy_v0
():
"""Autoaugment policy that was used in AutoAugment Paper for Detection.
https://arxiv.org/pdf/1906.11172
Each tuple is an augmentation operation of the form
(operation, probability, magnitude). Each element in policy is a
sub-policy that will be applied sequentially on the image.
Returns:
the policy.
"""
policy
=
[
[(
'TranslateX_BBox'
,
0.6
,
4
),
(
'Equalize'
,
0.8
,
10
)],
[(
'TranslateY_Only_BBoxes'
,
0.2
,
2
),
(
'Cutout'
,
0.8
,
8
)],
[(
'Sharpness'
,
0.0
,
8
),
(
'ShearX_BBox'
,
0.4
,
0
)],
[(
'ShearY_BBox'
,
1.0
,
2
),
(
'TranslateY_Only_BBoxes'
,
0.6
,
6
)],
[(
'Rotate_BBox'
,
0.6
,
10
),
(
'Color'
,
1.0
,
6
)],
]
return
policy
@
staticmethod
def
policy_v0
():
"""Autoaugment policy that was used in AutoAugment Paper.
Each tuple is an augmentation operation of the form
(operation, probability, magnitude). Each element in policy is a
sub-policy that will be applied sequentially on the image.
Returns:
the policy.
"""
policy
=
[
[(
'Equalize'
,
0.8
,
1
),
(
'ShearY'
,
0.8
,
4
)],
[(
'Color'
,
0.4
,
9
),
(
'Equalize'
,
0.6
,
3
)],
[(
'Color'
,
0.4
,
1
),
(
'Rotate'
,
0.6
,
8
)],
[(
'Solarize'
,
0.8
,
3
),
(
'Equalize'
,
0.4
,
7
)],
[(
'Solarize'
,
0.4
,
2
),
(
'Solarize'
,
0.6
,
2
)],
[(
'Color'
,
0.2
,
0
),
(
'Equalize'
,
0.8
,
8
)],
[(
'Equalize'
,
0.4
,
8
),
(
'SolarizeAdd'
,
0.8
,
3
)],
[(
'ShearX'
,
0.2
,
9
),
(
'Rotate'
,
0.6
,
8
)],
[(
'Color'
,
0.6
,
1
),
(
'Equalize'
,
1.0
,
2
)],
[(
'Invert'
,
0.4
,
9
),
(
'Rotate'
,
0.6
,
0
)],
[(
'Equalize'
,
1.0
,
9
),
(
'ShearY'
,
0.6
,
3
)],
[(
'Color'
,
0.4
,
7
),
(
'Equalize'
,
0.6
,
0
)],
[(
'Posterize'
,
0.4
,
6
),
(
'AutoContrast'
,
0.4
,
7
)],
[(
'Solarize'
,
0.6
,
8
),
(
'Color'
,
0.6
,
9
)],
[(
'Solarize'
,
0.2
,
4
),
(
'Rotate'
,
0.8
,
9
)],
[(
'Rotate'
,
1.0
,
7
),
(
'TranslateY'
,
0.8
,
9
)],
[(
'ShearX'
,
0.0
,
0
),
(
'Solarize'
,
0.8
,
4
)],
[(
'ShearY'
,
0.8
,
0
),
(
'Color'
,
0.6
,
4
)],
[(
'Color'
,
1.0
,
0
),
(
'Rotate'
,
0.6
,
2
)],
[(
'Equalize'
,
0.8
,
4
),
(
'Equalize'
,
0.0
,
8
)],
[(
'Equalize'
,
1.0
,
4
),
(
'AutoContrast'
,
0.6
,
2
)],
[(
'ShearY'
,
0.4
,
7
),
(
'SolarizeAdd'
,
0.6
,
7
)],
[(
'Posterize'
,
0.8
,
2
),
(
'Solarize'
,
0.6
,
10
)],
[(
'Solarize'
,
0.6
,
8
),
(
'Equalize'
,
0.6
,
1
)],
[(
'Color'
,
0.8
,
6
),
(
'Rotate'
,
0.4
,
5
)],
]
return
policy
@
staticmethod
def
policy_reduced_cifar10
():
"""Autoaugment policy for reduced CIFAR-10 dataset.
Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501.
Each tuple is an augmentation operation of the form
(operation, probability, magnitude). Each element in policy is a
sub-policy that will be applied sequentially on the image.
Returns:
the policy.
"""
policy
=
[
[(
'Invert'
,
0.1
,
7
),
(
'Contrast'
,
0.2
,
6
)],
[(
'Rotate'
,
0.7
,
2
),
(
'TranslateX'
,
0.3
,
9
)],
[(
'Sharpness'
,
0.8
,
1
),
(
'Sharpness'
,
0.9
,
3
)],
[(
'ShearY'
,
0.5
,
8
),
(
'TranslateY'
,
0.7
,
9
)],
[(
'AutoContrast'
,
0.5
,
8
),
(
'Equalize'
,
0.9
,
2
)],
[(
'ShearY'
,
0.2
,
7
),
(
'Posterize'
,
0.3
,
7
)],
[(
'Color'
,
0.4
,
3
),
(
'Brightness'
,
0.6
,
7
)],
[(
'Sharpness'
,
0.3
,
9
),
(
'Brightness'
,
0.7
,
9
)],
[(
'Equalize'
,
0.6
,
5
),
(
'Equalize'
,
0.5
,
1
)],
[(
'Contrast'
,
0.6
,
7
),
(
'Sharpness'
,
0.6
,
5
)],
[(
'Color'
,
0.7
,
7
),
(
'TranslateX'
,
0.5
,
8
)],
[(
'Equalize'
,
0.3
,
7
),
(
'AutoContrast'
,
0.4
,
8
)],
[(
'TranslateY'
,
0.4
,
3
),
(
'Sharpness'
,
0.2
,
6
)],
[(
'Brightness'
,
0.9
,
6
),
(
'Color'
,
0.2
,
8
)],
[(
'Solarize'
,
0.5
,
2
),
(
'Invert'
,
0.0
,
3
)],
[(
'Equalize'
,
0.2
,
0
),
(
'AutoContrast'
,
0.6
,
0
)],
[(
'Equalize'
,
0.2
,
8
),
(
'Equalize'
,
0.6
,
4
)],
[(
'Color'
,
0.9
,
9
),
(
'Equalize'
,
0.6
,
6
)],
[(
'AutoContrast'
,
0.8
,
4
),
(
'Solarize'
,
0.2
,
8
)],
[(
'Brightness'
,
0.1
,
3
),
(
'Color'
,
0.7
,
0
)],
[(
'Solarize'
,
0.4
,
5
),
(
'AutoContrast'
,
0.9
,
3
)],
[(
'TranslateY'
,
0.9
,
9
),
(
'TranslateY'
,
0.7
,
9
)],
[(
'AutoContrast'
,
0.9
,
2
),
(
'Solarize'
,
0.8
,
3
)],
[(
'Equalize'
,
0.8
,
8
),
(
'Invert'
,
0.1
,
3
)],
[(
'TranslateY'
,
0.7
,
9
),
(
'AutoContrast'
,
0.9
,
1
)],
]
return
policy
@
staticmethod
def
policy_svhn
():
"""Autoaugment policy for SVHN dataset.
Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501.
Each tuple is an augmentation operation of the form
(operation, probability, magnitude). Each element in policy is a
sub-policy that will be applied sequentially on the image.
Returns:
the policy.
"""
policy
=
[
[(
'ShearX'
,
0.9
,
4
),
(
'Invert'
,
0.2
,
3
)],
[(
'ShearY'
,
0.9
,
8
),
(
'Invert'
,
0.7
,
5
)],
[(
'Equalize'
,
0.6
,
5
),
(
'Solarize'
,
0.6
,
6
)],
[(
'Invert'
,
0.9
,
3
),
(
'Equalize'
,
0.6
,
3
)],
[(
'Equalize'
,
0.6
,
1
),
(
'Rotate'
,
0.9
,
3
)],
[(
'ShearX'
,
0.9
,
4
),
(
'AutoContrast'
,
0.8
,
3
)],
[(
'ShearY'
,
0.9
,
8
),
(
'Invert'
,
0.4
,
5
)],
[(
'ShearY'
,
0.9
,
5
),
(
'Solarize'
,
0.2
,
6
)],
[(
'Invert'
,
0.9
,
6
),
(
'AutoContrast'
,
0.8
,
1
)],
[(
'Equalize'
,
0.6
,
3
),
(
'Rotate'
,
0.9
,
3
)],
[(
'ShearX'
,
0.9
,
4
),
(
'Solarize'
,
0.3
,
3
)],
[(
'ShearY'
,
0.8
,
8
),
(
'Invert'
,
0.7
,
4
)],
[(
'Equalize'
,
0.9
,
5
),
(
'TranslateY'
,
0.6
,
6
)],
[(
'Invert'
,
0.9
,
4
),
(
'Equalize'
,
0.6
,
7
)],
[(
'Contrast'
,
0.3
,
3
),
(
'Rotate'
,
0.8
,
4
)],
[(
'Invert'
,
0.8
,
5
),
(
'TranslateY'
,
0.0
,
2
)],
[(
'ShearY'
,
0.7
,
6
),
(
'Solarize'
,
0.4
,
8
)],
[(
'Invert'
,
0.6
,
4
),
(
'Rotate'
,
0.8
,
4
)],
[(
'ShearY'
,
0.3
,
7
),
(
'TranslateX'
,
0.9
,
3
)],
[(
'ShearX'
,
0.1
,
6
),
(
'Invert'
,
0.6
,
5
)],
[(
'Solarize'
,
0.7
,
2
),
(
'TranslateY'
,
0.6
,
7
)],
[(
'ShearY'
,
0.8
,
4
),
(
'Invert'
,
0.8
,
8
)],
[(
'ShearX'
,
0.7
,
9
),
(
'TranslateY'
,
0.8
,
3
)],
[(
'ShearY'
,
0.8
,
5
),
(
'AutoContrast'
,
0.7
,
3
)],
[(
'ShearX'
,
0.7
,
2
),
(
'Invert'
,
0.1
,
5
)],
]
return
policy
@
staticmethod
def
policy_reduced_imagenet
():
"""Autoaugment policy for reduced ImageNet dataset.
Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501.
Each tuple is an augmentation operation of the form
(operation, probability, magnitude). Each element in policy is a
sub-policy that will be applied sequentially on the image.
Returns:
the policy.
"""
policy
=
[
[(
'Posterize'
,
0.4
,
8
),
(
'Rotate'
,
0.6
,
9
)],
[(
'Solarize'
,
0.6
,
5
),
(
'AutoContrast'
,
0.6
,
5
)],
[(
'Equalize'
,
0.8
,
8
),
(
'Equalize'
,
0.6
,
3
)],
[(
'Posterize'
,
0.6
,
7
),
(
'Posterize'
,
0.6
,
6
)],
[(
'Equalize'
,
0.4
,
7
),
(
'Solarize'
,
0.2
,
4
)],
[(
'Equalize'
,
0.4
,
4
),
(
'Rotate'
,
0.8
,
8
)],
[(
'Solarize'
,
0.6
,
3
),
(
'Equalize'
,
0.6
,
7
)],
[(
'Posterize'
,
0.8
,
5
),
(
'Equalize'
,
1.0
,
2
)],
[(
'Rotate'
,
0.2
,
3
),
(
'Solarize'
,
0.6
,
8
)],
[(
'Equalize'
,
0.6
,
8
),
(
'Posterize'
,
0.4
,
6
)],
[(
'Rotate'
,
0.8
,
8
),
(
'Color'
,
0.4
,
0
)],
[(
'Rotate'
,
0.4
,
9
),
(
'Equalize'
,
0.6
,
2
)],
[(
'Equalize'
,
0.0
,
7
),
(
'Equalize'
,
0.8
,
8
)],
[(
'Invert'
,
0.6
,
4
),
(
'Equalize'
,
1.0
,
8
)],
[(
'Color'
,
0.6
,
4
),
(
'Contrast'
,
1.0
,
8
)],
[(
'Rotate'
,
0.8
,
8
),
(
'Color'
,
1.0
,
2
)],
[(
'Color'
,
0.8
,
8
),
(
'Solarize'
,
0.8
,
7
)],
[(
'Sharpness'
,
0.4
,
7
),
(
'Invert'
,
0.6
,
8
)],
[(
'ShearX'
,
0.6
,
5
),
(
'Equalize'
,
1.0
,
9
)],
[(
'Color'
,
0.4
,
0
),
(
'Equalize'
,
0.6
,
3
)],
[(
'Equalize'
,
0.4
,
7
),
(
'Solarize'
,
0.2
,
4
)],
[(
'Solarize'
,
0.6
,
5
),
(
'AutoContrast'
,
0.6
,
5
)],
[(
'Invert'
,
0.6
,
4
),
(
'Equalize'
,
1.0
,
8
)],
[(
'Color'
,
0.6
,
4
),
(
'Contrast'
,
1.0
,
8
)],
[(
'Equalize'
,
0.8
,
8
),
(
'Equalize'
,
0.6
,
3
)]
]
return
policy
@
staticmethod
def
policy_simple
():
"""Same as `policy_v0`, except with custom ops removed."""
policy
=
[
[(
'Color'
,
0.4
,
9
),
(
'Equalize'
,
0.6
,
3
)],
[(
'Solarize'
,
0.8
,
3
),
(
'Equalize'
,
0.4
,
7
)],
[(
'Solarize'
,
0.4
,
2
),
(
'Solarize'
,
0.6
,
2
)],
[(
'Color'
,
0.2
,
0
),
(
'Equalize'
,
0.8
,
8
)],
[(
'Equalize'
,
0.4
,
8
),
(
'SolarizeAdd'
,
0.8
,
3
)],
[(
'Color'
,
0.6
,
1
),
(
'Equalize'
,
1.0
,
2
)],
[(
'Color'
,
0.4
,
7
),
(
'Equalize'
,
0.6
,
0
)],
[(
'Posterize'
,
0.4
,
6
),
(
'AutoContrast'
,
0.4
,
7
)],
[(
'Solarize'
,
0.6
,
8
),
(
'Color'
,
0.6
,
9
)],
[(
'Equalize'
,
0.8
,
4
),
(
'Equalize'
,
0.0
,
8
)],
[(
'Equalize'
,
1.0
,
4
),
(
'AutoContrast'
,
0.6
,
2
)],
[(
'Posterize'
,
0.8
,
2
),
(
'Solarize'
,
0.6
,
10
)],
[(
'Solarize'
,
0.6
,
8
),
(
'Equalize'
,
0.6
,
1
)],
]
return
policy
@
staticmethod
def
policy_test
():
"""Autoaugment test policy for debugging."""
policy
=
[
[(
'TranslateX'
,
1.0
,
4
),
(
'Equalize'
,
1.0
,
10
)],
]
return
policy
def
_maybe_identity
(
x
:
Optional
[
tf
.
Tensor
])
->
Optional
[
tf
.
Tensor
]:
return
tf
.
identity
(
x
)
if
x
is
not
None
else
None
class
RandAugment
(
ImageAugment
):
"""Applies the RandAugment policy to images.
RandAugment is from the paper https://arxiv.org/abs/1909.13719,
"""
def
__init__
(
self
,
num_layers
:
int
=
2
,
magnitude
:
float
=
10.
,
cutout_const
:
float
=
40.
,
translate_const
:
float
=
100.
,
magnitude_std
:
float
=
0.0
,
prob_to_apply
:
Optional
[
float
]
=
None
,
exclude_ops
:
Optional
[
List
[
str
]]
=
None
):
"""Applies the RandAugment policy to images.
Args:
num_layers: Integer, the number of augmentation transformations to apply
sequentially to an image. Represented as (N) in the paper. Usually best
values will be in the range [1, 3].
magnitude: Integer, shared magnitude across all augmentation operations.
Represented as (M) in the paper. Usually best values are in the range
[5, 10].
cutout_const: multiplier for applying cutout.
translate_const: multiplier for applying translation.
magnitude_std: randomness of the severity as proposed by the authors of
the timm library.
prob_to_apply: The probability to apply the selected augmentation at each
layer.
exclude_ops: exclude selected operations.
"""
super
(
RandAugment
,
self
).
__init__
()
self
.
num_layers
=
num_layers
self
.
magnitude
=
float
(
magnitude
)
self
.
cutout_const
=
float
(
cutout_const
)
self
.
translate_const
=
float
(
translate_const
)
self
.
prob_to_apply
=
(
float
(
prob_to_apply
)
if
prob_to_apply
is
not
None
else
None
)
self
.
available_ops
=
[
'AutoContrast'
,
'Equalize'
,
'Invert'
,
'Rotate'
,
'Posterize'
,
'Solarize'
,
'Color'
,
'Contrast'
,
'Brightness'
,
'Sharpness'
,
'ShearX'
,
'ShearY'
,
'TranslateX'
,
'TranslateY'
,
'Cutout'
,
'SolarizeAdd'
]
self
.
magnitude_std
=
magnitude_std
if
exclude_ops
:
self
.
available_ops
=
[
op
for
op
in
self
.
available_ops
if
op
not
in
exclude_ops
]
@
classmethod
def
build_for_detection
(
cls
,
num_layers
:
int
=
2
,
magnitude
:
float
=
10.
,
cutout_const
:
float
=
40.
,
translate_const
:
float
=
100.
,
magnitude_std
:
float
=
0.0
,
prob_to_apply
:
Optional
[
float
]
=
None
,
exclude_ops
:
Optional
[
List
[
str
]]
=
None
):
"""Builds a RandAugment that modifies bboxes for geometric transforms."""
augmenter
=
cls
(
num_layers
=
num_layers
,
magnitude
=
magnitude
,
cutout_const
=
cutout_const
,
translate_const
=
translate_const
,
magnitude_std
=
magnitude_std
,
prob_to_apply
=
prob_to_apply
,
exclude_ops
=
exclude_ops
)
box_aware_ops_by_base_name
=
{
'Rotate'
:
'Rotate_BBox'
,
'ShearX'
:
'ShearX_BBox'
,
'ShearY'
:
'ShearY_BBox'
,
'TranslateX'
:
'TranslateX_BBox'
,
'TranslateY'
:
'TranslateY_BBox'
,
}
augmenter
.
available_ops
=
[
box_aware_ops_by_base_name
.
get
(
op_name
)
or
op_name
for
op_name
in
augmenter
.
available_ops
]
return
augmenter
def
_distort_common
(
self
,
image
:
tf
.
Tensor
,
bboxes
:
Optional
[
tf
.
Tensor
]
=
None
)
->
Tuple
[
tf
.
Tensor
,
Optional
[
tf
.
Tensor
]]:
"""Distorts the image and optionally bounding boxes."""
input_image_type
=
image
.
dtype
if
input_image_type
!=
tf
.
uint8
:
image
=
tf
.
clip_by_value
(
image
,
0.0
,
255.0
)
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
uint8
)
replace_value
=
[
128
]
*
3
min_prob
,
max_prob
=
0.2
,
0.8
aug_image
=
image
aug_bboxes
=
bboxes
for
_
in
range
(
self
.
num_layers
):
op_to_select
=
tf
.
random
.
uniform
([],
maxval
=
len
(
self
.
available_ops
)
+
1
,
dtype
=
tf
.
int32
)
branch_fns
=
[]
for
(
i
,
op_name
)
in
enumerate
(
self
.
available_ops
):
prob
=
tf
.
random
.
uniform
([],
minval
=
min_prob
,
maxval
=
max_prob
,
dtype
=
tf
.
float32
)
func
,
_
,
args
=
_parse_policy_info
(
op_name
,
prob
,
self
.
magnitude
,
replace_value
,
self
.
cutout_const
,
self
.
translate_const
,
self
.
magnitude_std
)
branch_fns
.
append
((
i
,
# pylint:disable=g-long-lambda
lambda
selected_func
=
func
,
selected_args
=
args
:
selected_func
(
image
,
bboxes
,
*
selected_args
)))
# pylint:enable=g-long-lambda
aug_image
,
aug_bboxes
=
tf
.
switch_case
(
branch_index
=
op_to_select
,
branch_fns
=
branch_fns
,
default
=
lambda
:
(
tf
.
identity
(
image
),
_maybe_identity
(
bboxes
)))
if
self
.
prob_to_apply
is
not
None
:
aug_image
,
aug_bboxes
=
tf
.
cond
(
tf
.
random
.
uniform
(
shape
=
[],
dtype
=
tf
.
float32
)
<
self
.
prob_to_apply
,
lambda
:
(
tf
.
identity
(
aug_image
),
_maybe_identity
(
aug_bboxes
)),
lambda
:
(
tf
.
identity
(
image
),
_maybe_identity
(
bboxes
)))
image
=
aug_image
bboxes
=
aug_bboxes
image
=
tf
.
cast
(
image
,
dtype
=
input_image_type
)
return
image
,
bboxes
def
distort
(
self
,
image
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""See base class."""
image
,
_
=
self
.
_distort_common
(
image
)
return
image
def
distort_with_boxes
(
self
,
image
:
tf
.
Tensor
,
bboxes
:
tf
.
Tensor
)
->
Tuple
[
tf
.
Tensor
,
tf
.
Tensor
]:
"""See base class."""
image
,
bboxes
=
self
.
_distort_common
(
image
,
bboxes
)
return
image
,
bboxes
class
RandomErasing
(
ImageAugment
):
"""Applies RandomErasing to a single image.
Reference: https://arxiv.org/abs/1708.04896
Implementaion is inspired by https://github.com/rwightman/pytorch-image-models
"""
def
__init__
(
self
,
probability
:
float
=
0.25
,
min_area
:
float
=
0.02
,
max_area
:
float
=
1
/
3
,
min_aspect
:
float
=
0.3
,
max_aspect
=
None
,
min_count
=
1
,
max_count
=
1
,
trials
=
10
):
"""Applies RandomErasing to a single image.
Args:
probability (float, optional): Probability of augmenting the image.
Defaults to 0.25.
min_area (float, optional): Minimum area of the random erasing rectangle.
Defaults to 0.02.
max_area (float, optional): Maximum area of the random erasing rectangle.
Defaults to 1/3.
min_aspect (float, optional): Minimum aspect rate of the random erasing
rectangle. Defaults to 0.3.
max_aspect ([type], optional): Maximum aspect rate of the random erasing
rectangle. Defaults to None.
min_count (int, optional): Minimum number of erased rectangles. Defaults
to 1.
max_count (int, optional): Maximum number of erased rectangles. Defaults
to 1.
trials (int, optional): Maximum number of trials to randomly sample a
rectangle that fulfills constraint. Defaults to 10.
"""
self
.
_probability
=
probability
self
.
_min_area
=
float
(
min_area
)
self
.
_max_area
=
float
(
max_area
)
self
.
_min_log_aspect
=
math
.
log
(
min_aspect
)
self
.
_max_log_aspect
=
math
.
log
(
max_aspect
or
1
/
min_aspect
)
self
.
_min_count
=
min_count
self
.
_max_count
=
max_count
self
.
_trials
=
trials
def
distort
(
self
,
image
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Applies RandomErasing to single `image`.
Args:
image (tf.Tensor): Of shape [height, width, 3] representing an image.
Returns:
tf.Tensor: The augmented version of `image`.
"""
uniform_random
=
tf
.
random
.
uniform
(
shape
=
[],
minval
=
0.
,
maxval
=
1.0
)
mirror_cond
=
tf
.
less
(
uniform_random
,
self
.
_probability
)
image
=
tf
.
cond
(
mirror_cond
,
lambda
:
self
.
_erase
(
image
),
lambda
:
image
)
return
image
@
tf
.
function
def
_erase
(
self
,
image
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Erase an area."""
if
self
.
_min_count
==
self
.
_max_count
:
count
=
self
.
_min_count
else
:
count
=
tf
.
random
.
uniform
(
shape
=
[],
minval
=
int
(
self
.
_min_count
),
maxval
=
int
(
self
.
_max_count
-
self
.
_min_count
+
1
),
dtype
=
tf
.
int32
)
image_height
=
tf
.
shape
(
image
)[
0
]
image_width
=
tf
.
shape
(
image
)[
1
]
area
=
tf
.
cast
(
image_width
*
image_height
,
tf
.
float32
)
for
_
in
range
(
count
):
# Work around since break is not supported in tf.function
is_trial_successfull
=
False
for
_
in
range
(
self
.
_trials
):
if
not
is_trial_successfull
:
erase_area
=
tf
.
random
.
uniform
(
shape
=
[],
minval
=
area
*
self
.
_min_area
,
maxval
=
area
*
self
.
_max_area
)
aspect_ratio
=
tf
.
math
.
exp
(
tf
.
random
.
uniform
(
shape
=
[],
minval
=
self
.
_min_log_aspect
,
maxval
=
self
.
_max_log_aspect
))
half_height
=
tf
.
cast
(
tf
.
math
.
round
(
tf
.
math
.
sqrt
(
erase_area
*
aspect_ratio
)
/
2
),
dtype
=
tf
.
int32
)
half_width
=
tf
.
cast
(
tf
.
math
.
round
(
tf
.
math
.
sqrt
(
erase_area
/
aspect_ratio
)
/
2
),
dtype
=
tf
.
int32
)
if
2
*
half_height
<
image_height
and
2
*
half_width
<
image_width
:
center_height
=
tf
.
random
.
uniform
(
shape
=
[],
minval
=
0
,
maxval
=
int
(
image_height
-
2
*
half_height
),
dtype
=
tf
.
int32
)
center_width
=
tf
.
random
.
uniform
(
shape
=
[],
minval
=
0
,
maxval
=
int
(
image_width
-
2
*
half_width
),
dtype
=
tf
.
int32
)
image
=
_fill_rectangle
(
image
,
center_width
,
center_height
,
half_width
,
half_height
,
replace
=
None
)
is_trial_successfull
=
True
return
image
class
MixupAndCutmix
:
"""Applies Mixup and/or Cutmix to a batch of images.
- Mixup: https://arxiv.org/abs/1710.09412
- Cutmix: https://arxiv.org/abs/1905.04899
Implementaion is inspired by https://github.com/rwightman/pytorch-image-models
"""
def
__init__
(
self
,
mixup_alpha
:
float
=
.
8
,
cutmix_alpha
:
float
=
1.
,
prob
:
float
=
1.0
,
switch_prob
:
float
=
0.5
,
label_smoothing
:
float
=
0.1
,
num_classes
:
int
=
1001
):
"""Applies Mixup and/or Cutmix to a batch of images.
Args:
mixup_alpha (float, optional): For drawing a random lambda (`lam`) from a
beta distribution (for each image). If zero Mixup is deactivated.
Defaults to .8.
cutmix_alpha (float, optional): For drawing a random lambda (`lam`) from a
beta distribution (for each image). If zero Cutmix is deactivated.
Defaults to 1..
prob (float, optional): Of augmenting the batch. Defaults to 1.0.
switch_prob (float, optional): Probability of applying Cutmix for the
batch. Defaults to 0.5.
label_smoothing (float, optional): Constant for label smoothing. Defaults
to 0.1.
num_classes (int, optional): Number of classes. Defaults to 1001.
"""
self
.
mixup_alpha
=
mixup_alpha
self
.
cutmix_alpha
=
cutmix_alpha
self
.
mix_prob
=
prob
self
.
switch_prob
=
switch_prob
self
.
label_smoothing
=
label_smoothing
self
.
num_classes
=
num_classes
self
.
mode
=
'batch'
self
.
mixup_enabled
=
True
if
self
.
mixup_alpha
and
not
self
.
cutmix_alpha
:
self
.
switch_prob
=
-
1
elif
not
self
.
mixup_alpha
and
self
.
cutmix_alpha
:
self
.
switch_prob
=
1
def
__call__
(
self
,
images
:
tf
.
Tensor
,
labels
:
tf
.
Tensor
)
->
Tuple
[
tf
.
Tensor
,
tf
.
Tensor
]:
return
self
.
distort
(
images
,
labels
)
def
distort
(
self
,
images
:
tf
.
Tensor
,
labels
:
tf
.
Tensor
)
->
Tuple
[
tf
.
Tensor
,
tf
.
Tensor
]:
"""Applies Mixup and/or Cutmix to batch of images and transforms labels.
Args:
images (tf.Tensor): Of shape [batch_size,height, width, 3] representing a
batch of image.
labels (tf.Tensor): Of shape [batch_size, ] representing the class id for
each image of the batch.
Returns:
Tuple[tf.Tensor, tf.Tensor]: The augmented version of `image` and
`labels`.
"""
augment_cond
=
tf
.
less
(
tf
.
random
.
uniform
(
shape
=
[],
minval
=
0.
,
maxval
=
1.0
),
self
.
mix_prob
)
# pylint: disable=g-long-lambda
augment_a
=
lambda
:
self
.
_update_labels
(
*
tf
.
cond
(
tf
.
less
(
tf
.
random
.
uniform
(
shape
=
[],
minval
=
0.
,
maxval
=
1.0
),
self
.
switch_prob
),
lambda
:
self
.
_cutmix
(
images
,
labels
),
lambda
:
self
.
_mixup
(
images
,
labels
)))
augment_b
=
lambda
:
(
images
,
self
.
_smooth_labels
(
labels
))
# pylint: enable=g-long-lambda
return
tf
.
cond
(
augment_cond
,
augment_a
,
augment_b
)
@
staticmethod
def
_sample_from_beta
(
alpha
,
beta
,
shape
):
sample_alpha
=
tf
.
random
.
gamma
(
shape
,
1.
,
beta
=
alpha
)
sample_beta
=
tf
.
random
.
gamma
(
shape
,
1.
,
beta
=
beta
)
return
sample_alpha
/
(
sample_alpha
+
sample_beta
)
def
_cutmix
(
self
,
images
:
tf
.
Tensor
,
labels
:
tf
.
Tensor
)
->
Tuple
[
tf
.
Tensor
,
tf
.
Tensor
,
tf
.
Tensor
]:
"""Apply cutmix."""
lam
=
MixupAndCutmix
.
_sample_from_beta
(
self
.
cutmix_alpha
,
self
.
cutmix_alpha
,
labels
.
shape
)
ratio
=
tf
.
math
.
sqrt
(
1
-
lam
)
batch_size
=
tf
.
shape
(
images
)[
0
]
image_height
,
image_width
=
tf
.
shape
(
images
)[
1
],
tf
.
shape
(
images
)[
2
]
cut_height
=
tf
.
cast
(
ratio
*
tf
.
cast
(
image_height
,
dtype
=
tf
.
float32
),
dtype
=
tf
.
int32
)
cut_width
=
tf
.
cast
(
ratio
*
tf
.
cast
(
image_height
,
dtype
=
tf
.
float32
),
dtype
=
tf
.
int32
)
random_center_height
=
tf
.
random
.
uniform
(
shape
=
[
batch_size
],
minval
=
0
,
maxval
=
image_height
,
dtype
=
tf
.
int32
)
random_center_width
=
tf
.
random
.
uniform
(
shape
=
[
batch_size
],
minval
=
0
,
maxval
=
image_width
,
dtype
=
tf
.
int32
)
bbox_area
=
cut_height
*
cut_width
lam
=
1.
-
bbox_area
/
(
image_height
*
image_width
)
lam
=
tf
.
cast
(
lam
,
dtype
=
tf
.
float32
)
images
=
tf
.
map_fn
(
lambda
x
:
_fill_rectangle
(
*
x
),
(
images
,
random_center_width
,
random_center_height
,
cut_width
//
2
,
cut_height
//
2
,
tf
.
reverse
(
images
,
[
0
])),
dtype
=
(
tf
.
float32
,
tf
.
int32
,
tf
.
int32
,
tf
.
int32
,
tf
.
int32
,
tf
.
float32
),
fn_output_signature
=
tf
.
TensorSpec
(
images
.
shape
[
1
:],
dtype
=
tf
.
float32
))
return
images
,
labels
,
lam
def
_mixup
(
self
,
images
:
tf
.
Tensor
,
labels
:
tf
.
Tensor
)
->
Tuple
[
tf
.
Tensor
,
tf
.
Tensor
,
tf
.
Tensor
]:
lam
=
MixupAndCutmix
.
_sample_from_beta
(
self
.
mixup_alpha
,
self
.
mixup_alpha
,
labels
.
shape
)
lam
=
tf
.
reshape
(
lam
,
[
-
1
,
1
,
1
,
1
])
images
=
lam
*
images
+
(
1.
-
lam
)
*
tf
.
reverse
(
images
,
[
0
])
return
images
,
labels
,
tf
.
squeeze
(
lam
)
def
_smooth_labels
(
self
,
labels
:
tf
.
Tensor
)
->
tf
.
Tensor
:
off_value
=
self
.
label_smoothing
/
self
.
num_classes
on_value
=
1.
-
self
.
label_smoothing
+
off_value
smooth_labels
=
tf
.
one_hot
(
labels
,
self
.
num_classes
,
on_value
=
on_value
,
off_value
=
off_value
)
return
smooth_labels
def
_update_labels
(
self
,
images
:
tf
.
Tensor
,
labels
:
tf
.
Tensor
,
lam
:
tf
.
Tensor
)
->
Tuple
[
tf
.
Tensor
,
tf
.
Tensor
]:
labels_1
=
self
.
_smooth_labels
(
labels
)
labels_2
=
tf
.
reverse
(
labels_1
,
[
0
])
lam
=
tf
.
reshape
(
lam
,
[
-
1
,
1
])
labels
=
lam
*
labels_1
+
(
1.
-
lam
)
*
labels_2
return
images
,
labels
official/vision/ops/augment_test.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for autoaugment."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
random
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.vision.ops
import
augment
def
get_dtype_test_cases
():
return
[
(
'uint8'
,
tf
.
uint8
),
(
'int32'
,
tf
.
int32
),
(
'float16'
,
tf
.
float16
),
(
'float32'
,
tf
.
float32
),
]
@
parameterized
.
named_parameters
(
get_dtype_test_cases
())
class
TransformsTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
"""Basic tests for fundamental transformations."""
def
test_to_from_4d
(
self
,
dtype
):
for
shape
in
[(
10
,
10
),
(
10
,
10
,
10
),
(
10
,
10
,
10
,
10
)]:
original_ndims
=
len
(
shape
)
image
=
tf
.
zeros
(
shape
,
dtype
=
dtype
)
image_4d
=
augment
.
to_4d
(
image
)
self
.
assertEqual
(
4
,
tf
.
rank
(
image_4d
))
self
.
assertAllEqual
(
image
,
augment
.
from_4d
(
image_4d
,
original_ndims
))
def
test_transform
(
self
,
dtype
):
image
=
tf
.
constant
([[
1
,
2
],
[
3
,
4
]],
dtype
=
dtype
)
self
.
assertAllEqual
(
augment
.
transform
(
image
,
transforms
=
[
1
]
*
8
),
[[
4
,
4
],
[
4
,
4
]])
def
test_translate
(
self
,
dtype
):
image
=
tf
.
constant
(
[[
1
,
0
,
1
,
0
],
[
0
,
1
,
0
,
1
],
[
1
,
0
,
1
,
0
],
[
0
,
1
,
0
,
1
]],
dtype
=
dtype
)
translations
=
[
-
1
,
-
1
]
translated
=
augment
.
translate
(
image
=
image
,
translations
=
translations
)
expected
=
[[
1
,
0
,
1
,
1
],
[
0
,
1
,
0
,
0
],
[
1
,
0
,
1
,
1
],
[
1
,
0
,
1
,
1
]]
self
.
assertAllEqual
(
translated
,
expected
)
def
test_translate_shapes
(
self
,
dtype
):
translation
=
[
0
,
0
]
for
shape
in
[(
3
,
3
),
(
5
,
5
),
(
224
,
224
,
3
)]:
image
=
tf
.
zeros
(
shape
,
dtype
=
dtype
)
self
.
assertAllEqual
(
image
,
augment
.
translate
(
image
,
translation
))
def
test_translate_invalid_translation
(
self
,
dtype
):
image
=
tf
.
zeros
((
1
,
1
),
dtype
=
dtype
)
invalid_translation
=
[[[
1
,
1
]]]
with
self
.
assertRaisesRegex
(
TypeError
,
'rank 1 or 2'
):
_
=
augment
.
translate
(
image
,
invalid_translation
)
def
test_rotate
(
self
,
dtype
):
image
=
tf
.
reshape
(
tf
.
cast
(
tf
.
range
(
9
),
dtype
),
(
3
,
3
))
rotation
=
90.
transformed
=
augment
.
rotate
(
image
=
image
,
degrees
=
rotation
)
expected
=
[[
2
,
5
,
8
],
[
1
,
4
,
7
],
[
0
,
3
,
6
]]
self
.
assertAllEqual
(
transformed
,
expected
)
def
test_rotate_shapes
(
self
,
dtype
):
degrees
=
0.
for
shape
in
[(
3
,
3
),
(
5
,
5
),
(
224
,
224
,
3
)]:
image
=
tf
.
zeros
(
shape
,
dtype
=
dtype
)
self
.
assertAllEqual
(
image
,
augment
.
rotate
(
image
,
degrees
))
class
AutoaugmentTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
AVAILABLE_POLICIES
=
[
'v0'
,
'test'
,
'simple'
,
'reduced_cifar10'
,
'svhn'
,
'reduced_imagenet'
,
'detection_v0'
,
]
def
test_autoaugment
(
self
):
"""Smoke test to be sure there are no syntax errors."""
image
=
tf
.
zeros
((
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
for
policy
in
self
.
AVAILABLE_POLICIES
:
augmenter
=
augment
.
AutoAugment
(
augmentation_name
=
policy
)
aug_image
=
augmenter
.
distort
(
image
)
self
.
assertEqual
((
224
,
224
,
3
),
aug_image
.
shape
)
def
test_autoaugment_with_bboxes
(
self
):
"""Smoke test to be sure there are no syntax errors with bboxes."""
image
=
tf
.
zeros
((
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
bboxes
=
tf
.
ones
((
2
,
4
),
dtype
=
tf
.
float32
)
for
policy
in
self
.
AVAILABLE_POLICIES
:
augmenter
=
augment
.
AutoAugment
(
augmentation_name
=
policy
)
aug_image
,
aug_bboxes
=
augmenter
.
distort_with_boxes
(
image
,
bboxes
)
self
.
assertEqual
((
224
,
224
,
3
),
aug_image
.
shape
)
self
.
assertEqual
((
2
,
4
),
aug_bboxes
.
shape
)
def
test_randaug
(
self
):
"""Smoke test to be sure there are no syntax errors."""
image
=
tf
.
zeros
((
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
augmenter
=
augment
.
RandAugment
()
aug_image
=
augmenter
.
distort
(
image
)
self
.
assertEqual
((
224
,
224
,
3
),
aug_image
.
shape
)
def
test_randaug_with_bboxes
(
self
):
"""Smoke test to be sure there are no syntax errors with bboxes."""
image
=
tf
.
zeros
((
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
bboxes
=
tf
.
ones
((
2
,
4
),
dtype
=
tf
.
float32
)
augmenter
=
augment
.
RandAugment
()
aug_image
,
aug_bboxes
=
augmenter
.
distort_with_boxes
(
image
,
bboxes
)
self
.
assertEqual
((
224
,
224
,
3
),
aug_image
.
shape
)
self
.
assertEqual
((
2
,
4
),
aug_bboxes
.
shape
)
def
test_randaug_build_for_detection
(
self
):
"""Smoke test to be sure there are no syntax errors built for detection."""
image
=
tf
.
zeros
((
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
bboxes
=
tf
.
ones
((
2
,
4
),
dtype
=
tf
.
float32
)
augmenter
=
augment
.
RandAugment
.
build_for_detection
()
self
.
assertCountEqual
(
augmenter
.
available_ops
,
[
'AutoContrast'
,
'Equalize'
,
'Invert'
,
'Posterize'
,
'Solarize'
,
'Color'
,
'Contrast'
,
'Brightness'
,
'Sharpness'
,
'Cutout'
,
'SolarizeAdd'
,
'Rotate_BBox'
,
'ShearX_BBox'
,
'ShearY_BBox'
,
'TranslateX_BBox'
,
'TranslateY_BBox'
])
aug_image
,
aug_bboxes
=
augmenter
.
distort_with_boxes
(
image
,
bboxes
)
self
.
assertEqual
((
224
,
224
,
3
),
aug_image
.
shape
)
self
.
assertEqual
((
2
,
4
),
aug_bboxes
.
shape
)
def
test_all_policy_ops
(
self
):
"""Smoke test to be sure all augmentation functions can execute."""
prob
=
1
magnitude
=
10
replace_value
=
[
128
]
*
3
cutout_const
=
100
translate_const
=
250
image
=
tf
.
ones
((
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
bboxes
=
None
for
op_name
in
augment
.
NAME_TO_FUNC
.
keys
()
-
augment
.
REQUIRE_BOXES_FUNCS
:
func
,
_
,
args
=
augment
.
_parse_policy_info
(
op_name
,
prob
,
magnitude
,
replace_value
,
cutout_const
,
translate_const
)
image
,
bboxes
=
func
(
image
,
bboxes
,
*
args
)
self
.
assertEqual
((
224
,
224
,
3
),
image
.
shape
)
self
.
assertIsNone
(
bboxes
)
def
test_all_policy_ops_with_bboxes
(
self
):
"""Smoke test to be sure all augmentation functions can execute."""
prob
=
1
magnitude
=
10
replace_value
=
[
128
]
*
3
cutout_const
=
100
translate_const
=
250
image
=
tf
.
ones
((
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
bboxes
=
tf
.
ones
((
2
,
4
),
dtype
=
tf
.
float32
)
for
op_name
in
augment
.
NAME_TO_FUNC
:
func
,
_
,
args
=
augment
.
_parse_policy_info
(
op_name
,
prob
,
magnitude
,
replace_value
,
cutout_const
,
translate_const
)
image
,
bboxes
=
func
(
image
,
bboxes
,
*
args
)
self
.
assertEqual
((
224
,
224
,
3
),
image
.
shape
)
self
.
assertEqual
((
2
,
4
),
bboxes
.
shape
)
def
test_autoaugment_video
(
self
):
"""Smoke test with video to be sure there are no syntax errors."""
image
=
tf
.
zeros
((
2
,
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
for
policy
in
self
.
AVAILABLE_POLICIES
:
augmenter
=
augment
.
AutoAugment
(
augmentation_name
=
policy
)
aug_image
=
augmenter
.
distort
(
image
)
self
.
assertEqual
((
2
,
224
,
224
,
3
),
aug_image
.
shape
)
def
test_autoaugment_video_with_boxes
(
self
):
"""Smoke test with video to be sure there are no syntax errors."""
image
=
tf
.
zeros
((
2
,
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
bboxes
=
tf
.
ones
((
2
,
2
,
4
),
dtype
=
tf
.
float32
)
for
policy
in
self
.
AVAILABLE_POLICIES
:
augmenter
=
augment
.
AutoAugment
(
augmentation_name
=
policy
)
aug_image
,
aug_bboxes
=
augmenter
.
distort_with_boxes
(
image
,
bboxes
)
self
.
assertEqual
((
2
,
224
,
224
,
3
),
aug_image
.
shape
)
self
.
assertEqual
((
2
,
2
,
4
),
aug_bboxes
.
shape
)
def
test_randaug_video
(
self
):
"""Smoke test with video to be sure there are no syntax errors."""
image
=
tf
.
zeros
((
2
,
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
augmenter
=
augment
.
RandAugment
()
aug_image
=
augmenter
.
distort
(
image
)
self
.
assertEqual
((
2
,
224
,
224
,
3
),
aug_image
.
shape
)
def
test_all_policy_ops_video
(
self
):
"""Smoke test to be sure all video augmentation functions can execute."""
prob
=
1
magnitude
=
10
replace_value
=
[
128
]
*
3
cutout_const
=
100
translate_const
=
250
image
=
tf
.
ones
((
2
,
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
bboxes
=
None
for
op_name
in
augment
.
NAME_TO_FUNC
.
keys
()
-
augment
.
REQUIRE_BOXES_FUNCS
:
func
,
_
,
args
=
augment
.
_parse_policy_info
(
op_name
,
prob
,
magnitude
,
replace_value
,
cutout_const
,
translate_const
)
image
,
bboxes
=
func
(
image
,
bboxes
,
*
args
)
self
.
assertEqual
((
2
,
224
,
224
,
3
),
image
.
shape
)
self
.
assertIsNone
(
bboxes
)
def
test_all_policy_ops_video_with_bboxes
(
self
):
"""Smoke test to be sure all video augmentation functions can execute."""
prob
=
1
magnitude
=
10
replace_value
=
[
128
]
*
3
cutout_const
=
100
translate_const
=
250
image
=
tf
.
ones
((
2
,
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
bboxes
=
tf
.
ones
((
2
,
2
,
4
),
dtype
=
tf
.
float32
)
for
op_name
in
augment
.
NAME_TO_FUNC
:
func
,
_
,
args
=
augment
.
_parse_policy_info
(
op_name
,
prob
,
magnitude
,
replace_value
,
cutout_const
,
translate_const
)
if
op_name
in
{
'Rotate_BBox'
,
'ShearX_BBox'
,
'ShearY_BBox'
,
'TranslateX_BBox'
,
'TranslateY_BBox'
,
'TranslateY_Only_BBoxes'
,
}:
with
self
.
assertRaises
(
ValueError
):
func
(
image
,
bboxes
,
*
args
)
else
:
image
,
bboxes
=
func
(
image
,
bboxes
,
*
args
)
self
.
assertEqual
((
2
,
224
,
224
,
3
),
image
.
shape
)
self
.
assertEqual
((
2
,
2
,
4
),
bboxes
.
shape
)
def
_generate_test_policy
(
self
):
"""Generate a test policy at random."""
op_list
=
list
(
augment
.
NAME_TO_FUNC
.
keys
())
size
=
6
prob
=
[
round
(
random
.
uniform
(
0.
,
1.
),
1
)
for
_
in
range
(
size
)]
mag
=
[
round
(
random
.
uniform
(
0
,
10
))
for
_
in
range
(
size
)]
policy
=
[]
for
i
in
range
(
0
,
size
,
2
):
policy
.
append
([(
op_list
[
i
],
prob
[
i
],
mag
[
i
]),
(
op_list
[
i
+
1
],
prob
[
i
+
1
],
mag
[
i
+
1
])])
return
policy
def
test_custom_policy
(
self
):
"""Test autoaugment with a custom policy."""
image
=
tf
.
zeros
((
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
augmenter
=
augment
.
AutoAugment
(
policies
=
self
.
_generate_test_policy
())
aug_image
=
augmenter
.
distort
(
image
)
self
.
assertEqual
((
224
,
224
,
3
),
aug_image
.
shape
)
@
parameterized
.
named_parameters
(
{
'testcase_name'
:
'_OutOfRangeProb'
,
'sub_policy'
:
(
'Equalize'
,
1.1
,
3
),
'value'
:
'1.1'
},
{
'testcase_name'
:
'_OutOfRangeMag'
,
'sub_policy'
:
(
'Equalize'
,
0.9
,
11
),
'value'
:
'11'
},
)
def
test_invalid_custom_sub_policy
(
self
,
sub_policy
,
value
):
"""Test autoaugment with out-of-range values in the custom policy."""
image
=
tf
.
zeros
((
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
policy
=
self
.
_generate_test_policy
()
policy
[
0
][
0
]
=
sub_policy
augmenter
=
augment
.
AutoAugment
(
policies
=
policy
)
with
self
.
assertRaisesRegex
(
tf
.
errors
.
InvalidArgumentError
,
r
'Expected \'tf.Tensor\(False, shape=\(\), dtype=bool\)\' to be true. '
r
'Summarized data: ({})'
.
format
(
value
)):
augmenter
.
distort
(
image
)
def
test_invalid_custom_policy_ndim
(
self
):
"""Test autoaugment with wrong dimension in the custom policy."""
policy
=
[[(
'Equalize'
,
0.8
,
1
),
(
'Shear'
,
0.8
,
4
)],
[(
'TranslateY'
,
0.6
,
3
),
(
'Rotate'
,
0.9
,
3
)]]
policy
=
[[
policy
]]
with
self
.
assertRaisesRegex
(
ValueError
,
r
'Expected \(:, :, 3\) but got \(1, 1, 2, 2, 3\).'
):
augment
.
AutoAugment
(
policies
=
policy
)
def
test_invalid_custom_policy_shape
(
self
):
"""Test autoaugment with wrong shape in the custom policy."""
policy
=
[[(
'Equalize'
,
0.8
,
1
,
1
),
(
'Shear'
,
0.8
,
4
,
1
)],
[(
'TranslateY'
,
0.6
,
3
,
1
),
(
'Rotate'
,
0.9
,
3
,
1
)]]
with
self
.
assertRaisesRegex
(
ValueError
,
r
'Expected \(:, :, 3\) but got \(2, 2, 4\)'
):
augment
.
AutoAugment
(
policies
=
policy
)
def
test_invalid_custom_policy_key
(
self
):
"""Test autoaugment with invalid key in the custom policy."""
image
=
tf
.
zeros
((
224
,
224
,
3
),
dtype
=
tf
.
uint8
)
policy
=
[[(
'AAAAA'
,
0.8
,
1
),
(
'Shear'
,
0.8
,
4
)],
[(
'TranslateY'
,
0.6
,
3
),
(
'Rotate'
,
0.9
,
3
)]]
augmenter
=
augment
.
AutoAugment
(
policies
=
policy
)
with
self
.
assertRaisesRegex
(
KeyError
,
'
\'
AAAAA
\'
'
):
augmenter
.
distort
(
image
)
class
RandomErasingTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
def
test_random_erase_replaces_some_pixels
(
self
):
image
=
tf
.
zeros
((
224
,
224
,
3
),
dtype
=
tf
.
float32
)
augmenter
=
augment
.
RandomErasing
(
probability
=
1.
,
max_count
=
10
)
aug_image
=
augmenter
.
distort
(
image
)
self
.
assertEqual
((
224
,
224
,
3
),
aug_image
.
shape
)
self
.
assertNotEqual
(
0
,
tf
.
reduce_max
(
aug_image
))
class
MixupAndCutmixTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
def
test_mixup_and_cutmix_smoothes_labels
(
self
):
batch_size
=
12
num_classes
=
1000
label_smoothing
=
0.1
images
=
tf
.
random
.
normal
((
batch_size
,
224
,
224
,
3
),
dtype
=
tf
.
float32
)
labels
=
tf
.
range
(
batch_size
)
augmenter
=
augment
.
MixupAndCutmix
(
num_classes
=
num_classes
,
label_smoothing
=
label_smoothing
)
aug_images
,
aug_labels
=
augmenter
.
distort
(
images
,
labels
)
self
.
assertEqual
(
images
.
shape
,
aug_images
.
shape
)
self
.
assertEqual
(
images
.
dtype
,
aug_images
.
dtype
)
self
.
assertEqual
([
batch_size
,
num_classes
],
aug_labels
.
shape
)
self
.
assertAllLessEqual
(
aug_labels
,
1.
-
label_smoothing
+
2.
/
num_classes
)
# With tolerance
self
.
assertAllGreaterEqual
(
aug_labels
,
label_smoothing
/
num_classes
-
1e4
)
# With tolerance
def
test_mixup_changes_image
(
self
):
batch_size
=
12
num_classes
=
1000
label_smoothing
=
0.1
images
=
tf
.
random
.
normal
((
batch_size
,
224
,
224
,
3
),
dtype
=
tf
.
float32
)
labels
=
tf
.
range
(
batch_size
)
augmenter
=
augment
.
MixupAndCutmix
(
mixup_alpha
=
1.
,
cutmix_alpha
=
0.
,
num_classes
=
num_classes
)
aug_images
,
aug_labels
=
augmenter
.
distort
(
images
,
labels
)
self
.
assertEqual
(
images
.
shape
,
aug_images
.
shape
)
self
.
assertEqual
(
images
.
dtype
,
aug_images
.
dtype
)
self
.
assertEqual
([
batch_size
,
num_classes
],
aug_labels
.
shape
)
self
.
assertAllLessEqual
(
aug_labels
,
1.
-
label_smoothing
+
2.
/
num_classes
)
# With tolerance
self
.
assertAllGreaterEqual
(
aug_labels
,
label_smoothing
/
num_classes
-
1e4
)
# With tolerance
self
.
assertFalse
(
tf
.
math
.
reduce_all
(
images
==
aug_images
))
def
test_cutmix_changes_image
(
self
):
batch_size
=
12
num_classes
=
1000
label_smoothing
=
0.1
images
=
tf
.
random
.
normal
((
batch_size
,
224
,
224
,
3
),
dtype
=
tf
.
float32
)
labels
=
tf
.
range
(
batch_size
)
augmenter
=
augment
.
MixupAndCutmix
(
mixup_alpha
=
0.
,
cutmix_alpha
=
1.
,
num_classes
=
num_classes
)
aug_images
,
aug_labels
=
augmenter
.
distort
(
images
,
labels
)
self
.
assertEqual
(
images
.
shape
,
aug_images
.
shape
)
self
.
assertEqual
(
images
.
dtype
,
aug_images
.
dtype
)
self
.
assertEqual
([
batch_size
,
num_classes
],
aug_labels
.
shape
)
self
.
assertAllLessEqual
(
aug_labels
,
1.
-
label_smoothing
+
2.
/
num_classes
)
# With tolerance
self
.
assertAllGreaterEqual
(
aug_labels
,
label_smoothing
/
num_classes
-
1e4
)
# With tolerance
self
.
assertFalse
(
tf
.
math
.
reduce_all
(
images
==
aug_images
))
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/ops/box_matcher.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Box matcher implementation."""
import
tensorflow
as
tf
class
BoxMatcher
:
"""Matcher based on highest value.
This class computes matches from a similarity matrix. Each column is matched
to a single row.
To support object detection target assignment this class enables setting both
positive_threshold (upper threshold) and negative_threshold (lower thresholds)
defining three categories of similarity which define whether examples are
positive, negative, or ignored, for example:
(1) thresholds=[negative_threshold, positive_threshold], and
indicators=[negative_value, ignore_value, positive_value]: The similarity
metrics below negative_threshold will be assigned with negative_value,
the metrics between negative_threshold and positive_threshold will be
assigned ignore_value, and the metrics above positive_threshold will be
assigned positive_value.
(2) thresholds=[negative_threshold, positive_threshold], and
indicators=[ignore_value, negative_value, positive_value]: The similarity
metric below negative_threshold will be assigned with ignore_value,
the metrics between negative_threshold and positive_threshold will be
assigned negative_value, and the metrics above positive_threshold will be
assigned positive_value.
"""
def
__init__
(
self
,
thresholds
,
indicators
,
force_match_for_each_col
=
False
):
"""Construct BoxMatcher.
Args:
thresholds: A list of thresholds to classify boxes into
different buckets. The list needs to be sorted, and will be prepended
with -Inf and appended with +Inf.
indicators: A list of values to assign for each bucket. len(`indicators`)
must equal to len(`thresholds`) + 1.
force_match_for_each_col: If True, ensures that each column is matched to
at least one row (which is not guaranteed otherwise if the
positive_threshold is high). Defaults to False. If True, all force
matched row will be assigned to `indicators[-1]`.
Raises:
ValueError: If `threshold` not sorted,
or len(indicators) != len(threshold) + 1
"""
if
not
all
([
lo
<=
hi
for
(
lo
,
hi
)
in
zip
(
thresholds
[:
-
1
],
thresholds
[
1
:])]):
raise
ValueError
(
'`threshold` must be sorted, got {}'
.
format
(
thresholds
))
self
.
indicators
=
indicators
if
len
(
indicators
)
!=
len
(
thresholds
)
+
1
:
raise
ValueError
(
'len(`indicators`) must be len(`thresholds`) + 1, got '
'indicators {}, thresholds {}'
.
format
(
indicators
,
thresholds
))
thresholds
=
thresholds
[:]
thresholds
.
insert
(
0
,
-
float
(
'inf'
))
thresholds
.
append
(
float
(
'inf'
))
self
.
thresholds
=
thresholds
self
.
_force_match_for_each_col
=
force_match_for_each_col
def
__call__
(
self
,
similarity_matrix
):
"""Tries to match each column of the similarity matrix to a row.
Args:
similarity_matrix: A float tensor of shape [N, M] representing any
similarity metric.
Returns:
A integer tensor of shape [N] with corresponding match indices for each
of M columns, for positive match, the match result will be the
corresponding row index, for negative match, the match will be
`negative_value`, for ignored match, the match result will be
`ignore_value`.
"""
squeeze_result
=
False
if
len
(
similarity_matrix
.
shape
)
==
2
:
squeeze_result
=
True
similarity_matrix
=
tf
.
expand_dims
(
similarity_matrix
,
axis
=
0
)
static_shape
=
similarity_matrix
.
shape
.
as_list
()
num_rows
=
static_shape
[
1
]
or
tf
.
shape
(
similarity_matrix
)[
1
]
batch_size
=
static_shape
[
0
]
or
tf
.
shape
(
similarity_matrix
)[
0
]
def
_match_when_rows_are_empty
():
"""Performs matching when the rows of similarity matrix are empty.
When the rows are empty, all detections are false positives. So we return
a tensor of -1's to indicate that the columns do not match to any rows.
Returns:
matches: int32 tensor indicating the row each column matches to.
"""
with
tf
.
name_scope
(
'empty_gt_boxes'
):
matches
=
tf
.
zeros
([
batch_size
,
num_rows
],
dtype
=
tf
.
int32
)
match_labels
=
-
tf
.
ones
([
batch_size
,
num_rows
],
dtype
=
tf
.
int32
)
return
matches
,
match_labels
def
_match_when_rows_are_non_empty
():
"""Performs matching when the rows of similarity matrix are non empty.
Returns:
matches: int32 tensor indicating the row each column matches to.
"""
# Matches for each column
with
tf
.
name_scope
(
'non_empty_gt_boxes'
):
matches
=
tf
.
argmax
(
similarity_matrix
,
axis
=-
1
,
output_type
=
tf
.
int32
)
# Get logical indices of ignored and unmatched columns as tf.int64
matched_vals
=
tf
.
reduce_max
(
similarity_matrix
,
axis
=-
1
)
matched_indicators
=
tf
.
zeros
([
batch_size
,
num_rows
],
tf
.
int32
)
match_dtype
=
matched_vals
.
dtype
for
(
ind
,
low
,
high
)
in
zip
(
self
.
indicators
,
self
.
thresholds
[:
-
1
],
self
.
thresholds
[
1
:]):
low_threshold
=
tf
.
cast
(
low
,
match_dtype
)
high_threshold
=
tf
.
cast
(
high
,
match_dtype
)
mask
=
tf
.
logical_and
(
tf
.
greater_equal
(
matched_vals
,
low_threshold
),
tf
.
less
(
matched_vals
,
high_threshold
))
matched_indicators
=
self
.
_set_values_using_indicator
(
matched_indicators
,
mask
,
ind
)
if
self
.
_force_match_for_each_col
:
# [batch_size, M], for each col (groundtruth_box), find the best
# matching row (anchor).
force_match_column_ids
=
tf
.
argmax
(
input
=
similarity_matrix
,
axis
=
1
,
output_type
=
tf
.
int32
)
# [batch_size, M, N]
force_match_column_indicators
=
tf
.
one_hot
(
force_match_column_ids
,
depth
=
num_rows
)
# [batch_size, N], for each row (anchor), find the largest column
# index for groundtruth box
force_match_row_ids
=
tf
.
argmax
(
input
=
force_match_column_indicators
,
axis
=
1
,
output_type
=
tf
.
int32
)
# [batch_size, N]
force_match_column_mask
=
tf
.
cast
(
tf
.
reduce_max
(
force_match_column_indicators
,
axis
=
1
),
tf
.
bool
)
# [batch_size, N]
final_matches
=
tf
.
where
(
force_match_column_mask
,
force_match_row_ids
,
matches
)
final_matched_indicators
=
tf
.
where
(
force_match_column_mask
,
self
.
indicators
[
-
1
]
*
tf
.
ones
([
batch_size
,
num_rows
],
dtype
=
tf
.
int32
),
matched_indicators
)
return
final_matches
,
final_matched_indicators
else
:
return
matches
,
matched_indicators
num_gt_boxes
=
similarity_matrix
.
shape
.
as_list
()[
-
1
]
or
tf
.
shape
(
similarity_matrix
)[
-
1
]
result_match
,
result_matched_indicators
=
tf
.
cond
(
pred
=
tf
.
greater
(
num_gt_boxes
,
0
),
true_fn
=
_match_when_rows_are_non_empty
,
false_fn
=
_match_when_rows_are_empty
)
if
squeeze_result
:
result_match
=
tf
.
squeeze
(
result_match
,
axis
=
0
)
result_matched_indicators
=
tf
.
squeeze
(
result_matched_indicators
,
axis
=
0
)
return
result_match
,
result_matched_indicators
def
_set_values_using_indicator
(
self
,
x
,
indicator
,
val
):
"""Set the indicated fields of x to val.
Args:
x: tensor.
indicator: boolean with same shape as x.
val: scalar with value to set.
Returns:
modified tensor.
"""
indicator
=
tf
.
cast
(
indicator
,
x
.
dtype
)
return
tf
.
add
(
tf
.
multiply
(
x
,
1
-
indicator
),
val
*
indicator
)
official/vision/ops/box_matcher_test.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for box_matcher.py."""
import
tensorflow
as
tf
from
official.vision.ops
import
box_matcher
class
BoxMatcherTest
(
tf
.
test
.
TestCase
):
def
test_box_matcher_unbatched
(
self
):
sim_matrix
=
tf
.
constant
(
[[
0.04
,
0
,
0
,
0
],
[
0
,
0
,
1.
,
0
]],
dtype
=
tf
.
float32
)
fg_threshold
=
0.5
bg_thresh_hi
=
0.2
bg_thresh_lo
=
0.0
matcher
=
box_matcher
.
BoxMatcher
(
thresholds
=
[
bg_thresh_lo
,
bg_thresh_hi
,
fg_threshold
],
indicators
=
[
-
3
,
-
2
,
-
1
,
1
])
match_indices
,
match_indicators
=
matcher
(
sim_matrix
)
positive_matches
=
tf
.
greater_equal
(
match_indicators
,
0
)
negative_matches
=
tf
.
equal
(
match_indicators
,
-
2
)
self
.
assertAllEqual
(
positive_matches
.
numpy
(),
[
False
,
True
])
self
.
assertAllEqual
(
negative_matches
.
numpy
(),
[
True
,
False
])
self
.
assertAllEqual
(
match_indices
.
numpy
(),
[
0
,
2
])
self
.
assertAllEqual
(
match_indicators
.
numpy
(),
[
-
2
,
1
])
def
test_box_matcher_batched
(
self
):
sim_matrix
=
tf
.
constant
(
[[[
0.04
,
0
,
0
,
0
],
[
0
,
0
,
1.
,
0
]]],
dtype
=
tf
.
float32
)
fg_threshold
=
0.5
bg_thresh_hi
=
0.2
bg_thresh_lo
=
0.0
matcher
=
box_matcher
.
BoxMatcher
(
thresholds
=
[
bg_thresh_lo
,
bg_thresh_hi
,
fg_threshold
],
indicators
=
[
-
3
,
-
2
,
-
1
,
1
])
match_indices
,
match_indicators
=
matcher
(
sim_matrix
)
positive_matches
=
tf
.
greater_equal
(
match_indicators
,
0
)
negative_matches
=
tf
.
equal
(
match_indicators
,
-
2
)
self
.
assertAllEqual
(
positive_matches
.
numpy
(),
[[
False
,
True
]])
self
.
assertAllEqual
(
negative_matches
.
numpy
(),
[[
True
,
False
]])
self
.
assertAllEqual
(
match_indices
.
numpy
(),
[[
0
,
2
]])
self
.
assertAllEqual
(
match_indicators
.
numpy
(),
[[
-
2
,
1
]])
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/ops/box_ops.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Box related ops."""
# Import libraries
import
numpy
as
np
import
tensorflow
as
tf
EPSILON
=
1e-8
BBOX_XFORM_CLIP
=
np
.
log
(
1000.
/
16.
)
def
yxyx_to_xywh
(
boxes
):
"""Converts boxes from ymin, xmin, ymax, xmax to xmin, ymin, width, height.
Args:
boxes: a numpy array whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
Returns:
boxes: a numpy array whose shape is the same as `boxes` in new format.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if
boxes
.
shape
[
-
1
]
!=
4
:
raise
ValueError
(
'boxes.shape[-1] is {:d}, but must be 4.'
.
format
(
boxes
.
shape
[
-
1
]))
boxes_ymin
=
boxes
[...,
0
]
boxes_xmin
=
boxes
[...,
1
]
boxes_width
=
boxes
[...,
3
]
-
boxes
[...,
1
]
boxes_height
=
boxes
[...,
2
]
-
boxes
[...,
0
]
new_boxes
=
np
.
stack
(
[
boxes_xmin
,
boxes_ymin
,
boxes_width
,
boxes_height
],
axis
=-
1
)
return
new_boxes
def
yxyx_to_cycxhw
(
boxes
):
"""Converts box corner coordinates to center plus height and width terms.
Args:
boxes: a `Tensor` with last dimension of 4, representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
Returns:
boxes: a `Tensor` with the same shape as the inputted boxes, in the format
of cy, cx, height, width.
Raises:
ValueError: if the last dimension of boxes is not 4.
"""
if
boxes
.
shape
[
-
1
]
!=
4
:
raise
ValueError
(
'Last dimension of boxes must be 4 but is {:d}'
.
format
(
boxes
.
shape
[
-
1
]))
boxes_ycenter
=
(
boxes
[...,
0
]
+
boxes
[...,
2
])
/
2
boxes_xcenter
=
(
boxes
[...,
1
]
+
boxes
[...,
3
])
/
2
boxes_height
=
boxes
[...,
2
]
-
boxes
[...,
0
]
boxes_width
=
boxes
[...,
3
]
-
boxes
[...,
1
]
new_boxes
=
tf
.
stack
(
[
boxes_ycenter
,
boxes_xcenter
,
boxes_height
,
boxes_width
],
axis
=-
1
)
return
new_boxes
def
cycxhw_to_yxyx
(
boxes
):
"""Converts box center coordinates plus height and width terms to corner.
Args:
boxes: a numpy array whose last dimension is 4 representing the coordinates
of boxes in cy, cx, height, width order.
Returns:
boxes: a numpy array whose shape is the same as `boxes` in new format.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if
boxes
.
shape
[
-
1
]
!=
4
:
raise
ValueError
(
'boxes.shape[-1] is {:d}, but must be 4.'
.
format
(
boxes
.
shape
[
-
1
]))
boxes_ymin
=
boxes
[...,
0
]
-
boxes
[...,
2
]
/
2
boxes_xmin
=
boxes
[...,
1
]
-
boxes
[...,
3
]
/
2
boxes_ymax
=
boxes
[...,
0
]
+
boxes
[...,
2
]
/
2
boxes_xmax
=
boxes
[...,
1
]
+
boxes
[...,
3
]
/
2
new_boxes
=
tf
.
stack
([
boxes_ymin
,
boxes_xmin
,
boxes_ymax
,
boxes_xmax
],
axis
=-
1
)
return
new_boxes
def
jitter_boxes
(
boxes
,
noise_scale
=
0.025
):
"""Jitter the box coordinates by some noise distribution.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
noise_scale: a python float which specifies the magnitude of noise. The rule
of thumb is to set this between (0, 0.1]. The default value is found to
mimic the noisy detections best empirically.
Returns:
jittered_boxes: a tensor whose shape is the same as `boxes` representing
the jittered boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if
boxes
.
shape
[
-
1
]
!=
4
:
raise
ValueError
(
'boxes.shape[-1] is {:d}, but must be 4.'
.
format
(
boxes
.
shape
[
-
1
]))
with
tf
.
name_scope
(
'jitter_boxes'
):
bbox_jitters
=
tf
.
random
.
normal
(
tf
.
shape
(
boxes
),
stddev
=
noise_scale
)
ymin
=
boxes
[...,
0
:
1
]
xmin
=
boxes
[...,
1
:
2
]
ymax
=
boxes
[...,
2
:
3
]
xmax
=
boxes
[...,
3
:
4
]
width
=
xmax
-
xmin
height
=
ymax
-
ymin
new_center_x
=
(
xmin
+
xmax
)
/
2.0
+
bbox_jitters
[...,
0
:
1
]
*
width
new_center_y
=
(
ymin
+
ymax
)
/
2.0
+
bbox_jitters
[...,
1
:
2
]
*
height
new_width
=
width
*
tf
.
math
.
exp
(
bbox_jitters
[...,
2
:
3
])
new_height
=
height
*
tf
.
math
.
exp
(
bbox_jitters
[...,
3
:
4
])
jittered_boxes
=
tf
.
concat
(
[
new_center_y
-
new_height
*
0.5
,
new_center_x
-
new_width
*
0.5
,
new_center_y
+
new_height
*
0.5
,
new_center_x
+
new_width
*
0.5
],
axis
=-
1
)
return
jittered_boxes
def
normalize_boxes
(
boxes
,
image_shape
):
"""Converts boxes to the normalized coordinates.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
normalized_boxes: a tensor whose shape is the same as `boxes` representing
the normalized boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if
boxes
.
shape
[
-
1
]
!=
4
:
raise
ValueError
(
'boxes.shape[-1] is {:d}, but must be 4.'
.
format
(
boxes
.
shape
[
-
1
]))
with
tf
.
name_scope
(
'normalize_boxes'
):
if
isinstance
(
image_shape
,
list
)
or
isinstance
(
image_shape
,
tuple
):
height
,
width
=
image_shape
else
:
image_shape
=
tf
.
cast
(
image_shape
,
dtype
=
boxes
.
dtype
)
height
=
image_shape
[...,
0
:
1
]
width
=
image_shape
[...,
1
:
2
]
ymin
=
boxes
[...,
0
:
1
]
/
height
xmin
=
boxes
[...,
1
:
2
]
/
width
ymax
=
boxes
[...,
2
:
3
]
/
height
xmax
=
boxes
[...,
3
:
4
]
/
width
normalized_boxes
=
tf
.
concat
([
ymin
,
xmin
,
ymax
,
xmax
],
axis
=-
1
)
return
normalized_boxes
def
denormalize_boxes
(
boxes
,
image_shape
):
"""Converts boxes normalized by [height, width] to pixel coordinates.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
denormalized_boxes: a tensor whose shape is the same as `boxes` representing
the denormalized boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
with
tf
.
name_scope
(
'denormalize_boxes'
):
if
isinstance
(
image_shape
,
list
)
or
isinstance
(
image_shape
,
tuple
):
height
,
width
=
image_shape
else
:
image_shape
=
tf
.
cast
(
image_shape
,
dtype
=
boxes
.
dtype
)
height
,
width
=
tf
.
split
(
image_shape
,
2
,
axis
=-
1
)
ymin
,
xmin
,
ymax
,
xmax
=
tf
.
split
(
boxes
,
4
,
axis
=-
1
)
ymin
=
ymin
*
height
xmin
=
xmin
*
width
ymax
=
ymax
*
height
xmax
=
xmax
*
width
denormalized_boxes
=
tf
.
concat
([
ymin
,
xmin
,
ymax
,
xmax
],
axis
=-
1
)
return
denormalized_boxes
def
clip_boxes
(
boxes
,
image_shape
):
"""Clips boxes to image boundaries.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
clipped_boxes: a tensor whose shape is the same as `boxes` representing the
clipped boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if
boxes
.
shape
[
-
1
]
!=
4
:
raise
ValueError
(
'boxes.shape[-1] is {:d}, but must be 4.'
.
format
(
boxes
.
shape
[
-
1
]))
with
tf
.
name_scope
(
'clip_boxes'
):
if
isinstance
(
image_shape
,
list
)
or
isinstance
(
image_shape
,
tuple
):
height
,
width
=
image_shape
max_length
=
[
height
,
width
,
height
,
width
]
else
:
image_shape
=
tf
.
cast
(
image_shape
,
dtype
=
boxes
.
dtype
)
height
,
width
=
tf
.
unstack
(
image_shape
,
axis
=-
1
)
max_length
=
tf
.
stack
([
height
,
width
,
height
,
width
],
axis
=-
1
)
clipped_boxes
=
tf
.
math
.
maximum
(
tf
.
math
.
minimum
(
boxes
,
max_length
),
0.0
)
return
clipped_boxes
def
compute_outer_boxes
(
boxes
,
image_shape
,
scale
=
1.0
):
"""Compute outer box encloses an object with a margin.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
scale: a float number specifying the scale of output outer boxes to input
`boxes`.
Returns:
outer_boxes: a tensor whose shape is the same as `boxes` representing the
outer boxes.
"""
if
scale
<
1.0
:
raise
ValueError
(
'scale is {}, but outer box scale must be greater than 1.0.'
.
format
(
scale
))
centers_y
=
(
boxes
[...,
0
]
+
boxes
[...,
2
])
/
2.0
centers_x
=
(
boxes
[...,
1
]
+
boxes
[...,
3
])
/
2.0
box_height
=
(
boxes
[...,
2
]
-
boxes
[...,
0
])
*
scale
box_width
=
(
boxes
[...,
3
]
-
boxes
[...,
1
])
*
scale
outer_boxes
=
tf
.
stack
(
[
centers_y
-
box_height
/
2.0
,
centers_x
-
box_width
/
2.0
,
centers_y
+
box_height
/
2.0
,
centers_x
+
box_width
/
2.0
],
axis
=
1
)
outer_boxes
=
clip_boxes
(
outer_boxes
,
image_shape
)
return
outer_boxes
def
encode_boxes
(
boxes
,
anchors
,
weights
=
None
):
"""Encode boxes to targets.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
encoded_boxes: a tensor whose shape is the same as `boxes` representing the
encoded box targets.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if
boxes
.
shape
[
-
1
]
!=
4
:
raise
ValueError
(
'boxes.shape[-1] is {:d}, but must be 4.'
.
format
(
boxes
.
shape
[
-
1
]))
with
tf
.
name_scope
(
'encode_boxes'
):
boxes
=
tf
.
cast
(
boxes
,
dtype
=
anchors
.
dtype
)
ymin
=
boxes
[...,
0
:
1
]
xmin
=
boxes
[...,
1
:
2
]
ymax
=
boxes
[...,
2
:
3
]
xmax
=
boxes
[...,
3
:
4
]
box_h
=
ymax
-
ymin
box_w
=
xmax
-
xmin
box_yc
=
ymin
+
0.5
*
box_h
box_xc
=
xmin
+
0.5
*
box_w
anchor_ymin
=
anchors
[...,
0
:
1
]
anchor_xmin
=
anchors
[...,
1
:
2
]
anchor_ymax
=
anchors
[...,
2
:
3
]
anchor_xmax
=
anchors
[...,
3
:
4
]
anchor_h
=
anchor_ymax
-
anchor_ymin
anchor_w
=
anchor_xmax
-
anchor_xmin
anchor_yc
=
anchor_ymin
+
0.5
*
anchor_h
anchor_xc
=
anchor_xmin
+
0.5
*
anchor_w
encoded_dy
=
(
box_yc
-
anchor_yc
)
/
anchor_h
encoded_dx
=
(
box_xc
-
anchor_xc
)
/
anchor_w
encoded_dh
=
tf
.
math
.
log
(
box_h
/
anchor_h
)
encoded_dw
=
tf
.
math
.
log
(
box_w
/
anchor_w
)
if
weights
:
encoded_dy
*=
weights
[
0
]
encoded_dx
*=
weights
[
1
]
encoded_dh
*=
weights
[
2
]
encoded_dw
*=
weights
[
3
]
encoded_boxes
=
tf
.
concat
(
[
encoded_dy
,
encoded_dx
,
encoded_dh
,
encoded_dw
],
axis
=-
1
)
return
encoded_boxes
def
decode_boxes
(
encoded_boxes
,
anchors
,
weights
=
None
):
"""Decode boxes.
Args:
encoded_boxes: a tensor whose last dimension is 4 representing the
coordinates of encoded boxes in ymin, xmin, ymax, xmax order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
encoded_boxes: a tensor whose shape is the same as `boxes` representing the
decoded box targets.
"""
if
encoded_boxes
.
shape
[
-
1
]
!=
4
:
raise
ValueError
(
'encoded_boxes.shape[-1] is {:d}, but must be 4.'
.
format
(
encoded_boxes
.
shape
[
-
1
]))
with
tf
.
name_scope
(
'decode_boxes'
):
encoded_boxes
=
tf
.
cast
(
encoded_boxes
,
dtype
=
anchors
.
dtype
)
dy
=
encoded_boxes
[...,
0
:
1
]
dx
=
encoded_boxes
[...,
1
:
2
]
dh
=
encoded_boxes
[...,
2
:
3
]
dw
=
encoded_boxes
[...,
3
:
4
]
if
weights
:
dy
/=
weights
[
0
]
dx
/=
weights
[
1
]
dh
/=
weights
[
2
]
dw
/=
weights
[
3
]
dh
=
tf
.
math
.
minimum
(
dh
,
BBOX_XFORM_CLIP
)
dw
=
tf
.
math
.
minimum
(
dw
,
BBOX_XFORM_CLIP
)
anchor_ymin
=
anchors
[...,
0
:
1
]
anchor_xmin
=
anchors
[...,
1
:
2
]
anchor_ymax
=
anchors
[...,
2
:
3
]
anchor_xmax
=
anchors
[...,
3
:
4
]
anchor_h
=
anchor_ymax
-
anchor_ymin
anchor_w
=
anchor_xmax
-
anchor_xmin
anchor_yc
=
anchor_ymin
+
0.5
*
anchor_h
anchor_xc
=
anchor_xmin
+
0.5
*
anchor_w
decoded_boxes_yc
=
dy
*
anchor_h
+
anchor_yc
decoded_boxes_xc
=
dx
*
anchor_w
+
anchor_xc
decoded_boxes_h
=
tf
.
math
.
exp
(
dh
)
*
anchor_h
decoded_boxes_w
=
tf
.
math
.
exp
(
dw
)
*
anchor_w
decoded_boxes_ymin
=
decoded_boxes_yc
-
0.5
*
decoded_boxes_h
decoded_boxes_xmin
=
decoded_boxes_xc
-
0.5
*
decoded_boxes_w
decoded_boxes_ymax
=
decoded_boxes_ymin
+
decoded_boxes_h
decoded_boxes_xmax
=
decoded_boxes_xmin
+
decoded_boxes_w
decoded_boxes
=
tf
.
concat
(
[
decoded_boxes_ymin
,
decoded_boxes_xmin
,
decoded_boxes_ymax
,
decoded_boxes_xmax
],
axis
=-
1
)
return
decoded_boxes
def
filter_boxes
(
boxes
,
scores
,
image_shape
,
min_size_threshold
):
"""Filter and remove boxes that are too small or fall outside the image.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
representing the original scores of the boxes.
image_shape: a tensor whose shape is the same as, or `broadcastable` to
`boxes` except the last dimension, which is 2, representing [height,
width] of the scaled image.
min_size_threshold: a float representing the minimal box size in each side
(w.r.t. the scaled image). Boxes whose sides are smaller than it will be
filtered out.
Returns:
filtered_boxes: a tensor whose shape is the same as `boxes` but with
the position of the filtered boxes are filled with 0.
filtered_scores: a tensor whose shape is the same as 'scores' but with
the positinon of the filtered boxes filled with 0.
"""
if
boxes
.
shape
[
-
1
]
!=
4
:
raise
ValueError
(
'boxes.shape[1] is {:d}, but must be 4.'
.
format
(
boxes
.
shape
[
-
1
]))
with
tf
.
name_scope
(
'filter_boxes'
):
if
isinstance
(
image_shape
,
list
)
or
isinstance
(
image_shape
,
tuple
):
height
,
width
=
image_shape
else
:
image_shape
=
tf
.
cast
(
image_shape
,
dtype
=
boxes
.
dtype
)
height
=
image_shape
[...,
0
]
width
=
image_shape
[...,
1
]
ymin
=
boxes
[...,
0
]
xmin
=
boxes
[...,
1
]
ymax
=
boxes
[...,
2
]
xmax
=
boxes
[...,
3
]
h
=
ymax
-
ymin
w
=
xmax
-
xmin
yc
=
ymin
+
0.5
*
h
xc
=
xmin
+
0.5
*
w
min_size
=
tf
.
cast
(
tf
.
math
.
maximum
(
min_size_threshold
,
0.0
),
dtype
=
boxes
.
dtype
)
filtered_size_mask
=
tf
.
math
.
logical_and
(
tf
.
math
.
greater
(
h
,
min_size
),
tf
.
math
.
greater
(
w
,
min_size
))
filtered_center_mask
=
tf
.
logical_and
(
tf
.
math
.
logical_and
(
tf
.
math
.
greater
(
yc
,
0.0
),
tf
.
math
.
less
(
yc
,
height
)),
tf
.
math
.
logical_and
(
tf
.
math
.
greater
(
xc
,
0.0
),
tf
.
math
.
less
(
xc
,
width
)))
filtered_mask
=
tf
.
math
.
logical_and
(
filtered_size_mask
,
filtered_center_mask
)
filtered_scores
=
tf
.
where
(
filtered_mask
,
scores
,
tf
.
zeros_like
(
scores
))
filtered_boxes
=
tf
.
cast
(
tf
.
expand_dims
(
filtered_mask
,
axis
=-
1
),
dtype
=
boxes
.
dtype
)
*
boxes
return
filtered_boxes
,
filtered_scores
def
filter_boxes_by_scores
(
boxes
,
scores
,
min_score_threshold
):
"""Filter and remove boxes whose scores are smaller than the threshold.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
representing the original scores of the boxes.
min_score_threshold: a float representing the minimal box score threshold.
Boxes whose score are smaller than it will be filtered out.
Returns:
filtered_boxes: a tensor whose shape is the same as `boxes` but with
the position of the filtered boxes are filled with -1.
filtered_scores: a tensor whose shape is the same as 'scores' but with
the
"""
if
boxes
.
shape
[
-
1
]
!=
4
:
raise
ValueError
(
'boxes.shape[1] is {:d}, but must be 4.'
.
format
(
boxes
.
shape
[
-
1
]))
with
tf
.
name_scope
(
'filter_boxes_by_scores'
):
filtered_mask
=
tf
.
math
.
greater
(
scores
,
min_score_threshold
)
filtered_scores
=
tf
.
where
(
filtered_mask
,
scores
,
-
tf
.
ones_like
(
scores
))
filtered_boxes
=
tf
.
cast
(
tf
.
expand_dims
(
filtered_mask
,
axis
=-
1
),
dtype
=
boxes
.
dtype
)
*
boxes
return
filtered_boxes
,
filtered_scores
def
gather_instances
(
selected_indices
,
instances
,
*
aux_instances
):
"""Gather instances by indices.
Args:
selected_indices: a Tensor of shape [batch, K] which indicates the selected
indices in instance dimension (2nd dimension).
instances: a Tensor of shape [batch, N, ...] where the 2nd dimension is
the instance dimension to be selected from.
*aux_instances: the additional Tensors whose shapes are in [batch, N, ...]
which are the tensors to be selected from using the `selected_indices`.
Returns:
selected_instances: the tensor of shape [batch, K, ...] which corresponds to
the selected instances of the `instances` tensor.
selected_aux_instances: the additional tensors of shape [batch, K, ...]
which corresponds to the selected instances of the `aus_instances`
tensors.
"""
batch_size
=
instances
.
shape
[
0
]
if
batch_size
==
1
:
selected_instances
=
tf
.
squeeze
(
tf
.
gather
(
instances
,
selected_indices
,
axis
=
1
),
axis
=
1
)
if
aux_instances
:
selected_aux_instances
=
[
tf
.
squeeze
(
tf
.
gather
(
a
,
selected_indices
,
axis
=
1
),
axis
=
1
)
for
a
in
aux_instances
]
return
tuple
([
selected_instances
]
+
selected_aux_instances
)
else
:
return
selected_instances
else
:
indices_shape
=
tf
.
shape
(
selected_indices
)
batch_indices
=
(
tf
.
expand_dims
(
tf
.
range
(
indices_shape
[
0
]),
axis
=-
1
)
*
tf
.
ones
([
1
,
indices_shape
[
-
1
]],
dtype
=
tf
.
int32
))
gather_nd_indices
=
tf
.
stack
(
[
batch_indices
,
selected_indices
],
axis
=-
1
)
selected_instances
=
tf
.
gather_nd
(
instances
,
gather_nd_indices
)
if
aux_instances
:
selected_aux_instances
=
[
tf
.
gather_nd
(
a
,
gather_nd_indices
)
for
a
in
aux_instances
]
return
tuple
([
selected_instances
]
+
selected_aux_instances
)
else
:
return
selected_instances
def
top_k_boxes
(
boxes
,
scores
,
k
):
"""Sort and select top k boxes according to the scores.
Args:
boxes: a tensor of shape [batch_size, N, 4] representing the coordinate of
the boxes. N is the number of boxes per image.
scores: a tensor of shsape [batch_size, N] representing the socre of the
boxes.
k: an integer or a tensor indicating the top k number.
Returns:
selected_boxes: a tensor of shape [batch_size, k, 4] representing the
selected top k box coordinates.
selected_scores: a tensor of shape [batch_size, k] representing the selected
top k box scores.
"""
with
tf
.
name_scope
(
'top_k_boxes'
):
selected_scores
,
top_k_indices
=
tf
.
nn
.
top_k
(
scores
,
k
=
k
,
sorted
=
True
)
selected_boxes
=
gather_instances
(
top_k_indices
,
boxes
)
return
selected_boxes
,
selected_scores
def
get_non_empty_box_indices
(
boxes
):
"""Get indices for non-empty boxes."""
# Selects indices if box height or width is 0.
height
=
boxes
[:,
2
]
-
boxes
[:,
0
]
width
=
boxes
[:,
3
]
-
boxes
[:,
1
]
indices
=
tf
.
where
(
tf
.
logical_and
(
tf
.
greater
(
height
,
0
),
tf
.
greater
(
width
,
0
)))
return
indices
[:,
0
]
def
bbox_overlap
(
boxes
,
gt_boxes
):
"""Calculates the overlap between proposal and ground truth boxes.
Some `boxes` or `gt_boxes` may have been padded. The returned `iou` tensor
for these boxes will be -1.
Args:
boxes: a tensor with a shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. This
tensor might have paddings with a negative value.
Returns:
iou: a tensor with as a shape of [batch_size, N, MAX_NUM_INSTANCES].
"""
with
tf
.
name_scope
(
'bbox_overlap'
):
bb_y_min
,
bb_x_min
,
bb_y_max
,
bb_x_max
=
tf
.
split
(
value
=
boxes
,
num_or_size_splits
=
4
,
axis
=
2
)
gt_y_min
,
gt_x_min
,
gt_y_max
,
gt_x_max
=
tf
.
split
(
value
=
gt_boxes
,
num_or_size_splits
=
4
,
axis
=
2
)
# Calculates the intersection area.
i_xmin
=
tf
.
math
.
maximum
(
bb_x_min
,
tf
.
transpose
(
gt_x_min
,
[
0
,
2
,
1
]))
i_xmax
=
tf
.
math
.
minimum
(
bb_x_max
,
tf
.
transpose
(
gt_x_max
,
[
0
,
2
,
1
]))
i_ymin
=
tf
.
math
.
maximum
(
bb_y_min
,
tf
.
transpose
(
gt_y_min
,
[
0
,
2
,
1
]))
i_ymax
=
tf
.
math
.
minimum
(
bb_y_max
,
tf
.
transpose
(
gt_y_max
,
[
0
,
2
,
1
]))
i_area
=
(
tf
.
math
.
maximum
((
i_xmax
-
i_xmin
),
0
)
*
tf
.
math
.
maximum
((
i_ymax
-
i_ymin
),
0
))
# Calculates the union area.
bb_area
=
(
bb_y_max
-
bb_y_min
)
*
(
bb_x_max
-
bb_x_min
)
gt_area
=
(
gt_y_max
-
gt_y_min
)
*
(
gt_x_max
-
gt_x_min
)
# Adds a small epsilon to avoid divide-by-zero.
u_area
=
bb_area
+
tf
.
transpose
(
gt_area
,
[
0
,
2
,
1
])
-
i_area
+
1e-8
# Calculates IoU.
iou
=
i_area
/
u_area
# Fills -1 for IoU entries between the padded ground truth boxes.
gt_invalid_mask
=
tf
.
less
(
tf
.
reduce_max
(
gt_boxes
,
axis
=-
1
,
keepdims
=
True
),
0.0
)
padding_mask
=
tf
.
logical_or
(
tf
.
zeros_like
(
bb_x_min
,
dtype
=
tf
.
bool
),
tf
.
transpose
(
gt_invalid_mask
,
[
0
,
2
,
1
]))
iou
=
tf
.
where
(
padding_mask
,
-
tf
.
ones_like
(
iou
),
iou
)
# Fills -1 for for invalid (-1) boxes.
boxes_invalid_mask
=
tf
.
less
(
tf
.
reduce_max
(
boxes
,
axis
=-
1
,
keepdims
=
True
),
0.0
)
iou
=
tf
.
where
(
boxes_invalid_mask
,
-
tf
.
ones_like
(
iou
),
iou
)
return
iou
def
bbox_generalized_overlap
(
boxes
,
gt_boxes
):
"""Calculates the GIOU between proposal and ground truth boxes.
The generalized intersection of union is an adjustment of the traditional IOU
metric which provides continuous updates even for predictions with no overlap.
This metric is defined in https://giou.stanford.edu/GIoU.pdf. Note, some
`gt_boxes` may have been padded. The returned `giou` tensor for these boxes
will be -1.
Args:
boxes: a `Tensor` with a shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
gt_boxes: a `Tensor` with a shape of [batch_size, max_num_instances, 4].
This tensor may have paddings with a negative value and will also be in
the [ymin, xmin, ymax, xmax] format.
Returns:
giou: a `Tensor` with as a shape of [batch_size, N, max_num_instances].
"""
with
tf
.
name_scope
(
'bbox_generalized_overlap'
):
assert
boxes
.
shape
.
as_list
(
)[
-
1
]
==
4
,
'Boxes must be defined by 4 coordinates.'
assert
gt_boxes
.
shape
.
as_list
(
)[
-
1
]
==
4
,
'Groundtruth boxes must be defined by 4 coordinates.'
bb_y_min
,
bb_x_min
,
bb_y_max
,
bb_x_max
=
tf
.
split
(
value
=
boxes
,
num_or_size_splits
=
4
,
axis
=
2
)
gt_y_min
,
gt_x_min
,
gt_y_max
,
gt_x_max
=
tf
.
split
(
value
=
gt_boxes
,
num_or_size_splits
=
4
,
axis
=
2
)
# Calculates the hull area for each pair of boxes, with one from
# boxes and the other from gt_boxes.
# Outputs for coordinates are of shape [batch_size, N, max_num_instances]
h_xmin
=
tf
.
minimum
(
bb_x_min
,
tf
.
transpose
(
gt_x_min
,
[
0
,
2
,
1
]))
h_xmax
=
tf
.
maximum
(
bb_x_max
,
tf
.
transpose
(
gt_x_max
,
[
0
,
2
,
1
]))
h_ymin
=
tf
.
minimum
(
bb_y_min
,
tf
.
transpose
(
gt_y_min
,
[
0
,
2
,
1
]))
h_ymax
=
tf
.
maximum
(
bb_y_max
,
tf
.
transpose
(
gt_y_max
,
[
0
,
2
,
1
]))
h_area
=
tf
.
maximum
((
h_xmax
-
h_xmin
),
0
)
*
tf
.
maximum
((
h_ymax
-
h_ymin
),
0
)
# Add a small epsilon to avoid divide-by-zero.
h_area
=
h_area
+
1e-8
# Calculates the intersection area.
i_xmin
=
tf
.
maximum
(
bb_x_min
,
tf
.
transpose
(
gt_x_min
,
[
0
,
2
,
1
]))
i_xmax
=
tf
.
minimum
(
bb_x_max
,
tf
.
transpose
(
gt_x_max
,
[
0
,
2
,
1
]))
i_ymin
=
tf
.
maximum
(
bb_y_min
,
tf
.
transpose
(
gt_y_min
,
[
0
,
2
,
1
]))
i_ymax
=
tf
.
minimum
(
bb_y_max
,
tf
.
transpose
(
gt_y_max
,
[
0
,
2
,
1
]))
i_area
=
tf
.
maximum
((
i_xmax
-
i_xmin
),
0
)
*
tf
.
maximum
((
i_ymax
-
i_ymin
),
0
)
# Calculates the union area.
bb_area
=
(
bb_y_max
-
bb_y_min
)
*
(
bb_x_max
-
bb_x_min
)
gt_area
=
(
gt_y_max
-
gt_y_min
)
*
(
gt_x_max
-
gt_x_min
)
# Adds a small epsilon to avoid divide-by-zero.
u_area
=
bb_area
+
tf
.
transpose
(
gt_area
,
[
0
,
2
,
1
])
-
i_area
+
1e-8
# Calculates IoU.
iou
=
i_area
/
u_area
# Calculates GIoU.
giou
=
iou
-
(
h_area
-
u_area
)
/
h_area
# Fills -1 for GIoU entries between the padded ground truth boxes.
gt_invalid_mask
=
tf
.
less
(
tf
.
reduce_max
(
gt_boxes
,
axis
=-
1
,
keepdims
=
True
),
0.0
)
padding_mask
=
tf
.
broadcast_to
(
tf
.
transpose
(
gt_invalid_mask
,
[
0
,
2
,
1
]),
tf
.
shape
(
giou
))
giou
=
tf
.
where
(
padding_mask
,
-
tf
.
ones_like
(
giou
),
giou
)
return
giou
def
box_matching
(
boxes
,
gt_boxes
,
gt_classes
):
"""Match boxes to groundtruth boxes.
Given the proposal boxes and the groundtruth boxes and classes, perform the
groundtruth matching by taking the argmax of the IoU between boxes and
groundtruth boxes.
Args:
boxes: a tensor of shape of [batch_size, N, 4] representing the box
coordiantes to be matched to groundtruth boxes.
gt_boxes: a tensor of shape of [batch_size, MAX_INSTANCES, 4] representing
the groundtruth box coordinates. It is padded with -1s to indicate the
invalid boxes.
gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
classes. It is padded with -1s to indicate the invalid classes.
Returns:
matched_gt_boxes: a tensor of shape of [batch_size, N, 4], representing
the matched groundtruth box coordinates for each input box. If the box
does not overlap with any groundtruth boxes, the matched boxes of it
will be set to all 0s.
matched_gt_classes: a tensor of shape of [batch_size, N], representing
the matched groundtruth classes for each input box. If the box does not
overlap with any groundtruth boxes, the matched box classes of it will
be set to 0, which corresponds to the background class.
matched_gt_indices: a tensor of shape of [batch_size, N], representing
the indices of the matched groundtruth boxes in the original gt_boxes
tensor. If the box does not overlap with any groundtruth boxes, the
index of the matched groundtruth will be set to -1.
matched_iou: a tensor of shape of [batch_size, N], representing the IoU
between the box and its matched groundtruth box. The matched IoU is the
maximum IoU of the box and all the groundtruth boxes.
iou: a tensor of shape of [batch_size, N, K], representing the IoU matrix
between boxes and the groundtruth boxes. The IoU between a box and the
invalid groundtruth boxes whose coordinates are [-1, -1, -1, -1] is -1.
"""
# Compute IoU between boxes and gt_boxes.
# iou <- [batch_size, N, K]
iou
=
bbox_overlap
(
boxes
,
gt_boxes
)
# max_iou <- [batch_size, N]
# 0.0 -> no match to gt, or -1.0 match to no gt
matched_iou
=
tf
.
reduce_max
(
iou
,
axis
=-
1
)
# background_box_mask <- bool, [batch_size, N]
background_box_mask
=
tf
.
less_equal
(
matched_iou
,
0.0
)
argmax_iou_indices
=
tf
.
argmax
(
iou
,
axis
=-
1
,
output_type
=
tf
.
int32
)
matched_gt_boxes
,
matched_gt_classes
=
gather_instances
(
argmax_iou_indices
,
gt_boxes
,
gt_classes
)
matched_gt_boxes
=
tf
.
where
(
tf
.
tile
(
tf
.
expand_dims
(
background_box_mask
,
axis
=-
1
),
[
1
,
1
,
4
]),
tf
.
zeros_like
(
matched_gt_boxes
,
dtype
=
matched_gt_boxes
.
dtype
),
matched_gt_boxes
)
matched_gt_classes
=
tf
.
where
(
background_box_mask
,
tf
.
zeros_like
(
matched_gt_classes
),
matched_gt_classes
)
matched_gt_indices
=
tf
.
where
(
background_box_mask
,
-
tf
.
ones_like
(
argmax_iou_indices
),
argmax_iou_indices
)
return
(
matched_gt_boxes
,
matched_gt_classes
,
matched_gt_indices
,
matched_iou
,
iou
)
official/vision/ops/iou_similarity.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Region Similarity Calculators."""
import
tensorflow
as
tf
def
area
(
box
):
"""Computes area of boxes.
B: batch_size
N: number of boxes
Args:
box: a float Tensor with [N, 4], or [B, N, 4].
Returns:
a float Tensor with [N], or [B, N]
"""
with
tf
.
name_scope
(
'Area'
):
y_min
,
x_min
,
y_max
,
x_max
=
tf
.
split
(
value
=
box
,
num_or_size_splits
=
4
,
axis
=-
1
)
return
tf
.
squeeze
((
y_max
-
y_min
)
*
(
x_max
-
x_min
),
axis
=-
1
)
def
intersection
(
gt_boxes
,
boxes
):
"""Compute pairwise intersection areas between boxes.
B: batch_size
N: number of groundtruth boxes.
M: number of anchor boxes.
Args:
gt_boxes: a float Tensor with [N, 4], or [B, N, 4]
boxes: a float Tensor with [M, 4], or [B, M, 4]
Returns:
a float Tensor with shape [N, M] or [B, N, M] representing pairwise
intersections.
"""
with
tf
.
name_scope
(
'Intersection'
):
y_min1
,
x_min1
,
y_max1
,
x_max1
=
tf
.
split
(
value
=
gt_boxes
,
num_or_size_splits
=
4
,
axis
=-
1
)
y_min2
,
x_min2
,
y_max2
,
x_max2
=
tf
.
split
(
value
=
boxes
,
num_or_size_splits
=
4
,
axis
=-
1
)
boxes_rank
=
len
(
boxes
.
shape
)
perm
=
[
1
,
0
]
if
boxes_rank
==
2
else
[
0
,
2
,
1
]
# [N, M] or [B, N, M]
y_min_max
=
tf
.
minimum
(
y_max1
,
tf
.
transpose
(
y_max2
,
perm
))
y_max_min
=
tf
.
maximum
(
y_min1
,
tf
.
transpose
(
y_min2
,
perm
))
x_min_max
=
tf
.
minimum
(
x_max1
,
tf
.
transpose
(
x_max2
,
perm
))
x_max_min
=
tf
.
maximum
(
x_min1
,
tf
.
transpose
(
x_min2
,
perm
))
intersect_heights
=
y_min_max
-
y_max_min
intersect_widths
=
x_min_max
-
x_max_min
zeros_t
=
tf
.
cast
(
0
,
intersect_heights
.
dtype
)
intersect_heights
=
tf
.
maximum
(
zeros_t
,
intersect_heights
)
intersect_widths
=
tf
.
maximum
(
zeros_t
,
intersect_widths
)
return
intersect_heights
*
intersect_widths
def
iou
(
gt_boxes
,
boxes
):
"""Computes pairwise intersection-over-union between box collections.
Args:
gt_boxes: a float Tensor with [N, 4].
boxes: a float Tensor with [M, 4].
Returns:
a Tensor with shape [N, M] representing pairwise iou scores.
"""
with
tf
.
name_scope
(
'IOU'
):
intersections
=
intersection
(
gt_boxes
,
boxes
)
gt_boxes_areas
=
area
(
gt_boxes
)
boxes_areas
=
area
(
boxes
)
boxes_rank
=
len
(
boxes_areas
.
shape
)
boxes_axis
=
1
if
(
boxes_rank
==
2
)
else
0
gt_boxes_areas
=
tf
.
expand_dims
(
gt_boxes_areas
,
-
1
)
boxes_areas
=
tf
.
expand_dims
(
boxes_areas
,
boxes_axis
)
unions
=
gt_boxes_areas
+
boxes_areas
unions
=
unions
-
intersections
return
tf
.
where
(
tf
.
equal
(
intersections
,
0.0
),
tf
.
zeros_like
(
intersections
),
tf
.
truediv
(
intersections
,
unions
))
class
IouSimilarity
:
"""Class to compute similarity based on Intersection over Union (IOU) metric.
"""
def
__init__
(
self
,
mask_val
=-
1
):
self
.
mask_val
=
mask_val
def
__call__
(
self
,
boxes_1
,
boxes_2
,
boxes_1_masks
=
None
,
boxes_2_masks
=
None
):
"""Compute pairwise IOU similarity between ground truth boxes and anchors.
B: batch_size
N: Number of groundtruth boxes.
M: Number of anchor boxes.
Args:
boxes_1: a float Tensor with M or B * M boxes.
boxes_2: a float Tensor with N or B * N boxes, the rank must be less than
or equal to rank of `boxes_1`.
boxes_1_masks: a boolean Tensor with M or B * M boxes. Optional.
boxes_2_masks: a boolean Tensor with N or B * N boxes. Optional.
Returns:
A Tensor with shape [M, N] or [B, M, N] representing pairwise
iou scores, anchor per row and groundtruth_box per colulmn.
Input shape:
boxes_1: [N, 4], or [B, N, 4]
boxes_2: [M, 4], or [B, M, 4]
boxes_1_masks: [N, 1], or [B, N, 1]
boxes_2_masks: [M, 1], or [B, M, 1]
Output shape:
[M, N], or [B, M, N]
"""
boxes_1
=
tf
.
cast
(
boxes_1
,
tf
.
float32
)
boxes_2
=
tf
.
cast
(
boxes_2
,
tf
.
float32
)
boxes_1_rank
=
len
(
boxes_1
.
shape
)
boxes_2_rank
=
len
(
boxes_2
.
shape
)
if
boxes_1_rank
<
2
or
boxes_1_rank
>
3
:
raise
ValueError
(
'`groudtruth_boxes` must be rank 2 or 3, got {}'
.
format
(
boxes_1_rank
))
if
boxes_2_rank
<
2
or
boxes_2_rank
>
3
:
raise
ValueError
(
'`anchors` must be rank 2 or 3, got {}'
.
format
(
boxes_2_rank
))
if
boxes_1_rank
<
boxes_2_rank
:
raise
ValueError
(
'`groundtruth_boxes` is unbatched while `anchors` is '
'batched is not a valid use case, got groundtruth_box '
'rank {}, and anchors rank {}'
.
format
(
boxes_1_rank
,
boxes_2_rank
))
result
=
iou
(
boxes_1
,
boxes_2
)
if
boxes_1_masks
is
None
and
boxes_2_masks
is
None
:
return
result
background_mask
=
None
mask_val_t
=
tf
.
cast
(
self
.
mask_val
,
result
.
dtype
)
*
tf
.
ones_like
(
result
)
perm
=
[
1
,
0
]
if
boxes_2_rank
==
2
else
[
0
,
2
,
1
]
if
boxes_1_masks
is
not
None
and
boxes_2_masks
is
not
None
:
background_mask
=
tf
.
logical_or
(
boxes_1_masks
,
tf
.
transpose
(
boxes_2_masks
,
perm
))
elif
boxes_1_masks
is
not
None
:
background_mask
=
boxes_1_masks
else
:
background_mask
=
tf
.
logical_or
(
tf
.
zeros
(
tf
.
shape
(
boxes_2
)[:
-
1
],
dtype
=
tf
.
bool
),
tf
.
transpose
(
boxes_2_masks
,
perm
))
return
tf
.
where
(
background_mask
,
mask_val_t
,
result
)
official/vision/ops/iou_similarity_test.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for iou_similarity.py."""
import
tensorflow
as
tf
from
official.vision.ops
import
iou_similarity
class
BoxMatcherTest
(
tf
.
test
.
TestCase
):
def
test_similarity_unbatched
(
self
):
boxes
=
tf
.
constant
(
[
[
0
,
0
,
1
,
1
],
[
5
,
0
,
10
,
5
],
],
dtype
=
tf
.
float32
)
gt_boxes
=
tf
.
constant
(
[
[
0
,
0
,
5
,
5
],
[
0
,
5
,
5
,
10
],
[
5
,
0
,
10
,
5
],
[
5
,
5
,
10
,
10
],
],
dtype
=
tf
.
float32
)
sim_calc
=
iou_similarity
.
IouSimilarity
()
sim_matrix
=
sim_calc
(
boxes
,
gt_boxes
)
self
.
assertAllClose
(
sim_matrix
.
numpy
(),
[[
0.04
,
0
,
0
,
0
],
[
0
,
0
,
1.
,
0
]])
def
test_similarity_batched
(
self
):
boxes
=
tf
.
constant
(
[[
[
0
,
0
,
1
,
1
],
[
5
,
0
,
10
,
5
],
]],
dtype
=
tf
.
float32
)
gt_boxes
=
tf
.
constant
(
[[
[
0
,
0
,
5
,
5
],
[
0
,
5
,
5
,
10
],
[
5
,
0
,
10
,
5
],
[
5
,
5
,
10
,
10
],
]],
dtype
=
tf
.
float32
)
sim_calc
=
iou_similarity
.
IouSimilarity
()
sim_matrix
=
sim_calc
(
boxes
,
gt_boxes
)
self
.
assertAllClose
(
sim_matrix
.
numpy
(),
[[[
0.04
,
0
,
0
,
0
],
[
0
,
0
,
1.
,
0
]]])
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/ops/mask_ops.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for segmentations."""
import
math
# Import libraries
import
cv2
import
numpy
as
np
def
paste_instance_masks
(
masks
,
detected_boxes
,
image_height
,
image_width
):
"""Paste instance masks to generate the image segmentation results.
Args:
masks: a numpy array of shape [N, mask_height, mask_width] representing the
instance masks w.r.t. the `detected_boxes`.
detected_boxes: a numpy array of shape [N, 4] representing the reference
bounding boxes.
image_height: an integer representing the height of the image.
image_width: an integer representing the width of the image.
Returns:
segms: a numpy array of shape [N, image_height, image_width] representing
the instance masks *pasted* on the image canvas.
"""
def
expand_boxes
(
boxes
,
scale
):
"""Expands an array of boxes by a given scale."""
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227 # pylint: disable=line-too-long
# The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
# whereas `boxes` here is in [x1, y1, w, h] form
w_half
=
boxes
[:,
2
]
*
.
5
h_half
=
boxes
[:,
3
]
*
.
5
x_c
=
boxes
[:,
0
]
+
w_half
y_c
=
boxes
[:,
1
]
+
h_half
w_half
*=
scale
h_half
*=
scale
boxes_exp
=
np
.
zeros
(
boxes
.
shape
)
boxes_exp
[:,
0
]
=
x_c
-
w_half
boxes_exp
[:,
2
]
=
x_c
+
w_half
boxes_exp
[:,
1
]
=
y_c
-
h_half
boxes_exp
[:,
3
]
=
y_c
+
h_half
return
boxes_exp
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812 # pylint: disable=line-too-long
# To work around an issue with cv2.resize (it seems to automatically pad
# with repeated border values), we manually zero-pad the masks by 1 pixel
# prior to resizing back to the original image resolution. This prevents
# "top hat" artifacts. We therefore need to expand the reference boxes by an
# appropriate factor.
_
,
mask_height
,
mask_width
=
masks
.
shape
scale
=
max
((
mask_width
+
2.0
)
/
mask_width
,
(
mask_height
+
2.0
)
/
mask_height
)
ref_boxes
=
expand_boxes
(
detected_boxes
,
scale
)
ref_boxes
=
ref_boxes
.
astype
(
np
.
int32
)
padded_mask
=
np
.
zeros
((
mask_height
+
2
,
mask_width
+
2
),
dtype
=
np
.
float32
)
segms
=
[]
for
mask_ind
,
mask
in
enumerate
(
masks
):
im_mask
=
np
.
zeros
((
image_height
,
image_width
),
dtype
=
np
.
uint8
)
# Process mask inside bounding boxes.
padded_mask
[
1
:
-
1
,
1
:
-
1
]
=
mask
[:,
:]
ref_box
=
ref_boxes
[
mask_ind
,
:]
w
=
ref_box
[
2
]
-
ref_box
[
0
]
+
1
h
=
ref_box
[
3
]
-
ref_box
[
1
]
+
1
w
=
np
.
maximum
(
w
,
1
)
h
=
np
.
maximum
(
h
,
1
)
mask
=
cv2
.
resize
(
padded_mask
,
(
w
,
h
))
mask
=
np
.
array
(
mask
>
0.5
,
dtype
=
np
.
uint8
)
x_0
=
min
(
max
(
ref_box
[
0
],
0
),
image_width
)
x_1
=
min
(
max
(
ref_box
[
2
]
+
1
,
0
),
image_width
)
y_0
=
min
(
max
(
ref_box
[
1
],
0
),
image_height
)
y_1
=
min
(
max
(
ref_box
[
3
]
+
1
,
0
),
image_height
)
im_mask
[
y_0
:
y_1
,
x_0
:
x_1
]
=
mask
[
(
y_0
-
ref_box
[
1
]):(
y_1
-
ref_box
[
1
]),
(
x_0
-
ref_box
[
0
]):(
x_1
-
ref_box
[
0
])
]
segms
.
append
(
im_mask
)
segms
=
np
.
array
(
segms
)
assert
masks
.
shape
[
0
]
==
segms
.
shape
[
0
]
return
segms
def
paste_instance_masks_v2
(
masks
,
detected_boxes
,
image_height
,
image_width
):
"""Paste instance masks to generate the image segmentation (v2).
Args:
masks: a numpy array of shape [N, mask_height, mask_width] representing the
instance masks w.r.t. the `detected_boxes`.
detected_boxes: a numpy array of shape [N, 4] representing the reference
bounding boxes.
image_height: an integer representing the height of the image.
image_width: an integer representing the width of the image.
Returns:
segms: a numpy array of shape [N, image_height, image_width] representing
the instance masks *pasted* on the image canvas.
"""
_
,
mask_height
,
mask_width
=
masks
.
shape
segms
=
[]
for
i
,
mask
in
enumerate
(
masks
):
box
=
detected_boxes
[
i
,
:]
xmin
=
box
[
0
]
ymin
=
box
[
1
]
xmax
=
xmin
+
box
[
2
]
ymax
=
ymin
+
box
[
3
]
# Sample points of the cropped mask w.r.t. the image grid.
# Note that these coordinates may fall beyond the image.
# Pixel clipping will happen after warping.
xmin_int
=
int
(
math
.
floor
(
xmin
))
xmax_int
=
int
(
math
.
ceil
(
xmax
))
ymin_int
=
int
(
math
.
floor
(
ymin
))
ymax_int
=
int
(
math
.
ceil
(
ymax
))
alpha
=
box
[
2
]
/
(
1.0
*
mask_width
)
beta
=
box
[
3
]
/
(
1.0
*
mask_height
)
# pylint: disable=invalid-name
# Transformation from mask pixel indices to image coordinate.
M_mask_to_image
=
np
.
array
(
[[
alpha
,
0
,
xmin
],
[
0
,
beta
,
ymin
],
[
0
,
0
,
1
]],
dtype
=
np
.
float32
)
# Transformation from image to cropped mask coordinate.
M_image_to_crop
=
np
.
array
(
[[
1
,
0
,
-
xmin_int
],
[
0
,
1
,
-
ymin_int
],
[
0
,
0
,
1
]],
dtype
=
np
.
float32
)
M
=
np
.
dot
(
M_image_to_crop
,
M_mask_to_image
)
# Compensate the half pixel offset that OpenCV has in the
# warpPerspective implementation: the top-left pixel is sampled
# at (0,0), but we want it to be at (0.5, 0.5).
M
=
np
.
dot
(
np
.
dot
(
np
.
array
([[
1
,
0
,
-
0.5
],
[
0
,
1
,
-
0.5
],
[
0
,
0
,
1
]],
np
.
float32
),
M
),
np
.
array
([[
1
,
0
,
0.5
],
[
0
,
1
,
0.5
],
[
0
,
0
,
1
]],
np
.
float32
))
# pylint: enable=invalid-name
cropped_mask
=
cv2
.
warpPerspective
(
mask
.
astype
(
np
.
float32
),
M
,
(
xmax_int
-
xmin_int
,
ymax_int
-
ymin_int
))
cropped_mask
=
np
.
array
(
cropped_mask
>
0.5
,
dtype
=
np
.
uint8
)
img_mask
=
np
.
zeros
((
image_height
,
image_width
))
x0
=
max
(
min
(
xmin_int
,
image_width
),
0
)
x1
=
max
(
min
(
xmax_int
,
image_width
),
0
)
y0
=
max
(
min
(
ymin_int
,
image_height
),
0
)
y1
=
max
(
min
(
ymax_int
,
image_height
),
0
)
img_mask
[
y0
:
y1
,
x0
:
x1
]
=
cropped_mask
[
(
y0
-
ymin_int
):(
y1
-
ymin_int
),
(
x0
-
xmin_int
):(
x1
-
xmin_int
)]
segms
.
append
(
img_mask
)
segms
=
np
.
array
(
segms
)
return
segms
official/vision/ops/mask_ops_test.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for mask_ops.py."""
# Import libraries
import
numpy
as
np
import
tensorflow
as
tf
from
official.vision.ops
import
mask_ops
class
MaskUtilsTest
(
tf
.
test
.
TestCase
):
def
testPasteInstanceMasks
(
self
):
image_height
=
10
image_width
=
10
mask_height
=
6
mask_width
=
6
masks
=
np
.
random
.
randint
(
0
,
255
,
(
1
,
mask_height
,
mask_width
))
detected_boxes
=
np
.
array
([[
0.0
,
2.0
,
mask_width
,
mask_height
]])
_
=
mask_ops
.
paste_instance_masks
(
masks
,
detected_boxes
,
image_height
,
image_width
)
def
testPasteInstanceMasksV2
(
self
):
image_height
=
10
image_width
=
10
mask_height
=
6
mask_width
=
6
masks
=
np
.
random
.
randint
(
0
,
255
,
(
1
,
mask_height
,
mask_width
))
detected_boxes
=
np
.
array
([[
0.0
,
2.0
,
mask_width
,
mask_height
]])
image_masks
=
mask_ops
.
paste_instance_masks_v2
(
masks
,
detected_boxes
,
image_height
,
image_width
)
self
.
assertNDArrayNear
(
image_masks
[:,
2
:
8
,
0
:
6
],
np
.
array
(
masks
>
0.5
,
dtype
=
np
.
uint8
),
1e-5
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/ops/nms.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tensorflow implementation of non max suppression."""
# Import libraries
import
tensorflow
as
tf
from
official.vision.ops
import
box_ops
NMS_TILE_SIZE
=
512
def
_self_suppression
(
iou
,
_
,
iou_sum
):
batch_size
=
tf
.
shape
(
iou
)[
0
]
can_suppress_others
=
tf
.
cast
(
tf
.
reshape
(
tf
.
reduce_max
(
iou
,
1
)
<=
0.5
,
[
batch_size
,
-
1
,
1
]),
iou
.
dtype
)
iou_suppressed
=
tf
.
reshape
(
tf
.
cast
(
tf
.
reduce_max
(
can_suppress_others
*
iou
,
1
)
<=
0.5
,
iou
.
dtype
),
[
batch_size
,
-
1
,
1
])
*
iou
iou_sum_new
=
tf
.
reduce_sum
(
iou_suppressed
,
[
1
,
2
])
return
[
iou_suppressed
,
tf
.
reduce_any
(
iou_sum
-
iou_sum_new
>
0.5
),
iou_sum_new
]
def
_cross_suppression
(
boxes
,
box_slice
,
iou_threshold
,
inner_idx
):
batch_size
=
tf
.
shape
(
boxes
)[
0
]
new_slice
=
tf
.
slice
(
boxes
,
[
0
,
inner_idx
*
NMS_TILE_SIZE
,
0
],
[
batch_size
,
NMS_TILE_SIZE
,
4
])
iou
=
box_ops
.
bbox_overlap
(
new_slice
,
box_slice
)
ret_slice
=
tf
.
expand_dims
(
tf
.
cast
(
tf
.
reduce_all
(
iou
<
iou_threshold
,
[
1
]),
box_slice
.
dtype
),
2
)
*
box_slice
return
boxes
,
ret_slice
,
iou_threshold
,
inner_idx
+
1
def
_suppression_loop_body
(
boxes
,
iou_threshold
,
output_size
,
idx
):
"""Process boxes in the range [idx*NMS_TILE_SIZE, (idx+1)*NMS_TILE_SIZE).
Args:
boxes: a tensor with a shape of [batch_size, anchors, 4].
iou_threshold: a float representing the threshold for deciding whether boxes
overlap too much with respect to IOU.
output_size: an int32 tensor of size [batch_size]. Representing the number
of selected boxes for each batch.
idx: an integer scalar representing induction variable.
Returns:
boxes: updated boxes.
iou_threshold: pass down iou_threshold to the next iteration.
output_size: the updated output_size.
idx: the updated induction variable.
"""
num_tiles
=
tf
.
shape
(
boxes
)[
1
]
//
NMS_TILE_SIZE
batch_size
=
tf
.
shape
(
boxes
)[
0
]
# Iterates over tiles that can possibly suppress the current tile.
box_slice
=
tf
.
slice
(
boxes
,
[
0
,
idx
*
NMS_TILE_SIZE
,
0
],
[
batch_size
,
NMS_TILE_SIZE
,
4
])
_
,
box_slice
,
_
,
_
=
tf
.
while_loop
(
lambda
_boxes
,
_box_slice
,
_threshold
,
inner_idx
:
inner_idx
<
idx
,
_cross_suppression
,
[
boxes
,
box_slice
,
iou_threshold
,
tf
.
constant
(
0
)])
# Iterates over the current tile to compute self-suppression.
iou
=
box_ops
.
bbox_overlap
(
box_slice
,
box_slice
)
mask
=
tf
.
expand_dims
(
tf
.
reshape
(
tf
.
range
(
NMS_TILE_SIZE
),
[
1
,
-
1
])
>
tf
.
reshape
(
tf
.
range
(
NMS_TILE_SIZE
),
[
-
1
,
1
]),
0
)
iou
*=
tf
.
cast
(
tf
.
logical_and
(
mask
,
iou
>=
iou_threshold
),
iou
.
dtype
)
suppressed_iou
,
_
,
_
=
tf
.
while_loop
(
lambda
_iou
,
loop_condition
,
_iou_sum
:
loop_condition
,
_self_suppression
,
[
iou
,
tf
.
constant
(
True
),
tf
.
reduce_sum
(
iou
,
[
1
,
2
])])
suppressed_box
=
tf
.
reduce_sum
(
suppressed_iou
,
1
)
>
0
box_slice
*=
tf
.
expand_dims
(
1.0
-
tf
.
cast
(
suppressed_box
,
box_slice
.
dtype
),
2
)
# Uses box_slice to update the input boxes.
mask
=
tf
.
reshape
(
tf
.
cast
(
tf
.
equal
(
tf
.
range
(
num_tiles
),
idx
),
boxes
.
dtype
),
[
1
,
-
1
,
1
,
1
])
boxes
=
tf
.
tile
(
tf
.
expand_dims
(
box_slice
,
[
1
]),
[
1
,
num_tiles
,
1
,
1
])
*
mask
+
tf
.
reshape
(
boxes
,
[
batch_size
,
num_tiles
,
NMS_TILE_SIZE
,
4
])
*
(
1
-
mask
)
boxes
=
tf
.
reshape
(
boxes
,
[
batch_size
,
-
1
,
4
])
# Updates output_size.
output_size
+=
tf
.
reduce_sum
(
tf
.
cast
(
tf
.
reduce_any
(
box_slice
>
0
,
[
2
]),
tf
.
int32
),
[
1
])
return
boxes
,
iou_threshold
,
output_size
,
idx
+
1
def
sorted_non_max_suppression_padded
(
scores
,
boxes
,
max_output_size
,
iou_threshold
):
"""A wrapper that handles non-maximum suppression.
Assumption:
* The boxes are sorted by scores unless the box is a dot (all coordinates
are zero).
* Boxes with higher scores can be used to suppress boxes with lower scores.
The overal design of the algorithm is to handle boxes tile-by-tile:
boxes = boxes.pad_to_multiply_of(tile_size)
num_tiles = len(boxes) // tile_size
output_boxes = []
for i in range(num_tiles):
box_tile = boxes[i*tile_size : (i+1)*tile_size]
for j in range(i - 1):
suppressing_tile = boxes[j*tile_size : (j+1)*tile_size]
iou = bbox_overlap(box_tile, suppressing_tile)
# if the box is suppressed in iou, clear it to a dot
box_tile *= _update_boxes(iou)
# Iteratively handle the diagnal tile.
iou = _box_overlap(box_tile, box_tile)
iou_changed = True
while iou_changed:
# boxes that are not suppressed by anything else
suppressing_boxes = _get_suppressing_boxes(iou)
# boxes that are suppressed by suppressing_boxes
suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes)
# clear iou to 0 for boxes that are suppressed, as they cannot be used
# to suppress other boxes any more
new_iou = _clear_iou(iou, suppressed_boxes)
iou_changed = (new_iou != iou)
iou = new_iou
# remaining boxes that can still suppress others, are selected boxes.
output_boxes.append(_get_suppressing_boxes(iou))
if len(output_boxes) >= max_output_size:
break
Args:
scores: a tensor with a shape of [batch_size, anchors].
boxes: a tensor with a shape of [batch_size, anchors, 4].
max_output_size: a scalar integer `Tensor` representing the maximum number
of boxes to be selected by non max suppression.
iou_threshold: a float representing the threshold for deciding whether boxes
overlap too much with respect to IOU.
Returns:
nms_scores: a tensor with a shape of [batch_size, anchors]. It has same
dtype as input scores.
nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has
same dtype as input boxes.
"""
batch_size
=
tf
.
shape
(
boxes
)[
0
]
num_boxes
=
tf
.
shape
(
boxes
)[
1
]
pad
=
tf
.
cast
(
tf
.
math
.
ceil
(
tf
.
cast
(
num_boxes
,
tf
.
float32
)
/
NMS_TILE_SIZE
),
tf
.
int32
)
*
NMS_TILE_SIZE
-
num_boxes
boxes
=
tf
.
pad
(
tf
.
cast
(
boxes
,
tf
.
float32
),
[[
0
,
0
],
[
0
,
pad
],
[
0
,
0
]])
scores
=
tf
.
pad
(
tf
.
cast
(
scores
,
tf
.
float32
),
[[
0
,
0
],
[
0
,
pad
]],
constant_values
=-
1
)
num_boxes
+=
pad
def
_loop_cond
(
unused_boxes
,
unused_threshold
,
output_size
,
idx
):
return
tf
.
logical_and
(
tf
.
reduce_min
(
output_size
)
<
max_output_size
,
idx
<
num_boxes
//
NMS_TILE_SIZE
)
selected_boxes
,
_
,
output_size
,
_
=
tf
.
while_loop
(
_loop_cond
,
_suppression_loop_body
,
[
boxes
,
iou_threshold
,
tf
.
zeros
([
batch_size
],
tf
.
int32
),
tf
.
constant
(
0
)
])
idx
=
num_boxes
-
tf
.
cast
(
tf
.
nn
.
top_k
(
tf
.
cast
(
tf
.
reduce_any
(
selected_boxes
>
0
,
[
2
]),
tf
.
int32
)
*
tf
.
expand_dims
(
tf
.
range
(
num_boxes
,
0
,
-
1
),
0
),
max_output_size
)[
0
],
tf
.
int32
)
idx
=
tf
.
minimum
(
idx
,
num_boxes
-
1
)
idx
=
tf
.
reshape
(
idx
+
tf
.
reshape
(
tf
.
range
(
batch_size
)
*
num_boxes
,
[
-
1
,
1
]),
[
-
1
])
boxes
=
tf
.
reshape
(
tf
.
gather
(
tf
.
reshape
(
boxes
,
[
-
1
,
4
]),
idx
),
[
batch_size
,
max_output_size
,
4
])
boxes
=
boxes
*
tf
.
cast
(
tf
.
reshape
(
tf
.
range
(
max_output_size
),
[
1
,
-
1
,
1
])
<
tf
.
reshape
(
output_size
,
[
-
1
,
1
,
1
]),
boxes
.
dtype
)
scores
=
tf
.
reshape
(
tf
.
gather
(
tf
.
reshape
(
scores
,
[
-
1
,
1
]),
idx
),
[
batch_size
,
max_output_size
])
scores
=
scores
*
tf
.
cast
(
tf
.
reshape
(
tf
.
range
(
max_output_size
),
[
1
,
-
1
])
<
tf
.
reshape
(
output_size
,
[
-
1
,
1
]),
scores
.
dtype
)
return
scores
,
boxes
official/vision/ops/preprocess_ops.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Preprocessing ops."""
import
math
from
typing
import
Optional
,
Tuple
,
Union
from
six.moves
import
range
import
tensorflow
as
tf
from
official.vision.ops
import
augment
from
official.vision.ops
import
box_ops
CENTER_CROP_FRACTION
=
0.875
def
clip_or_pad_to_fixed_size
(
input_tensor
,
size
,
constant_values
=
0
):
"""Pads data to a fixed length at the first dimension.
Args:
input_tensor: `Tensor` with any dimension.
size: `int` number for the first dimension of output Tensor.
constant_values: `int` value assigned to the paddings.
Returns:
`Tensor` with the first dimension padded to `size`.
"""
input_shape
=
input_tensor
.
get_shape
().
as_list
()
padding_shape
=
[]
# Computes the padding length on the first dimension, clip input tensor if it
# is longer than `size`.
input_length
=
tf
.
shape
(
input_tensor
)[
0
]
input_length
=
tf
.
clip_by_value
(
input_length
,
0
,
size
)
input_tensor
=
input_tensor
[:
input_length
]
padding_length
=
tf
.
maximum
(
0
,
size
-
input_length
)
padding_shape
.
append
(
padding_length
)
# Copies shapes of the rest of input shape dimensions.
for
i
in
range
(
1
,
len
(
input_shape
)):
padding_shape
.
append
(
tf
.
shape
(
input_tensor
)[
i
])
# Pads input tensor to the fixed first dimension.
paddings
=
tf
.
cast
(
constant_values
*
tf
.
ones
(
padding_shape
),
input_tensor
.
dtype
)
padded_tensor
=
tf
.
concat
([
input_tensor
,
paddings
],
axis
=
0
)
output_shape
=
input_shape
output_shape
[
0
]
=
size
padded_tensor
.
set_shape
(
output_shape
)
return
padded_tensor
def
normalize_image
(
image
,
offset
=
(
0.485
,
0.456
,
0.406
),
scale
=
(
0.229
,
0.224
,
0.225
)):
"""Normalizes the image to zero mean and unit variance."""
with
tf
.
name_scope
(
'normalize_image'
):
image
=
tf
.
image
.
convert_image_dtype
(
image
,
dtype
=
tf
.
float32
)
offset
=
tf
.
constant
(
offset
)
offset
=
tf
.
expand_dims
(
offset
,
axis
=
0
)
offset
=
tf
.
expand_dims
(
offset
,
axis
=
0
)
image
-=
offset
scale
=
tf
.
constant
(
scale
)
scale
=
tf
.
expand_dims
(
scale
,
axis
=
0
)
scale
=
tf
.
expand_dims
(
scale
,
axis
=
0
)
image
/=
scale
return
image
def
compute_padded_size
(
desired_size
,
stride
):
"""Compute the padded size given the desired size and the stride.
The padded size will be the smallest rectangle, such that each dimension is
the smallest multiple of the stride which is larger than the desired
dimension. For example, if desired_size = (100, 200) and stride = 32,
the output padded_size = (128, 224).
Args:
desired_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the target output image size.
stride: an integer, the stride of the backbone network.
Returns:
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size.
"""
if
isinstance
(
desired_size
,
list
)
or
isinstance
(
desired_size
,
tuple
):
padded_size
=
[
int
(
math
.
ceil
(
d
*
1.0
/
stride
)
*
stride
)
for
d
in
desired_size
]
else
:
padded_size
=
tf
.
cast
(
tf
.
math
.
ceil
(
tf
.
cast
(
desired_size
,
dtype
=
tf
.
float32
)
/
stride
)
*
stride
,
tf
.
int32
)
return
padded_size
def
resize_and_crop_image
(
image
,
desired_size
,
padded_size
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
seed
=
1
,
method
=
tf
.
image
.
ResizeMethod
.
BILINEAR
):
"""Resizes the input image to output size (RetinaNet style).
Resize and pad images given the desired output size of the image and
stride size.
Here are the preprocessing steps.
1. For a given image, keep its aspect ratio and rescale the image to make it
the largest rectangle to be bounded by the rectangle specified by the
`desired_size`.
2. Pad the rescaled image to the padded_size.
Args:
image: a `Tensor` of shape [height, width, 3] representing an image.
desired_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the desired actual output image size.
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size. Padding will be applied
after scaling the image to the desired_size.
aug_scale_min: a `float` with range between [0, 1.0] representing minimum
random scale applied to desired_size for training scale jittering.
aug_scale_max: a `float` with range between [1.0, inf] representing maximum
random scale applied to desired_size for training scale jittering.
seed: seed for random scale jittering.
method: function to resize input image to scaled image.
Returns:
output_image: `Tensor` of shape [height, width, 3] where [height, width]
equals to `output_size`.
image_info: a 2D `Tensor` that encodes the information of the image and the
applied preprocessing. It is in the format of
[[original_height, original_width], [desired_height, desired_width],
[y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
desired_width] is the actual scaled image size, and [y_scale, x_scale] is
the scaling factor, which is the ratio of
scaled dimension / original dimension.
"""
with
tf
.
name_scope
(
'resize_and_crop_image'
):
image_size
=
tf
.
cast
(
tf
.
shape
(
image
)[
0
:
2
],
tf
.
float32
)
random_jittering
=
(
aug_scale_min
!=
1.0
or
aug_scale_max
!=
1.0
)
if
random_jittering
:
random_scale
=
tf
.
random
.
uniform
(
[],
aug_scale_min
,
aug_scale_max
,
seed
=
seed
)
scaled_size
=
tf
.
round
(
random_scale
*
desired_size
)
else
:
scaled_size
=
desired_size
scale
=
tf
.
minimum
(
scaled_size
[
0
]
/
image_size
[
0
],
scaled_size
[
1
]
/
image_size
[
1
])
scaled_size
=
tf
.
round
(
image_size
*
scale
)
# Computes 2D image_scale.
image_scale
=
scaled_size
/
image_size
# Selects non-zero random offset (x, y) if scaled image is larger than
# desired_size.
if
random_jittering
:
max_offset
=
scaled_size
-
desired_size
max_offset
=
tf
.
where
(
tf
.
less
(
max_offset
,
0
),
tf
.
zeros_like
(
max_offset
),
max_offset
)
offset
=
max_offset
*
tf
.
random
.
uniform
([
2
,],
0
,
1
,
seed
=
seed
)
offset
=
tf
.
cast
(
offset
,
tf
.
int32
)
else
:
offset
=
tf
.
zeros
((
2
,),
tf
.
int32
)
scaled_image
=
tf
.
image
.
resize
(
image
,
tf
.
cast
(
scaled_size
,
tf
.
int32
),
method
=
method
)
if
random_jittering
:
scaled_image
=
scaled_image
[
offset
[
0
]:
offset
[
0
]
+
desired_size
[
0
],
offset
[
1
]:
offset
[
1
]
+
desired_size
[
1
],
:]
output_image
=
tf
.
image
.
pad_to_bounding_box
(
scaled_image
,
0
,
0
,
padded_size
[
0
],
padded_size
[
1
])
image_info
=
tf
.
stack
([
image_size
,
tf
.
constant
(
desired_size
,
dtype
=
tf
.
float32
),
image_scale
,
tf
.
cast
(
offset
,
tf
.
float32
)])
return
output_image
,
image_info
def
resize_and_crop_image_v2
(
image
,
short_side
,
long_side
,
padded_size
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
seed
=
1
,
method
=
tf
.
image
.
ResizeMethod
.
BILINEAR
):
"""Resizes the input image to output size (Faster R-CNN style).
Resize and pad images given the specified short / long side length and the
stride size.
Here are the preprocessing steps.
1. For a given image, keep its aspect ratio and first try to rescale the short
side of the original image to `short_side`.
2. If the scaled image after 1 has a long side that exceeds `long_side`, keep
the aspect ratio and rescal the long side of the image to `long_side`.
2. Pad the rescaled image to the padded_size.
Args:
image: a `Tensor` of shape [height, width, 3] representing an image.
short_side: a scalar `Tensor` or `int` representing the desired short side
to be rescaled to.
long_side: a scalar `Tensor` or `int` representing the desired long side to
be rescaled to.
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size. Padding will be applied
after scaling the image to the desired_size.
aug_scale_min: a `float` with range between [0, 1.0] representing minimum
random scale applied to desired_size for training scale jittering.
aug_scale_max: a `float` with range between [1.0, inf] representing maximum
random scale applied to desired_size for training scale jittering.
seed: seed for random scale jittering.
method: function to resize input image to scaled image.
Returns:
output_image: `Tensor` of shape [height, width, 3] where [height, width]
equals to `output_size`.
image_info: a 2D `Tensor` that encodes the information of the image and the
applied preprocessing. It is in the format of
[[original_height, original_width], [desired_height, desired_width],
[y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
desired_width] is the actual scaled image size, and [y_scale, x_scale] is
the scaling factor, which is the ratio of
scaled dimension / original dimension.
"""
with
tf
.
name_scope
(
'resize_and_crop_image_v2'
):
image_size
=
tf
.
cast
(
tf
.
shape
(
image
)[
0
:
2
],
tf
.
float32
)
scale_using_short_side
=
(
short_side
/
tf
.
math
.
minimum
(
image_size
[
0
],
image_size
[
1
]))
scale_using_long_side
=
(
long_side
/
tf
.
math
.
maximum
(
image_size
[
0
],
image_size
[
1
]))
scaled_size
=
tf
.
math
.
round
(
image_size
*
scale_using_short_side
)
scaled_size
=
tf
.
where
(
tf
.
math
.
greater
(
tf
.
math
.
maximum
(
scaled_size
[
0
],
scaled_size
[
1
]),
long_side
),
tf
.
math
.
round
(
image_size
*
scale_using_long_side
),
scaled_size
)
desired_size
=
scaled_size
random_jittering
=
(
aug_scale_min
!=
1.0
or
aug_scale_max
!=
1.0
)
if
random_jittering
:
random_scale
=
tf
.
random
.
uniform
(
[],
aug_scale_min
,
aug_scale_max
,
seed
=
seed
)
scaled_size
=
tf
.
math
.
round
(
random_scale
*
scaled_size
)
# Computes 2D image_scale.
image_scale
=
scaled_size
/
image_size
# Selects non-zero random offset (x, y) if scaled image is larger than
# desired_size.
if
random_jittering
:
max_offset
=
scaled_size
-
desired_size
max_offset
=
tf
.
where
(
tf
.
math
.
less
(
max_offset
,
0
),
tf
.
zeros_like
(
max_offset
),
max_offset
)
offset
=
max_offset
*
tf
.
random
.
uniform
([
2
,],
0
,
1
,
seed
=
seed
)
offset
=
tf
.
cast
(
offset
,
tf
.
int32
)
else
:
offset
=
tf
.
zeros
((
2
,),
tf
.
int32
)
scaled_image
=
tf
.
image
.
resize
(
image
,
tf
.
cast
(
scaled_size
,
tf
.
int32
),
method
=
method
)
if
random_jittering
:
scaled_image
=
scaled_image
[
offset
[
0
]:
offset
[
0
]
+
desired_size
[
0
],
offset
[
1
]:
offset
[
1
]
+
desired_size
[
1
],
:]
output_image
=
tf
.
image
.
pad_to_bounding_box
(
scaled_image
,
0
,
0
,
padded_size
[
0
],
padded_size
[
1
])
image_info
=
tf
.
stack
([
image_size
,
tf
.
cast
(
desired_size
,
dtype
=
tf
.
float32
),
image_scale
,
tf
.
cast
(
offset
,
tf
.
float32
)])
return
output_image
,
image_info
def
resize_image
(
image
:
tf
.
Tensor
,
size
:
Union
[
Tuple
[
int
,
int
],
int
],
max_size
:
Optional
[
int
]
=
None
,
method
:
tf
.
image
.
ResizeMethod
=
tf
.
image
.
ResizeMethod
.
BILINEAR
):
"""Resize image with size and max_size.
Args:
image: the image to be resized.
size: if list to tuple, resize to it. If scalar, we keep the same
aspect ratio and resize the short side to the value.
max_size: only used when size is a scalar. When the larger side is larger
than max_size after resized with size we used max_size to keep the aspect
ratio instead.
method: the method argument passed to tf.image.resize.
Returns:
the resized image and image_info to be used for downstream processing.
image_info: a 2D `Tensor` that encodes the information of the image and the
applied preprocessing. It is in the format of
[[original_height, original_width], [resized_height, resized_width],
[y_scale, x_scale], [0, 0]], where [resized_height, resized_width]
is the actual scaled image size, and [y_scale, x_scale] is the
scaling factor, which is the ratio of
scaled dimension / original dimension.
"""
def
get_size_with_aspect_ratio
(
image_size
,
size
,
max_size
=
None
):
h
=
image_size
[
0
]
w
=
image_size
[
1
]
if
max_size
is
not
None
:
min_original_size
=
tf
.
cast
(
tf
.
math
.
minimum
(
w
,
h
),
dtype
=
tf
.
float32
)
max_original_size
=
tf
.
cast
(
tf
.
math
.
maximum
(
w
,
h
),
dtype
=
tf
.
float32
)
if
max_original_size
/
min_original_size
*
size
>
max_size
:
size
=
tf
.
cast
(
tf
.
math
.
floor
(
max_size
*
min_original_size
/
max_original_size
),
dtype
=
tf
.
int32
)
else
:
size
=
tf
.
cast
(
size
,
tf
.
int32
)
else
:
size
=
tf
.
cast
(
size
,
tf
.
int32
)
if
(
w
<=
h
and
w
==
size
)
or
(
h
<=
w
and
h
==
size
):
return
tf
.
stack
([
h
,
w
])
if
w
<
h
:
ow
=
size
oh
=
tf
.
cast
(
(
tf
.
cast
(
size
,
dtype
=
tf
.
float32
)
*
tf
.
cast
(
h
,
dtype
=
tf
.
float32
)
/
tf
.
cast
(
w
,
dtype
=
tf
.
float32
)),
dtype
=
tf
.
int32
)
else
:
oh
=
size
ow
=
tf
.
cast
(
(
tf
.
cast
(
size
,
dtype
=
tf
.
float32
)
*
tf
.
cast
(
w
,
dtype
=
tf
.
float32
)
/
tf
.
cast
(
h
,
dtype
=
tf
.
float32
)),
dtype
=
tf
.
int32
)
return
tf
.
stack
([
oh
,
ow
])
def
get_size
(
image_size
,
size
,
max_size
=
None
):
if
isinstance
(
size
,
(
list
,
tuple
)):
return
size
[::
-
1
]
else
:
return
get_size_with_aspect_ratio
(
image_size
,
size
,
max_size
)
orignal_size
=
tf
.
shape
(
image
)[
0
:
2
]
size
=
get_size
(
orignal_size
,
size
,
max_size
)
rescaled_image
=
tf
.
image
.
resize
(
image
,
tf
.
cast
(
size
,
tf
.
int32
),
method
=
method
)
image_scale
=
size
/
orignal_size
image_info
=
tf
.
stack
([
tf
.
cast
(
orignal_size
,
dtype
=
tf
.
float32
),
tf
.
cast
(
size
,
dtype
=
tf
.
float32
),
tf
.
cast
(
image_scale
,
tf
.
float32
),
tf
.
constant
([
0.0
,
0.0
],
dtype
=
tf
.
float32
)
])
return
rescaled_image
,
image_info
def
center_crop_image
(
image
):
"""Center crop a square shape slice from the input image.
It crops a square shape slice from the image. The side of the actual crop
is 224 / 256 = 0.875 of the short side of the original image. References:
[1] Very Deep Convolutional Networks for Large-Scale Image Recognition
https://arxiv.org/abs/1409.1556
[2] Deep Residual Learning for Image Recognition
https://arxiv.org/abs/1512.03385
Args:
image: a Tensor of shape [height, width, 3] representing the input image.
Returns:
cropped_image: a Tensor representing the center cropped image.
"""
with
tf
.
name_scope
(
'center_crop_image'
):
image_size
=
tf
.
cast
(
tf
.
shape
(
image
)[:
2
],
dtype
=
tf
.
float32
)
crop_size
=
(
CENTER_CROP_FRACTION
*
tf
.
math
.
minimum
(
image_size
[
0
],
image_size
[
1
]))
crop_offset
=
tf
.
cast
((
image_size
-
crop_size
)
/
2.0
,
dtype
=
tf
.
int32
)
crop_size
=
tf
.
cast
(
crop_size
,
dtype
=
tf
.
int32
)
cropped_image
=
image
[
crop_offset
[
0
]:
crop_offset
[
0
]
+
crop_size
,
crop_offset
[
1
]:
crop_offset
[
1
]
+
crop_size
,
:]
return
cropped_image
def
center_crop_image_v2
(
image_bytes
,
image_shape
):
"""Center crop a square shape slice from the input image.
It crops a square shape slice from the image. The side of the actual crop
is 224 / 256 = 0.875 of the short side of the original image. References:
[1] Very Deep Convolutional Networks for Large-Scale Image Recognition
https://arxiv.org/abs/1409.1556
[2] Deep Residual Learning for Image Recognition
https://arxiv.org/abs/1512.03385
This is a faster version of `center_crop_image` which takes the original
image bytes and image size as the inputs, and partially decode the JPEG
bytes according to the center crop.
Args:
image_bytes: a Tensor of type string representing the raw image bytes.
image_shape: a Tensor specifying the shape of the raw image.
Returns:
cropped_image: a Tensor representing the center cropped image.
"""
with
tf
.
name_scope
(
'center_image_crop_v2'
):
image_shape
=
tf
.
cast
(
image_shape
,
tf
.
float32
)
crop_size
=
(
CENTER_CROP_FRACTION
*
tf
.
math
.
minimum
(
image_shape
[
0
],
image_shape
[
1
]))
crop_offset
=
tf
.
cast
((
image_shape
-
crop_size
)
/
2.0
,
dtype
=
tf
.
int32
)
crop_size
=
tf
.
cast
(
crop_size
,
dtype
=
tf
.
int32
)
crop_window
=
tf
.
stack
(
[
crop_offset
[
0
],
crop_offset
[
1
],
crop_size
,
crop_size
])
cropped_image
=
tf
.
image
.
decode_and_crop_jpeg
(
image_bytes
,
crop_window
,
channels
=
3
)
return
cropped_image
def
random_crop_image
(
image
,
aspect_ratio_range
=
(
3.
/
4.
,
4.
/
3.
),
area_range
=
(
0.08
,
1.0
),
max_attempts
=
10
,
seed
=
1
):
"""Randomly crop an arbitrary shaped slice from the input image.
Args:
image: a Tensor of shape [height, width, 3] representing the input image.
aspect_ratio_range: a list of floats. The cropped area of the image must
have an aspect ratio = width / height within this range.
area_range: a list of floats. The cropped reas of the image must contain
a fraction of the input image within this range.
max_attempts: the number of attempts at generating a cropped region of the
image of the specified constraints. After max_attempts failures, return
the entire image.
seed: the seed of the random generator.
Returns:
cropped_image: a Tensor representing the random cropped image. Can be the
original image if max_attempts is exhausted.
"""
with
tf
.
name_scope
(
'random_crop_image'
):
crop_offset
,
crop_size
,
_
=
tf
.
image
.
sample_distorted_bounding_box
(
tf
.
shape
(
image
),
tf
.
constant
([
0.0
,
0.0
,
1.0
,
1.0
],
dtype
=
tf
.
float32
,
shape
=
[
1
,
1
,
4
]),
seed
=
seed
,
min_object_covered
=
area_range
[
0
],
aspect_ratio_range
=
aspect_ratio_range
,
area_range
=
area_range
,
max_attempts
=
max_attempts
)
cropped_image
=
tf
.
slice
(
image
,
crop_offset
,
crop_size
)
return
cropped_image
def
random_crop_image_v2
(
image_bytes
,
image_shape
,
aspect_ratio_range
=
(
3.
/
4.
,
4.
/
3.
),
area_range
=
(
0.08
,
1.0
),
max_attempts
=
10
,
seed
=
1
):
"""Randomly crop an arbitrary shaped slice from the input image.
This is a faster version of `random_crop_image` which takes the original
image bytes and image size as the inputs, and partially decode the JPEG
bytes according to the generated crop.
Args:
image_bytes: a Tensor of type string representing the raw image bytes.
image_shape: a Tensor specifying the shape of the raw image.
aspect_ratio_range: a list of floats. The cropped area of the image must
have an aspect ratio = width / height within this range.
area_range: a list of floats. The cropped reas of the image must contain
a fraction of the input image within this range.
max_attempts: the number of attempts at generating a cropped region of the
image of the specified constraints. After max_attempts failures, return
the entire image.
seed: the seed of the random generator.
Returns:
cropped_image: a Tensor representing the random cropped image. Can be the
original image if max_attempts is exhausted.
"""
with
tf
.
name_scope
(
'random_crop_image_v2'
):
crop_offset
,
crop_size
,
_
=
tf
.
image
.
sample_distorted_bounding_box
(
image_shape
,
tf
.
constant
([
0.0
,
0.0
,
1.0
,
1.0
],
dtype
=
tf
.
float32
,
shape
=
[
1
,
1
,
4
]),
seed
=
seed
,
min_object_covered
=
area_range
[
0
],
aspect_ratio_range
=
aspect_ratio_range
,
area_range
=
area_range
,
max_attempts
=
max_attempts
)
offset_y
,
offset_x
,
_
=
tf
.
unstack
(
crop_offset
)
crop_height
,
crop_width
,
_
=
tf
.
unstack
(
crop_size
)
crop_window
=
tf
.
stack
([
offset_y
,
offset_x
,
crop_height
,
crop_width
])
cropped_image
=
tf
.
image
.
decode_and_crop_jpeg
(
image_bytes
,
crop_window
,
channels
=
3
)
return
cropped_image
def
resize_and_crop_boxes
(
boxes
,
image_scale
,
output_size
,
offset
):
"""Resizes boxes to output size with scale and offset.
Args:
boxes: `Tensor` of shape [N, 4] representing ground truth boxes.
image_scale: 2D float `Tensor` representing scale factors that apply to
[height, width] of input image.
output_size: 2D `Tensor` or `int` representing [height, width] of target
output image size.
offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
boxes.
Returns:
boxes: `Tensor` of shape [N, 4] representing the scaled boxes.
"""
with
tf
.
name_scope
(
'resize_and_crop_boxes'
):
# Adjusts box coordinates based on image_scale and offset.
boxes
*=
tf
.
tile
(
tf
.
expand_dims
(
image_scale
,
axis
=
0
),
[
1
,
2
])
boxes
-=
tf
.
tile
(
tf
.
expand_dims
(
offset
,
axis
=
0
),
[
1
,
2
])
# Clips the boxes.
boxes
=
box_ops
.
clip_boxes
(
boxes
,
output_size
)
return
boxes
def
resize_and_crop_masks
(
masks
,
image_scale
,
output_size
,
offset
):
"""Resizes boxes to output size with scale and offset.
Args:
masks: `Tensor` of shape [N, H, W, 1] representing ground truth masks.
image_scale: 2D float `Tensor` representing scale factors that apply to
[height, width] of input image.
output_size: 2D `Tensor` or `int` representing [height, width] of target
output image size.
offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
boxes.
Returns:
masks: `Tensor` of shape [N, H, W, 1] representing the scaled masks.
"""
with
tf
.
name_scope
(
'resize_and_crop_masks'
):
mask_size
=
tf
.
cast
(
tf
.
shape
(
masks
)[
1
:
3
],
tf
.
float32
)
# Pad masks to avoid empty mask annotations.
masks
=
tf
.
concat
(
[
tf
.
zeros
([
1
,
mask_size
[
0
],
mask_size
[
1
],
1
]),
masks
],
axis
=
0
)
scaled_size
=
tf
.
cast
(
image_scale
*
mask_size
,
tf
.
int32
)
scaled_masks
=
tf
.
image
.
resize
(
masks
,
scaled_size
,
method
=
tf
.
image
.
ResizeMethod
.
NEAREST_NEIGHBOR
)
offset
=
tf
.
cast
(
offset
,
tf
.
int32
)
scaled_masks
=
scaled_masks
[
:,
offset
[
0
]:
offset
[
0
]
+
output_size
[
0
],
offset
[
1
]:
offset
[
1
]
+
output_size
[
1
],
:]
output_masks
=
tf
.
image
.
pad_to_bounding_box
(
scaled_masks
,
0
,
0
,
output_size
[
0
],
output_size
[
1
])
# Remove padding.
output_masks
=
output_masks
[
1
::]
return
output_masks
def
horizontal_flip_image
(
image
):
"""Flips image horizontally."""
return
tf
.
image
.
flip_left_right
(
image
)
def
horizontal_flip_boxes
(
normalized_boxes
):
"""Flips normalized boxes horizontally."""
ymin
,
xmin
,
ymax
,
xmax
=
tf
.
split
(
value
=
normalized_boxes
,
num_or_size_splits
=
4
,
axis
=
1
)
flipped_xmin
=
tf
.
subtract
(
1.0
,
xmax
)
flipped_xmax
=
tf
.
subtract
(
1.0
,
xmin
)
flipped_boxes
=
tf
.
concat
([
ymin
,
flipped_xmin
,
ymax
,
flipped_xmax
],
1
)
return
flipped_boxes
def
horizontal_flip_masks
(
masks
):
"""Flips masks horizontally."""
return
masks
[:,
:,
::
-
1
]
def
random_horizontal_flip
(
image
,
normalized_boxes
=
None
,
masks
=
None
,
seed
=
1
):
"""Randomly flips input image and bounding boxes."""
with
tf
.
name_scope
(
'random_horizontal_flip'
):
do_flip
=
tf
.
greater
(
tf
.
random
.
uniform
([],
seed
=
seed
),
0.5
)
image
=
tf
.
cond
(
do_flip
,
lambda
:
horizontal_flip_image
(
image
),
lambda
:
image
)
if
normalized_boxes
is
not
None
:
normalized_boxes
=
tf
.
cond
(
do_flip
,
lambda
:
horizontal_flip_boxes
(
normalized_boxes
),
lambda
:
normalized_boxes
)
if
masks
is
not
None
:
masks
=
tf
.
cond
(
do_flip
,
lambda
:
horizontal_flip_masks
(
masks
),
lambda
:
masks
)
return
image
,
normalized_boxes
,
masks
def
color_jitter
(
image
:
tf
.
Tensor
,
brightness
:
Optional
[
float
]
=
0.
,
contrast
:
Optional
[
float
]
=
0.
,
saturation
:
Optional
[
float
]
=
0.
,
seed
:
Optional
[
int
]
=
None
)
->
tf
.
Tensor
:
"""Applies color jitter to an image, similarly to torchvision`s ColorJitter.
Args:
image (tf.Tensor): Of shape [height, width, 3] and type uint8.
brightness (float, optional): Magnitude for brightness jitter. Defaults to
0.
contrast (float, optional): Magnitude for contrast jitter. Defaults to 0.
saturation (float, optional): Magnitude for saturation jitter. Defaults to
0.
seed (int, optional): Random seed. Defaults to None.
Returns:
tf.Tensor: The augmented `image` of type uint8.
"""
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
uint8
)
image
=
random_brightness
(
image
,
brightness
,
seed
=
seed
)
image
=
random_contrast
(
image
,
contrast
,
seed
=
seed
)
image
=
random_saturation
(
image
,
saturation
,
seed
=
seed
)
return
image
def
random_brightness
(
image
:
tf
.
Tensor
,
brightness
:
float
=
0.
,
seed
:
Optional
[
int
]
=
None
)
->
tf
.
Tensor
:
"""Jitters brightness of an image.
Args:
image (tf.Tensor): Of shape [height, width, 3] and type uint8.
brightness (float, optional): Magnitude for brightness jitter. Defaults to
0.
seed (int, optional): Random seed. Defaults to None.
Returns:
tf.Tensor: The augmented `image` of type uint8.
"""
assert
brightness
>=
0
,
'`brightness` must be positive'
brightness
=
tf
.
random
.
uniform
([],
max
(
0
,
1
-
brightness
),
1
+
brightness
,
seed
=
seed
,
dtype
=
tf
.
float32
)
return
augment
.
brightness
(
image
,
brightness
)
def
random_contrast
(
image
:
tf
.
Tensor
,
contrast
:
float
=
0.
,
seed
:
Optional
[
int
]
=
None
)
->
tf
.
Tensor
:
"""Jitters contrast of an image, similarly to torchvision`s ColorJitter.
Args:
image (tf.Tensor): Of shape [height, width, 3] and type uint8.
contrast (float, optional): Magnitude for contrast jitter. Defaults to 0.
seed (int, optional): Random seed. Defaults to None.
Returns:
tf.Tensor: The augmented `image` of type uint8.
"""
assert
contrast
>=
0
,
'`contrast` must be positive'
contrast
=
tf
.
random
.
uniform
([],
max
(
0
,
1
-
contrast
),
1
+
contrast
,
seed
=
seed
,
dtype
=
tf
.
float32
)
return
augment
.
contrast
(
image
,
contrast
)
def
random_saturation
(
image
:
tf
.
Tensor
,
saturation
:
float
=
0.
,
seed
:
Optional
[
int
]
=
None
)
->
tf
.
Tensor
:
"""Jitters saturation of an image, similarly to torchvision`s ColorJitter.
Args:
image (tf.Tensor): Of shape [height, width, 3] and type uint8.
saturation (float, optional): Magnitude for saturation jitter. Defaults to
0.
seed (int, optional): Random seed. Defaults to None.
Returns:
tf.Tensor: The augmented `image` of type uint8.
"""
assert
saturation
>=
0
,
'`saturation` must be positive'
saturation
=
tf
.
random
.
uniform
([],
max
(
0
,
1
-
saturation
),
1
+
saturation
,
seed
=
seed
,
dtype
=
tf
.
float32
)
return
_saturation
(
image
,
saturation
)
def
_saturation
(
image
:
tf
.
Tensor
,
saturation
:
Optional
[
float
]
=
0.
)
->
tf
.
Tensor
:
return
augment
.
blend
(
tf
.
repeat
(
tf
.
image
.
rgb_to_grayscale
(
image
),
3
,
axis
=-
1
),
image
,
saturation
)
def
random_crop_image_with_boxes_and_labels
(
img
,
boxes
,
labels
,
min_scale
,
aspect_ratio_range
,
min_overlap_params
,
max_retry
):
"""Crops a random slice from the input image.
The function will correspondingly recompute the bounding boxes and filter out
outside boxes and their labels.
References:
[1] End-to-End Object Detection with Transformers
https://arxiv.org/abs/2005.12872
The preprocessing steps:
1. Sample a minimum IoU overlap.
2. For each trial, sample the new image width, height, and top-left corner.
3. Compute the IoUs of bounding boxes with the cropped image and retry if
the maximum IoU is below the sampled threshold.
4. Find boxes whose centers are in the cropped image.
5. Compute new bounding boxes in the cropped region and only select those
boxes' labels.
Args:
img: a 'Tensor' of shape [height, width, 3] representing the input image.
boxes: a 'Tensor' of shape [N, 4] representing the ground-truth bounding
boxes with (ymin, xmin, ymax, xmax).
labels: a 'Tensor' of shape [N,] representing the class labels of the boxes.
min_scale: a 'float' in [0.0, 1.0) indicating the lower bound of the random
scale variable.
aspect_ratio_range: a list of two 'float' that specifies the lower and upper
bound of the random aspect ratio.
min_overlap_params: a list of four 'float' representing the min value, max
value, step size, and offset for the minimum overlap sample.
max_retry: an 'int' representing the number of trials for cropping. If it is
exhausted, no cropping will be performed.
Returns:
img: a Tensor representing the random cropped image. Can be the
original image if max_retry is exhausted.
boxes: a Tensor representing the bounding boxes in the cropped image.
labels: a Tensor representing the new bounding boxes' labels.
"""
shape
=
tf
.
shape
(
img
)
original_h
=
shape
[
0
]
original_w
=
shape
[
1
]
minval
,
maxval
,
step
,
offset
=
min_overlap_params
min_overlap
=
tf
.
math
.
floordiv
(
tf
.
random
.
uniform
([],
minval
=
minval
,
maxval
=
maxval
),
step
)
*
step
-
offset
min_overlap
=
tf
.
clip_by_value
(
min_overlap
,
0.0
,
1.1
)
if
min_overlap
>
1.0
:
return
img
,
boxes
,
labels
aspect_ratio_low
=
aspect_ratio_range
[
0
]
aspect_ratio_high
=
aspect_ratio_range
[
1
]
for
_
in
tf
.
range
(
max_retry
):
scale_h
=
tf
.
random
.
uniform
([],
min_scale
,
1.0
)
scale_w
=
tf
.
random
.
uniform
([],
min_scale
,
1.0
)
new_h
=
tf
.
cast
(
scale_h
*
tf
.
cast
(
original_h
,
dtype
=
tf
.
float32
),
dtype
=
tf
.
int32
)
new_w
=
tf
.
cast
(
scale_w
*
tf
.
cast
(
original_w
,
dtype
=
tf
.
float32
),
dtype
=
tf
.
int32
)
# Aspect ratio has to be in the prespecified range
aspect_ratio
=
new_h
/
new_w
if
aspect_ratio_low
>
aspect_ratio
or
aspect_ratio
>
aspect_ratio_high
:
continue
left
=
tf
.
random
.
uniform
([],
0
,
original_w
-
new_w
,
dtype
=
tf
.
int32
)
right
=
left
+
new_w
top
=
tf
.
random
.
uniform
([],
0
,
original_h
-
new_h
,
dtype
=
tf
.
int32
)
bottom
=
top
+
new_h
normalized_left
=
tf
.
cast
(
left
,
dtype
=
tf
.
float32
)
/
tf
.
cast
(
original_w
,
dtype
=
tf
.
float32
)
normalized_right
=
tf
.
cast
(
right
,
dtype
=
tf
.
float32
)
/
tf
.
cast
(
original_w
,
dtype
=
tf
.
float32
)
normalized_top
=
tf
.
cast
(
top
,
dtype
=
tf
.
float32
)
/
tf
.
cast
(
original_h
,
dtype
=
tf
.
float32
)
normalized_bottom
=
tf
.
cast
(
bottom
,
dtype
=
tf
.
float32
)
/
tf
.
cast
(
original_h
,
dtype
=
tf
.
float32
)
cropped_box
=
tf
.
expand_dims
(
tf
.
stack
([
normalized_top
,
normalized_left
,
normalized_bottom
,
normalized_right
,
]),
axis
=
0
)
iou
=
box_ops
.
bbox_overlap
(
tf
.
expand_dims
(
cropped_box
,
axis
=
0
),
tf
.
expand_dims
(
boxes
,
axis
=
0
))
# (1, 1, n_ground_truth)
iou
=
tf
.
squeeze
(
iou
,
axis
=
[
0
,
1
])
# If not a single bounding box has a Jaccard overlap of greater than
# the minimum, try again
if
tf
.
reduce_max
(
iou
)
<
min_overlap
:
continue
centroids
=
box_ops
.
yxyx_to_cycxhw
(
boxes
)
mask
=
tf
.
math
.
logical_and
(
tf
.
math
.
logical_and
(
centroids
[:,
0
]
>
normalized_top
,
centroids
[:,
0
]
<
normalized_bottom
),
tf
.
math
.
logical_and
(
centroids
[:,
1
]
>
normalized_left
,
centroids
[:,
1
]
<
normalized_right
))
# If not a single bounding box has its center in the crop, try again.
if
tf
.
reduce_sum
(
tf
.
cast
(
mask
,
dtype
=
tf
.
int32
))
>
0
:
indices
=
tf
.
squeeze
(
tf
.
where
(
mask
),
axis
=
1
)
filtered_boxes
=
tf
.
gather
(
boxes
,
indices
)
boxes
=
tf
.
clip_by_value
(
(
filtered_boxes
[...,
:]
*
tf
.
cast
(
tf
.
stack
([
original_h
,
original_w
,
original_h
,
original_w
]),
dtype
=
tf
.
float32
)
-
tf
.
cast
(
tf
.
stack
([
top
,
left
,
top
,
left
]),
dtype
=
tf
.
float32
))
/
tf
.
cast
(
tf
.
stack
([
new_h
,
new_w
,
new_h
,
new_w
]),
dtype
=
tf
.
float32
),
0.0
,
1.0
)
img
=
tf
.
image
.
crop_to_bounding_box
(
img
,
top
,
left
,
bottom
-
top
,
right
-
left
)
labels
=
tf
.
gather
(
labels
,
indices
)
break
return
img
,
boxes
,
labels
def
random_crop
(
image
,
boxes
,
labels
,
min_scale
=
0.3
,
aspect_ratio_range
=
(
0.5
,
2.0
),
min_overlap_params
=
(
0.0
,
1.4
,
0.2
,
0.1
),
max_retry
=
50
,
seed
=
None
):
"""Randomly crop the image and boxes, filtering labels.
Args:
image: a 'Tensor' of shape [height, width, 3] representing the input image.
boxes: a 'Tensor' of shape [N, 4] representing the ground-truth bounding
boxes with (ymin, xmin, ymax, xmax).
labels: a 'Tensor' of shape [N,] representing the class labels of the boxes.
min_scale: a 'float' in [0.0, 1.0) indicating the lower bound of the random
scale variable.
aspect_ratio_range: a list of two 'float' that specifies the lower and upper
bound of the random aspect ratio.
min_overlap_params: a list of four 'float' representing the min value, max
value, step size, and offset for the minimum overlap sample.
max_retry: an 'int' representing the number of trials for cropping. If it is
exhausted, no cropping will be performed.
seed: the random number seed of int, but could be None.
Returns:
image: a Tensor representing the random cropped image. Can be the
original image if max_retry is exhausted.
boxes: a Tensor representing the bounding boxes in the cropped image.
labels: a Tensor representing the new bounding boxes' labels.
"""
with
tf
.
name_scope
(
'random_crop'
):
do_crop
=
tf
.
greater
(
tf
.
random
.
uniform
([],
seed
=
seed
),
0.5
)
if
do_crop
:
return
random_crop_image_with_boxes_and_labels
(
image
,
boxes
,
labels
,
min_scale
,
aspect_ratio_range
,
min_overlap_params
,
max_retry
)
else
:
return
image
,
boxes
,
labels
official/vision/ops/preprocess_ops_3d.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Utils for processing video dataset features."""
from
typing
import
Optional
,
Tuple
import
tensorflow
as
tf
def
_sample_or_pad_sequence_indices
(
sequence
:
tf
.
Tensor
,
num_steps
:
int
,
stride
:
int
,
offset
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Returns indices to take for sampling or padding sequences to fixed size."""
sequence_length
=
tf
.
shape
(
sequence
)[
0
]
sel_idx
=
tf
.
range
(
sequence_length
)
# Repeats sequence until num_steps are available in total.
max_length
=
num_steps
*
stride
+
offset
num_repeats
=
tf
.
math
.
floordiv
(
max_length
+
sequence_length
-
1
,
sequence_length
)
sel_idx
=
tf
.
tile
(
sel_idx
,
[
num_repeats
])
steps
=
tf
.
range
(
offset
,
offset
+
num_steps
*
stride
,
stride
)
return
tf
.
gather
(
sel_idx
,
steps
)
def
sample_linspace_sequence
(
sequence
:
tf
.
Tensor
,
num_windows
:
int
,
num_steps
:
int
,
stride
:
int
)
->
tf
.
Tensor
:
"""Samples `num_windows` segments from sequence with linearly spaced offsets.
The samples are concatenated in a single `tf.Tensor` in order to have the same
format structure per timestep (e.g. a single frame). If `num_steps` * `stride`
is bigger than the number of timesteps, the sequence is repeated. This
function can be used in evaluation in order to extract enough segments to span
the entire sequence.
Args:
sequence: Any tensor where the first dimension is timesteps.
num_windows: Number of windows retrieved from the sequence.
num_steps: Number of steps (e.g. frames) to take.
stride: Distance to sample between timesteps.
Returns:
A single `tf.Tensor` with first dimension `num_windows` * `num_steps`. The
tensor contains the concatenated list of `num_windows` tensors which offsets
have been linearly spaced from input.
"""
sequence_length
=
tf
.
shape
(
sequence
)[
0
]
max_offset
=
tf
.
maximum
(
0
,
sequence_length
-
num_steps
*
stride
)
offsets
=
tf
.
linspace
(
0.0
,
tf
.
cast
(
max_offset
,
tf
.
float32
),
num_windows
)
offsets
=
tf
.
cast
(
offsets
,
tf
.
int32
)
all_indices
=
[]
for
i
in
range
(
num_windows
):
all_indices
.
append
(
_sample_or_pad_sequence_indices
(
sequence
=
sequence
,
num_steps
=
num_steps
,
stride
=
stride
,
offset
=
offsets
[
i
]))
indices
=
tf
.
concat
(
all_indices
,
axis
=
0
)
indices
.
set_shape
((
num_windows
*
num_steps
,))
return
tf
.
gather
(
sequence
,
indices
)
def
sample_sequence
(
sequence
:
tf
.
Tensor
,
num_steps
:
int
,
random
:
bool
,
stride
:
int
,
seed
:
Optional
[
int
]
=
None
)
->
tf
.
Tensor
:
"""Samples a single segment of size `num_steps` from a given sequence.
If `random` is not `True`, this function will simply sample the central window
of the sequence. Otherwise, a random offset will be chosen in a way that the
desired `num_steps` might be extracted from the sequence.
Args:
sequence: Any tensor where the first dimension is timesteps.
num_steps: Number of steps (e.g. frames) to take.
random: A boolean indicating whether to random sample the single window. If
`True`, the offset is randomized. If `False`, the middle frame minus half
of `num_steps` is the first frame.
stride: Distance to sample between timesteps.
seed: A deterministic seed to use when sampling.
Returns:
A single `tf.Tensor` with first dimension `num_steps` with the sampled
segment.
"""
sequence_length
=
tf
.
shape
(
sequence
)[
0
]
if
random
:
sequence_length
=
tf
.
cast
(
sequence_length
,
tf
.
float32
)
frame_stride
=
tf
.
cast
(
stride
,
tf
.
float32
)
max_offset
=
tf
.
cond
(
sequence_length
>
(
num_steps
-
1
)
*
frame_stride
,
lambda
:
sequence_length
-
(
num_steps
-
1
)
*
frame_stride
,
lambda
:
sequence_length
)
offset
=
tf
.
random
.
uniform
(
(),
maxval
=
tf
.
cast
(
max_offset
,
dtype
=
tf
.
int32
),
dtype
=
tf
.
int32
,
seed
=
seed
)
else
:
offset
=
(
sequence_length
-
num_steps
*
stride
)
//
2
offset
=
tf
.
maximum
(
0
,
offset
)
indices
=
_sample_or_pad_sequence_indices
(
sequence
=
sequence
,
num_steps
=
num_steps
,
stride
=
stride
,
offset
=
offset
)
indices
.
set_shape
((
num_steps
,))
return
tf
.
gather
(
sequence
,
indices
)
def
decode_jpeg
(
image_string
:
tf
.
Tensor
,
channels
:
int
=
0
)
->
tf
.
Tensor
:
"""Decodes JPEG raw bytes string into a RGB uint8 Tensor.
Args:
image_string: A `tf.Tensor` of type strings with the raw JPEG bytes where
the first dimension is timesteps.
channels: Number of channels of the JPEG image. Allowed values are 0, 1 and
3. If 0, the number of channels will be calculated at runtime and no
static shape is set.
Returns:
A Tensor of shape [T, H, W, C] of type uint8 with the decoded images.
"""
return
tf
.
map_fn
(
lambda
x
:
tf
.
image
.
decode_jpeg
(
x
,
channels
=
channels
),
image_string
,
back_prop
=
False
,
dtype
=
tf
.
uint8
)
def
crop_image
(
frames
:
tf
.
Tensor
,
target_height
:
int
,
target_width
:
int
,
random
:
bool
=
False
,
num_crops
:
int
=
1
,
seed
:
Optional
[
int
]
=
None
)
->
tf
.
Tensor
:
"""Crops the image sequence of images.
If requested size is bigger than image size, image is padded with 0. If not
random cropping, a central crop is performed if num_crops is 1.
Args:
frames: A Tensor of dimension [timesteps, in_height, in_width, channels].
target_height: Target cropped image height.
target_width: Target cropped image width.
random: A boolean indicating if crop should be randomized.
num_crops: Number of crops (support 1 for central crop and 3 for 3-crop).
seed: A deterministic seed to use when random cropping.
Returns:
A Tensor of shape [timesteps, out_height, out_width, channels] of type uint8
with the cropped images.
"""
if
random
:
# Random spatial crop.
shape
=
tf
.
shape
(
frames
)
# If a static_shape is available (e.g. when using this method from add_image
# method), it will be used to have an output tensor with static shape.
static_shape
=
frames
.
shape
.
as_list
()
seq_len
=
shape
[
0
]
if
static_shape
[
0
]
is
None
else
static_shape
[
0
]
channels
=
shape
[
3
]
if
static_shape
[
3
]
is
None
else
static_shape
[
3
]
frames
=
tf
.
image
.
random_crop
(
frames
,
(
seq_len
,
target_height
,
target_width
,
channels
),
seed
)
else
:
if
num_crops
==
1
:
# Central crop or pad.
frames
=
tf
.
image
.
resize_with_crop_or_pad
(
frames
,
target_height
,
target_width
)
elif
num_crops
==
3
:
# Three-crop evaluation.
shape
=
tf
.
shape
(
frames
)
static_shape
=
frames
.
shape
.
as_list
()
seq_len
=
shape
[
0
]
if
static_shape
[
0
]
is
None
else
static_shape
[
0
]
height
=
shape
[
1
]
if
static_shape
[
1
]
is
None
else
static_shape
[
1
]
width
=
shape
[
2
]
if
static_shape
[
2
]
is
None
else
static_shape
[
2
]
channels
=
shape
[
3
]
if
static_shape
[
3
]
is
None
else
static_shape
[
3
]
size
=
tf
.
convert_to_tensor
(
(
seq_len
,
target_height
,
target_width
,
channels
))
offset_1
=
tf
.
broadcast_to
([
0
,
0
,
0
,
0
],
[
4
])
# pylint:disable=g-long-lambda
offset_2
=
tf
.
cond
(
tf
.
greater_equal
(
height
,
width
),
true_fn
=
lambda
:
tf
.
broadcast_to
([
0
,
tf
.
cast
(
height
,
tf
.
float32
)
/
2
-
target_height
//
2
,
0
,
0
],
[
4
]),
false_fn
=
lambda
:
tf
.
broadcast_to
([
0
,
0
,
tf
.
cast
(
width
,
tf
.
float32
)
/
2
-
target_width
//
2
,
0
],
[
4
]))
offset_3
=
tf
.
cond
(
tf
.
greater_equal
(
height
,
width
),
true_fn
=
lambda
:
tf
.
broadcast_to
(
[
0
,
tf
.
cast
(
height
,
tf
.
float32
)
-
target_height
,
0
,
0
],
[
4
]),
false_fn
=
lambda
:
tf
.
broadcast_to
(
[
0
,
0
,
tf
.
cast
(
width
,
tf
.
float32
)
-
target_width
,
0
],
[
4
]))
# pylint:disable=g-long-lambda
crops
=
[]
for
offset
in
[
offset_1
,
offset_2
,
offset_3
]:
offset
=
tf
.
cast
(
tf
.
math
.
round
(
offset
),
tf
.
int32
)
crops
.
append
(
tf
.
slice
(
frames
,
offset
,
size
))
frames
=
tf
.
concat
(
crops
,
axis
=
0
)
else
:
raise
NotImplementedError
(
f
"Only 1-crop and 3-crop are supported. Found
{
num_crops
!
r
}
."
)
return
frames
def
resize_smallest
(
frames
:
tf
.
Tensor
,
min_resize
:
int
)
->
tf
.
Tensor
:
"""Resizes frames so that min(`height`, `width`) is equal to `min_resize`.
This function will not do anything if the min(`height`, `width`) is already
equal to `min_resize`. This allows to save compute time.
Args:
frames: A Tensor of dimension [timesteps, input_h, input_w, channels].
min_resize: Minimum size of the final image dimensions.
Returns:
A Tensor of shape [timesteps, output_h, output_w, channels] of type
frames.dtype where min(output_h, output_w) = min_resize.
"""
shape
=
tf
.
shape
(
frames
)
input_h
=
shape
[
1
]
input_w
=
shape
[
2
]
output_h
=
tf
.
maximum
(
min_resize
,
(
input_h
*
min_resize
)
//
input_w
)
output_w
=
tf
.
maximum
(
min_resize
,
(
input_w
*
min_resize
)
//
input_h
)
def
resize_fn
():
frames_resized
=
tf
.
image
.
resize
(
frames
,
(
output_h
,
output_w
))
return
tf
.
cast
(
frames_resized
,
frames
.
dtype
)
should_resize
=
tf
.
math
.
logical_or
(
tf
.
not_equal
(
input_w
,
output_w
),
tf
.
not_equal
(
input_h
,
output_h
))
frames
=
tf
.
cond
(
should_resize
,
resize_fn
,
lambda
:
frames
)
return
frames
def
random_crop_resize
(
frames
:
tf
.
Tensor
,
output_h
:
int
,
output_w
:
int
,
num_frames
:
int
,
num_channels
:
int
,
aspect_ratio
:
Tuple
[
float
,
float
],
area_range
:
Tuple
[
float
,
float
])
->
tf
.
Tensor
:
"""First crops clip with jittering and then resizes to (output_h, output_w).
Args:
frames: A Tensor of dimension [timesteps, input_h, input_w, channels].
output_h: Resized image height.
output_w: Resized image width.
num_frames: Number of input frames per clip.
num_channels: Number of channels of the clip.
aspect_ratio: Float tuple with the aspect range for cropping.
area_range: Float tuple with the area range for cropping.
Returns:
A Tensor of shape [timesteps, output_h, output_w, channels] of type
frames.dtype.
"""
shape
=
tf
.
shape
(
frames
)
seq_len
,
_
,
_
,
channels
=
shape
[
0
],
shape
[
1
],
shape
[
2
],
shape
[
3
]
bbox
=
tf
.
constant
([
0.0
,
0.0
,
1.0
,
1.0
],
dtype
=
tf
.
float32
,
shape
=
[
1
,
1
,
4
])
factor
=
output_w
/
output_h
aspect_ratio
=
(
aspect_ratio
[
0
]
*
factor
,
aspect_ratio
[
1
]
*
factor
)
sample_distorted_bbox
=
tf
.
image
.
sample_distorted_bounding_box
(
shape
[
1
:],
bounding_boxes
=
bbox
,
min_object_covered
=
0.1
,
aspect_ratio_range
=
aspect_ratio
,
area_range
=
area_range
,
max_attempts
=
100
,
use_image_if_no_bounding_boxes
=
True
)
bbox_begin
,
bbox_size
,
_
=
sample_distorted_bbox
offset_y
,
offset_x
,
_
=
tf
.
unstack
(
bbox_begin
)
target_height
,
target_width
,
_
=
tf
.
unstack
(
bbox_size
)
size
=
tf
.
convert_to_tensor
((
seq_len
,
target_height
,
target_width
,
channels
))
offset
=
tf
.
convert_to_tensor
((
0
,
offset_y
,
offset_x
,
0
))
frames
=
tf
.
slice
(
frames
,
offset
,
size
)
frames
=
tf
.
cast
(
tf
.
image
.
resize
(
frames
,
(
output_h
,
output_w
)),
frames
.
dtype
)
frames
.
set_shape
((
num_frames
,
output_h
,
output_w
,
num_channels
))
return
frames
def
random_flip_left_right
(
frames
:
tf
.
Tensor
,
seed
:
Optional
[
int
]
=
None
)
->
tf
.
Tensor
:
"""Flips all the frames with a probability of 50%.
Args:
frames: A Tensor of shape [timesteps, input_h, input_w, channels].
seed: A seed to use for the random sampling.
Returns:
A Tensor of shape [timesteps, output_h, output_w, channels] eventually
flipped left right.
"""
is_flipped
=
tf
.
random
.
uniform
(
(),
minval
=
0
,
maxval
=
2
,
dtype
=
tf
.
int32
,
seed
=
seed
)
frames
=
tf
.
cond
(
tf
.
equal
(
is_flipped
,
1
),
true_fn
=
lambda
:
tf
.
image
.
flip_left_right
(
frames
),
false_fn
=
lambda
:
frames
)
return
frames
def
normalize_image
(
frames
:
tf
.
Tensor
,
zero_centering_image
:
bool
,
dtype
:
tf
.
dtypes
.
DType
=
tf
.
float32
)
->
tf
.
Tensor
:
"""Normalizes images.
Args:
frames: A Tensor of numbers.
zero_centering_image: If True, results are in [-1, 1], if False, results are
in [0, 1].
dtype: Type of output Tensor.
Returns:
A Tensor of same shape as the input and of the given type.
"""
frames
=
tf
.
cast
(
frames
,
dtype
)
if
zero_centering_image
:
return
frames
*
(
2.0
/
255.0
)
-
1.0
else
:
return
frames
/
255.0
official/vision/ops/preprocess_ops_3d_test.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
import
io
import
itertools
import
numpy
as
np
from
PIL
import
Image
import
tensorflow
as
tf
from
official.vision.ops
import
preprocess_ops_3d
class
ParserUtilsTest
(
tf
.
test
.
TestCase
):
def
setUp
(
self
):
super
().
setUp
()
# [[0, 1, ..., 119], [1, 2, ..., 120], ..., [119, 120, ..., 218]].
self
.
_frames
=
tf
.
stack
([
tf
.
range
(
i
,
i
+
120
)
for
i
in
range
(
90
)])
self
.
_frames
=
tf
.
cast
(
self
.
_frames
,
tf
.
uint8
)
self
.
_frames
=
self
.
_frames
[
tf
.
newaxis
,
:,
:,
tf
.
newaxis
]
self
.
_frames
=
tf
.
broadcast_to
(
self
.
_frames
,
(
6
,
90
,
120
,
3
))
# Create an equivalent numpy array for assertions.
self
.
_np_frames
=
np
.
array
([
range
(
i
,
i
+
120
)
for
i
in
range
(
90
)])
self
.
_np_frames
=
self
.
_np_frames
[
np
.
newaxis
,
:,
:,
np
.
newaxis
]
self
.
_np_frames
=
np
.
broadcast_to
(
self
.
_np_frames
,
(
6
,
90
,
120
,
3
))
def
test_sample_linspace_sequence
(
self
):
sequence
=
tf
.
range
(
100
)
sampled_seq_1
=
preprocess_ops_3d
.
sample_linspace_sequence
(
sequence
,
10
,
10
,
1
)
sampled_seq_2
=
preprocess_ops_3d
.
sample_linspace_sequence
(
sequence
,
7
,
10
,
1
)
sampled_seq_3
=
preprocess_ops_3d
.
sample_linspace_sequence
(
sequence
,
7
,
5
,
2
)
sampled_seq_4
=
preprocess_ops_3d
.
sample_linspace_sequence
(
sequence
,
101
,
1
,
1
)
self
.
assertAllEqual
(
sampled_seq_1
,
range
(
100
))
# [0, 1, 2, 3, 4, ..., 8, 9, 15, 16, ..., 97, 98, 99]
self
.
assertAllEqual
(
sampled_seq_2
,
[
15
*
i
+
j
for
i
,
j
in
itertools
.
product
(
range
(
7
),
range
(
10
))])
# [0, 2, 4, 6, 8, 15, 17, 19, ..., 96, 98]
self
.
assertAllEqual
(
sampled_seq_3
,
[
15
*
i
+
2
*
j
for
i
,
j
in
itertools
.
product
(
range
(
7
),
range
(
5
))])
self
.
assertAllEqual
(
sampled_seq_4
,
[
0
]
+
list
(
range
(
100
)))
def
test_sample_sequence
(
self
):
sequence
=
tf
.
range
(
100
)
sampled_seq_1
=
preprocess_ops_3d
.
sample_sequence
(
sequence
,
10
,
False
,
1
)
sampled_seq_2
=
preprocess_ops_3d
.
sample_sequence
(
sequence
,
10
,
False
,
2
)
sampled_seq_3
=
preprocess_ops_3d
.
sample_sequence
(
sequence
,
10
,
True
,
1
)
self
.
assertAllEqual
(
sampled_seq_1
,
range
(
45
,
55
))
self
.
assertAllEqual
(
sampled_seq_2
,
range
(
40
,
60
,
2
))
offset_3
=
sampled_seq_3
[
0
]
self
.
assertBetween
(
offset_3
,
0
,
99
)
self
.
assertAllEqual
(
sampled_seq_3
,
range
(
offset_3
,
offset_3
+
10
))
def
test_decode_jpeg
(
self
):
# Create a random RGB JPEG image.
random_image
=
np
.
random
.
randint
(
0
,
256
,
size
=
(
263
,
320
,
3
),
dtype
=
np
.
uint8
)
random_image
=
Image
.
fromarray
(
random_image
)
with
io
.
BytesIO
()
as
buffer
:
random_image
.
save
(
buffer
,
format
=
'JPEG'
)
raw_image_bytes
=
buffer
.
getvalue
()
raw_image
=
tf
.
constant
([
raw_image_bytes
,
raw_image_bytes
])
decoded_image
=
preprocess_ops_3d
.
decode_jpeg
(
raw_image
,
3
)
self
.
assertEqual
(
decoded_image
.
shape
.
as_list
()[
3
],
3
)
self
.
assertAllEqual
(
decoded_image
.
shape
,
(
2
,
263
,
320
,
3
))
def
test_crop_image
(
self
):
cropped_image_1
=
preprocess_ops_3d
.
crop_image
(
self
.
_frames
,
50
,
70
)
cropped_image_2
=
preprocess_ops_3d
.
crop_image
(
self
.
_frames
,
200
,
200
)
cropped_image_3
=
preprocess_ops_3d
.
crop_image
(
self
.
_frames
,
50
,
70
,
True
)
cropped_image_4
=
preprocess_ops_3d
.
crop_image
(
self
.
_frames
,
90
,
90
,
False
,
3
)
self
.
assertAllEqual
(
cropped_image_1
.
shape
,
(
6
,
50
,
70
,
3
))
self
.
assertAllEqual
(
cropped_image_1
,
self
.
_np_frames
[:,
20
:
70
,
25
:
95
,
:])
self
.
assertAllEqual
(
cropped_image_2
.
shape
,
(
6
,
200
,
200
,
3
))
expected
=
np
.
pad
(
self
.
_np_frames
,
((
0
,
0
),
(
55
,
55
),
(
40
,
40
),
(
0
,
0
)),
'constant'
)
self
.
assertAllEqual
(
cropped_image_2
,
expected
)
self
.
assertAllEqual
(
cropped_image_3
.
shape
,
(
6
,
50
,
70
,
3
))
offset
=
cropped_image_3
[
0
,
0
,
0
,
0
]
expected
=
np
.
array
([
range
(
i
,
i
+
70
)
for
i
in
range
(
offset
,
offset
+
50
)])
expected
=
expected
[
np
.
newaxis
,
:,
:,
np
.
newaxis
]
expected
=
np
.
broadcast_to
(
expected
,
(
6
,
50
,
70
,
3
))
self
.
assertAllEqual
(
cropped_image_3
,
expected
)
self
.
assertAllEqual
(
cropped_image_4
.
shape
,
(
18
,
90
,
90
,
3
))
def
test_resize_smallest
(
self
):
resized_frames_1
=
preprocess_ops_3d
.
resize_smallest
(
self
.
_frames
,
180
)
resized_frames_2
=
preprocess_ops_3d
.
resize_smallest
(
self
.
_frames
,
45
)
resized_frames_3
=
preprocess_ops_3d
.
resize_smallest
(
self
.
_frames
,
90
)
resized_frames_4
=
preprocess_ops_3d
.
resize_smallest
(
tf
.
transpose
(
self
.
_frames
,
(
0
,
2
,
1
,
3
)),
45
)
self
.
assertAllEqual
(
resized_frames_1
.
shape
,
(
6
,
180
,
240
,
3
))
self
.
assertAllEqual
(
resized_frames_2
.
shape
,
(
6
,
45
,
60
,
3
))
self
.
assertAllEqual
(
resized_frames_3
.
shape
,
(
6
,
90
,
120
,
3
))
self
.
assertAllEqual
(
resized_frames_4
.
shape
,
(
6
,
60
,
45
,
3
))
def
test_random_crop_resize
(
self
):
resized_frames_1
=
preprocess_ops_3d
.
random_crop_resize
(
self
.
_frames
,
256
,
256
,
6
,
3
,
(
0.5
,
2
),
(
0.3
,
1
))
resized_frames_2
=
preprocess_ops_3d
.
random_crop_resize
(
self
.
_frames
,
224
,
224
,
6
,
3
,
(
0.5
,
2
),
(
0.3
,
1
))
resized_frames_3
=
preprocess_ops_3d
.
random_crop_resize
(
self
.
_frames
,
256
,
256
,
6
,
3
,
(
0.8
,
1.2
),
(
0.3
,
1
))
resized_frames_4
=
preprocess_ops_3d
.
random_crop_resize
(
self
.
_frames
,
256
,
256
,
6
,
3
,
(
0.5
,
2
),
(
0.1
,
1
))
self
.
assertAllEqual
(
resized_frames_1
.
shape
,
(
6
,
256
,
256
,
3
))
self
.
assertAllEqual
(
resized_frames_2
.
shape
,
(
6
,
224
,
224
,
3
))
self
.
assertAllEqual
(
resized_frames_3
.
shape
,
(
6
,
256
,
256
,
3
))
self
.
assertAllEqual
(
resized_frames_4
.
shape
,
(
6
,
256
,
256
,
3
))
def
test_random_flip_left_right
(
self
):
flipped_frames
=
preprocess_ops_3d
.
random_flip_left_right
(
self
.
_frames
)
flipped
=
np
.
fliplr
(
self
.
_np_frames
[
0
,
:,
:,
0
])
flipped
=
flipped
[
np
.
newaxis
,
:,
:,
np
.
newaxis
]
flipped
=
np
.
broadcast_to
(
flipped
,
(
6
,
90
,
120
,
3
))
self
.
assertTrue
((
flipped_frames
==
self
.
_np_frames
).
numpy
().
all
()
or
(
flipped_frames
==
flipped
).
numpy
().
all
())
def
test_normalize_image
(
self
):
normalized_images_1
=
preprocess_ops_3d
.
normalize_image
(
self
.
_frames
,
False
,
tf
.
float32
)
normalized_images_2
=
preprocess_ops_3d
.
normalize_image
(
self
.
_frames
,
True
,
tf
.
float32
)
self
.
assertAllClose
(
normalized_images_1
,
self
.
_np_frames
/
255
)
self
.
assertAllClose
(
normalized_images_2
,
self
.
_np_frames
*
2
/
255
-
1.0
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/ops/preprocess_ops_test.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for preprocess_ops.py."""
import
io
# Import libraries
from
absl.testing
import
parameterized
import
numpy
as
np
from
PIL
import
Image
import
tensorflow
as
tf
from
official.vision.ops
import
preprocess_ops
def
_encode_image
(
image_array
,
fmt
):
image
=
Image
.
fromarray
(
image_array
)
with
io
.
BytesIO
()
as
output
:
image
.
save
(
output
,
format
=
fmt
)
return
output
.
getvalue
()
class
InputUtilsTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
parameters
(
([
1
],
10
),
([
1
,
2
],
10
),
([
1
,
2
,
3
],
10
),
([
11
],
10
),
([
12
,
2
],
10
),
([
13
,
2
,
3
],
10
),
)
def
test_pad_to_fixed_size
(
self
,
input_shape
,
output_size
):
# Copies input shape to padding shape.
clip_shape
=
input_shape
[:]
clip_shape
[
0
]
=
min
(
output_size
,
clip_shape
[
0
])
padding_shape
=
input_shape
[:]
padding_shape
[
0
]
=
max
(
output_size
-
input_shape
[
0
],
0
)
expected_outputs
=
np
.
concatenate
(
[
np
.
ones
(
clip_shape
),
np
.
zeros
(
padding_shape
)],
axis
=
0
)
data
=
tf
.
ones
(
input_shape
)
output_data
=
preprocess_ops
.
clip_or_pad_to_fixed_size
(
data
,
output_size
,
constant_values
=
0
)
output_data
=
output_data
.
numpy
()
self
.
assertAllClose
(
output_size
,
output_data
.
shape
[
0
])
self
.
assertAllClose
(
expected_outputs
,
output_data
)
@
parameterized
.
parameters
(
(
100
,
200
,
100
,
200
,
32
,
1.0
,
1.0
,
128
,
224
),
(
100
,
256
,
128
,
256
,
32
,
1.0
,
1.0
,
128
,
256
),
(
200
,
512
,
200
,
128
,
32
,
0.25
,
0.25
,
224
,
128
),
)
def
test_resize_and_crop_image_rectangluar_case
(
self
,
input_height
,
input_width
,
desired_height
,
desired_width
,
stride
,
scale_y
,
scale_x
,
output_height
,
output_width
):
image
=
tf
.
convert_to_tensor
(
np
.
random
.
rand
(
input_height
,
input_width
,
3
))
desired_size
=
(
desired_height
,
desired_width
)
resized_image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
desired_size
=
desired_size
,
padded_size
=
preprocess_ops
.
compute_padded_size
(
desired_size
,
stride
))
resized_image_shape
=
tf
.
shape
(
resized_image
)
self
.
assertAllEqual
(
[
output_height
,
output_width
,
3
],
resized_image_shape
.
numpy
())
self
.
assertNDArrayNear
(
[[
input_height
,
input_width
],
[
desired_height
,
desired_width
],
[
scale_y
,
scale_x
],
[
0.0
,
0.0
]],
image_info
.
numpy
(),
1e-5
)
@
parameterized
.
parameters
(
(
100
,
200
,
220
,
220
,
32
,
1.1
,
1.1
,
224
,
224
),
(
512
,
512
,
1024
,
1024
,
32
,
2.0
,
2.0
,
1024
,
1024
),
)
def
test_resize_and_crop_image_square_case
(
self
,
input_height
,
input_width
,
desired_height
,
desired_width
,
stride
,
scale_y
,
scale_x
,
output_height
,
output_width
):
image
=
tf
.
convert_to_tensor
(
np
.
random
.
rand
(
input_height
,
input_width
,
3
))
desired_size
=
(
desired_height
,
desired_width
)
resized_image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
desired_size
=
desired_size
,
padded_size
=
preprocess_ops
.
compute_padded_size
(
desired_size
,
stride
))
resized_image_shape
=
tf
.
shape
(
resized_image
)
self
.
assertAllEqual
(
[
output_height
,
output_width
,
3
],
resized_image_shape
.
numpy
())
self
.
assertNDArrayNear
(
[[
input_height
,
input_width
],
[
desired_height
,
desired_width
],
[
scale_y
,
scale_x
],
[
0.0
,
0.0
]],
image_info
.
numpy
(),
1e-5
)
@
parameterized
.
parameters
(
(
100
,
200
,
100
,
300
,
32
,
1.0
,
1.0
,
100
,
200
,
128
,
320
),
(
200
,
100
,
100
,
300
,
32
,
1.0
,
1.0
,
200
,
100
,
320
,
128
),
(
100
,
200
,
80
,
100
,
32
,
0.5
,
0.5
,
50
,
100
,
96
,
128
),
(
200
,
100
,
80
,
100
,
32
,
0.5
,
0.5
,
100
,
50
,
128
,
96
),
)
def
test_resize_and_crop_image_v2
(
self
,
input_height
,
input_width
,
short_side
,
long_side
,
stride
,
scale_y
,
scale_x
,
desired_height
,
desired_width
,
output_height
,
output_width
):
image
=
tf
.
convert_to_tensor
(
np
.
random
.
rand
(
input_height
,
input_width
,
3
))
image_shape
=
tf
.
shape
(
image
)[
0
:
2
]
desired_size
=
tf
.
where
(
tf
.
greater
(
image_shape
[
0
],
image_shape
[
1
]),
tf
.
constant
([
long_side
,
short_side
],
dtype
=
tf
.
int32
),
tf
.
constant
([
short_side
,
long_side
],
dtype
=
tf
.
int32
))
resized_image
,
image_info
=
preprocess_ops
.
resize_and_crop_image_v2
(
image
,
short_side
=
short_side
,
long_side
=
long_side
,
padded_size
=
preprocess_ops
.
compute_padded_size
(
desired_size
,
stride
))
resized_image_shape
=
tf
.
shape
(
resized_image
)
self
.
assertAllEqual
(
[
output_height
,
output_width
,
3
],
resized_image_shape
.
numpy
())
self
.
assertNDArrayNear
(
[[
input_height
,
input_width
],
[
desired_height
,
desired_width
],
[
scale_y
,
scale_x
],
[
0.0
,
0.0
]],
image_info
.
numpy
(),
1e-5
)
@
parameterized
.
parameters
(
(
400
,
600
),
(
600
,
400
),
)
def
test_center_crop_image
(
self
,
input_height
,
input_width
):
image
=
tf
.
convert_to_tensor
(
np
.
random
.
rand
(
input_height
,
input_width
,
3
))
cropped_image
=
preprocess_ops
.
center_crop_image
(
image
)
cropped_image_shape
=
tf
.
shape
(
cropped_image
)
self
.
assertAllEqual
([
350
,
350
,
3
],
cropped_image_shape
.
numpy
())
@
parameterized
.
parameters
(
(
400
,
600
),
(
600
,
400
),
)
def
test_center_crop_image_v2
(
self
,
input_height
,
input_width
):
image_bytes
=
tf
.
constant
(
_encode_image
(
np
.
uint8
(
np
.
random
.
rand
(
input_height
,
input_width
,
3
)
*
255
),
fmt
=
'JPEG'
),
dtype
=
tf
.
string
)
cropped_image
=
preprocess_ops
.
center_crop_image_v2
(
image_bytes
,
tf
.
constant
([
input_height
,
input_width
,
3
],
tf
.
int32
))
cropped_image_shape
=
tf
.
shape
(
cropped_image
)
self
.
assertAllEqual
([
350
,
350
,
3
],
cropped_image_shape
.
numpy
())
@
parameterized
.
parameters
(
(
400
,
600
),
(
600
,
400
),
)
def
test_random_crop_image
(
self
,
input_height
,
input_width
):
image
=
tf
.
convert_to_tensor
(
np
.
random
.
rand
(
input_height
,
input_width
,
3
))
_
=
preprocess_ops
.
random_crop_image
(
image
)
@
parameterized
.
parameters
(
(
400
,
600
),
(
600
,
400
),
)
def
test_random_crop_image_v2
(
self
,
input_height
,
input_width
):
image_bytes
=
tf
.
constant
(
_encode_image
(
np
.
uint8
(
np
.
random
.
rand
(
input_height
,
input_width
,
3
)
*
255
),
fmt
=
'JPEG'
),
dtype
=
tf
.
string
)
_
=
preprocess_ops
.
random_crop_image_v2
(
image_bytes
,
tf
.
constant
([
input_height
,
input_width
,
3
],
tf
.
int32
))
@
parameterized
.
parameters
((
400
,
600
,
0
),
(
400
,
600
,
0.4
),
(
600
,
400
,
1.4
))
def
testColorJitter
(
self
,
input_height
,
input_width
,
color_jitter
):
image
=
tf
.
convert_to_tensor
(
np
.
random
.
rand
(
input_height
,
input_width
,
3
))
jittered_image
=
preprocess_ops
.
color_jitter
(
image
,
color_jitter
,
color_jitter
,
color_jitter
)
assert
jittered_image
.
shape
==
image
.
shape
@
parameterized
.
parameters
((
400
,
600
,
0
),
(
400
,
600
,
0.4
),
(
600
,
400
,
1
))
def
testSaturation
(
self
,
input_height
,
input_width
,
saturation
):
image
=
tf
.
convert_to_tensor
(
np
.
random
.
rand
(
input_height
,
input_width
,
3
))
jittered_image
=
preprocess_ops
.
_saturation
(
image
,
saturation
)
assert
jittered_image
.
shape
==
image
.
shape
@
parameterized
.
parameters
((
640
,
640
,
20
),
(
1280
,
1280
,
30
))
def
test_random_crop
(
self
,
input_height
,
input_width
,
num_boxes
):
image
=
tf
.
convert_to_tensor
(
np
.
random
.
rand
(
input_height
,
input_width
,
3
))
boxes_height
=
np
.
random
.
randint
(
0
,
input_height
,
size
=
(
num_boxes
,
1
))
top
=
np
.
random
.
randint
(
0
,
high
=
(
input_height
-
boxes_height
))
down
=
top
+
boxes_height
boxes_width
=
np
.
random
.
randint
(
0
,
input_width
,
size
=
(
num_boxes
,
1
))
left
=
np
.
random
.
randint
(
0
,
high
=
(
input_width
-
boxes_width
))
right
=
left
+
boxes_width
boxes
=
tf
.
constant
(
np
.
concatenate
([
top
,
left
,
down
,
right
],
axis
=-
1
),
tf
.
float32
)
labels
=
tf
.
constant
(
np
.
random
.
randint
(
low
=
0
,
high
=
num_boxes
,
size
=
(
num_boxes
,)),
tf
.
int64
)
_
=
preprocess_ops
.
random_crop
(
image
,
boxes
,
labels
)
@
parameterized
.
parameters
(
((
640
,
640
,
3
),
(
1000
,
1000
),
None
,
(
1000
,
1000
,
3
)),
((
1280
,
640
,
3
),
320
,
None
,
(
640
,
320
,
3
)),
((
640
,
1280
,
3
),
320
,
None
,
(
320
,
640
,
3
)),
((
640
,
640
,
3
),
320
,
100
,
(
100
,
100
,
3
)))
def
test_resize_image
(
self
,
input_shape
,
size
,
max_size
,
expected_shape
):
resized_img
,
image_info
=
preprocess_ops
.
resize_image
(
tf
.
zeros
((
input_shape
)),
size
,
max_size
)
self
.
assertAllEqual
(
tf
.
shape
(
resized_img
),
expected_shape
)
self
.
assertAllEqual
(
image_info
[
0
],
input_shape
[:
-
1
])
self
.
assertAllEqual
(
image_info
[
1
],
expected_shape
[:
-
1
])
self
.
assertAllEqual
(
image_info
[
2
],
np
.
array
(
expected_shape
[:
-
1
])
/
np
.
array
(
input_shape
[:
-
1
]))
self
.
assertAllEqual
(
image_info
[
3
],
[
0
,
0
])
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/ops/sampling_ops.py
0 → 100644
View file @
c44482ab
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Class to subsample minibatches by balancing positives and negatives.
Subsamples minibatches based on a pre-specified positive fraction in range
[0,1]. The class presumes there are many more negatives than positive examples:
if the desired batch_size cannot be achieved with the pre-specified positive
fraction, it fills the rest with negative examples. If this is not sufficient
for obtaining the desired batch_size, it returns fewer examples.
The main function to call is Subsample(self, indicator, labels). For convenience
one can also call SubsampleWeights(self, weights, labels) which is defined in
the minibatch_sampler base class.
When is_static is True, it implements a method that guarantees static shapes.
It also ensures the length of output of the subsample is always batch_size, even
when number of examples set to True in indicator is less than batch_size.
This is originally implemented in TensorFlow Object Detection API.
"""
# Import libraries
import
tensorflow
as
tf
def
combined_static_and_dynamic_shape
(
tensor
):
"""Returns a list containing static and dynamic values for the dimensions.
Returns a list of static and dynamic values for shape dimensions. This is
useful to preserve static shapes when available in reshape operation.
Args:
tensor: A tensor of any type.
Returns:
A list of size tensor.shape.ndims containing integers or a scalar tensor.
"""
static_tensor_shape
=
tensor
.
shape
.
as_list
()
dynamic_tensor_shape
=
tf
.
shape
(
input
=
tensor
)
combined_shape
=
[]
for
index
,
dim
in
enumerate
(
static_tensor_shape
):
if
dim
is
not
None
:
combined_shape
.
append
(
dim
)
else
:
combined_shape
.
append
(
dynamic_tensor_shape
[
index
])
return
combined_shape
def
indices_to_dense_vector
(
indices
,
size
,
indices_value
=
1.
,
default_value
=
0
,
dtype
=
tf
.
float32
):
"""Creates dense vector with indices set to specific value and rest to zeros.
This function exists because it is unclear if it is safe to use
tf.sparse_to_dense(indices, [size], 1, validate_indices=False)
with indices which are not ordered.
This function accepts a dynamic size (e.g. tf.shape(tensor)[0])
Args:
indices: 1d Tensor with integer indices which are to be set to
indices_values.
size: scalar with size (integer) of output Tensor.
indices_value: values of elements specified by indices in the output vector
default_value: values of other elements in the output vector.
dtype: data type.
Returns:
dense 1D Tensor of shape [size] with indices set to indices_values and the
rest set to default_value.
"""
size
=
tf
.
cast
(
size
,
dtype
=
tf
.
int32
)
zeros
=
tf
.
ones
([
size
],
dtype
=
dtype
)
*
default_value
values
=
tf
.
ones_like
(
indices
,
dtype
=
dtype
)
*
indices_value
return
tf
.
dynamic_stitch
(
[
tf
.
range
(
size
),
tf
.
cast
(
indices
,
dtype
=
tf
.
int32
)],
[
zeros
,
values
])
def
matmul_gather_on_zeroth_axis
(
params
,
indices
,
scope
=
None
):
"""Matrix multiplication based implementation of tf.gather on zeroth axis.
TODO(rathodv, jonathanhuang): enable sparse matmul option.
Args:
params: A float32 Tensor. The tensor from which to gather values.
Must be at least rank 1.
indices: A Tensor. Must be one of the following types: int32, int64.
Must be in range [0, params.shape[0])
scope: A name for the operation (optional).
Returns:
A Tensor. Has the same type as params. Values from params gathered
from indices given by indices, with shape indices.shape + params.shape[1:].
"""
scope
=
scope
or
'MatMulGather'
with
tf
.
name_scope
(
scope
):
params_shape
=
combined_static_and_dynamic_shape
(
params
)
indices_shape
=
combined_static_and_dynamic_shape
(
indices
)
params2d
=
tf
.
reshape
(
params
,
[
params_shape
[
0
],
-
1
])
indicator_matrix
=
tf
.
one_hot
(
indices
,
params_shape
[
0
])
gathered_result_flattened
=
tf
.
matmul
(
indicator_matrix
,
params2d
)
return
tf
.
reshape
(
gathered_result_flattened
,
tf
.
stack
(
indices_shape
+
params_shape
[
1
:]))
class
BalancedPositiveNegativeSampler
:
"""Subsamples minibatches to a desired balance of positives and negatives."""
def
__init__
(
self
,
positive_fraction
=
0.5
,
is_static
=
False
):
"""Constructs a minibatch sampler.
Args:
positive_fraction: desired fraction of positive examples (scalar in [0,1])
in the batch.
is_static: If True, uses an implementation with static shape guarantees.
Raises:
ValueError: if positive_fraction < 0, or positive_fraction > 1
"""
if
positive_fraction
<
0
or
positive_fraction
>
1
:
raise
ValueError
(
'positive_fraction should be in range [0,1]. '
'Received: %s.'
%
positive_fraction
)
self
.
_positive_fraction
=
positive_fraction
self
.
_is_static
=
is_static
@
staticmethod
def
subsample_indicator
(
indicator
,
num_samples
):
"""Subsample indicator vector.
Given a boolean indicator vector with M elements set to `True`, the function
assigns all but `num_samples` of these previously `True` elements to
`False`. If `num_samples` is greater than M, the original indicator vector
is returned.
Args:
indicator: a 1-dimensional boolean tensor indicating which elements
are allowed to be sampled and which are not.
num_samples: int32 scalar tensor
Returns:
a boolean tensor with the same shape as input (indicator) tensor
"""
indices
=
tf
.
where
(
indicator
)
indices
=
tf
.
random
.
shuffle
(
indices
)
indices
=
tf
.
reshape
(
indices
,
[
-
1
])
num_samples
=
tf
.
minimum
(
tf
.
size
(
input
=
indices
),
num_samples
)
selected_indices
=
tf
.
slice
(
indices
,
[
0
],
tf
.
reshape
(
num_samples
,
[
1
]))
selected_indicator
=
indices_to_dense_vector
(
selected_indices
,
tf
.
shape
(
input
=
indicator
)[
0
])
return
tf
.
equal
(
selected_indicator
,
1
)
def
_get_num_pos_neg_samples
(
self
,
sorted_indices_tensor
,
sample_size
):
"""Counts the number of positives and negatives numbers to be sampled.
Args:
sorted_indices_tensor: A sorted int32 tensor of shape [N] which contains
the signed indices of the examples where the sign is based on the label
value. The examples that cannot be sampled are set to 0. It samples
at most sample_size*positive_fraction positive examples and remaining
from negative examples.
sample_size: Size of subsamples.
Returns:
A tuple containing the number of positive and negative labels in the
subsample.
"""
input_length
=
tf
.
shape
(
input
=
sorted_indices_tensor
)[
0
]
valid_positive_index
=
tf
.
greater
(
sorted_indices_tensor
,
tf
.
zeros
(
input_length
,
tf
.
int32
))
num_sampled_pos
=
tf
.
reduce_sum
(
input_tensor
=
tf
.
cast
(
valid_positive_index
,
tf
.
int32
))
max_num_positive_samples
=
tf
.
constant
(
int
(
sample_size
*
self
.
_positive_fraction
),
tf
.
int32
)
num_positive_samples
=
tf
.
minimum
(
max_num_positive_samples
,
num_sampled_pos
)
num_negative_samples
=
tf
.
constant
(
sample_size
,
tf
.
int32
)
-
num_positive_samples
return
num_positive_samples
,
num_negative_samples
def
_get_values_from_start_and_end
(
self
,
input_tensor
,
num_start_samples
,
num_end_samples
,
total_num_samples
):
"""slices num_start_samples and last num_end_samples from input_tensor.
Args:
input_tensor: An int32 tensor of shape [N] to be sliced.
num_start_samples: Number of examples to be sliced from the beginning
of the input tensor.
num_end_samples: Number of examples to be sliced from the end of the
input tensor.
total_num_samples: Sum of is num_start_samples and num_end_samples. This
should be a scalar.
Returns:
A tensor containing the first num_start_samples and last num_end_samples
from input_tensor.
"""
input_length
=
tf
.
shape
(
input
=
input_tensor
)[
0
]
start_positions
=
tf
.
less
(
tf
.
range
(
input_length
),
num_start_samples
)
end_positions
=
tf
.
greater_equal
(
tf
.
range
(
input_length
),
input_length
-
num_end_samples
)
selected_positions
=
tf
.
logical_or
(
start_positions
,
end_positions
)
selected_positions
=
tf
.
cast
(
selected_positions
,
tf
.
float32
)
indexed_positions
=
tf
.
multiply
(
tf
.
cumsum
(
selected_positions
),
selected_positions
)
one_hot_selector
=
tf
.
one_hot
(
tf
.
cast
(
indexed_positions
,
tf
.
int32
)
-
1
,
total_num_samples
,
dtype
=
tf
.
float32
)
return
tf
.
cast
(
tf
.
tensordot
(
tf
.
cast
(
input_tensor
,
tf
.
float32
),
one_hot_selector
,
axes
=
[
0
,
0
]),
tf
.
int32
)
def
_static_subsample
(
self
,
indicator
,
batch_size
,
labels
):
"""Returns subsampled minibatch.
Args:
indicator: boolean tensor of shape [N] whose True entries can be sampled.
N should be a complie time constant.
batch_size: desired batch size. This scalar cannot be None.
labels: boolean tensor of shape [N] denoting positive(=True) and negative
(=False) examples. N should be a complie time constant.
Returns:
sampled_idx_indicator: boolean tensor of shape [N], True for entries which
are sampled. It ensures the length of output of the subsample is always
batch_size, even when number of examples set to True in indicator is
less than batch_size.
Raises:
ValueError: if labels and indicator are not 1D boolean tensors.
"""
# Check if indicator and labels have a static size.
if
not
indicator
.
shape
.
is_fully_defined
():
raise
ValueError
(
'indicator must be static in shape when is_static is'
'True'
)
if
not
labels
.
shape
.
is_fully_defined
():
raise
ValueError
(
'labels must be static in shape when is_static is'
'True'
)
if
not
isinstance
(
batch_size
,
int
):
raise
ValueError
(
'batch_size has to be an integer when is_static is'
'True.'
)
input_length
=
tf
.
shape
(
input
=
indicator
)[
0
]
# Set the number of examples set True in indicator to be at least
# batch_size.
num_true_sampled
=
tf
.
reduce_sum
(
input_tensor
=
tf
.
cast
(
indicator
,
tf
.
float32
))
additional_false_sample
=
tf
.
less_equal
(
tf
.
cumsum
(
tf
.
cast
(
tf
.
logical_not
(
indicator
),
tf
.
float32
)),
batch_size
-
num_true_sampled
)
indicator
=
tf
.
logical_or
(
indicator
,
additional_false_sample
)
# Shuffle indicator and label. Need to store the permutation to restore the
# order post sampling.
permutation
=
tf
.
random
.
shuffle
(
tf
.
range
(
input_length
))
indicator
=
matmul_gather_on_zeroth_axis
(
tf
.
cast
(
indicator
,
tf
.
float32
),
permutation
)
labels
=
matmul_gather_on_zeroth_axis
(
tf
.
cast
(
labels
,
tf
.
float32
),
permutation
)
# index (starting from 1) when indicator is True, 0 when False
indicator_idx
=
tf
.
where
(
tf
.
cast
(
indicator
,
tf
.
bool
),
tf
.
range
(
1
,
input_length
+
1
),
tf
.
zeros
(
input_length
,
tf
.
int32
))
# Replace -1 for negative, +1 for positive labels
signed_label
=
tf
.
where
(
tf
.
cast
(
labels
,
tf
.
bool
),
tf
.
ones
(
input_length
,
tf
.
int32
),
tf
.
scalar_mul
(
-
1
,
tf
.
ones
(
input_length
,
tf
.
int32
)))
# negative of index for negative label, positive index for positive label,
# 0 when indicator is False.
signed_indicator_idx
=
tf
.
multiply
(
indicator_idx
,
signed_label
)
sorted_signed_indicator_idx
=
tf
.
nn
.
top_k
(
signed_indicator_idx
,
input_length
,
sorted
=
True
).
values
[
num_positive_samples
,
num_negative_samples
]
=
self
.
_get_num_pos_neg_samples
(
sorted_signed_indicator_idx
,
batch_size
)
sampled_idx
=
self
.
_get_values_from_start_and_end
(
sorted_signed_indicator_idx
,
num_positive_samples
,
num_negative_samples
,
batch_size
)
# Shift the indices to start from 0 and remove any samples that are set as
# False.
sampled_idx
=
tf
.
abs
(
sampled_idx
)
-
tf
.
ones
(
batch_size
,
tf
.
int32
)
sampled_idx
=
tf
.
multiply
(
tf
.
cast
(
tf
.
greater_equal
(
sampled_idx
,
tf
.
constant
(
0
)),
tf
.
int32
),
sampled_idx
)
sampled_idx_indicator
=
tf
.
cast
(
tf
.
reduce_sum
(
input_tensor
=
tf
.
one_hot
(
sampled_idx
,
depth
=
input_length
),
axis
=
0
),
tf
.
bool
)
# project back the order based on stored permutations
reprojections
=
tf
.
one_hot
(
permutation
,
depth
=
input_length
,
dtype
=
tf
.
float32
)
return
tf
.
cast
(
tf
.
tensordot
(
tf
.
cast
(
sampled_idx_indicator
,
tf
.
float32
),
reprojections
,
axes
=
[
0
,
0
]),
tf
.
bool
)
def
subsample
(
self
,
indicator
,
batch_size
,
labels
,
scope
=
None
):
"""Returns subsampled minibatch.
Args:
indicator: boolean tensor of shape [N] whose True entries can be sampled.
batch_size: desired batch size. If None, keeps all positive samples and
randomly selects negative samples so that the positive sample fraction
matches self._positive_fraction. It cannot be None is is_static is True.
labels: boolean tensor of shape [N] denoting positive(=True) and negative
(=False) examples.
scope: name scope.
Returns:
sampled_idx_indicator: boolean tensor of shape [N], True for entries which
are sampled.
Raises:
ValueError: if labels and indicator are not 1D boolean tensors.
"""
if
len
(
indicator
.
get_shape
().
as_list
())
!=
1
:
raise
ValueError
(
'indicator must be 1 dimensional, got a tensor of '
'shape %s'
%
indicator
.
get_shape
())
if
len
(
labels
.
get_shape
().
as_list
())
!=
1
:
raise
ValueError
(
'labels must be 1 dimensional, got a tensor of '
'shape %s'
%
labels
.
get_shape
())
if
labels
.
dtype
!=
tf
.
bool
:
raise
ValueError
(
'labels should be of type bool. Received: %s'
%
labels
.
dtype
)
if
indicator
.
dtype
!=
tf
.
bool
:
raise
ValueError
(
'indicator should be of type bool. Received: %s'
%
indicator
.
dtype
)
scope
=
scope
or
'BalancedPositiveNegativeSampler'
with
tf
.
name_scope
(
scope
):
if
self
.
_is_static
:
return
self
.
_static_subsample
(
indicator
,
batch_size
,
labels
)
else
:
# Only sample from indicated samples
negative_idx
=
tf
.
logical_not
(
labels
)
positive_idx
=
tf
.
logical_and
(
labels
,
indicator
)
negative_idx
=
tf
.
logical_and
(
negative_idx
,
indicator
)
# Sample positive and negative samples separately
if
batch_size
is
None
:
max_num_pos
=
tf
.
reduce_sum
(
input_tensor
=
tf
.
cast
(
positive_idx
,
dtype
=
tf
.
int32
))
else
:
max_num_pos
=
int
(
self
.
_positive_fraction
*
batch_size
)
sampled_pos_idx
=
self
.
subsample_indicator
(
positive_idx
,
max_num_pos
)
num_sampled_pos
=
tf
.
reduce_sum
(
input_tensor
=
tf
.
cast
(
sampled_pos_idx
,
tf
.
int32
))
if
batch_size
is
None
:
negative_positive_ratio
=
(
1
-
self
.
_positive_fraction
)
/
self
.
_positive_fraction
max_num_neg
=
tf
.
cast
(
negative_positive_ratio
*
tf
.
cast
(
num_sampled_pos
,
dtype
=
tf
.
float32
),
dtype
=
tf
.
int32
)
else
:
max_num_neg
=
batch_size
-
num_sampled_pos
sampled_neg_idx
=
self
.
subsample_indicator
(
negative_idx
,
max_num_neg
)
return
tf
.
logical_or
(
sampled_pos_idx
,
sampled_neg_idx
)
Prev
1
…
6
7
8
9
10
11
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment