Commit cc748b2a authored by Abdullah Rashwan's avatar Abdullah Rashwan Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 329754787
parent 2f788e1d
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for maskrcnn_model.py."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.beta.modeling import maskrcnn_model
from official.vision.beta.modeling.backbones import resnet
from official.vision.beta.modeling.decoders import fpn
from official.vision.beta.modeling.heads import dense_prediction_heads
from official.vision.beta.modeling.heads import instance_heads
from official.vision.beta.modeling.layers import detection_generator
from official.vision.beta.modeling.layers import mask_sampler
from official.vision.beta.modeling.layers import roi_aligner
from official.vision.beta.modeling.layers import roi_generator
from official.vision.beta.modeling.layers import roi_sampler
from official.vision.beta.ops import anchor
class MaskRCNNModelTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(3, 3, 7, 3, [1.0], 50, False, False, 41953246),
)
def test_num_params(self,
num_classes,
min_level,
max_level,
num_scales,
aspect_ratios,
resnet_model_id,
use_separable_conv,
include_mask,
expected_num_params):
num_anchors_per_location = num_scales * len(aspect_ratios)
image_size = 384
images = np.random.rand(2, image_size, image_size, 3)
image_shape = np.array([[image_size, image_size], [image_size, image_size]])
anchor_boxes = anchor.Anchor(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=3,
image_size=(image_size, image_size)).multilevel_boxes
for l in anchor_boxes:
anchor_boxes[l] = tf.tile(
tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
backbone = resnet.ResNet(model_id=resnet_model_id)
decoder = fpn.FPN(
input_specs=backbone.output_specs,
min_level=min_level,
max_level=max_level,
use_separable_conv=use_separable_conv)
rpn_head = dense_prediction_heads.RPNHead(
min_level=min_level,
max_level=max_level,
num_anchors_per_location=num_anchors_per_location,
num_convs=1)
detection_head = instance_heads.DetectionHead(
num_classes=num_classes)
roi_generator_obj = roi_generator.MultilevelROIGenerator()
roi_sampler_obj = roi_sampler.ROISampler()
roi_aligner_obj = roi_aligner.MultilevelROIAligner()
detection_generator_obj = detection_generator.DetectionGenerator()
if include_mask:
mask_head = instance_heads.MaskHead(
num_classes=num_classes, upsample_factor=2)
mask_sampler_obj = mask_sampler.MaskSampler(
mask_target_size=28, num_sampled_masks=1)
mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
else:
mask_head = None
mask_sampler_obj = None
mask_roi_aligner_obj = None
model = maskrcnn_model.MaskRCNNModel(
backbone,
decoder,
rpn_head,
detection_head,
roi_generator_obj,
roi_sampler_obj,
roi_aligner_obj,
detection_generator_obj,
mask_head,
mask_sampler_obj,
mask_roi_aligner_obj)
gt_boxes = np.array(
[[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
[[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
dtype=np.float32)
gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
if include_mask:
gt_masks = np.ones((2, 3, 100, 100))
else:
gt_masks = None
_ = model(images,
image_shape,
anchor_boxes,
gt_boxes,
gt_classes,
gt_masks,
training=True)
self.assertEqual(expected_num_params, model.count_params())
@parameterized.parameters(
(False, False,),
(False, True,),
(True, False,),
(True, True,),
)
def test_forward(self, include_mask, training):
num_classes = 3
min_level = 3
max_level = 4
num_scales = 3
aspect_ratios = [1.0]
image_size = (256, 256)
images = np.random.rand(2, image_size[0], image_size[1], 3)
image_shape = np.array([[224, 100], [100, 224]])
anchor_boxes = anchor.Anchor(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=3,
image_size=image_size).multilevel_boxes
num_anchors_per_location = len(aspect_ratios) * num_scales
input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
decoder = fpn.FPN(
min_level=min_level,
max_level=max_level,
input_specs=backbone.output_specs)
rpn_head = dense_prediction_heads.RPNHead(
min_level=min_level,
max_level=max_level,
num_anchors_per_location=num_anchors_per_location)
detection_head = instance_heads.DetectionHead(
num_classes=num_classes)
roi_generator_obj = roi_generator.MultilevelROIGenerator()
roi_sampler_obj = roi_sampler.ROISampler()
roi_aligner_obj = roi_aligner.MultilevelROIAligner()
detection_generator_obj = detection_generator.DetectionGenerator()
if include_mask:
mask_head = instance_heads.MaskHead(
num_classes=num_classes, upsample_factor=2)
mask_sampler_obj = mask_sampler.MaskSampler(
mask_target_size=28, num_sampled_masks=1)
mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
else:
mask_head = None
mask_sampler_obj = None
mask_roi_aligner_obj = None
model = maskrcnn_model.MaskRCNNModel(
backbone,
decoder,
rpn_head,
detection_head,
roi_generator_obj,
roi_sampler_obj,
roi_aligner_obj,
detection_generator_obj,
mask_head,
mask_sampler_obj,
mask_roi_aligner_obj)
gt_boxes = np.array(
[[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
[[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
dtype=np.float32)
gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
if include_mask:
gt_masks = np.ones((2, 3, 100, 100))
else:
gt_masks = None
results = model(images,
image_shape,
anchor_boxes,
gt_boxes,
gt_classes,
gt_masks,
training=training)
self.assertIn('rpn_boxes', results)
self.assertIn('rpn_scores', results)
if training:
self.assertIn('class_targets', results)
self.assertIn('box_targets', results)
self.assertIn('class_outputs', results)
self.assertIn('box_outputs', results)
if include_mask:
self.assertIn('mask_outputs', results)
else:
self.assertIn('detection_boxes', results)
self.assertIn('detection_scores', results)
self.assertIn('detection_classes', results)
self.assertIn('num_detections', results)
if include_mask:
self.assertIn('detection_masks', results)
@parameterized.parameters(
(False,),
(True,),
)
def test_serialize_deserialize(self, include_mask):
input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
decoder = fpn.FPN(
min_level=3,
max_level=7,
input_specs=backbone.output_specs)
rpn_head = dense_prediction_heads.RPNHead(
min_level=3,
max_level=7,
num_anchors_per_location=3)
detection_head = instance_heads.DetectionHead(
num_classes=2)
roi_generator_obj = roi_generator.MultilevelROIGenerator()
roi_sampler_obj = roi_sampler.ROISampler()
roi_aligner_obj = roi_aligner.MultilevelROIAligner()
detection_generator_obj = detection_generator.DetectionGenerator()
if include_mask:
mask_head = instance_heads.MaskHead(
num_classes=2, upsample_factor=2)
mask_sampler_obj = mask_sampler.MaskSampler(
mask_target_size=28, num_sampled_masks=1)
mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
else:
mask_head = None
mask_sampler_obj = None
mask_roi_aligner_obj = None
model = maskrcnn_model.MaskRCNNModel(
backbone,
decoder,
rpn_head,
detection_head,
roi_generator_obj,
roi_sampler_obj,
roi_aligner_obj,
detection_generator_obj,
mask_head,
mask_sampler_obj,
mask_roi_aligner_obj)
config = model.get_config()
new_model = maskrcnn_model.MaskRCNNModel.from_config(config)
# Validate that the config can be forced to JSON.
_ = new_model.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(model.get_config(), new_model.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""RetinaNet."""
# Import libraries
import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package='Vision')
class RetinaNetModel(tf.keras.Model):
"""The RetinaNet model class."""
def __init__(self,
backbone,
decoder,
head,
detection_generator,
**kwargs):
"""Classification initialization function.
Args:
backbone: `tf.keras.Model` a backbone network.
decoder: `tf.keras.Model` a decoder network.
head: `RetinaNetHead`, the RetinaNet head.
detection_generator: the detection generator.
**kwargs: keyword arguments to be passed.
"""
super(RetinaNetModel, self).__init__(**kwargs)
self._config_dict = {
'backbone': backbone,
'decoder': decoder,
'head': head,
'detection_generator': detection_generator,
}
self._backbone = backbone
self._decoder = decoder
self._head = head
self._detection_generator = detection_generator
def call(self,
images,
image_shape=None,
anchor_boxes=None,
training=None):
"""Forward pass of the RetinaNet model.
Args:
images: `Tensor`, the input batched images, whose shape is
[batch, height, width, 3].
image_shape: `Tensor`, the actual shape of the input images, whose shape
is [batch, 2] where the last dimension is [height, width]. Note that
this is the actual image shape excluding paddings. For example, images
in the batch may be resized into different shapes before padding to the
fixed size.
anchor_boxes: a dict of tensors which includes multilevel anchors.
- key: `int`, the level of the multilevel predictions.
- values: `Tensor`, the anchor coordinates of a particular feature
level, whose shape is [height_l, width_l, num_anchors_per_location].
training: `bool`, indicating whether it is in training mode.
Returns:
scores: a dict of tensors which includes scores of the predictions.
- key: `int`, the level of the multilevel predictions.
- values: `Tensor`, the box scores predicted from a particular feature
level, whose shape is
[batch, height_l, width_l, num_classes * num_anchors_per_location].
boxes: a dict of tensors which includes coordinates of the predictions.
- key: `int`, the level of the multilevel predictions.
- values: `Tensor`, the box coordinates predicted from a particular
feature level, whose shape is
[batch, height_l, width_l, 4 * num_anchors_per_location].
"""
# Feature extraction.
features = self.backbone(images)
if self.decoder:
features = self.decoder(features)
# Dense prediction.
raw_scores, raw_boxes = self.head(features)
if training:
return {
'cls_outputs': raw_scores,
'box_outputs': raw_boxes,
}
else:
# Post-processing.
final_results = self.detection_generator(
raw_boxes, raw_scores, anchor_boxes, image_shape)
return {
'detection_boxes': final_results['detection_boxes'],
'detection_scores': final_results['detection_scores'],
'detection_classes': final_results['detection_classes'],
'num_detections': final_results['num_detections'],
'cls_outputs': raw_scores,
'box_outputs': raw_boxes
}
@property
def checkpoint_items(self):
"""Returns a dictionary of items to be additionally checkpointed."""
items = dict(backbone=self.backbone, head=self.head)
if self.decoder is not None:
items.update(decoder=self.decoder)
return items
@property
def backbone(self):
return self._backbone
@property
def decoder(self):
return self._decoder
@property
def head(self):
return self._head
@property
def detection_generator(self):
return self._detection_generator
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for RetinaNet models."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.vision.beta.modeling import retinanet_model
from official.vision.beta.modeling.backbones import resnet
from official.vision.beta.modeling.decoders import fpn
from official.vision.beta.modeling.heads import dense_prediction_heads
from official.vision.beta.modeling.layers import detection_generator
from official.vision.beta.ops import anchor
class RetinaNetTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(3, 3, 7, 3, [1.0], 50, False, 256, 4, 256, 32244949),
)
def test_num_params(self,
num_classes,
min_level,
max_level,
num_scales,
aspect_ratios,
resnet_model_id,
use_separable_conv,
fpn_num_filters,
head_num_convs,
head_num_filters,
expected_num_params):
num_anchors_per_location = num_scales * len(aspect_ratios)
image_size = 384
images = np.random.rand(2, image_size, image_size, 3)
image_shape = np.array([[image_size, image_size], [image_size, image_size]])
anchor_boxes = anchor.Anchor(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=3,
image_size=(image_size, image_size)).multilevel_boxes
for l in anchor_boxes:
anchor_boxes[l] = tf.tile(
tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
backbone = resnet.ResNet(model_id=resnet_model_id)
decoder = fpn.FPN(
input_specs=backbone.output_specs,
min_level=min_level,
max_level=max_level,
num_filters=fpn_num_filters,
use_separable_conv=use_separable_conv)
head = dense_prediction_heads.RetinaNetHead(
min_level=min_level,
max_level=max_level,
num_classes=num_classes,
num_anchors_per_location=num_anchors_per_location,
use_separable_conv=use_separable_conv,
num_convs=head_num_convs,
num_filters=head_num_filters)
generator = detection_generator.MultilevelDetectionGenerator(
max_num_detections=10)
model = retinanet_model.RetinaNetModel(
backbone=backbone,
decoder=decoder,
head=head,
detection_generator=generator)
_ = model(images, image_shape, anchor_boxes, training=True)
self.assertEqual(expected_num_params, model.count_params())
@combinations.generate(
combinations.combine(
strategy=[
strategy_combinations.tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],
image_size=[(128, 128),],
training=[True, False],
)
)
def test_forward(self, strategy, image_size, training):
"""Test for creation of a R50-FPN RetinaNet."""
tf.keras.backend.set_image_data_format('channels_last')
num_classes = 3
min_level = 3
max_level = 7
num_scales = 3
aspect_ratios = [1.0]
num_anchors_per_location = num_scales * len(aspect_ratios)
images = np.random.rand(2, image_size[0], image_size[1], 3)
image_shape = np.array(
[[image_size[0], image_size[1]], [image_size[0], image_size[1]]])
with strategy.scope():
anchor_gen = anchor.build_anchor_generator(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=3)
anchor_boxes = anchor_gen(image_size)
for l in anchor_boxes:
anchor_boxes[l] = tf.tile(
tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
backbone = resnet.ResNet(model_id=50)
decoder = fpn.FPN(
input_specs=backbone.output_specs,
min_level=min_level,
max_level=max_level)
head = dense_prediction_heads.RetinaNetHead(
min_level=min_level,
max_level=max_level,
num_classes=num_classes,
num_anchors_per_location=num_anchors_per_location)
generator = detection_generator.MultilevelDetectionGenerator(
max_num_detections=10)
model = retinanet_model.RetinaNetModel(
backbone=backbone,
decoder=decoder,
head=head,
detection_generator=generator)
model_outputs = model(
images,
image_shape,
anchor_boxes,
training=training)
if training:
cls_outputs = model_outputs['cls_outputs']
box_outputs = model_outputs['box_outputs']
for level in range(min_level, max_level + 1):
self.assertIn(level, cls_outputs)
self.assertIn(level, box_outputs)
self.assertAllEqual([
2,
image_size[0] // 2**level,
image_size[1] // 2**level,
num_classes * num_anchors_per_location
], cls_outputs[level].numpy().shape)
self.assertAllEqual([
2,
image_size[0] // 2**level,
image_size[1] // 2**level,
4 * num_anchors_per_location
], box_outputs[level].numpy().shape)
else:
self.assertIn('detection_boxes', model_outputs)
self.assertIn('detection_scores', model_outputs)
self.assertIn('detection_classes', model_outputs)
self.assertIn('num_detections', model_outputs)
self.assertAllEqual(
[2, 10, 4], model_outputs['detection_boxes'].numpy().shape)
self.assertAllEqual(
[2, 10], model_outputs['detection_scores'].numpy().shape)
self.assertAllEqual(
[2, 10], model_outputs['detection_classes'].numpy().shape)
self.assertAllEqual(
[2,], model_outputs['num_detections'].numpy().shape)
def test_serialize_deserialize(self):
"""Validate the network can be serialized and deserialized."""
num_classes = 3
min_level = 3
max_level = 7
num_scales = 3
aspect_ratios = [1.0]
num_anchors_per_location = num_scales * len(aspect_ratios)
backbone = resnet.ResNet(model_id=50)
decoder = fpn.FPN(
input_specs=backbone.output_specs,
min_level=min_level,
max_level=max_level)
head = dense_prediction_heads.RetinaNetHead(
min_level=min_level,
max_level=max_level,
num_classes=num_classes,
num_anchors_per_location=num_anchors_per_location)
generator = detection_generator.MultilevelDetectionGenerator(
max_num_detections=10)
model = retinanet_model.RetinaNetModel(
backbone=backbone,
decoder=decoder,
head=head,
detection_generator=generator)
config = model.get_config()
new_model = retinanet_model.RetinaNetModel.from_config(config)
# Validate that the config can be forced to JSON.
_ = new_model.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(model.get_config(), new_model.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Build video classification models."""
# Import libraries
import tensorflow as tf
layers = tf.keras.layers
@tf.keras.utils.register_keras_serializable(package='Vision')
class VideoClassificationModel(tf.keras.Model):
"""A video classification class builder."""
def __init__(self,
backbone,
num_classes,
input_specs=layers.InputSpec(shape=[None, None, None, None, 3]),
dropout_rate=0.0,
kernel_initializer='random_uniform',
kernel_regularizer=None,
bias_regularizer=None,
add_head_batch_norm=False,
use_sync_bn: bool = False,
norm_momentum: float = 0.99,
norm_epsilon: float = 0.001,
**kwargs):
"""Video Classification initialization function.
Args:
backbone: a 3d backbone network.
num_classes: `int` number of classes in classification task.
input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
dropout_rate: `float` rate for dropout regularization.
kernel_initializer: kernel initializer for the dense layer.
kernel_regularizer: tf.keras.regularizers.Regularizer object. Default to
None.
bias_regularizer: tf.keras.regularizers.Regularizer object. Default to
None.
add_head_batch_norm: `bool` whether to add a batch normalization layer
before pool.
use_sync_bn: `bool` if True, use synchronized batch normalization.
norm_momentum: `float` normalization momentum for the moving average.
norm_epsilon: `float` small float added to variance to avoid dividing by
zero.
**kwargs: keyword arguments to be passed.
"""
self._self_setattr_tracking = False
self._config_dict = {
'backbone': backbone,
'num_classes': num_classes,
'input_specs': input_specs,
'dropout_rate': dropout_rate,
'kernel_initializer': kernel_initializer,
'kernel_regularizer': kernel_regularizer,
'bias_regularizer': bias_regularizer,
'add_head_batch_norm': add_head_batch_norm,
'use_sync_bn': use_sync_bn,
'norm_momentum': norm_momentum,
'norm_epsilon': norm_epsilon,
}
self._input_specs = input_specs
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
self._backbone = backbone
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
axis = -1 if tf.keras.backend.image_data_format() == 'channels_last' else 1
inputs = tf.keras.Input(shape=input_specs.shape[1:])
endpoints = backbone(inputs)
x = endpoints[max(endpoints.keys())]
if add_head_batch_norm:
x = self._norm(axis=axis, momentum=norm_momentum, epsilon=norm_epsilon)(x)
x = tf.keras.layers.GlobalAveragePooling3D()(x)
x = tf.keras.layers.Dropout(dropout_rate)(x)
x = tf.keras.layers.Dense(
num_classes, kernel_initializer=kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)(
x)
super(VideoClassificationModel, self).__init__(
inputs=inputs, outputs=x, **kwargs)
@property
def checkpoint_items(self):
"""Returns a dictionary of items to be additionally checkpointed."""
return dict(backbone=self.backbone)
@property
def backbone(self):
return self._backbone
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for video classification network."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.beta.modeling import backbones
from official.vision.beta.modeling import video_classification_model
class VideoClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(50, 8, 112, 'relu'),
(50, 8, 112, 'swish'),
)
def test_resnet3d_network_creation(self, model_id, temporal_size,
spatial_size, activation):
"""Test for creation of a ResNet3D-50 classifier."""
input_specs = tf.keras.layers.InputSpec(
shape=[None, temporal_size, spatial_size, spatial_size, 3])
temporal_strides = [1, 1, 1, 1]
temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
(1, 3, 1)]
tf.keras.backend.set_image_data_format('channels_last')
backbone = backbones.ResNet3D(
model_id=model_id,
temporal_strides=temporal_strides,
temporal_kernel_sizes=temporal_kernel_sizes,
input_specs=input_specs,
activation=activation)
num_classes = 1000
model = video_classification_model.VideoClassificationModel(
backbone=backbone,
num_classes=num_classes,
input_specs=input_specs,
dropout_rate=0.2,
)
inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
logits = model(inputs)
self.assertAllEqual([2, num_classes], logits.numpy().shape)
def test_serialize_deserialize(self):
"""Validate the classification network can be serialized and deserialized."""
model_id = 50
temporal_strides = [1, 1, 1, 1]
temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
(1, 3, 1)]
backbone = backbones.ResNet3D(
model_id=model_id,
temporal_strides=temporal_strides,
temporal_kernel_sizes=temporal_kernel_sizes)
model = video_classification_model.VideoClassificationModel(
backbone=backbone, num_classes=1000)
config = model.get_config()
new_model = video_classification_model.VideoClassificationModel.from_config(
config)
# Validate that the config can be forced to JSON.
_ = new_model.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(model.get_config(), new_model.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Anchor box and labeler definition."""
import collections
# Import libraries
import tensorflow as tf
from official.vision.beta.ops.experimental import anchor_generator
from official.vision.detection.utils.object_detection import argmax_matcher
from official.vision.detection.utils.object_detection import balanced_positive_negative_sampler
from official.vision.detection.utils.object_detection import box_list
from official.vision.detection.utils.object_detection import faster_rcnn_box_coder
from official.vision.detection.utils.object_detection import region_similarity_calculator
from official.vision.detection.utils.object_detection import target_assigner
class Anchor(object):
"""Anchor class for anchor-based object detectors."""
def __init__(self,
min_level,
max_level,
num_scales,
aspect_ratios,
anchor_size,
image_size):
"""Constructs multiscale anchors.
Args:
min_level: integer number of minimum level of the output feature pyramid.
max_level: integer number of maximum level of the output feature pyramid.
num_scales: integer number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: list of float numbers representing the aspect raito anchors
added on each level. The number indicates the ratio of width to height.
For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
scale level.
anchor_size: float number representing the scale of size of the base
anchor to the feature stride 2^level.
image_size: a list of integer numbers or Tensors representing
[height, width] of the input image size.The image_size should be divided
by the largest feature stride 2^max_level.
"""
self.min_level = min_level
self.max_level = max_level
self.num_scales = num_scales
self.aspect_ratios = aspect_ratios
self.anchor_size = anchor_size
self.image_size = image_size
self.boxes = self._generate_boxes()
def _generate_boxes(self):
"""Generates multiscale anchor boxes.
Returns:
a Tensor of shape [N, 4], representing anchor boxes of all levels
concatenated together.
"""
boxes_all = []
for level in range(self.min_level, self.max_level + 1):
boxes_l = []
for scale in range(self.num_scales):
for aspect_ratio in self.aspect_ratios:
stride = 2 ** level
intermidate_scale = 2 ** (scale / float(self.num_scales))
base_anchor_size = self.anchor_size * stride * intermidate_scale
aspect_x = aspect_ratio ** 0.5
aspect_y = aspect_ratio ** -0.5
half_anchor_size_x = base_anchor_size * aspect_x / 2.0
half_anchor_size_y = base_anchor_size * aspect_y / 2.0
x = tf.range(stride / 2, self.image_size[1], stride)
y = tf.range(stride / 2, self.image_size[0], stride)
xv, yv = tf.meshgrid(x, y)
xv = tf.cast(tf.reshape(xv, [-1]), dtype=tf.float32)
yv = tf.cast(tf.reshape(yv, [-1]), dtype=tf.float32)
# Tensor shape Nx4.
boxes = tf.stack([yv - half_anchor_size_y, xv - half_anchor_size_x,
yv + half_anchor_size_y, xv + half_anchor_size_x],
axis=1)
boxes_l.append(boxes)
# Concat anchors on the same level to tensor shape NxAx4.
boxes_l = tf.stack(boxes_l, axis=1)
boxes_l = tf.reshape(boxes_l, [-1, 4])
boxes_all.append(boxes_l)
return tf.concat(boxes_all, axis=0)
def unpack_labels(self, labels):
"""Unpacks an array of labels into multiscales labels."""
unpacked_labels = collections.OrderedDict()
count = 0
for level in range(self.min_level, self.max_level + 1):
feat_size_y = tf.cast(self.image_size[0] / 2 ** level, tf.int32)
feat_size_x = tf.cast(self.image_size[1] / 2 ** level, tf.int32)
steps = feat_size_y * feat_size_x * self.anchors_per_location
unpacked_labels[level] = tf.reshape(
labels[count:count + steps], [feat_size_y, feat_size_x, -1])
count += steps
return unpacked_labels
@property
def anchors_per_location(self):
return self.num_scales * len(self.aspect_ratios)
@property
def multilevel_boxes(self):
return self.unpack_labels(self.boxes)
class AnchorLabeler(object):
"""Labeler for dense object detector."""
def __init__(self,
match_threshold=0.5,
unmatched_threshold=0.5):
"""Constructs anchor labeler to assign labels to anchors.
Args:
match_threshold: a float number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: a float number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
"""
similarity_calc = region_similarity_calculator.IouSimilarity()
matcher = argmax_matcher.ArgMaxMatcher(
match_threshold,
unmatched_threshold=unmatched_threshold,
negatives_lower_than_unmatched=True,
force_match_for_each_row=True)
box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder()
self._target_assigner = target_assigner.TargetAssigner(
similarity_calc, matcher, box_coder)
self._match_threshold = match_threshold
self._unmatched_threshold = unmatched_threshold
def label_anchors(self, anchor_boxes, gt_boxes, gt_labels):
"""Labels anchors with ground truth inputs.
Args:
anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_labels: A integer tensor with shape [N, 1] representing groundtruth
classes.
Returns:
cls_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
box_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors_per_location * 4]. The height_l
and width_l represent the dimension of bounding box regression output at
l-th level.
cls_weights: A flattened Tensor with shape [batch_size, num_anchors], that
serves as masking / sample weight for classification loss. Its value
is 1.0 for positive and negative matched anchors, and 0.0 for ignored
anchors.
box_weights: A flattened Tensor with shape [batch_size, num_anchors], that
serves as masking / sample weight for regression loss. Its value is
1.0 for positive matched anchors, and 0.0 for negative and ignored
anchors.
"""
gt_box_list = box_list.BoxList(gt_boxes)
flattened_anchor_boxes = []
for anchors in anchor_boxes.values():
flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4]))
flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0)
anchor_box_list = box_list.BoxList(flattened_anchor_boxes)
# The cls_weights, box_weights are not used.
(cls_targets, cls_weights, box_targets, box_weights,
matches) = self._target_assigner.assign(anchor_box_list, gt_box_list,
gt_labels)
# Labels definition in matches.match_results:
# (1) match_results[i]>=0, meaning that column i is matched with row
# match_results[i].
# (2) match_results[i]=-1, meaning that column i is not matched.
# (3) match_results[i]=-2, meaning that column i is ignored.
match_results = tf.expand_dims(matches.match_results, axis=1)
cls_targets = tf.cast(cls_targets, tf.int32)
cls_targets = tf.where(
tf.equal(match_results, -1), -tf.ones_like(cls_targets), cls_targets)
cls_targets = tf.where(
tf.equal(match_results, -2), -2 * tf.ones_like(cls_targets),
cls_targets)
# Unpacks labels into multi-level representations.
cls_targets_dict = unpack_targets(cls_targets, anchor_boxes)
box_targets_dict = unpack_targets(box_targets, anchor_boxes)
return cls_targets_dict, box_targets_dict, cls_weights, box_weights
class RpnAnchorLabeler(AnchorLabeler):
"""Labeler for Region Proposal Network."""
def __init__(self,
match_threshold=0.7,
unmatched_threshold=0.3,
rpn_batch_size_per_im=256,
rpn_fg_fraction=0.5):
AnchorLabeler.__init__(self, match_threshold=0.7, unmatched_threshold=0.3)
self._rpn_batch_size_per_im = rpn_batch_size_per_im
self._rpn_fg_fraction = rpn_fg_fraction
def _get_rpn_samples(self, match_results):
"""Computes anchor labels.
This function performs subsampling for foreground (fg) and background (bg)
anchors.
Args:
match_results: A integer tensor with shape [N] representing the
matching results of anchors. (1) match_results[i]>=0,
meaning that column i is matched with row match_results[i].
(2) match_results[i]=-1, meaning that column i is not matched.
(3) match_results[i]=-2, meaning that column i is ignored.
Returns:
score_targets: a integer tensor with the a shape of [N].
(1) score_targets[i]=1, the anchor is a positive sample.
(2) score_targets[i]=0, negative. (3) score_targets[i]=-1, the anchor is
don't care (ignore).
"""
sampler = (
balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
positive_fraction=self._rpn_fg_fraction, is_static=False))
# indicator includes both positive and negative labels.
# labels includes only positives labels.
# positives = indicator & labels.
# negatives = indicator & !labels.
# ignore = !indicator.
indicator = tf.greater(match_results, -2)
labels = tf.greater(match_results, -1)
samples = sampler.subsample(
indicator, self._rpn_batch_size_per_im, labels)
positive_labels = tf.where(
tf.logical_and(samples, labels),
tf.constant(2, dtype=tf.int32, shape=match_results.shape),
tf.constant(0, dtype=tf.int32, shape=match_results.shape))
negative_labels = tf.where(
tf.logical_and(samples, tf.logical_not(labels)),
tf.constant(1, dtype=tf.int32, shape=match_results.shape),
tf.constant(0, dtype=tf.int32, shape=match_results.shape))
ignore_labels = tf.fill(match_results.shape, -1)
return (ignore_labels + positive_labels + negative_labels,
positive_labels, negative_labels)
def label_anchors(self, anchor_boxes, gt_boxes, gt_labels):
"""Labels anchors with ground truth inputs.
Args:
anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_labels: A integer tensor with shape [N, 1] representing groundtruth
classes.
Returns:
score_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors]. The height_l and width_l
represent the dimension of class logits at l-th level.
box_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
"""
gt_box_list = box_list.BoxList(gt_boxes)
flattened_anchor_boxes = []
for anchors in anchor_boxes.values():
flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4]))
flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0)
anchor_box_list = box_list.BoxList(flattened_anchor_boxes)
# cls_targets, cls_weights, box_weights are not used.
_, _, box_targets, _, matches = self._target_assigner.assign(
anchor_box_list, gt_box_list, gt_labels)
# score_targets contains the subsampled positive and negative anchors.
score_targets, _, _ = self._get_rpn_samples(matches.match_results)
# Unpacks labels.
score_targets_dict = unpack_targets(score_targets, anchor_boxes)
box_targets_dict = unpack_targets(box_targets, anchor_boxes)
return score_targets_dict, box_targets_dict
def build_anchor_generator(min_level, max_level, num_scales, aspect_ratios,
anchor_size):
"""Build anchor generator from levels."""
anchor_sizes = collections.OrderedDict()
strides = collections.OrderedDict()
scales = []
for scale in range(num_scales):
scales.append(2**(scale / float(num_scales)))
for level in range(min_level, max_level + 1):
stride = 2**level
strides[level] = stride
anchor_sizes[level] = anchor_size * stride
anchor_gen = anchor_generator.AnchorGenerator(
anchor_sizes=anchor_sizes,
scales=scales,
aspect_ratios=aspect_ratios,
strides=strides)
return anchor_gen
def unpack_targets(targets, anchor_boxes_dict):
"""Unpacks an array of labels into multiscales labels."""
unpacked_targets = collections.OrderedDict()
count = 0
for level, anchor_boxes in anchor_boxes_dict.items():
feat_size_shape = anchor_boxes.shape.as_list()
feat_size_y = feat_size_shape[0]
feat_size_x = feat_size_shape[1]
anchors_per_location = int(feat_size_shape[2] / 4)
steps = feat_size_y * feat_size_x * anchors_per_location
unpacked_targets[level] = tf.reshape(targets[count:count + steps],
[feat_size_y, feat_size_x, -1])
count += steps
return unpacked_targets
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for anchor.py."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.beta.ops import anchor
class AnchorTest(parameterized.TestCase, tf.test.TestCase):
# The set of parameters are tailored for the MLPerf configuration, where
# the number of anchors is 495132, rpn_batch_size_per_im=256, and
# rpn_fg_fraction=0.5.
@parameterized.parameters(
(512, 25, 25, 25, 25, (512, 512)),
(512, 25, 25, 25, 25, (512, 640)),
(512, 25, 25, 25, 25, (640, 512)),
(495132, 100, 100, 100, 100, (512, 512)),
(495132, 200, 100, 128, 100, (512, 512)),
(495132, 100, 120, 100, 120, (512, 512)),
(495132, 100, 200, 100, 156, (512, 512)),
(495132, 200, 200, 128, 128, (512, 512)),
)
def testAnchorRpnSample(self, num_anchors, num_positives,
num_negatives, expected_positives,
expected_negatives, image_size):
match_results_np = np.empty([num_anchors])
match_results_np.fill(-2)
match_results_np[:num_positives] = 0
match_results_np[num_positives:num_positives + num_negatives] = -1
match_results = tf.convert_to_tensor(value=match_results_np, dtype=tf.int32)
anchor_labeler = anchor.RpnAnchorLabeler(
match_threshold=0.7,
unmatched_threshold=0.3,
rpn_batch_size_per_im=256,
rpn_fg_fraction=0.5)
rpn_sample_op = anchor_labeler._get_rpn_samples(match_results)
labels = [v.numpy() for v in rpn_sample_op]
self.assertLen(labels[0], num_anchors)
positives = np.sum(np.array(labels[0]) == 1)
negatives = np.sum(np.array(labels[0]) == 0)
self.assertEqual(positives, expected_positives)
self.assertEqual(negatives, expected_negatives)
@parameterized.parameters(
# Single scale anchor.
(5, 5, 1, [1.0], 2.0,
[[-16, -16, 48, 48], [-16, 16, 48, 80],
[16, -16, 80, 48], [16, 16, 80, 80]]),
# Multi scale anchor.
(5, 6, 1, [1.0], 2.0,
[[-16, -16, 48, 48], [-16, 16, 48, 80],
[16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]),
# # Multi aspect ratio anchor.
(6, 6, 1, [1.0, 4.0, 0.25], 2.0,
[[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]),
)
def testAnchorGeneration(self, min_level, max_level, num_scales,
aspect_ratios, anchor_size, expected_boxes):
image_size = [64, 64]
anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios,
anchor_size, image_size)
boxes = anchors.boxes.numpy()
self.assertEqual(expected_boxes, boxes.tolist())
@parameterized.parameters(
# Single scale anchor.
(5, 5, 1, [1.0], 2.0,
[[-16, -16, 48, 48], [-16, 16, 48, 80],
[16, -16, 80, 48], [16, 16, 80, 80]]),
# Multi scale anchor.
(5, 6, 1, [1.0], 2.0,
[[-16, -16, 48, 48], [-16, 16, 48, 80],
[16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]),
# # Multi aspect ratio anchor.
(6, 6, 1, [1.0, 4.0, 0.25], 2.0,
[[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]),
)
def testAnchorGenerationWithImageSizeAsTensor(self,
min_level,
max_level,
num_scales,
aspect_ratios,
anchor_size,
expected_boxes):
image_size = tf.constant([64, 64], tf.int32)
anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios,
anchor_size, image_size)
boxes = anchors.boxes.numpy()
self.assertEqual(expected_boxes, boxes.tolist())
@parameterized.parameters(
(3, 6, 2, [1.0], 2.0),
)
def testLabelAnchors(self, min_level, max_level, num_scales,
aspect_ratios, anchor_size):
input_size = [512, 512]
ground_truth_class_id = 2
# The matched anchors are the anchors used as ground truth and the anchors
# at the next octave scale on the same location.
expected_anchor_locations = [[0, 0, 0], [0, 0, 1]]
anchor_gen = anchor.build_anchor_generator(min_level, max_level, num_scales,
aspect_ratios, anchor_size)
anchor_boxes = anchor_gen(input_size)
anchor_labeler = anchor.AnchorLabeler()
# Uses the first anchors as ground truth. The ground truth should map to
# two anchors with two intermediate scales at the same location.
gt_boxes = anchor_boxes[3][0:1, 0, 0:4]
gt_classes = tf.constant([[ground_truth_class_id]], dtype=tf.float32)
(cls_targets, box_targets, _,
box_weights) = anchor_labeler.label_anchors(
anchor_boxes, gt_boxes, gt_classes)
for k, v in cls_targets.items():
cls_targets[k] = v.numpy()
for k, v in box_targets.items():
box_targets[k] = v.numpy()
box_weights = box_weights.numpy()
anchor_locations = np.vstack(
np.where(cls_targets[min_level] > -1)).transpose()
self.assertAllClose(expected_anchor_locations, anchor_locations)
# Two anchor boxes on min_level got matched to the gt_boxes.
self.assertAllClose(tf.reduce_sum(box_weights), 2)
@parameterized.parameters(
(3, 7, [.5, 1., 2.], 2, 8, (256, 256)),
(3, 8, [1.], 3, 32, (512, 512)),
(3, 3, [1.], 2, 4, (32, 32)),
)
def testEquivalentResult(self, min_level, max_level, aspect_ratios,
num_scales, anchor_size, image_size):
anchor_gen = anchor.build_anchor_generator(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=anchor_size)
anchors = anchor_gen(image_size)
expected_anchor_gen = anchor.Anchor(min_level, max_level, num_scales,
aspect_ratios, anchor_size, image_size)
expected_anchors = expected_anchor_gen.multilevel_boxes
for k in expected_anchors.keys():
self.assertAllClose(expected_anchors[k], anchors[k])
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Box related ops."""
# Import libraries
import numpy as np
import tensorflow as tf
EPSILON = 1e-8
BBOX_XFORM_CLIP = np.log(1000. / 16.)
def yxyx_to_xywh(boxes):
"""Converts boxes from ymin, xmin, ymax, xmax to xmin, ymin, width, height.
Args:
boxes: a numpy array whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
Returns:
boxes: a numpy array whose shape is the same as `boxes` in new format.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
boxes_ymin = boxes[..., 0]
boxes_xmin = boxes[..., 1]
boxes_width = boxes[..., 3] - boxes[..., 1]
boxes_height = boxes[..., 2] - boxes[..., 0]
new_boxes = np.stack(
[boxes_xmin, boxes_ymin, boxes_width, boxes_height], axis=-1)
return new_boxes
def jitter_boxes(boxes, noise_scale=0.025):
"""Jitter the box coordinates by some noise distribution.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
noise_scale: a python float which specifies the magnitude of noise. The rule
of thumb is to set this between (0, 0.1]. The default value is found to
mimic the noisy detections best empirically.
Returns:
jittered_boxes: a tensor whose shape is the same as `boxes` representing
the jittered boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
with tf.name_scope('jitter_boxes'):
bbox_jitters = tf.random.normal(tf.shape(boxes), stddev=noise_scale)
ymin = boxes[..., 0:1]
xmin = boxes[..., 1:2]
ymax = boxes[..., 2:3]
xmax = boxes[..., 3:4]
width = xmax - xmin
height = ymax - ymin
new_center_x = (xmin + xmax) / 2.0 + bbox_jitters[..., 0:1] * width
new_center_y = (ymin + ymax) / 2.0 + bbox_jitters[..., 1:2] * height
new_width = width * tf.math.exp(bbox_jitters[..., 2:3])
new_height = height * tf.math.exp(bbox_jitters[..., 3:4])
jittered_boxes = tf.concat(
[new_center_y - new_height * 0.5, new_center_x - new_width * 0.5,
new_center_y + new_height * 0.5, new_center_x + new_width * 0.5],
axis=-1)
return jittered_boxes
def normalize_boxes(boxes, image_shape):
"""Converts boxes to the normalized coordinates.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
normalized_boxes: a tensor whose shape is the same as `boxes` representing
the normalized boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
with tf.name_scope('normalize_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height = image_shape[..., 0:1]
width = image_shape[..., 1:2]
ymin = boxes[..., 0:1] / height
xmin = boxes[..., 1:2] / width
ymax = boxes[..., 2:3] / height
xmax = boxes[..., 3:4] / width
normalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
return normalized_boxes
def denormalize_boxes(boxes, image_shape):
"""Converts boxes normalized by [height, width] to pixel coordinates.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
denormalized_boxes: a tensor whose shape is the same as `boxes` representing
the denormalized boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
with tf.name_scope('denormalize_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height, width = tf.split(image_shape, 2, axis=-1)
ymin, xmin, ymax, xmax = tf.split(boxes, 4, axis=-1)
ymin = ymin * height
xmin = xmin * width
ymax = ymax * height
xmax = xmax * width
denormalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
return denormalized_boxes
def clip_boxes(boxes, image_shape):
"""Clips boxes to image boundaries.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
clipped_boxes: a tensor whose shape is the same as `boxes` representing the
clipped boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
with tf.name_scope('clip_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
max_length = [height, width, height, width]
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height, width = tf.unstack(image_shape, axis=-1)
max_length = tf.stack([height, width, height, width], axis=-1)
clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0)
return clipped_boxes
def compute_outer_boxes(boxes, image_shape, scale=1.0):
"""Compute outer box encloses an object with a margin.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
scale: a float number specifying the scale of output outer boxes to input
`boxes`.
Returns:
outer_boxes: a tensor whose shape is the same as `boxes` representing the
outer boxes.
"""
if scale < 1.0:
raise ValueError(
'scale is {}, but outer box scale must be greater than 1.0.'.format(
scale))
centers_y = (boxes[..., 0] + boxes[..., 2]) / 2.0
centers_x = (boxes[..., 1] + boxes[..., 3]) / 2.0
box_height = (boxes[..., 2] - boxes[..., 0]) * scale
box_width = (boxes[..., 3] - boxes[..., 1]) * scale
outer_boxes = tf.stack(
[centers_y - box_height / 2.0, centers_x - box_width / 2.0,
centers_y + box_height / 2.0, centers_x + box_width / 2.0],
axis=1)
outer_boxes = clip_boxes(outer_boxes, image_shape)
return outer_boxes
def encode_boxes(boxes, anchors, weights=None):
"""Encode boxes to targets.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
encoded_boxes: a tensor whose shape is the same as `boxes` representing the
encoded box targets.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
with tf.name_scope('encode_boxes'):
boxes = tf.cast(boxes, dtype=anchors.dtype)
ymin = boxes[..., 0:1]
xmin = boxes[..., 1:2]
ymax = boxes[..., 2:3]
xmax = boxes[..., 3:4]
box_h = ymax - ymin
box_w = xmax - xmin
box_yc = ymin + 0.5 * box_h
box_xc = xmin + 0.5 * box_w
anchor_ymin = anchors[..., 0:1]
anchor_xmin = anchors[..., 1:2]
anchor_ymax = anchors[..., 2:3]
anchor_xmax = anchors[..., 3:4]
anchor_h = anchor_ymax - anchor_ymin
anchor_w = anchor_xmax - anchor_xmin
anchor_yc = anchor_ymin + 0.5 * anchor_h
anchor_xc = anchor_xmin + 0.5 * anchor_w
encoded_dy = (box_yc - anchor_yc) / anchor_h
encoded_dx = (box_xc - anchor_xc) / anchor_w
encoded_dh = tf.math.log(box_h / anchor_h)
encoded_dw = tf.math.log(box_w / anchor_w)
if weights:
encoded_dy *= weights[0]
encoded_dx *= weights[1]
encoded_dh *= weights[2]
encoded_dw *= weights[3]
encoded_boxes = tf.concat(
[encoded_dy, encoded_dx, encoded_dh, encoded_dw], axis=-1)
return encoded_boxes
def decode_boxes(encoded_boxes, anchors, weights=None):
"""Decode boxes.
Args:
encoded_boxes: a tensor whose last dimension is 4 representing the
coordinates of encoded boxes in ymin, xmin, ymax, xmax order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
encoded_boxes: a tensor whose shape is the same as `boxes` representing the
decoded box targets.
"""
if encoded_boxes.shape[-1] != 4:
raise ValueError(
'encoded_boxes.shape[-1] is {:d}, but must be 4.'
.format(encoded_boxes.shape[-1]))
with tf.name_scope('decode_boxes'):
encoded_boxes = tf.cast(encoded_boxes, dtype=anchors.dtype)
dy = encoded_boxes[..., 0:1]
dx = encoded_boxes[..., 1:2]
dh = encoded_boxes[..., 2:3]
dw = encoded_boxes[..., 3:4]
if weights:
dy /= weights[0]
dx /= weights[1]
dh /= weights[2]
dw /= weights[3]
dh = tf.math.minimum(dh, BBOX_XFORM_CLIP)
dw = tf.math.minimum(dw, BBOX_XFORM_CLIP)
anchor_ymin = anchors[..., 0:1]
anchor_xmin = anchors[..., 1:2]
anchor_ymax = anchors[..., 2:3]
anchor_xmax = anchors[..., 3:4]
anchor_h = anchor_ymax - anchor_ymin
anchor_w = anchor_xmax - anchor_xmin
anchor_yc = anchor_ymin + 0.5 * anchor_h
anchor_xc = anchor_xmin + 0.5 * anchor_w
decoded_boxes_yc = dy * anchor_h + anchor_yc
decoded_boxes_xc = dx * anchor_w + anchor_xc
decoded_boxes_h = tf.math.exp(dh) * anchor_h
decoded_boxes_w = tf.math.exp(dw) * anchor_w
decoded_boxes_ymin = decoded_boxes_yc - 0.5 * decoded_boxes_h
decoded_boxes_xmin = decoded_boxes_xc - 0.5 * decoded_boxes_w
decoded_boxes_ymax = decoded_boxes_ymin + decoded_boxes_h
decoded_boxes_xmax = decoded_boxes_xmin + decoded_boxes_w
decoded_boxes = tf.concat(
[decoded_boxes_ymin, decoded_boxes_xmin,
decoded_boxes_ymax, decoded_boxes_xmax],
axis=-1)
return decoded_boxes
def filter_boxes(boxes, scores, image_shape, min_size_threshold):
"""Filter and remove boxes that are too small or fall outside the image.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
representing the original scores of the boxes.
image_shape: a tensor whose shape is the same as, or `broadcastable` to
`boxes` except the last dimension, which is 2, representing [height,
width] of the scaled image.
min_size_threshold: a float representing the minimal box size in each side
(w.r.t. the scaled image). Boxes whose sides are smaller than it will be
filtered out.
Returns:
filtered_boxes: a tensor whose shape is the same as `boxes` but with
the position of the filtered boxes are filled with 0.
filtered_scores: a tensor whose shape is the same as 'scores' but with
the positinon of the filtered boxes filled with 0.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
with tf.name_scope('filter_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height = image_shape[..., 0]
width = image_shape[..., 1]
ymin = boxes[..., 0]
xmin = boxes[..., 1]
ymax = boxes[..., 2]
xmax = boxes[..., 3]
h = ymax - ymin
w = xmax - xmin
yc = ymin + 0.5 * h
xc = xmin + 0.5 * w
min_size = tf.cast(
tf.math.maximum(min_size_threshold, 0.0), dtype=boxes.dtype)
filtered_size_mask = tf.math.logical_and(
tf.math.greater(h, min_size), tf.math.greater(w, min_size))
filtered_center_mask = tf.logical_and(
tf.math.logical_and(tf.math.greater(yc, 0.0), tf.math.less(yc, height)),
tf.math.logical_and(tf.math.greater(xc, 0.0), tf.math.less(xc, width)))
filtered_mask = tf.math.logical_and(
filtered_size_mask, filtered_center_mask)
filtered_scores = tf.where(filtered_mask, scores, tf.zeros_like(scores))
filtered_boxes = tf.cast(
tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
return filtered_boxes, filtered_scores
def filter_boxes_by_scores(boxes, scores, min_score_threshold):
"""Filter and remove boxes whose scores are smaller than the threshold.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
representing the original scores of the boxes.
min_score_threshold: a float representing the minimal box score threshold.
Boxes whose score are smaller than it will be filtered out.
Returns:
filtered_boxes: a tensor whose shape is the same as `boxes` but with
the position of the filtered boxes are filled with -1.
filtered_scores: a tensor whose shape is the same as 'scores' but with
the
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('filter_boxes_by_scores'):
filtered_mask = tf.math.greater(scores, min_score_threshold)
filtered_scores = tf.where(filtered_mask, scores, -tf.ones_like(scores))
filtered_boxes = tf.cast(
tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
return filtered_boxes, filtered_scores
def gather_instances(selected_indices, instances, *aux_instances):
"""Gather instances by indices.
Args:
selected_indices: a Tensor of shape [batch, K] which indicates the selected
indices in instance dimension (2nd dimension).
instances: a Tensor of shape [batch, N, ...] where the 2nd dimension is
the instance dimension to be selected from.
*aux_instances: the additional Tensors whose shapes are in [batch, N, ...]
which are the tensors to be selected from using the `selected_indices`.
Returns:
selected_instances: the tensor of shape [batch, K, ...] which corresponds to
the selected instances of the `instances` tensor.
selected_aux_instances: the additional tensors of shape [batch, K, ...]
which corresponds to the selected instances of the `aus_instances`
tensors.
"""
batch_size = instances.shape[0]
if batch_size == 1:
selected_instances = tf.squeeze(
tf.gather(instances, selected_indices, axis=1), axis=1)
if aux_instances:
selected_aux_instances = [
tf.squeeze(
tf.gather(a, selected_indices, axis=1), axis=1)
for a in aux_instances
]
return tuple([selected_instances] + selected_aux_instances)
else:
return selected_instances
else:
indices_shape = tf.shape(selected_indices)
batch_indices = (
tf.expand_dims(tf.range(indices_shape[0]), axis=-1) *
tf.ones([1, indices_shape[-1]], dtype=tf.int32))
gather_nd_indices = tf.stack(
[batch_indices, selected_indices], axis=-1)
selected_instances = tf.gather_nd(instances, gather_nd_indices)
if aux_instances:
selected_aux_instances = [
tf.gather_nd(a, gather_nd_indices) for a in aux_instances
]
return tuple([selected_instances] + selected_aux_instances)
else:
return selected_instances
def top_k_boxes(boxes, scores, k):
"""Sort and select top k boxes according to the scores.
Args:
boxes: a tensor of shape [batch_size, N, 4] representing the coordinate of
the boxes. N is the number of boxes per image.
scores: a tensor of shsape [batch_size, N] representing the socre of the
boxes.
k: an integer or a tensor indicating the top k number.
Returns:
selected_boxes: a tensor of shape [batch_size, k, 4] representing the
selected top k box coordinates.
selected_scores: a tensor of shape [batch_size, k] representing the selected
top k box scores.
"""
with tf.name_scope('top_k_boxes'):
selected_scores, top_k_indices = tf.nn.top_k(scores, k=k, sorted=True)
selected_boxes = gather_instances(top_k_indices, boxes)
return selected_boxes, selected_scores
def get_non_empty_box_indices(boxes):
"""Get indices for non-empty boxes."""
# Selects indices if box height or width is 0.
height = boxes[:, 2] - boxes[:, 0]
width = boxes[:, 3] - boxes[:, 1]
indices = tf.where(tf.logical_and(tf.greater(height, 0),
tf.greater(width, 0)))
return indices[:, 0]
def bbox_overlap(boxes, gt_boxes):
"""Calculates the overlap between proposal and ground truth boxes.
Some `boxes` or `gt_boxes` may have been padded. The returned `iou` tensor
for these boxes will be -1.
Args:
boxes: a tensor with a shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. This
tensor might have paddings with a negative value.
Returns:
iou: a tensor with as a shape of [batch_size, N, MAX_NUM_INSTANCES].
"""
with tf.name_scope('bbox_overlap'):
bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
value=boxes, num_or_size_splits=4, axis=2)
gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
value=gt_boxes, num_or_size_splits=4, axis=2)
# Calculates the intersection area.
i_xmin = tf.math.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
i_xmax = tf.math.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
i_ymin = tf.math.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
i_ymax = tf.math.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
i_area = (
tf.math.maximum((i_xmax - i_xmin), 0) *
tf.math.maximum((i_ymax - i_ymin), 0))
# Calculates the union area.
bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min)
gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min)
# Adds a small epsilon to avoid divide-by-zero.
u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8
# Calculates IoU.
iou = i_area / u_area
# Fills -1 for IoU entries between the padded ground truth boxes.
gt_invalid_mask = tf.less(
tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
padding_mask = tf.logical_or(
tf.zeros_like(bb_x_min, dtype=tf.bool),
tf.transpose(gt_invalid_mask, [0, 2, 1]))
iou = tf.where(padding_mask, -tf.ones_like(iou), iou)
# Fills -1 for for invalid (-1) boxes.
boxes_invalid_mask = tf.less(
tf.reduce_max(boxes, axis=-1, keepdims=True), 0.0)
iou = tf.where(boxes_invalid_mask, -tf.ones_like(iou), iou)
return iou
def box_matching(boxes, gt_boxes, gt_classes):
"""Match boxes to groundtruth boxes.
Given the proposal boxes and the groundtruth boxes and classes, perform the
groundtruth matching by taking the argmax of the IoU between boxes and
groundtruth boxes.
Args:
boxes: a tensor of shape of [batch_size, N, 4] representing the box
coordiantes to be matched to groundtruth boxes.
gt_boxes: a tensor of shape of [batch_size, MAX_INSTANCES, 4] representing
the groundtruth box coordinates. It is padded with -1s to indicate the
invalid boxes.
gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
classes. It is padded with -1s to indicate the invalid classes.
Returns:
matched_gt_boxes: a tensor of shape of [batch_size, N, 4], representing
the matched groundtruth box coordinates for each input box. If the box
does not overlap with any groundtruth boxes, the matched boxes of it
will be set to all 0s.
matched_gt_classes: a tensor of shape of [batch_size, N], representing
the matched groundtruth classes for each input box. If the box does not
overlap with any groundtruth boxes, the matched box classes of it will
be set to 0, which corresponds to the background class.
matched_gt_indices: a tensor of shape of [batch_size, N], representing
the indices of the matched groundtruth boxes in the original gt_boxes
tensor. If the box does not overlap with any groundtruth boxes, the
index of the matched groundtruth will be set to -1.
matched_iou: a tensor of shape of [batch_size, N], representing the IoU
between the box and its matched groundtruth box. The matched IoU is the
maximum IoU of the box and all the groundtruth boxes.
iou: a tensor of shape of [batch_size, N, K], representing the IoU matrix
between boxes and the groundtruth boxes. The IoU between a box and the
invalid groundtruth boxes whose coordinates are [-1, -1, -1, -1] is -1.
"""
# Compute IoU between boxes and gt_boxes.
# iou <- [batch_size, N, K]
iou = bbox_overlap(boxes, gt_boxes)
# max_iou <- [batch_size, N]
# 0.0 -> no match to gt, or -1.0 match to no gt
matched_iou = tf.reduce_max(iou, axis=-1)
# background_box_mask <- bool, [batch_size, N]
background_box_mask = tf.less_equal(matched_iou, 0.0)
argmax_iou_indices = tf.argmax(iou, axis=-1, output_type=tf.int32)
matched_gt_boxes, matched_gt_classes = gather_instances(
argmax_iou_indices, gt_boxes, gt_classes)
matched_gt_boxes = tf.where(
tf.tile(tf.expand_dims(background_box_mask, axis=-1), [1, 1, 4]),
tf.zeros_like(matched_gt_boxes, dtype=matched_gt_boxes.dtype),
matched_gt_boxes)
matched_gt_classes = tf.where(
background_box_mask,
tf.zeros_like(matched_gt_classes),
matched_gt_classes)
matched_gt_indices = tf.where(
background_box_mask,
-tf.ones_like(argmax_iou_indices),
argmax_iou_indices)
return (matched_gt_boxes, matched_gt_classes, matched_gt_indices,
matched_iou, iou)
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for box_ops.py."""
# Import libraries
import numpy as np
import tensorflow as tf
from official.vision.beta.ops import box_ops
def _transform_boxes_on_tpu_and_cpu(transform_fn, boxes, *args):
# Runs on TPU.
strategy = tf.distribute.experimental.TPUStrategy()
with strategy.scope():
transformed_op_tpu = transform_fn(boxes, *args)
transfomred_boxes_tpu = tf.nest.map_structure(lambda x: x.numpy(),
transformed_op_tpu)
# Runs on CPU.
transfomred_op_cpu = transform_fn(boxes, *args)
transfomred_boxes_cpu = tf.nest.map_structure(lambda x: x.numpy(),
transfomred_op_cpu)
return transfomred_boxes_tpu, transfomred_boxes_cpu
class ConvertBoxesTest(tf.test.TestCase):
def testConvertBoxes(self):
# y1, x1, y2, x2.
boxes = np.array([[0, 0, 1, 2], [0.2, 0.1, 1.2, 1.1]])
# x1, y1, width, height
target = np.array([[0, 0, 2, 1], [0.1, 0.2, 1, 1]])
outboxes = box_ops.yxyx_to_xywh(boxes)
self.assertNDArrayNear(outboxes, target, 1e-7)
class JitterBoxesTest(tf.test.TestCase):
def testJitterBoxes(self):
boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
[0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]]
boxes_np = np.array(boxes_data, dtype=np.float32)
max_size = max(
np.amax(boxes_np[:, 3] - boxes_np[:, 1]),
np.amax(boxes_np[:, 2] - boxes_np[:, 0]))
noise_scale = 0.025
boxes = tf.constant(boxes_np)
def jitter_fn(input_boxes, arg_noise_scale):
return box_ops.jitter_boxes(input_boxes, arg_noise_scale)
jittered_boxes_tpu, jittered_boxes_cpu = _transform_boxes_on_tpu_and_cpu(
jitter_fn, boxes, noise_scale)
# Test that the jittered box is within 10 stds from the inputs.
self.assertNDArrayNear(jittered_boxes_tpu, boxes_np,
noise_scale * max_size * 10)
self.assertNDArrayNear(jittered_boxes_cpu, boxes_np,
noise_scale * max_size * 10)
class NormalizeBoxesTest(tf.test.TestCase):
def testNormalizeBoxes1DWithImageShapeAsList(self):
boxes = tf.constant([10, 30, 40, 90], tf.float32)
image_shape = [50, 100]
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.normalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu, [0.2, 0.3, 0.8, 0.9], 1e-5)
def testNormalizeBoxes1DWithImageShapeAsTensor(self):
boxes = tf.constant([10, 30, 40, 90], tf.float32)
image_shape = tf.constant([50, 100], tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.normalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu, [0.2, 0.3, 0.8, 0.9], 1e-5)
def testNormalizeBoxes2DWithImageShapeAsList(self):
boxes = tf.constant([[10, 30, 40, 90], [30, 10, 40, 50]], tf.float32)
image_shape = [50, 100]
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.normalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]], 1e-5)
def testNormalizeBoxes2DWithImageShapeAsVector(self):
boxes = tf.constant([[10, 30, 40, 90], [30, 10, 40, 50]], tf.float32)
image_shape = tf.constant([50, 100], dtype=tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.normalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]], 1e-5)
def testNormalizeBoxes2DWithImageShapeAsBroadcastableTensor(self):
boxes = tf.constant([[10, 30, 40, 90], [30, 10, 40, 50]], tf.float32)
image_shape = tf.constant([[50, 100]], tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.normalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]], 1e-5)
def testNormalizeBoxes2DWithImageShapeAsSameShapeTensor(self):
boxes = tf.constant([[10, 30, 40, 90], [30, 10, 40, 50]], tf.float32)
image_shape = tf.constant([[50, 100], [50, 100]], tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.normalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]], 1e-5)
def testNormalizeBoxes3DWithImageShapeAsList(self):
boxes = tf.constant([[[10, 30, 40, 90], [30, 10, 40, 50]],
[[20, 40, 50, 80], [30, 50, 40, 90]]], tf.float32)
image_shape = [50, 100]
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.normalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
[[0.4, 0.4, 1.0, 0.8], [0.6, 0.5, 0.8, 0.9]]], 1e-5)
def testNormalizeBoxes3DWithImageShapeAsVector(self):
boxes = tf.constant([[[10, 30, 40, 90], [30, 10, 40, 50]],
[[20, 40, 50, 80], [30, 50, 40, 90]]], tf.float32)
image_shape = tf.constant([50, 100], tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.normalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
[[0.4, 0.4, 1.0, 0.8], [0.6, 0.5, 0.8, 0.9]]], 1e-5)
def testNormalizeBoxes3DWithImageShapeAsBroadcastableTensor(self):
boxes = tf.constant([[[10, 30, 40, 90], [30, 10, 40, 50]],
[[20, 40, 50, 80], [30, 50, 40, 90]]], tf.float32)
image_shape = tf.constant([[[50, 100]], [[500, 1000]]], tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.normalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(
normalized_boxes_tpu,
[[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
[[0.04, 0.04, 0.1, 0.08], [0.06, 0.05, 0.08, 0.09]]], 1e-5)
def testNormalizeBoxes3DWithImageShapeAsSameShapeTensor(self):
boxes = tf.constant([[[10, 30, 40, 90], [30, 10, 40, 50]],
[[20, 40, 50, 80], [30, 50, 40, 90]]], tf.float32)
image_shape = tf.constant(
[[[50, 100], [50, 100]], [[500, 1000], [500, 1000]]], tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.normalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(
normalized_boxes_tpu,
[[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
[[0.04, 0.04, 0.1, 0.08], [0.06, 0.05, 0.08, 0.09]]], 1e-5)
class DenormalizeBoxesTest(tf.test.TestCase):
def testDenormalizeBoxes1DWithImageShapeAsList(self):
boxes = tf.constant([0.2, 0.3, 0.8, 0.9], tf.float32)
image_shape = [50, 100]
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.denormalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu, [10, 30, 40, 90], 1e-5)
def testDenormalizeBoxes1DWithImageShapeAsTensor(self):
boxes = tf.constant([0.2, 0.3, 0.8, 0.9], tf.float32)
image_shape = tf.constant([50, 100], tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.denormalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu, [10, 30, 40, 90], 1e-5)
def testDenormalizeBoxes2DWithImageShapeAsList(self):
boxes = tf.constant([[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
tf.float32)
image_shape = [50, 100]
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.denormalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[10, 30, 40, 90], [30, 10, 40, 50]], 1e-5)
def testDenormalizeBoxes2DWithImageShapeAsVector(self):
boxes = tf.constant([[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
tf.float32)
image_shape = tf.constant([50, 100], dtype=tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.denormalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[10, 30, 40, 90], [30, 10, 40, 50]], 1e-5)
def testDenormalizeBoxes2DWithImageShapeAsBroadcastableTensor(self):
boxes = tf.constant([[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
tf.float32)
image_shape = tf.constant([[50, 100]], tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.denormalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[10, 30, 40, 90], [30, 10, 40, 50]], 1e-5)
def testDenormalizeBoxes2DWithImageShapeAsSameShapeTensor(self):
boxes = tf.constant([[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
tf.float32)
image_shape = tf.constant([[50, 100], [50, 100]], tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.denormalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[10, 30, 40, 90], [30, 10, 40, 50]], 1e-5)
def testDenormalizeBoxes3DWithImageShapeAsList(self):
boxes = tf.constant([[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
[[0.4, 0.4, 1.0, 0.8], [0.6, 0.5, 0.8, 0.9]]],
tf.float32)
image_shape = [50, 100]
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.denormalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[[10, 30, 40, 90], [30, 10, 40, 50]],
[[20, 40, 50, 80], [30, 50, 40, 90]]], 1e-5)
def testDenormalizeBoxes3DWithImageShapeAsVector(self):
boxes = tf.constant([[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
[[0.4, 0.4, 1.0, 0.8], [0.6, 0.5, 0.8, 0.9]]],
tf.float32)
image_shape = tf.constant([50, 100], tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.denormalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[[10, 30, 40, 90], [30, 10, 40, 50]],
[[20, 40, 50, 80], [30, 50, 40, 90]]], 1e-5)
def testDenormalizeBoxes3DWithImageShapeAsBroadcastableTensor(self):
boxes = tf.constant([[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
[[0.04, 0.04, 0.1, 0.08], [0.06, 0.05, 0.08, 0.09]]],
tf.float32)
image_shape = tf.constant([[[50, 100]], [[500, 1000]]], tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.denormalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[[10, 30, 40, 90], [30, 10, 40, 50]],
[[20, 40, 50, 80], [30, 50, 40, 90]]], 1e-5)
def testDenormalizeBoxes3DWithImageShapeAsSameShapeTensor(self):
boxes = tf.constant([[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
[[0.04, 0.04, 0.1, 0.08], [0.06, 0.05, 0.08, 0.09]]],
tf.float32)
image_shape = tf.constant(
[[[50, 100], [50, 100]], [[500, 1000], [500, 1000]]], tf.int32)
normalized_boxes_tpu, normalized_boxes_cpu = (
_transform_boxes_on_tpu_and_cpu(
box_ops.denormalize_boxes, boxes, image_shape))
self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
self.assertNDArrayNear(normalized_boxes_tpu,
[[[10, 30, 40, 90], [30, 10, 40, 50]],
[[20, 40, 50, 80], [30, 50, 40, 90]]], 1e-5)
class ClipBoxesTest(tf.test.TestCase):
def testClipBoxesImageShapeAsList(self):
boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
[0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]]
image_shape = [3, 3]
boxes = tf.constant(boxes_data)
clipped_boxes_tpu, clipped_boxes_cpu = _transform_boxes_on_tpu_and_cpu(
box_ops.clip_boxes, boxes, image_shape)
self.assertAllClose(clipped_boxes_tpu, clipped_boxes_cpu)
self.assertAllClose(clipped_boxes_tpu,
[[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
[0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]])
def testClipBoxesImageShapeAsVector(self):
boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
[0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]]
boxes = tf.constant(boxes_data)
image_shape = np.array([3, 3], dtype=np.float32)
clipped_boxes_tpu, clipped_boxes_cpu = _transform_boxes_on_tpu_and_cpu(
box_ops.clip_boxes, boxes, image_shape)
self.assertAllClose(clipped_boxes_tpu, clipped_boxes_cpu)
self.assertAllClose(clipped_boxes_tpu,
[[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
[0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]])
def testClipBoxesImageShapeAsTensor(self):
boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
[0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]]
boxes = tf.constant(boxes_data)
image_shape = tf.constant([[3, 3], [3, 3], [3, 3], [3, 3], [3, 3], [3, 3]],
dtype=tf.float32)
clipped_boxes_tpu, clipped_boxes_cpu = _transform_boxes_on_tpu_and_cpu(
box_ops.clip_boxes, boxes, image_shape)
self.assertAllClose(clipped_boxes_tpu, clipped_boxes_cpu)
self.assertAllClose(clipped_boxes_tpu,
[[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
[0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]])
class EncodeDecodeBoxesTest(tf.test.TestCase):
def test_encode_decode_boxes(self):
boxes_np = np.array([[[1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0]],
[[4.0, 5.0, 6.0, 7.0], [5.0, 6.0, 7.0, 8.0]]])
boxes = tf.constant(boxes_np, dtype=tf.float32)
anchors = tf.constant([[[1.5, 2.5, 3.5, 4.5], [2.5, 3.5, 4.5, 5.5]],
[[1.5, 2.5, 3.5, 4.5], [2.5, 3.5, 4.5, 5.5]]],
dtype=tf.float32)
weights = [1.0, 1.0, 1.0, 1.0]
def test_fn(boxes, anchors):
encoded_boxes = box_ops.encode_boxes(boxes, anchors, weights)
decoded_boxes = box_ops.decode_boxes(encoded_boxes, anchors, weights)
return decoded_boxes
decoded_boxes_tpu, decoded_boxes_cpu = _transform_boxes_on_tpu_and_cpu(
test_fn, boxes, anchors)
self.assertNDArrayNear(decoded_boxes_tpu, decoded_boxes_cpu, 1e-5)
self.assertNDArrayNear(decoded_boxes_tpu, boxes_np, 1e-5)
def test_encode_decode_boxes_batch_broadcast(self):
boxes_np = np.array([[[1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0]],
[[4.0, 5.0, 6.0, 7.0], [5.0, 6.0, 7.0, 8.0]]])
boxes = tf.constant(boxes_np, dtype=tf.float32)
anchors = tf.constant([[[1.5, 2.5, 3.5, 4.5], [2.5, 3.5, 4.5, 5.5]]],
dtype=tf.float32)
weights = [1.0, 1.0, 1.0, 1.0]
def test_fn(boxes, anchors):
encoded_boxes = box_ops.encode_boxes(boxes, anchors, weights)
decoded_boxes = box_ops.decode_boxes(encoded_boxes, anchors, weights)
return decoded_boxes
decoded_boxes_tpu, decoded_boxes_cpu = _transform_boxes_on_tpu_and_cpu(
test_fn, boxes, anchors)
self.assertNDArrayNear(decoded_boxes_tpu, decoded_boxes_cpu, 1e-5)
self.assertNDArrayNear(decoded_boxes_tpu, boxes_np, 1e-5)
class FilterBoxesTest(tf.test.TestCase):
def test_filter_boxes_batch(self):
# boxes -> [[small, good, outside], [outside, small, good]]
boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5],
[7.0, 4.0, 9.5, 6.5]],
[[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0],
[4.0, 1.0, 7.0, 4.0]]])
filtered_boxes_np = np.array([[[0.0, 0.0, 0.0, 0.0], [2.0, 3.0, 4.5, 5.5],
[0.0, 0.0, 0.0, 0.0]],
[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
[4.0, 1.0, 7.0, 4.0]]])
boxes = tf.constant(boxes_np, dtype=tf.float32)
scores_np = np.array([[0.9, 0.7, 0.5], [0.11, 0.22, 0.33]])
filtered_scores_np = np.array([[0.0, 0.7, 0.0], [0.0, 0.0, 0.33]])
scores = tf.constant(scores_np, dtype=tf.float32)
image_shape = tf.expand_dims(
tf.constant([[8, 8], [8, 8]], dtype=tf.int32), axis=1)
min_size_threshold = 2.0
def test_fn(boxes, scores, image_shape):
filtered_boxes, filtered_scores = box_ops.filter_boxes(
boxes, scores, image_shape, min_size_threshold)
return filtered_boxes, filtered_scores
filtered_results_tpu, filtered_results_cpu = (
_transform_boxes_on_tpu_and_cpu(
test_fn, boxes, scores, image_shape))
filtered_boxes_tpu, filtered_scores_tpu = filtered_results_tpu
filtered_boxes_cpu, filtered_scores_cpu = filtered_results_cpu
self.assertNDArrayNear(filtered_boxes_tpu, filtered_boxes_cpu, 1e-5)
self.assertNDArrayNear(filtered_scores_tpu, filtered_scores_cpu, 1e-5)
self.assertNDArrayNear(filtered_boxes_tpu, filtered_boxes_np, 1e-5)
self.assertNDArrayNear(filtered_scores_tpu, filtered_scores_np, 1e-5)
class FilterBoxesByScoresTest(tf.test.TestCase):
def test_filter_boxes_by_scores_batch(self):
# boxes -> [[small, good, outside], [outside, small, good]]
boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5],
[7.0, 4.0, 9.5, 6.5]],
[[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0],
[4.0, 1.0, 7.0, 4.0]]])
filtered_boxes_np = np.array([[[0.0, 0.0, 0.0, 0.0], [2.0, 3.0, 4.5, 5.5],
[7.0, 4.0, 9.5, 6.5]],
[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
[4.0, 1.0, 7.0, 4.0]]])
boxes = tf.constant(boxes_np, dtype=tf.float32)
scores_np = np.array([[0.1, 0.7, 0.6], [0.11, 0.22, 0.53]])
filtered_scores_np = np.array([[-1.0, 0.7, 0.6], [-1.0, -1.0, 0.53]])
scores = tf.constant(scores_np, dtype=tf.float32)
min_score_threshold = 0.5
def test_fn(boxes, scores):
filtered_boxes, filtered_scores = box_ops.filter_boxes_by_scores(
boxes, scores, min_score_threshold)
return filtered_boxes, filtered_scores
filtered_results_tpu, filtered_results_cpu = _transform_boxes_on_tpu_and_cpu(
test_fn, boxes, scores)
filtered_boxes_tpu, filtered_scores_tpu = filtered_results_tpu
filtered_boxes_cpu, filtered_scores_cpu = filtered_results_cpu
self.assertNDArrayNear(filtered_boxes_tpu, filtered_boxes_cpu, 1e-5)
self.assertNDArrayNear(filtered_scores_tpu, filtered_scores_cpu, 1e-5)
self.assertNDArrayNear(filtered_boxes_tpu, filtered_boxes_np, 1e-5)
self.assertNDArrayNear(filtered_scores_tpu, filtered_scores_np, 1e-5)
class GatherInstancesTest(tf.test.TestCase):
def test_gather_instances(self):
boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5],
[7.0, 4.0, 9.5, 6.5]],
[[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0],
[4.0, 1.0, 7.0, 4.0]]])
indices_np = np.array([[2, 0], [0, 1]])
boxes = tf.constant(boxes_np, dtype=tf.float32)
indices = tf.constant(indices_np, dtype=tf.int32)
selected_boxes = box_ops.gather_instances(indices, boxes)
expected_selected_boxes = np.array(
[[[7.0, 4.0, 9.5, 6.5], [1.0, 2.0, 1.5, 2.5]],
[[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0]]])
self.assertNDArrayNear(expected_selected_boxes, selected_boxes, 1e-5)
def test_gather_instances_with_multiple_inputs(self):
boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5],
[7.0, 4.0, 9.5, 6.5]],
[[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0],
[4.0, 1.0, 7.0, 4.0]]])
classes_np = np.array([[1, 2, 3], [20, 30, 40]])
indices_np = np.array([[2, 0], [0, 1]])
boxes = tf.constant(boxes_np, dtype=tf.float32)
classes = tf.constant(classes_np, dtype=tf.int32)
indices = tf.constant(indices_np, dtype=tf.int32)
selected_boxes, selected_classes = box_ops.gather_instances(
indices, boxes, classes)
expected_selected_boxes = np.array(
[[[7.0, 4.0, 9.5, 6.5], [1.0, 2.0, 1.5, 2.5]],
[[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0]]])
expected_selected_classes = np.array(
[[3, 1], [20, 30]])
self.assertNDArrayNear(expected_selected_boxes, selected_boxes, 1e-5)
self.assertAllEqual(expected_selected_classes, selected_classes)
class TopKBoxesTest(tf.test.TestCase):
def test_top_k_boxes_batch1(self):
boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5],
[7.0, 4.0, 9.5, 6.5]]])
boxes = tf.constant(boxes_np, dtype=tf.float32)
scores_np = np.array([[0.9, 0.5, 0.7]])
scores = tf.constant(scores_np, dtype=tf.float32)
top_k_boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [7.0, 4.0, 9.5, 6.5]]])
top_k_scores_np = np.array([[0.9, 0.7]])
def test_fn(boxes, scores):
top_k_boxes, top_k_scores = box_ops.top_k_boxes(boxes, scores, k=2)
return top_k_boxes, top_k_scores
top_k_results_tpu, top_k_results_cpu = _transform_boxes_on_tpu_and_cpu(
test_fn, boxes, scores)
top_k_boxes_tpu, top_k_scores_tpu = top_k_results_tpu
top_k_boxes_cpu, top_k_scores_cpu = top_k_results_cpu
self.assertNDArrayNear(top_k_boxes_tpu, top_k_boxes_cpu, 1e-5)
self.assertNDArrayNear(top_k_scores_tpu, top_k_scores_cpu, 1e-5)
self.assertNDArrayNear(top_k_boxes_tpu, top_k_boxes_np, 1e-5)
self.assertNDArrayNear(top_k_scores_tpu, top_k_scores_np, 1e-5)
def test_top_k_boxes_batch2(self):
boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5],
[7.0, 4.0, 9.5, 6.5]],
[[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0],
[4.0, 1.0, 7.0, 4.0]]])
boxes = tf.constant(boxes_np, dtype=tf.float32)
scores_np = np.array([[0.9, 0.7, 0.5], [0.11, 0.22, 0.33]])
scores = tf.constant(scores_np, dtype=tf.float32)
top_k_boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5]],
[[4.0, 1.0, 7.0, 4.0], [5.0, 6.0, 5.1, 6.0]]])
top_k_scores_np = np.array([[0.9, 0.7], [0.33, 0.22]])
def test_fn(boxes, scores):
top_k_boxes, top_k_scores = box_ops.top_k_boxes(boxes, scores, k=2)
return top_k_boxes, top_k_scores
top_k_results_tpu, top_k_results_cpu = _transform_boxes_on_tpu_and_cpu(
test_fn, boxes, scores)
top_k_boxes_tpu, top_k_scores_tpu = top_k_results_tpu
top_k_boxes_cpu, top_k_scores_cpu = top_k_results_cpu
self.assertNDArrayNear(top_k_boxes_tpu, top_k_boxes_cpu, 1e-5)
self.assertNDArrayNear(top_k_scores_tpu, top_k_scores_cpu, 1e-5)
self.assertNDArrayNear(top_k_boxes_tpu, top_k_boxes_np, 1e-5)
self.assertNDArrayNear(top_k_scores_tpu, top_k_scores_np, 1e-5)
class BboxeOverlapTest(tf.test.TestCase):
def testBBoxeOverlapOpCorrectness(self):
boxes_data = [[[0, 0, 0.1, 1], [0, 0.2, 0.2, 1.2], [0, 0.3, 0.3, 1.3],
[0, 0.5, 0.4, 1.5], [0, 0.7, 0.5, 1.7], [0, 0.9, 0.6, 1.9],
[0, 0.1, 0.1, 1.1], [0, 0.3, 0.7, 1.3], [0, 0.9, 2, 1.9]],
[[0, 0, 1, 0.2], [0, 0.2, 0.5, 1.2], [0, 0.4, 0.9, 1.4],
[0, 0.6, 1.1, 1.6], [0, 0.8, 1.2, 1.8], [0, 1, 1.5, 2],
[0, 0.5, 1, 1], [0.5, 0.8, 1, 1.8], [-1, -1, -1, -1]]]
boxes_np = np.array(boxes_data, dtype=np.float32)
gt_boxes_data = [[[0, 0.1, 0.1, 1.1], [0, 0.3, 0.7, 1.3], [0, 0.9, 2, 1.9]],
[[0, 0.5, 1, 1], [0.5, 0.8, 1, 1.8], [-1, -1, -1, -1]]]
gt_boxes_np = np.array(gt_boxes_data, dtype=np.float32)
# Runs on TPU.
strategy = tf.distribute.experimental.TPUStrategy()
with strategy.scope():
boxes = tf.constant(boxes_np)
gt_boxes = tf.constant(gt_boxes_np)
iou = box_ops.bbox_overlap(boxes=boxes, gt_boxes=gt_boxes)
iou = iou.numpy()
self.assertEqual(iou.shape, (2, 9, 3))
self.assertAllEqual(
np.argmax(iou, axis=2),
[[0, 0, 1, 1, 1, 2, 0, 1, 2], [0, 0, 0, 0, 1, 1, 0, 1, 0]])
def testBBoxeOverlapOpCheckShape(self):
batch_size = 2
rpn_post_nms_topn = 2000
gt_max_instances = 100
boxes_np = np.random.rand(batch_size, rpn_post_nms_topn,
4).astype(np.float32)
gt_boxes_np = np.random.rand(batch_size, gt_max_instances,
4).astype(np.float32)
strategy = tf.distribute.experimental.TPUStrategy()
with strategy.scope():
boxes = tf.constant(boxes_np)
gt_boxes = tf.constant(gt_boxes_np)
iou = box_ops.bbox_overlap(boxes=boxes, gt_boxes=gt_boxes)
iou = iou.numpy()
self.assertEqual(iou.shape,
(batch_size, (rpn_post_nms_topn), gt_max_instances))
def testBBoxeOverlapOpCorrectnessWithNegativeData(self):
boxes_data = [[[0, -0.01, 0.1, 1.1], [0, 0.2, 0.2, 5.0],
[0, -0.01, 0.1, 1.], [-1, -1, -1, -1]]]
boxes_np = np.array(boxes_data, dtype=np.float32)
gt_boxes_np = boxes_np
strategy = tf.distribute.experimental.TPUStrategy()
with strategy.scope():
boxes = tf.constant(boxes_np)
gt_boxes = tf.constant(gt_boxes_np)
iou = box_ops.bbox_overlap(boxes=boxes, gt_boxes=gt_boxes)
iou = iou.numpy()
expected = np.array([[[0.99999994, 0.0917431, 0.9099099, -1.],
[0.0917431, 1., 0.08154944, -1.],
[0.9099099, 0.08154944, 1., -1.],
[-1., -1., -1., -1.]]])
self.assertAllClose(expected, iou)
class BoxMatchingTest(tf.test.TestCase):
def test_box_matching_single(self):
boxes_np = np.array(
[[[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5],
[5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]]])
boxes = tf.constant(boxes_np, dtype=tf.float32)
gt_boxes_np = np.array(
[[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5],
[-1, -1, -1, -1]]])
gt_boxes = tf.constant(gt_boxes_np, dtype=tf.float32)
gt_classes_np = np.array([[2, 10, -1]])
gt_classes = tf.constant(gt_classes_np, dtype=tf.int32)
matched_gt_boxes_np = np.array(
[[[2.5, 2.5, 7.5, 7.5],
[2.5, 2.5, 7.5, 7.5],
[2.5, 2.5, 7.5, 7.5],
[10, 10, 15, 15]]])
matched_gt_classes_np = np.array([[10, 10, 10, 2]])
matched_gt_indices_np = np.array([[1, 1, 1, 0]])
matched_iou_np = np.array(
[[0.142857142857, 1.0, 0.142857142857, 0.142857142857]])
iou_np = np.array(
[[[0, 0.142857142857, -1.0],
[0, 1.0, -1.0],
[0, 0.142857142857, -1.0],
[0.142857142857, 0, -1.0]]])
# Runs on TPU.
strategy = tf.distribute.experimental.TPUStrategy()
with strategy.scope():
(matched_gt_boxes_tpu, matched_gt_classes_tpu,
matched_gt_indices_tpu, matched_iou_tpu, iou_tpu) = (
box_ops.box_matching(boxes, gt_boxes, gt_classes))
# Runs on CPU.
(matched_gt_boxes_cpu, matched_gt_classes_cpu,
matched_gt_indices_cpu, matched_iou_cpu, iou_cpu) = (
box_ops.box_matching(boxes, gt_boxes, gt_classes))
# consistency.
self.assertNDArrayNear(
matched_gt_boxes_tpu.numpy(), matched_gt_boxes_cpu.numpy(), 1e-5)
self.assertAllEqual(
matched_gt_classes_tpu.numpy(), matched_gt_classes_cpu.numpy())
self.assertAllEqual(
matched_gt_indices_tpu.numpy(), matched_gt_indices_cpu.numpy())
self.assertNDArrayNear(
matched_iou_tpu.numpy(), matched_iou_cpu.numpy(), 1e-5)
self.assertNDArrayNear(
iou_tpu.numpy(), iou_cpu.numpy(), 1e-5)
# correctness.
self.assertNDArrayNear(
matched_gt_boxes_tpu.numpy(), matched_gt_boxes_np, 1e-5)
self.assertAllEqual(
matched_gt_classes_tpu.numpy(), matched_gt_classes_np)
self.assertAllEqual(
matched_gt_indices_tpu.numpy(), matched_gt_indices_np)
self.assertNDArrayNear(
matched_iou_tpu.numpy(), matched_iou_np, 1e-5)
self.assertNDArrayNear(
iou_tpu.numpy(), iou_np, 1e-5)
def test_box_matching_single_no_gt(self):
boxes_np = np.array(
[[[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5],
[5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]]])
boxes = tf.constant(boxes_np, dtype=tf.float32)
gt_boxes_np = np.array(
[[[-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1]]])
gt_boxes = tf.constant(gt_boxes_np, dtype=tf.float32)
gt_classes_np = np.array([[-1, -1, -1]])
gt_classes = tf.constant(gt_classes_np, dtype=tf.int32)
matched_gt_boxes_np = np.array(
[[[0, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]]])
matched_gt_classes_np = np.array([[0, 0, 0, 0]])
matched_gt_indices_np = np.array([[-1, -1, -1, -1]])
matched_iou_np = np.array([[-1, -1, -1, -1]])
iou_np = np.array(
[[[-1, -1, -1],
[-1, -1, -1],
[-1, -1, -1],
[-1, -1, -1]]])
# Runs on TPU.
strategy = tf.distribute.experimental.TPUStrategy()
with strategy.scope():
(matched_gt_boxes_tpu, matched_gt_classes_tpu,
matched_gt_indices_tpu, matched_iou_tpu, iou_tpu) = (
box_ops.box_matching(boxes, gt_boxes, gt_classes))
# Runs on CPU.
(matched_gt_boxes_cpu, matched_gt_classes_cpu,
matched_gt_indices_cpu, matched_iou_cpu, iou_cpu) = (
box_ops.box_matching(boxes, gt_boxes, gt_classes))
# consistency.
self.assertNDArrayNear(
matched_gt_boxes_tpu.numpy(), matched_gt_boxes_cpu.numpy(), 1e-5)
self.assertAllEqual(
matched_gt_classes_tpu.numpy(), matched_gt_classes_cpu.numpy())
self.assertAllEqual(
matched_gt_indices_tpu.numpy(), matched_gt_indices_cpu.numpy())
self.assertNDArrayNear(
matched_iou_tpu.numpy(), matched_iou_cpu.numpy(), 1e-5)
self.assertNDArrayNear(
iou_tpu.numpy(), iou_cpu.numpy(), 1e-5)
# correctness.
self.assertNDArrayNear(
matched_gt_boxes_tpu.numpy(), matched_gt_boxes_np, 1e-5)
self.assertAllEqual(
matched_gt_classes_tpu.numpy(), matched_gt_classes_np)
self.assertAllEqual(
matched_gt_indices_tpu.numpy(), matched_gt_indices_np)
self.assertNDArrayNear(
matched_iou_tpu.numpy(), matched_iou_np, 1e-5)
self.assertNDArrayNear(
iou_tpu.numpy(), iou_np, 1e-5)
def test_box_matching_batch(self):
boxes_np = np.array(
[[[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5],
[5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]],
[[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5],
[5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]]])
boxes = tf.constant(boxes_np, dtype=tf.float32)
gt_boxes_np = np.array(
[[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
[[-1, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1]]])
gt_boxes = tf.constant(gt_boxes_np, dtype=tf.float32)
gt_classes_np = np.array([[2, 10, -1], [-1, -1, -1]])
gt_classes = tf.constant(gt_classes_np, dtype=tf.int32)
matched_gt_boxes_np = np.array(
[[[2.5, 2.5, 7.5, 7.5],
[2.5, 2.5, 7.5, 7.5],
[2.5, 2.5, 7.5, 7.5],
[10, 10, 15, 15]],
[[0, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]]])
matched_gt_classes_np = np.array(
[[10, 10, 10, 2],
[0, 0, 0, 0]])
matched_gt_indices_np = np.array(
[[1, 1, 1, 0],
[-1, -1, -1, -1]])
matched_iou_np = np.array(
[[0.142857142857, 1.0, 0.142857142857, 0.142857142857],
[-1, -1, -1, -1]])
iou_np = np.array(
[[[0, 0.142857142857, -1.0],
[0, 1.0, -1.0],
[0, 0.142857142857, -1.0],
[0.142857142857, 0, -1.0]],
[[-1, -1, -1],
[-1, -1, -1],
[-1, -1, -1],
[-1, -1, -1]]])
# Runs on TPU.
strategy = tf.distribute.experimental.TPUStrategy()
with strategy.scope():
(matched_gt_boxes_tpu, matched_gt_classes_tpu,
matched_gt_indices_tpu, matched_iou_tpu, iou_tpu) = (
box_ops.box_matching(boxes, gt_boxes, gt_classes))
# Runs on CPU.
(matched_gt_boxes_cpu, matched_gt_classes_cpu,
matched_gt_indices_cpu, matched_iou_cpu, iou_cpu) = (
box_ops.box_matching(boxes, gt_boxes, gt_classes))
# consistency.
self.assertNDArrayNear(
matched_gt_boxes_tpu.numpy(), matched_gt_boxes_cpu.numpy(), 1e-5)
self.assertAllEqual(
matched_gt_classes_tpu.numpy(), matched_gt_classes_cpu.numpy())
self.assertAllEqual(
matched_gt_indices_tpu.numpy(), matched_gt_indices_cpu.numpy())
self.assertNDArrayNear(
matched_iou_tpu.numpy(), matched_iou_cpu.numpy(), 1e-5)
self.assertNDArrayNear(
iou_tpu.numpy(), iou_cpu.numpy(), 1e-5)
# correctness.
self.assertNDArrayNear(
matched_gt_boxes_tpu.numpy(), matched_gt_boxes_np, 1e-5)
self.assertAllEqual(
matched_gt_classes_tpu.numpy(), matched_gt_classes_np)
self.assertAllEqual(
matched_gt_indices_tpu.numpy(), matched_gt_indices_np)
self.assertNDArrayNear(
matched_iou_tpu.numpy(), matched_iou_np, 1e-5)
self.assertNDArrayNear(
iou_tpu.numpy(), iou_np, 1e-5)
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Multi scale anchor generator definition."""
import tensorflow as tf
# (TODO/tanzheny): consider having customized anchor offset.
class _SingleAnchorGenerator:
"""Utility to generate anchors for a single feature map.
Example:
```python
anchor_gen = _SingleAnchorGenerator(32, [.5, 1., 2.], stride=16)
anchors = anchor_gen([512, 512, 3])
```
"""
def __init__(self,
anchor_size,
scales,
aspect_ratios,
stride,
clip_boxes=False):
"""Constructs single scale anchor.
Args:
anchor_size: A single int represents the base anchor size. The anchor
height will be `anchor_size / sqrt(aspect_ratio)`, anchor width will be
`anchor_size * sqrt(aspect_ratio)`.
scales: A list/tuple, or a list/tuple of a list/tuple of positive
floats representing the actual anchor size to the base `anchor_size`.
aspect_ratios: a list/tuple of positive floats representing the ratio of
anchor width to anchor height.
stride: A single int represents the anchor stride size between center of
each anchor.
clip_boxes: Boolean to represent whether the anchor coordinates should be
clipped to the image size. Defaults to `True`.
Input shape: the size of the image, `[H, W, C]`
Output shape: the size of anchors, `[(H / stride) * (W / stride), 4]`
"""
self.anchor_size = anchor_size
self.scales = scales
self.aspect_ratios = aspect_ratios
self.stride = stride
self.clip_boxes = clip_boxes
def __call__(self, image_size):
image_height = tf.cast(image_size[0], tf.float32)
image_width = tf.cast(image_size[1], tf.float32)
k = len(self.scales) * len(self.aspect_ratios)
aspect_ratios_sqrt = tf.cast(tf.sqrt(self.aspect_ratios), dtype=tf.float32)
anchor_size = tf.cast(self.anchor_size, tf.float32)
# [K]
anchor_heights = []
anchor_widths = []
for scale in self.scales:
anchor_size_t = anchor_size * scale
anchor_height = anchor_size_t / aspect_ratios_sqrt
anchor_width = anchor_size_t * aspect_ratios_sqrt
anchor_heights.append(anchor_height)
anchor_widths.append(anchor_width)
anchor_heights = tf.concat(anchor_heights, axis=0)
anchor_widths = tf.concat(anchor_widths, axis=0)
half_anchor_heights = tf.reshape(0.5 * anchor_heights, [1, 1, k])
half_anchor_widths = tf.reshape(0.5 * anchor_widths, [1, 1, k])
stride = tf.cast(self.stride, tf.float32)
# [W]
cx = tf.range(0.5 * stride, image_width, stride)
# [H]
cy = tf.range(0.5 * stride, image_height, stride)
# [H, W]
cx_grid, cy_grid = tf.meshgrid(cx, cy)
# [H, W, 1]
cx_grid = tf.expand_dims(cx_grid, axis=-1)
cy_grid = tf.expand_dims(cy_grid, axis=-1)
# [H, W, K, 1]
y_min = tf.expand_dims(cy_grid - half_anchor_heights, axis=-1)
y_max = tf.expand_dims(cy_grid + half_anchor_heights, axis=-1)
x_min = tf.expand_dims(cx_grid - half_anchor_widths, axis=-1)
x_max = tf.expand_dims(cx_grid + half_anchor_widths, axis=-1)
if self.clip_boxes:
y_min = tf.maximum(tf.minimum(y_min, image_height), 0.)
y_max = tf.maximum(tf.minimum(y_max, image_height), 0.)
x_min = tf.maximum(tf.minimum(x_min, image_width), 0.)
x_max = tf.maximum(tf.minimum(x_max, image_width), 0.)
# [H, W, K, 4]
result = tf.concat([y_min, x_min, y_max, x_max], axis=-1)
shape = result.shape.as_list()
# [H, W, K * 4]
return tf.reshape(result, [shape[0], shape[1], shape[2] * shape[3]])
class AnchorGenerator():
"""Utility to generate anchors for a multiple feature maps.
Example:
```python
anchor_gen = AnchorGenerator([32, 64], [.5, 1., 2.],
strides=[16, 32])
anchors = anchor_gen([512, 512, 3])
```
"""
def __init__(self,
anchor_sizes,
scales,
aspect_ratios,
strides,
clip_boxes=False):
"""Constructs multiscale anchors.
Args:
anchor_sizes: A list of int represents the anchor size for each scale. The
anchor height will be `anchor_size / sqrt(aspect_ratio)`, anchor width
will be `anchor_size * sqrt(aspect_ratio)` for each scale.
scales: A list/tuple, or a list/tuple of a list/tuple of positive
floats representing the actual anchor size to the base `anchor_size`.
aspect_ratios: A list/tuple, or a list/tuple of a list/tuple of positive
floats representing the ratio of anchor width to anchor height.
strides: A list/tuple of ints represent the anchor stride size between
center of anchors at each scale.
clip_boxes: Boolean to represents whether the anchor coordinates should be
clipped to the image size. Defaults to `False`.
Input shape: the size of the image, `[H, W, C]`
Output shape: the size of anchors concat on each level, `[(H /
strides) * (W / strides), K * 4]`
"""
# aspect_ratio is a single list that is the same across all levels.
aspect_ratios = maybe_map_structure_for_anchor(aspect_ratios, anchor_sizes)
scales = maybe_map_structure_for_anchor(scales, anchor_sizes)
if isinstance(anchor_sizes, dict):
self.anchor_generators = {}
for k in anchor_sizes.keys():
self.anchor_generators[k] = _SingleAnchorGenerator(
anchor_sizes[k], scales[k], aspect_ratios[k], strides[k],
clip_boxes)
elif isinstance(anchor_sizes, (list, tuple)):
self.anchor_generators = []
for anchor_size, scale_list, ar_list, stride in zip(
anchor_sizes, scales, aspect_ratios, strides):
self.anchor_generators.append(
_SingleAnchorGenerator(anchor_size, scale_list, ar_list, stride,
clip_boxes))
def __call__(self, image_size):
anchor_generators = tf.nest.flatten(self.anchor_generators)
results = [anchor_gen(image_size) for anchor_gen in anchor_generators]
return tf.nest.pack_sequence_as(self.anchor_generators, results)
def maybe_map_structure_for_anchor(params, anchor_sizes):
"""broadcast the params to match anchor_sizes."""
if all(isinstance(param, (int, float)) for param in params):
if isinstance(anchor_sizes, (tuple, list)):
return [params] * len(anchor_sizes)
elif isinstance(anchor_sizes, dict):
return tf.nest.map_structure(lambda _: params, anchor_sizes)
else:
raise ValueError("the structure of `anchor_sizes` must be a tuple, "
"list, or dict, given {}".format(anchor_sizes))
else:
return params
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for anchor_generator.py."""
from absl.testing import parameterized
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.vision.beta.ops.experimental import anchor_generator
class AnchorGeneratorTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
# Single scale anchor.
(5, [1.0], [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]],
[[16., -16., 80., 48.], [16., 16., 80., 80.]]]),
# # Multi aspect ratio anchor.
(6, [1.0, 4.0, 0.25],
[[[-32., -32., 96., 96., 0., -96., 64., 160., -96., 0., 160., 64.]]]),
)
def testAnchorGeneration(self, level, aspect_ratios, expected_boxes):
image_size = [64, 64]
anchor_size = 2**(level + 1)
stride = 2**level
anchor_gen = anchor_generator._SingleAnchorGenerator(
anchor_size=anchor_size,
scales=[1.],
aspect_ratios=aspect_ratios,
stride=stride,
clip_boxes=False)
anchors = anchor_gen(image_size).numpy()
self.assertAllClose(expected_boxes, anchors)
@parameterized.parameters(
# Single scale anchor.
(5, [1.0], [[[0., 0., 48., 48.], [0., 16., 48., 64.]],
[[16., 0., 64., 48.], [16., 16., 64., 64.]]]),
# # Multi aspect ratio anchor.
(6, [1.0, 4.0, 0.25
], [[[0., 0., 64., 64., 0., 0., 64., 64., 0., 0., 64., 64.]]]),
)
def testAnchorGenerationClipped(self, level, aspect_ratios, expected_boxes):
image_size = [64, 64]
anchor_size = 2**(level + 1)
stride = 2**level
anchor_gen = anchor_generator._SingleAnchorGenerator(
anchor_size=anchor_size,
scales=[1.],
aspect_ratios=aspect_ratios,
stride=stride,
clip_boxes=True)
anchors = anchor_gen(image_size).numpy()
self.assertAllClose(expected_boxes, anchors)
@combinations.generate(
combinations.combine(distribution=strategy_combinations.all_strategies))
def testAnchorGenerationDistributed(self, distribution):
image_size = [64, 64]
anchor_size = 64
stride = 32
aspect_ratios = [1.0]
with distribution.scope():
anchor_gen = anchor_generator._SingleAnchorGenerator(
anchor_size=anchor_size,
scales=[1.],
aspect_ratios=aspect_ratios,
stride=stride,
clip_boxes=False)
anchors = anchor_gen(image_size).numpy()
expected_boxes = [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]],
[[16., -16., 80., 48.], [16., 16., 80., 80.]]]
self.assertAllClose(expected_boxes, anchors)
class MultiScaleAnchorGeneratorTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
# Multi scale anchor.
(5, 6, [[1.0], [1.0]], [[-16, -16, 48, 48], [-16, 16, 48, 80],
[16, -16, 80, 48], [16, 16, 80, 80],
[-32, -32, 96, 96]]),)
def testAnchorGeneration(self, min_level, max_level, aspect_ratios,
expected_boxes):
image_size = [64, 64]
levels = range(min_level, max_level + 1)
anchor_sizes = [2**(level + 1) for level in levels]
strides = [2**level for level in levels]
anchor_gen = anchor_generator.AnchorGenerator(
anchor_sizes=anchor_sizes,
scales=[1.],
aspect_ratios=aspect_ratios,
strides=strides)
anchors = anchor_gen(image_size)
anchors = [tf.reshape(anchor, [-1, 4]) for anchor in anchors]
anchors = tf.concat(anchors, axis=0).numpy()
self.assertAllClose(expected_boxes, anchors)
@parameterized.parameters(
# Multi scale anchor.
(5, 6, [[1.0], [1.0]], [[-16, -16, 48, 48], [-16, 16, 48, 80],
[16, -16, 80, 48], [16, 16, 80, 80],
[-32, -32, 96, 96]]),)
def testAnchorGenerationClipped(self, min_level, max_level, aspect_ratios,
expected_boxes):
image_size = [64, 64]
levels = range(min_level, max_level + 1)
anchor_sizes = [2**(level + 1) for level in levels]
strides = [2**level for level in levels]
anchor_gen = anchor_generator.AnchorGenerator(
anchor_sizes=anchor_sizes,
scales=[1.],
aspect_ratios=aspect_ratios,
strides=strides,
clip_boxes=False)
anchors = anchor_gen(image_size)
anchors = [tf.reshape(anchor, [-1, 4]) for anchor in anchors]
anchors = tf.concat(anchors, axis=0).numpy()
self.assertAllClose(expected_boxes, anchors)
@parameterized.parameters(
# Multi scale anchor.
(5, 6, [1.0], {
5: [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]],
[[16., -16., 80., 48.], [16., 16., 80., 80.]]],
6: [[[-32, -32, 96, 96]]]
}),)
def testAnchorGenerationDict(self, min_level, max_level, aspect_ratios,
expected_boxes):
image_size = [64, 64]
levels = range(min_level, max_level + 1)
anchor_sizes = dict((level, 2**(level + 1)) for level in levels)
strides = dict((level, 2**level) for level in levels)
anchor_gen = anchor_generator.AnchorGenerator(
anchor_sizes=anchor_sizes,
scales=[1.],
aspect_ratios=aspect_ratios,
strides=strides,
clip_boxes=False)
anchors = anchor_gen(image_size)
for k in expected_boxes.keys():
self.assertAllClose(expected_boxes[k], anchors[k].numpy())
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utility functions for segmentations."""
import math
# Import libraries
from cvx2 import latest as cv2
import numpy as np
def paste_instance_masks(masks,
detected_boxes,
image_height,
image_width):
"""Paste instance masks to generate the image segmentation results.
Args:
masks: a numpy array of shape [N, mask_height, mask_width] representing the
instance masks w.r.t. the `detected_boxes`.
detected_boxes: a numpy array of shape [N, 4] representing the reference
bounding boxes.
image_height: an integer representing the height of the image.
image_width: an integer representing the width of the image.
Returns:
segms: a numpy array of shape [N, image_height, image_width] representing
the instance masks *pasted* on the image canvas.
"""
def expand_boxes(boxes, scale):
"""Expands an array of boxes by a given scale."""
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227 # pylint: disable=line-too-long
# The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
# whereas `boxes` here is in [x1, y1, w, h] form
w_half = boxes[:, 2] * .5
h_half = boxes[:, 3] * .5
x_c = boxes[:, 0] + w_half
y_c = boxes[:, 1] + h_half
w_half *= scale
h_half *= scale
boxes_exp = np.zeros(boxes.shape)
boxes_exp[:, 0] = x_c - w_half
boxes_exp[:, 2] = x_c + w_half
boxes_exp[:, 1] = y_c - h_half
boxes_exp[:, 3] = y_c + h_half
return boxes_exp
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812 # pylint: disable=line-too-long
# To work around an issue with cv2.resize (it seems to automatically pad
# with repeated border values), we manually zero-pad the masks by 1 pixel
# prior to resizing back to the original image resolution. This prevents
# "top hat" artifacts. We therefore need to expand the reference boxes by an
# appropriate factor.
_, mask_height, mask_width = masks.shape
scale = max((mask_width + 2.0) / mask_width,
(mask_height + 2.0) / mask_height)
ref_boxes = expand_boxes(detected_boxes, scale)
ref_boxes = ref_boxes.astype(np.int32)
padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
segms = []
for mask_ind, mask in enumerate(masks):
im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
# Process mask inside bounding boxes.
padded_mask[1:-1, 1:-1] = mask[:, :]
ref_box = ref_boxes[mask_ind, :]
w = ref_box[2] - ref_box[0] + 1
h = ref_box[3] - ref_box[1] + 1
w = np.maximum(w, 1)
h = np.maximum(h, 1)
mask = cv2.resize(padded_mask, (w, h))
mask = np.array(mask > 0.5, dtype=np.uint8)
x_0 = min(max(ref_box[0], 0), image_width)
x_1 = min(max(ref_box[2] + 1, 0), image_width)
y_0 = min(max(ref_box[1], 0), image_height)
y_1 = min(max(ref_box[3] + 1, 0), image_height)
im_mask[y_0:y_1, x_0:x_1] = mask[
(y_0 - ref_box[1]):(y_1 - ref_box[1]),
(x_0 - ref_box[0]):(x_1 - ref_box[0])
]
segms.append(im_mask)
segms = np.array(segms)
assert masks.shape[0] == segms.shape[0]
return segms
def paste_instance_masks_v2(masks,
detected_boxes,
image_height,
image_width):
"""Paste instance masks to generate the image segmentation (v2).
Args:
masks: a numpy array of shape [N, mask_height, mask_width] representing the
instance masks w.r.t. the `detected_boxes`.
detected_boxes: a numpy array of shape [N, 4] representing the reference
bounding boxes.
image_height: an integer representing the height of the image.
image_width: an integer representing the width of the image.
Returns:
segms: a numpy array of shape [N, image_height, image_width] representing
the instance masks *pasted* on the image canvas.
"""
_, mask_height, mask_width = masks.shape
segms = []
for i, mask in enumerate(masks):
box = detected_boxes[i, :]
xmin = box[0]
ymin = box[1]
xmax = xmin + box[2]
ymax = ymin + box[3]
# Sample points of the cropped mask w.r.t. the image grid.
# Note that these coordinates may fall beyond the image.
# Pixel clipping will happen after warping.
xmin_int = int(math.floor(xmin))
xmax_int = int(math.ceil(xmax))
ymin_int = int(math.floor(ymin))
ymax_int = int(math.ceil(ymax))
alpha = box[2] / (1.0 * mask_width)
beta = box[3] / (1.0 * mask_height)
# pylint: disable=invalid-name
# Transformation from mask pixel indices to image coordinate.
M_mask_to_image = np.array(
[[alpha, 0, xmin],
[0, beta, ymin],
[0, 0, 1]],
dtype=np.float32)
# Transformation from image to cropped mask coordinate.
M_image_to_crop = np.array(
[[1, 0, -xmin_int],
[0, 1, -ymin_int],
[0, 0, 1]],
dtype=np.float32)
M = np.dot(M_image_to_crop, M_mask_to_image)
# Compensate the half pixel offset that OpenCV has in the
# warpPerspective implementation: the top-left pixel is sampled
# at (0,0), but we want it to be at (0.5, 0.5).
M = np.dot(
np.dot(
np.array([[1, 0, -0.5],
[0, 1, -0.5],
[0, 0, 1]], np.float32),
M),
np.array([[1, 0, 0.5],
[0, 1, 0.5],
[0, 0, 1]], np.float32))
# pylint: enable=invalid-name
cropped_mask = cv2.warpPerspective(
mask.astype(np.float32), M,
(xmax_int - xmin_int, ymax_int - ymin_int))
cropped_mask = np.array(cropped_mask > 0.5, dtype=np.uint8)
img_mask = np.zeros((image_height, image_width))
x0 = max(min(xmin_int, image_width), 0)
x1 = max(min(xmax_int, image_width), 0)
y0 = max(min(ymin_int, image_height), 0)
y1 = max(min(ymax_int, image_height), 0)
img_mask[y0:y1, x0:x1] = cropped_mask[
(y0 - ymin_int):(y1 - ymin_int),
(x0 - xmin_int):(x1 - xmin_int)]
segms.append(img_mask)
segms = np.array(segms)
return segms
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for mask_ops.py."""
# Import libraries
import numpy as np
import tensorflow as tf
from official.vision.beta.ops import mask_ops
class MaskUtilsTest(tf.test.TestCase):
def testPasteInstanceMasks(self):
image_height = 10
image_width = 10
mask_height = 6
mask_width = 6
masks = np.random.randint(0, 255, (1, mask_height, mask_width))
detected_boxes = np.array([[0.0, 2.0, mask_width, mask_height]])
_ = mask_ops.paste_instance_masks(
masks, detected_boxes, image_height, image_width)
def testPasteInstanceMasksV2(self):
image_height = 10
image_width = 10
mask_height = 6
mask_width = 6
masks = np.random.randint(0, 255, (1, mask_height, mask_width))
detected_boxes = np.array([[0.0, 2.0, mask_width, mask_height]])
image_masks = mask_ops.paste_instance_masks_v2(
masks, detected_boxes, image_height, image_width)
self.assertNDArrayNear(
image_masks[:, 2:8, 0:6],
np.array(masks > 0.5, dtype=np.uint8),
1e-5)
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tensorflow implementation of non max suppression."""
# Import libraries
import tensorflow as tf
from official.vision.beta.ops import box_ops
NMS_TILE_SIZE = 512
def _self_suppression(iou, _, iou_sum):
batch_size = tf.shape(iou)[0]
can_suppress_others = tf.cast(
tf.reshape(tf.reduce_max(iou, 1) <= 0.5, [batch_size, -1, 1]), iou.dtype)
iou_suppressed = tf.reshape(
tf.cast(tf.reduce_max(can_suppress_others * iou, 1) <= 0.5, iou.dtype),
[batch_size, -1, 1]) * iou
iou_sum_new = tf.reduce_sum(iou_suppressed, [1, 2])
return [
iou_suppressed,
tf.reduce_any(iou_sum - iou_sum_new > 0.5), iou_sum_new
]
def _cross_suppression(boxes, box_slice, iou_threshold, inner_idx):
batch_size = tf.shape(boxes)[0]
new_slice = tf.slice(boxes, [0, inner_idx * NMS_TILE_SIZE, 0],
[batch_size, NMS_TILE_SIZE, 4])
iou = box_ops.bbox_overlap(new_slice, box_slice)
ret_slice = tf.expand_dims(
tf.cast(tf.reduce_all(iou < iou_threshold, [1]), box_slice.dtype),
2) * box_slice
return boxes, ret_slice, iou_threshold, inner_idx + 1
def _suppression_loop_body(boxes, iou_threshold, output_size, idx):
"""Process boxes in the range [idx*NMS_TILE_SIZE, (idx+1)*NMS_TILE_SIZE).
Args:
boxes: a tensor with a shape of [batch_size, anchors, 4].
iou_threshold: a float representing the threshold for deciding whether boxes
overlap too much with respect to IOU.
output_size: an int32 tensor of size [batch_size]. Representing the number
of selected boxes for each batch.
idx: an integer scalar representing induction variable.
Returns:
boxes: updated boxes.
iou_threshold: pass down iou_threshold to the next iteration.
output_size: the updated output_size.
idx: the updated induction variable.
"""
num_tiles = tf.shape(boxes)[1] // NMS_TILE_SIZE
batch_size = tf.shape(boxes)[0]
# Iterates over tiles that can possibly suppress the current tile.
box_slice = tf.slice(boxes, [0, idx * NMS_TILE_SIZE, 0],
[batch_size, NMS_TILE_SIZE, 4])
_, box_slice, _, _ = tf.while_loop(
lambda _boxes, _box_slice, _threshold, inner_idx: inner_idx < idx,
_cross_suppression, [boxes, box_slice, iou_threshold,
tf.constant(0)])
# Iterates over the current tile to compute self-suppression.
iou = box_ops.bbox_overlap(box_slice, box_slice)
mask = tf.expand_dims(
tf.reshape(tf.range(NMS_TILE_SIZE), [1, -1]) > tf.reshape(
tf.range(NMS_TILE_SIZE), [-1, 1]), 0)
iou *= tf.cast(tf.logical_and(mask, iou >= iou_threshold), iou.dtype)
suppressed_iou, _, _ = tf.while_loop(
lambda _iou, loop_condition, _iou_sum: loop_condition, _self_suppression,
[iou, tf.constant(True),
tf.reduce_sum(iou, [1, 2])])
suppressed_box = tf.reduce_sum(suppressed_iou, 1) > 0
box_slice *= tf.expand_dims(1.0 - tf.cast(suppressed_box, box_slice.dtype), 2)
# Uses box_slice to update the input boxes.
mask = tf.reshape(
tf.cast(tf.equal(tf.range(num_tiles), idx), boxes.dtype), [1, -1, 1, 1])
boxes = tf.tile(tf.expand_dims(
box_slice, [1]), [1, num_tiles, 1, 1]) * mask + tf.reshape(
boxes, [batch_size, num_tiles, NMS_TILE_SIZE, 4]) * (1 - mask)
boxes = tf.reshape(boxes, [batch_size, -1, 4])
# Updates output_size.
output_size += tf.reduce_sum(
tf.cast(tf.reduce_any(box_slice > 0, [2]), tf.int32), [1])
return boxes, iou_threshold, output_size, idx + 1
def sorted_non_max_suppression_padded(scores,
boxes,
max_output_size,
iou_threshold):
"""A wrapper that handles non-maximum suppression.
Assumption:
* The boxes are sorted by scores unless the box is a dot (all coordinates
are zero).
* Boxes with higher scores can be used to suppress boxes with lower scores.
The overal design of the algorithm is to handle boxes tile-by-tile:
boxes = boxes.pad_to_multiply_of(tile_size)
num_tiles = len(boxes) // tile_size
output_boxes = []
for i in range(num_tiles):
box_tile = boxes[i*tile_size : (i+1)*tile_size]
for j in range(i - 1):
suppressing_tile = boxes[j*tile_size : (j+1)*tile_size]
iou = bbox_overlap(box_tile, suppressing_tile)
# if the box is suppressed in iou, clear it to a dot
box_tile *= _update_boxes(iou)
# Iteratively handle the diagnal tile.
iou = _box_overlap(box_tile, box_tile)
iou_changed = True
while iou_changed:
# boxes that are not suppressed by anything else
suppressing_boxes = _get_suppressing_boxes(iou)
# boxes that are suppressed by suppressing_boxes
suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes)
# clear iou to 0 for boxes that are suppressed, as they cannot be used
# to suppress other boxes any more
new_iou = _clear_iou(iou, suppressed_boxes)
iou_changed = (new_iou != iou)
iou = new_iou
# remaining boxes that can still suppress others, are selected boxes.
output_boxes.append(_get_suppressing_boxes(iou))
if len(output_boxes) >= max_output_size:
break
Args:
scores: a tensor with a shape of [batch_size, anchors].
boxes: a tensor with a shape of [batch_size, anchors, 4].
max_output_size: a scalar integer `Tensor` representing the maximum number
of boxes to be selected by non max suppression.
iou_threshold: a float representing the threshold for deciding whether boxes
overlap too much with respect to IOU.
Returns:
nms_scores: a tensor with a shape of [batch_size, anchors]. It has same
dtype as input scores.
nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has
same dtype as input boxes.
"""
batch_size = tf.shape(boxes)[0]
num_boxes = tf.shape(boxes)[1]
pad = tf.cast(
tf.math.ceil(tf.cast(num_boxes, tf.float32) / NMS_TILE_SIZE),
tf.int32) * NMS_TILE_SIZE - num_boxes
boxes = tf.pad(tf.cast(boxes, tf.float32), [[0, 0], [0, pad], [0, 0]])
scores = tf.pad(
tf.cast(scores, tf.float32), [[0, 0], [0, pad]], constant_values=-1)
num_boxes += pad
def _loop_cond(unused_boxes, unused_threshold, output_size, idx):
return tf.logical_and(
tf.reduce_min(output_size) < max_output_size,
idx < num_boxes // NMS_TILE_SIZE)
selected_boxes, _, output_size, _ = tf.while_loop(
_loop_cond, _suppression_loop_body, [
boxes, iou_threshold,
tf.zeros([batch_size], tf.int32),
tf.constant(0)
])
idx = num_boxes - tf.cast(
tf.nn.top_k(
tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) *
tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0],
tf.int32)
idx = tf.minimum(idx, num_boxes - 1)
idx = tf.reshape(
idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]), [-1])
boxes = tf.reshape(
tf.gather(tf.reshape(boxes, [-1, 4]), idx),
[batch_size, max_output_size, 4])
boxes = boxes * tf.cast(
tf.reshape(tf.range(max_output_size), [1, -1, 1]) < tf.reshape(
output_size, [-1, 1, 1]), boxes.dtype)
scores = tf.reshape(
tf.gather(tf.reshape(scores, [-1, 1]), idx),
[batch_size, max_output_size])
scores = scores * tf.cast(
tf.reshape(tf.range(max_output_size), [1, -1]) < tf.reshape(
output_size, [-1, 1]), scores.dtype)
return scores, boxes
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for nms.py."""
# Import libraries
import numpy as np
import tensorflow as tf
from official.vision.beta.ops import nms
class SortedNonMaxSuppressionTest(tf.test.TestCase):
def setUp(self):
super(SortedNonMaxSuppressionTest, self).setUp()
self.boxes_data = [[[0, 0, 1, 1], [0, 0.2, 1, 1.2], [0, 0.4, 1, 1.4],
[0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
[[0, 2, 1, 2], [0, 0.8, 1, 1.8], [0, 0.6, 1, 1.6],
[0, 0.4, 1, 1.4], [0, 0.2, 1, 1.2], [0, 0, 1, 1]]]
self.scores_data = [[0.9, 0.7, 0.6, 0.5, 0.4, 0.3],
[0.8, 0.7, 0.6, 0.5, 0.4, 0.3]]
self.max_output_size = 6
self.iou_threshold = 0.5
def testSortedNonMaxSuppressionOnTPU(self):
boxes_np = np.array(self.boxes_data, dtype=np.float32)
scores_np = np.array(self.scores_data, dtype=np.float32)
iou_threshold_np = np.array(self.iou_threshold, dtype=np.float32)
boxes = tf.constant(boxes_np)
scores = tf.constant(scores_np)
iou_threshold = tf.constant(iou_threshold_np)
# Runs on TPU.
strategy = tf.distribute.experimental.TPUStrategy()
with strategy.scope():
scores_tpu, boxes_tpu = nms.sorted_non_max_suppression_padded(
boxes=boxes,
scores=scores,
max_output_size=self.max_output_size,
iou_threshold=iou_threshold)
self.assertEqual(boxes_tpu.numpy().shape, (2, self.max_output_size, 4))
self.assertAllClose(scores_tpu.numpy(),
[[0.9, 0.6, 0.4, 0.3, 0., 0.],
[0.8, 0.7, 0.5, 0.3, 0., 0.]])
def testSortedNonMaxSuppressionOnCPU(self):
boxes_np = np.array(self.boxes_data, dtype=np.float32)
scores_np = np.array(self.scores_data, dtype=np.float32)
iou_threshold_np = np.array(self.iou_threshold, dtype=np.float32)
boxes = tf.constant(boxes_np)
scores = tf.constant(scores_np)
iou_threshold = tf.constant(iou_threshold_np)
# Runs on CPU.
scores_cpu, boxes_cpu = nms.sorted_non_max_suppression_padded(
boxes=boxes,
scores=scores,
max_output_size=self.max_output_size,
iou_threshold=iou_threshold)
self.assertEqual(boxes_cpu.numpy().shape, (2, self.max_output_size, 4))
self.assertAllClose(scores_cpu.numpy(),
[[0.9, 0.6, 0.4, 0.3, 0., 0.],
[0.8, 0.7, 0.5, 0.3, 0., 0.]])
def testSortedNonMaxSuppressionOnTPUSpeed(self):
boxes_np = np.random.rand(2, 12000, 4).astype(np.float32)
scores_np = np.random.rand(2, 12000).astype(np.float32)
iou_threshold_np = np.array(0.7, dtype=np.float32)
boxes = tf.constant(boxes_np)
scores = tf.constant(scores_np)
iou_threshold = tf.constant(iou_threshold_np)
# Runs on TPU.
strategy = tf.distribute.experimental.TPUStrategy()
with strategy.scope():
scores_tpu, boxes_tpu = nms.sorted_non_max_suppression_padded(
boxes=boxes,
scores=scores,
max_output_size=2000,
iou_threshold=iou_threshold)
self.assertEqual(scores_tpu.numpy().shape, (2, 2000))
self.assertEqual(boxes_tpu.numpy().shape, (2, 2000, 4))
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Preprocessing ops."""
import math
from six.moves import range
import tensorflow as tf
from official.vision.beta.ops import box_ops
CENTER_CROP_FRACTION = 0.875
def clip_or_pad_to_fixed_size(input_tensor, size, constant_values=0):
"""Pads data to a fixed length at the first dimension.
Args:
input_tensor: `Tensor` with any dimension.
size: `int` number for the first dimension of output Tensor.
constant_values: `int` value assigned to the paddings.
Returns:
`Tensor` with the first dimension padded to `size`.
"""
input_shape = input_tensor.get_shape().as_list()
padding_shape = []
# Computes the padding length on the first dimension, clip input tensor if it
# is longer than `size`.
input_length = tf.shape(input_tensor)[0]
input_length = tf.clip_by_value(input_length, 0, size)
input_tensor = input_tensor[:input_length]
padding_length = tf.maximum(0, size - input_length)
padding_shape.append(padding_length)
# Copies shapes of the rest of input shape dimensions.
for i in range(1, len(input_shape)):
padding_shape.append(tf.shape(input_tensor)[i])
# Pads input tensor to the fixed first dimension.
paddings = tf.cast(constant_values * tf.ones(padding_shape),
input_tensor.dtype)
padded_tensor = tf.concat([input_tensor, paddings], axis=0)
output_shape = input_shape
output_shape[0] = size
padded_tensor.set_shape(output_shape)
return padded_tensor
def normalize_image(image,
offset=(0.485, 0.456, 0.406),
scale=(0.229, 0.224, 0.225)):
"""Normalizes the image to zero mean and unit variance."""
with tf.name_scope('normalize_image'):
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
offset = tf.constant(offset)
offset = tf.expand_dims(offset, axis=0)
offset = tf.expand_dims(offset, axis=0)
image -= offset
scale = tf.constant(scale)
scale = tf.expand_dims(scale, axis=0)
scale = tf.expand_dims(scale, axis=0)
image /= scale
return image
def compute_padded_size(desired_size, stride):
"""Compute the padded size given the desired size and the stride.
The padded size will be the smallest rectangle, such that each dimension is
the smallest multiple of the stride which is larger than the desired
dimension. For example, if desired_size = (100, 200) and stride = 32,
the output padded_size = (128, 224).
Args:
desired_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the target output image size.
stride: an integer, the stride of the backbone network.
Returns:
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size.
"""
if isinstance(desired_size, list) or isinstance(desired_size, tuple):
padded_size = [int(math.ceil(d * 1.0 / stride) * stride)
for d in desired_size]
else:
padded_size = tf.cast(
tf.math.ceil(
tf.cast(desired_size, dtype=tf.float32) / stride) * stride,
tf.int32)
return padded_size
def resize_and_crop_image(image,
desired_size,
padded_size,
aug_scale_min=1.0,
aug_scale_max=1.0,
seed=1,
method=tf.image.ResizeMethod.BILINEAR):
"""Resizes the input image to output size (RetinaNet style).
Resize and pad images given the desired output size of the image and
stride size.
Here are the preprocessing steps.
1. For a given image, keep its aspect ratio and rescale the image to make it
the largest rectangle to be bounded by the rectangle specified by the
`desired_size`.
2. Pad the rescaled image to the padded_size.
Args:
image: a `Tensor` of shape [height, width, 3] representing an image.
desired_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the desired actual output image size.
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size. Padding will be applied
after scaling the image to the desired_size.
aug_scale_min: a `float` with range between [0, 1.0] representing minimum
random scale applied to desired_size for training scale jittering.
aug_scale_max: a `float` with range between [1.0, inf] representing maximum
random scale applied to desired_size for training scale jittering.
seed: seed for random scale jittering.
method: function to resize input image to scaled image.
Returns:
output_image: `Tensor` of shape [height, width, 3] where [height, width]
equals to `output_size`.
image_info: a 2D `Tensor` that encodes the information of the image and the
applied preprocessing. It is in the format of
[[original_height, original_width], [desired_height, desired_width],
[y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
desired_width] is the actual scaled image size, and [y_scale, x_scale] is
the scaling factor, which is the ratio of
scaled dimension / original dimension.
"""
with tf.name_scope('resize_and_crop_image'):
image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
if random_jittering:
random_scale = tf.random.uniform(
[], aug_scale_min, aug_scale_max, seed=seed)
scaled_size = tf.round(random_scale * desired_size)
else:
scaled_size = desired_size
scale = tf.minimum(
scaled_size[0] / image_size[0], scaled_size[1] / image_size[1])
scaled_size = tf.round(image_size * scale)
# Computes 2D image_scale.
image_scale = scaled_size / image_size
# Selects non-zero random offset (x, y) if scaled image is larger than
# desired_size.
if random_jittering:
max_offset = scaled_size - desired_size
max_offset = tf.where(
tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
offset = max_offset * tf.random.uniform([2,], 0, 1, seed=seed)
offset = tf.cast(offset, tf.int32)
else:
offset = tf.zeros((2,), tf.int32)
scaled_image = tf.image.resize(
image, tf.cast(scaled_size, tf.int32), method=method)
if random_jittering:
scaled_image = scaled_image[
offset[0]:offset[0] + desired_size[0],
offset[1]:offset[1] + desired_size[1], :]
output_image = tf.image.pad_to_bounding_box(
scaled_image, 0, 0, padded_size[0], padded_size[1])
image_info = tf.stack([
image_size,
tf.constant(desired_size, dtype=tf.float32),
image_scale,
tf.cast(offset, tf.float32)])
return output_image, image_info
def resize_and_crop_image_v2(image,
short_side,
long_side,
padded_size,
aug_scale_min=1.0,
aug_scale_max=1.0,
seed=1,
method=tf.image.ResizeMethod.BILINEAR):
"""Resizes the input image to output size (Faster R-CNN style).
Resize and pad images given the specified short / long side length and the
stride size.
Here are the preprocessing steps.
1. For a given image, keep its aspect ratio and first try to rescale the short
side of the original image to `short_side`.
2. If the scaled image after 1 has a long side that exceeds `long_side`, keep
the aspect ratio and rescal the long side of the image to `long_side`.
2. Pad the rescaled image to the padded_size.
Args:
image: a `Tensor` of shape [height, width, 3] representing an image.
short_side: a scalar `Tensor` or `int` representing the desired short side
to be rescaled to.
long_side: a scalar `Tensor` or `int` representing the desired long side to
be rescaled to.
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size. Padding will be applied
after scaling the image to the desired_size.
aug_scale_min: a `float` with range between [0, 1.0] representing minimum
random scale applied to desired_size for training scale jittering.
aug_scale_max: a `float` with range between [1.0, inf] representing maximum
random scale applied to desired_size for training scale jittering.
seed: seed for random scale jittering.
method: function to resize input image to scaled image.
Returns:
output_image: `Tensor` of shape [height, width, 3] where [height, width]
equals to `output_size`.
image_info: a 2D `Tensor` that encodes the information of the image and the
applied preprocessing. It is in the format of
[[original_height, original_width], [desired_height, desired_width],
[y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
desired_width] is the actual scaled image size, and [y_scale, x_scale] is
the scaling factor, which is the ratio of
scaled dimension / original dimension.
"""
with tf.name_scope('resize_and_crop_image_v2'):
image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
scale_using_short_side = (
short_side / tf.math.minimum(image_size[0], image_size[1]))
scale_using_long_side = (
long_side / tf.math.maximum(image_size[0], image_size[1]))
scaled_size = tf.math.round(image_size * scale_using_short_side)
scaled_size = tf.where(
tf.math.greater(
tf.math.maximum(scaled_size[0], scaled_size[1]), long_side),
tf.math.round(image_size * scale_using_long_side),
scaled_size)
desired_size = scaled_size
random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
if random_jittering:
random_scale = tf.random.uniform(
[], aug_scale_min, aug_scale_max, seed=seed)
scaled_size = tf.math.round(random_scale * scaled_size)
# Computes 2D image_scale.
image_scale = scaled_size / image_size
# Selects non-zero random offset (x, y) if scaled image is larger than
# desired_size.
if random_jittering:
max_offset = scaled_size - desired_size
max_offset = tf.where(
tf.math.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
offset = max_offset * tf.random.uniform([2,], 0, 1, seed=seed)
offset = tf.cast(offset, tf.int32)
else:
offset = tf.zeros((2,), tf.int32)
scaled_image = tf.image.resize(
image, tf.cast(scaled_size, tf.int32), method=method)
if random_jittering:
scaled_image = scaled_image[
offset[0]:offset[0] + desired_size[0],
offset[1]:offset[1] + desired_size[1], :]
output_image = tf.image.pad_to_bounding_box(
scaled_image, 0, 0, padded_size[0], padded_size[1])
image_info = tf.stack([
image_size,
tf.cast(desired_size, dtype=tf.float32),
image_scale,
tf.cast(offset, tf.float32)])
return output_image, image_info
def center_crop_image(image):
"""Center crop a square shape slice from the input image.
It crops a square shape slice from the image. The side of the actual crop
is 224 / 256 = 0.875 of the short side of the original image. References:
[1] Very Deep Convolutional Networks for Large-Scale Image Recognition
https://arxiv.org/abs/1409.1556
[2] Deep Residual Learning for Image Recognition
https://arxiv.org/abs/1512.03385
Args:
image: a Tensor of shape [height, width, 3] representing the input image.
Returns:
cropped_image: a Tensor representing the center cropped image.
"""
with tf.name_scope('center_crop_image'):
image_size = tf.cast(tf.shape(image)[:2], dtype=tf.float32)
crop_size = (
CENTER_CROP_FRACTION * tf.math.minimum(image_size[0], image_size[1]))
crop_offset = tf.cast((image_size - crop_size) / 2.0, dtype=tf.int32)
crop_size = tf.cast(crop_size, dtype=tf.int32)
cropped_image = image[
crop_offset[0]:crop_offset[0] + crop_size,
crop_offset[1]:crop_offset[1] + crop_size, :]
return cropped_image
def center_crop_image_v2(image_bytes, image_shape):
"""Center crop a square shape slice from the input image.
It crops a square shape slice from the image. The side of the actual crop
is 224 / 256 = 0.875 of the short side of the original image. References:
[1] Very Deep Convolutional Networks for Large-Scale Image Recognition
https://arxiv.org/abs/1409.1556
[2] Deep Residual Learning for Image Recognition
https://arxiv.org/abs/1512.03385
This is a faster version of `center_crop_image` which takes the original
image bytes and image size as the inputs, and partially decode the JPEG
bytes according to the center crop.
Args:
image_bytes: a Tensor of type string representing the raw image bytes.
image_shape: a Tensor specifying the shape of the raw image.
Returns:
cropped_image: a Tensor representing the center cropped image.
"""
with tf.name_scope('center_image_crop_v2'):
image_shape = tf.cast(image_shape, tf.float32)
crop_size = (
CENTER_CROP_FRACTION * tf.math.minimum(image_shape[0], image_shape[1]))
crop_offset = tf.cast((image_shape - crop_size) / 2.0, dtype=tf.int32)
crop_size = tf.cast(crop_size, dtype=tf.int32)
crop_window = tf.stack(
[crop_offset[0], crop_offset[1], crop_size, crop_size])
cropped_image = tf.image.decode_and_crop_jpeg(
image_bytes, crop_window, channels=3)
return cropped_image
def random_crop_image(image,
aspect_ratio_range=(3. / 4., 4. / 3.),
area_range=(0.08, 1.0),
max_attempts=10,
seed=1):
"""Randomly crop an arbitrary shaped slice from the input image.
Args:
image: a Tensor of shape [height, width, 3] representing the input image.
aspect_ratio_range: a list of floats. The cropped area of the image must
have an aspect ratio = width / height within this range.
area_range: a list of floats. The cropped reas of the image must contain
a fraction of the input image within this range.
max_attempts: the number of attempts at generating a cropped region of the
image of the specified constraints. After max_attempts failures, return
the entire image.
seed: the seed of the random generator.
Returns:
cropped_image: a Tensor representing the random cropped image. Can be the
original image if max_attempts is exhausted.
"""
with tf.name_scope('random_crop_image'):
crop_offset, crop_size, _ = tf.image.sample_distorted_bounding_box(
tf.shape(image),
tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
seed=seed,
min_object_covered=area_range[0],
aspect_ratio_range=aspect_ratio_range,
area_range=area_range,
max_attempts=max_attempts)
cropped_image = tf.slice(image, crop_offset, crop_size)
return cropped_image
def random_crop_image_v2(image_bytes,
image_shape,
aspect_ratio_range=(3. / 4., 4. / 3.),
area_range=(0.08, 1.0),
max_attempts=10,
seed=1):
"""Randomly crop an arbitrary shaped slice from the input image.
This is a faster version of `random_crop_image` which takes the original
image bytes and image size as the inputs, and partially decode the JPEG
bytes according to the generated crop.
Args:
image_bytes: a Tensor of type string representing the raw image bytes.
image_shape: a Tensor specifying the shape of the raw image.
aspect_ratio_range: a list of floats. The cropped area of the image must
have an aspect ratio = width / height within this range.
area_range: a list of floats. The cropped reas of the image must contain
a fraction of the input image within this range.
max_attempts: the number of attempts at generating a cropped region of the
image of the specified constraints. After max_attempts failures, return
the entire image.
seed: the seed of the random generator.
Returns:
cropped_image: a Tensor representing the random cropped image. Can be the
original image if max_attempts is exhausted.
"""
with tf.name_scope('random_crop_image_v2'):
crop_offset, crop_size, _ = tf.image.sample_distorted_bounding_box(
image_shape,
tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
seed=seed,
min_object_covered=area_range[0],
aspect_ratio_range=aspect_ratio_range,
area_range=area_range,
max_attempts=max_attempts)
offset_y, offset_x, _ = tf.unstack(crop_offset)
crop_height, crop_width, _ = tf.unstack(crop_size)
crop_window = tf.stack([offset_y, offset_x, crop_height, crop_width])
cropped_image = tf.image.decode_and_crop_jpeg(
image_bytes, crop_window, channels=3)
return cropped_image
def resize_and_crop_boxes(boxes,
image_scale,
output_size,
offset):
"""Resizes boxes to output size with scale and offset.
Args:
boxes: `Tensor` of shape [N, 4] representing ground truth boxes.
image_scale: 2D float `Tensor` representing scale factors that apply to
[height, width] of input image.
output_size: 2D `Tensor` or `int` representing [height, width] of target
output image size.
offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
boxes.
Returns:
boxes: `Tensor` of shape [N, 4] representing the scaled boxes.
"""
with tf.name_scope('resize_and_crop_boxes'):
# Adjusts box coordinates based on image_scale and offset.
boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
# Clips the boxes.
boxes = box_ops.clip_boxes(boxes, output_size)
return boxes
def resize_and_crop_masks(masks,
image_scale,
output_size,
offset):
"""Resizes boxes to output size with scale and offset.
Args:
masks: `Tensor` of shape [N, H, W, 1] representing ground truth masks.
image_scale: 2D float `Tensor` representing scale factors that apply to
[height, width] of input image.
output_size: 2D `Tensor` or `int` representing [height, width] of target
output image size.
offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
boxes.
Returns:
masks: `Tensor` of shape [N, H, W, 1] representing the scaled masks.
"""
with tf.name_scope('resize_and_crop_masks'):
mask_size = tf.cast(tf.shape(masks)[1:3], tf.float32)
# Pad masks to avoid empty mask annotations.
masks = tf.concat(
[tf.zeros([1, mask_size[0], mask_size[1], 1]), masks], axis=0)
scaled_size = tf.cast(image_scale * mask_size, tf.int32)
scaled_masks = tf.image.resize(
masks, scaled_size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
offset = tf.cast(offset, tf.int32)
scaled_masks = scaled_masks[
:,
offset[0]:offset[0] + output_size[0],
offset[1]:offset[1] + output_size[1],
:]
output_masks = tf.image.pad_to_bounding_box(
scaled_masks, 0, 0, output_size[0], output_size[1])
# Remove padding.
output_masks = output_masks[1::]
return output_masks
def horizontal_flip_image(image):
"""Flips image horizontally."""
return tf.image.flip_left_right(image)
def horizontal_flip_boxes(normalized_boxes):
"""Flips normalized boxes horizontally."""
ymin, xmin, ymax, xmax = tf.split(
value=normalized_boxes, num_or_size_splits=4, axis=1)
flipped_xmin = tf.subtract(1.0, xmax)
flipped_xmax = tf.subtract(1.0, xmin)
flipped_boxes = tf.concat([ymin, flipped_xmin, ymax, flipped_xmax], 1)
return flipped_boxes
def horizontal_flip_masks(masks):
"""Flips masks horizontally."""
return masks[:, :, ::-1]
def random_horizontal_flip(image, normalized_boxes=None, masks=None, seed=1):
"""Randomly flips input image and bounding boxes."""
with tf.name_scope('random_horizontal_flip'):
do_flip = tf.greater(tf.random.uniform([], seed=seed), 0.5)
image = tf.cond(
do_flip,
lambda: horizontal_flip_image(image),
lambda: image)
if normalized_boxes is not None:
normalized_boxes = tf.cond(
do_flip,
lambda: horizontal_flip_boxes(normalized_boxes),
lambda: normalized_boxes)
if masks is not None:
masks = tf.cond(
do_flip,
lambda: horizontal_flip_masks(masks),
lambda: masks)
return image, normalized_boxes, masks
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utils for processing video dataset features."""
from typing import Optional
import tensorflow as tf
def _sample_or_pad_sequence_indices(sequence: tf.Tensor,
num_steps: int,
stride: int,
offset: tf.Tensor) -> tf.Tensor:
"""Returns indices to take for sampling or padding sequences to fixed size."""
sequence_length = tf.shape(sequence)[0]
sel_idx = tf.range(sequence_length)
# Repeats sequence until num_steps are available in total.
max_length = num_steps * stride + offset
num_repeats = tf.math.floordiv(
max_length + sequence_length - 1, sequence_length)
sel_idx = tf.tile(sel_idx, [num_repeats])
steps = tf.range(offset, offset + num_steps * stride, stride)
return tf.gather(sel_idx, steps)
def sample_linspace_sequence(sequence: tf.Tensor,
num_windows: int,
num_steps: int,
stride: int) -> tf.Tensor:
"""Samples `num_windows` segments from sequence with linearly spaced offsets.
The samples are concatenated in a single `tf.Tensor` in order to have the same
format structure per timestep (e.g. a single frame). If `num_steps` * `stride`
is bigger than the number of timesteps, the sequence is repeated. This
function can be used in evaluation in order to extract enough segments to span
the entire sequence.
Args:
sequence: Any tensor where the first dimension is timesteps.
num_windows: Number of windows retrieved from the sequence.
num_steps: Number of steps (e.g. frames) to take.
stride: Distance to sample between timesteps.
Returns:
A single `tf.Tensor` with first dimension `num_windows` * `num_steps`. The
tensor contains the concatenated list of `num_windows` tensors which offsets
have been linearly spaced from input.
"""
sequence_length = tf.shape(sequence)[0]
max_offset = tf.maximum(0, sequence_length - num_steps * stride)
offsets = tf.linspace(0.0, tf.cast(max_offset, tf.float32), num_windows)
offsets = tf.cast(offsets, tf.int32)
all_indices = []
for i in range(num_windows):
all_indices.append(_sample_or_pad_sequence_indices(
sequence=sequence,
num_steps=num_steps,
stride=stride,
offset=offsets[i]))
indices = tf.concat(all_indices, axis=0)
indices.set_shape((num_windows * num_steps,))
return tf.gather(sequence, indices)
def sample_sequence(sequence: tf.Tensor,
num_steps: int,
random: bool,
stride: int,
seed: Optional[int] = None) -> tf.Tensor:
"""Samples a single segment of size `num_steps` from a given sequence.
If `random` is not `True`, this function will simply sample the central window
of the sequence. Otherwise, a random offset will be chosen in a way that the
desired `num_steps` might be extracted from the sequence.
Args:
sequence: Any tensor where the first dimension is timesteps.
num_steps: Number of steps (e.g. frames) to take.
random: A boolean indicating whether to random sample the single window. If
`True`, the offset is randomized. If `False`, the middle frame minus half
of `num_steps` is the first frame.
stride: Distance to sample between timesteps.
seed: A deterministic seed to use when sampling.
Returns:
A single `tf.Tensor` with first dimension `num_steps` with the sampled
segment.
"""
sequence_length = tf.shape(sequence)[0]
if random:
sequence_length = tf.cast(sequence_length, tf.float32)
max_offset = tf.cond(
sequence_length > (num_steps - 1) * stride,
lambda: sequence_length - (num_steps - 1) * stride,
lambda: sequence_length)
offset = tf.random.uniform(
(),
maxval=tf.cast(max_offset, dtype=tf.int32),
dtype=tf.int32,
seed=seed)
else:
offset = (sequence_length - num_steps * stride) // 2
offset = tf.maximum(0, offset)
indices = _sample_or_pad_sequence_indices(
sequence=sequence,
num_steps=num_steps,
stride=stride,
offset=offset)
indices.set_shape((num_steps,))
return tf.gather(sequence, indices)
def decode_jpeg(image_string: tf.Tensor, channels: int = 0) -> tf.Tensor:
"""Decodes JPEG raw bytes string into a RGB uint8 Tensor.
Args:
image_string: A `tf.Tensor` of type strings with the raw JPEG bytes where
the first dimension is timesteps.
channels: Number of channels of the JPEG image. Allowed values are 0, 1 and
3. If 0, the number of channels will be calculated at runtime and no
static shape is set.
Returns:
A Tensor of shape [T, H, W, C] of type uint8 with the decoded images.
"""
return tf.map_fn(
lambda x: tf.image.decode_jpeg(x, channels=channels),
image_string, back_prop=False, dtype=tf.uint8)
def crop_image(frames: tf.Tensor,
height: int,
width: int,
random: bool = False,
seed: Optional[int] = None) -> tf.Tensor:
"""Crops the image sequence of images.
If requested size is bigger than image size, image is padded with 0. If not
random cropping, a central crop is performed.
Args:
frames: A Tensor of dimension [timesteps, in_height, in_width, channels].
height: Cropped image height.
width: Cropped image width.
random: A boolean indicating if crop should be randomized.
seed: A deterministic seed to use when random cropping.
Returns:
A Tensor of shape [timesteps, out_height, out_width, channels] of type uint8
with the cropped images.
"""
if random:
# Random spatial crop.
shape = tf.shape(frames)
# If a static_shape is available (e.g. when using this method from add_image
# method), it will be used to have an output tensor with static shape.
static_shape = frames.shape.as_list()
seq_len = shape[0] if static_shape[0] is None else static_shape[0]
channels = shape[3] if static_shape[3] is None else static_shape[3]
frames = tf.image.random_crop(frames, (seq_len, height, width, channels),
seed)
else:
# Central crop or pad.
frames = tf.image.resize_with_crop_or_pad(frames, height, width)
return frames
def resize_smallest(frames: tf.Tensor,
min_resize: int) -> tf.Tensor:
"""Resizes frames so that min(`height`, `width`) is equal to `min_resize`.
This function will not do anything if the min(`height`, `width`) is already
equal to `min_resize`. This allows to save compute time.
Args:
frames: A Tensor of dimension [timesteps, input_h, input_w, channels].
min_resize: Minimum size of the final image dimensions.
Returns:
A Tensor of shape [timesteps, output_h, output_w, channels] of type
frames.dtype where min(output_h, output_w) = min_resize.
"""
shape = tf.shape(frames)
input_h = shape[1]
input_w = shape[2]
output_h = tf.maximum(min_resize, (input_h * min_resize) // input_w)
output_w = tf.maximum(min_resize, (input_w * min_resize) // input_h)
def resize_fn():
frames_resized = tf.image.resize(frames, (output_h, output_w))
return tf.cast(frames_resized, frames.dtype)
should_resize = tf.math.logical_or(tf.not_equal(input_w, output_w),
tf.not_equal(input_h, output_h))
frames = tf.cond(should_resize, resize_fn, lambda: frames)
return frames
def random_flip_left_right(
frames: tf.Tensor,
seed: Optional[int] = None) -> tf.Tensor:
"""Flips all the frames with a probability of 50%.
Args:
frames: A Tensor of shape [timesteps, input_h, input_w, channels].
seed: A seed to use for the random sampling.
Returns:
A Tensor of shape [timesteps, output_h, output_w, channels] eventually
flipped left right.
"""
is_flipped = tf.random.uniform(
(), minval=0, maxval=2, dtype=tf.int32, seed=seed)
frames = tf.cond(tf.equal(is_flipped, 1),
true_fn=lambda: tf.image.flip_left_right(frames),
false_fn=lambda: frames)
return frames
def normalize_image(frames: tf.Tensor,
zero_centering_image: bool,
dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
"""Normalizes images.
Args:
frames: A Tensor of numbers.
zero_centering_image: If True, results are in [-1, 1], if False, results are
in [0, 1].
dtype: Type of output Tensor.
Returns:
A Tensor of same shape as the input and of the given type.
"""
frames = tf.cast(frames, dtype)
if zero_centering_image:
return frames * (2.0 / 255.0) - 1.0
else:
return frames / 255.0
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import io
import itertools
import numpy as np
from PIL import Image
import tensorflow as tf
from official.vision.beta.ops import preprocess_ops_3d
class ParserUtilsTest(tf.test.TestCase):
def setUp(self):
super().setUp()
# [[0, 1, ..., 119], [1, 2, ..., 120], ..., [119, 120, ..., 218]].
self._frames = tf.stack([tf.range(i, i + 120) for i in range(90)])
self._frames = tf.cast(self._frames, tf.uint8)
self._frames = self._frames[tf.newaxis, :, :, tf.newaxis]
self._frames = tf.broadcast_to(self._frames, (6, 90, 120, 3))
# Create an equivalent numpy array for assertions.
self._np_frames = np.array([range(i, i + 120) for i in range(90)])
self._np_frames = self._np_frames[np.newaxis, :, :, np.newaxis]
self._np_frames = np.broadcast_to(self._np_frames, (6, 90, 120, 3))
def test_sample_linspace_sequence(self):
sequence = tf.range(100)
sampled_seq_1 = preprocess_ops_3d.sample_linspace_sequence(
sequence, 10, 10, 1)
sampled_seq_2 = preprocess_ops_3d.sample_linspace_sequence(
sequence, 7, 10, 1)
sampled_seq_3 = preprocess_ops_3d.sample_linspace_sequence(
sequence, 7, 5, 2)
sampled_seq_4 = preprocess_ops_3d.sample_linspace_sequence(
sequence, 101, 1, 1)
self.assertAllEqual(sampled_seq_1, range(100))
# [0, 1, 2, 3, 4, ..., 8, 9, 15, 16, ..., 97, 98, 99]
self.assertAllEqual(
sampled_seq_2,
[15 * i + j for i, j in itertools.product(range(7), range(10))])
# [0, 2, 4, 6, 8, 15, 17, 19, ..., 96, 98]
self.assertAllEqual(
sampled_seq_3,
[15 * i + 2 * j for i, j in itertools.product(range(7), range(5))])
self.assertAllEqual(sampled_seq_4, [0] + list(range(100)))
def test_sample_sequence(self):
sequence = tf.range(100)
sampled_seq_1 = preprocess_ops_3d.sample_sequence(sequence, 10, False, 1)
sampled_seq_2 = preprocess_ops_3d.sample_sequence(sequence, 10, False, 2)
sampled_seq_3 = preprocess_ops_3d.sample_sequence(sequence, 10, True, 1)
self.assertAllEqual(sampled_seq_1, range(45, 55))
self.assertAllEqual(sampled_seq_2, range(40, 60, 2))
offset_3 = sampled_seq_3[0]
self.assertBetween(offset_3, 0, 99)
self.assertAllEqual(sampled_seq_3, range(offset_3, offset_3 + 10))
def test_decode_jpeg(self):
# Create a random RGB JPEG image.
random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
random_image = Image.fromarray(random_image)
with io.BytesIO() as buffer:
random_image.save(buffer, format='JPEG')
raw_image_bytes = buffer.getvalue()
raw_image = tf.constant([raw_image_bytes, raw_image_bytes])
decoded_image = preprocess_ops_3d.decode_jpeg(raw_image, 3)
self.assertEqual(decoded_image.shape.as_list()[3], 3)
self.assertAllEqual(decoded_image.shape, (2, 263, 320, 3))
def test_crop_image(self):
cropped_image_1 = preprocess_ops_3d.crop_image(self._frames, 50, 70)
cropped_image_2 = preprocess_ops_3d.crop_image(self._frames, 200, 200)
cropped_image_3 = preprocess_ops_3d.crop_image(self._frames, 50, 70, True)
self.assertAllEqual(cropped_image_1.shape, (6, 50, 70, 3))
self.assertAllEqual(cropped_image_1, self._np_frames[:, 20:70, 25:95, :])
self.assertAllEqual(cropped_image_2.shape, (6, 200, 200, 3))
expected = np.pad(
self._np_frames, ((0, 0), (55, 55), (40, 40), (0, 0)), 'constant')
self.assertAllEqual(cropped_image_2, expected)
self.assertAllEqual(cropped_image_3.shape, (6, 50, 70, 3))
offset = cropped_image_3[0, 0, 0, 0]
expected = np.array([range(i, i + 70) for i in range(offset, offset + 50)])
expected = expected[np.newaxis, :, :, np.newaxis]
expected = np.broadcast_to(expected, (6, 50, 70, 3))
self.assertAllEqual(cropped_image_3, expected)
def test_resize_smallest(self):
resized_frames_1 = preprocess_ops_3d.resize_smallest(self._frames, 180)
resized_frames_2 = preprocess_ops_3d.resize_smallest(self._frames, 45)
resized_frames_3 = preprocess_ops_3d.resize_smallest(self._frames, 90)
resized_frames_4 = preprocess_ops_3d.resize_smallest(
tf.transpose(self._frames, (0, 2, 1, 3)), 45)
self.assertAllEqual(resized_frames_1.shape, (6, 180, 240, 3))
self.assertAllEqual(resized_frames_2.shape, (6, 45, 60, 3))
self.assertAllEqual(resized_frames_3.shape, (6, 90, 120, 3))
self.assertAllEqual(resized_frames_4.shape, (6, 60, 45, 3))
def test_random_flip_left_right(self):
flipped_frames = preprocess_ops_3d.random_flip_left_right(self._frames)
flipped = np.fliplr(self._np_frames[0, :, :, 0])
flipped = flipped[np.newaxis, :, :, np.newaxis]
flipped = np.broadcast_to(flipped, (6, 90, 120, 3))
self.assertTrue((flipped_frames == self._np_frames).numpy().all() or (
flipped_frames == flipped).numpy().all())
def test_normalize_image(self):
normalized_images_1 = preprocess_ops_3d.normalize_image(
self._frames, False, tf.float32)
normalized_images_2 = preprocess_ops_3d.normalize_image(
self._frames, True, tf.float32)
self.assertAllClose(normalized_images_1, self._np_frames / 255)
self.assertAllClose(normalized_images_2, self._np_frames * 2 / 255 - 1.0)
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for preprocess_ops.py."""
import io
# Import libraries
from absl.testing import parameterized
import numpy as np
from PIL import Image
import tensorflow as tf
from official.vision.beta.ops import preprocess_ops
def _encode_image(image_array, fmt):
image = Image.fromarray(image_array)
with io.BytesIO() as output:
image.save(output, format=fmt)
return output.getvalue()
class InputUtilsTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
([1], 10),
([1, 2], 10),
([1, 2, 3], 10),
([11], 10),
([12, 2], 10),
([13, 2, 3], 10),
)
def testPadToFixedSize(self, input_shape, output_size):
# Copies input shape to padding shape.
clip_shape = input_shape[:]
clip_shape[0] = min(output_size, clip_shape[0])
padding_shape = input_shape[:]
padding_shape[0] = max(output_size - input_shape[0], 0)
expected_outputs = np.concatenate(
[np.ones(clip_shape), np.zeros(padding_shape)], axis=0)
data = tf.ones(input_shape)
output_data = preprocess_ops.clip_or_pad_to_fixed_size(
data, output_size, constant_values=0)
output_data = output_data.numpy()
self.assertAllClose(output_size, output_data.shape[0])
self.assertAllClose(expected_outputs, output_data)
@parameterized.parameters(
(100, 200, 100, 200, 32, 1.0, 1.0, 128, 224),
(100, 256, 128, 256, 32, 1.0, 1.0, 128, 256),
(200, 512, 200, 128, 32, 0.25, 0.25, 224, 128),
)
def testResizeAndCropImageRectangluarCase(self,
input_height,
input_width,
desired_height,
desired_width,
stride,
scale_y,
scale_x,
output_height,
output_width):
image = tf.convert_to_tensor(
np.random.rand(input_height, input_width, 3))
desired_size = (desired_height, desired_width)
resized_image, image_info = preprocess_ops.resize_and_crop_image(
image,
desired_size=desired_size,
padded_size=preprocess_ops.compute_padded_size(desired_size, stride))
resized_image_shape = tf.shape(resized_image)
self.assertAllEqual(
[output_height, output_width, 3],
resized_image_shape.numpy())
self.assertNDArrayNear(
[[input_height, input_width],
[desired_height, desired_width],
[scale_y, scale_x],
[0.0, 0.0]],
image_info.numpy(),
1e-5)
@parameterized.parameters(
(100, 200, 220, 220, 32, 1.1, 1.1, 224, 224),
(512, 512, 1024, 1024, 32, 2.0, 2.0, 1024, 1024),
)
def testResizeAndCropImageSquareCase(self,
input_height,
input_width,
desired_height,
desired_width,
stride,
scale_y,
scale_x,
output_height,
output_width):
image = tf.convert_to_tensor(
np.random.rand(input_height, input_width, 3))
desired_size = (desired_height, desired_width)
resized_image, image_info = preprocess_ops.resize_and_crop_image(
image,
desired_size=desired_size,
padded_size=preprocess_ops.compute_padded_size(desired_size, stride))
resized_image_shape = tf.shape(resized_image)
self.assertAllEqual(
[output_height, output_width, 3],
resized_image_shape.numpy())
self.assertNDArrayNear(
[[input_height, input_width],
[desired_height, desired_width],
[scale_y, scale_x],
[0.0, 0.0]],
image_info.numpy(),
1e-5)
@parameterized.parameters(
(100, 200, 100, 300, 32, 1.0, 1.0, 100, 200, 128, 320),
(200, 100, 100, 300, 32, 1.0, 1.0, 200, 100, 320, 128),
(100, 200, 80, 100, 32, 0.5, 0.5, 50, 100, 96, 128),
(200, 100, 80, 100, 32, 0.5, 0.5, 100, 50, 128, 96),
)
def testResizeAndCropImageV2(self,
input_height,
input_width,
short_side,
long_side,
stride,
scale_y,
scale_x,
desired_height,
desired_width,
output_height,
output_width):
image = tf.convert_to_tensor(
np.random.rand(input_height, input_width, 3))
image_shape = tf.shape(image)[0:2]
desired_size = tf.where(
tf.greater(image_shape[0], image_shape[1]),
tf.constant([long_side, short_side], dtype=tf.int32),
tf.constant([short_side, long_side], dtype=tf.int32))
resized_image, image_info = preprocess_ops.resize_and_crop_image_v2(
image,
short_side=short_side,
long_side=long_side,
padded_size=preprocess_ops.compute_padded_size(desired_size, stride))
resized_image_shape = tf.shape(resized_image)
self.assertAllEqual(
[output_height, output_width, 3],
resized_image_shape.numpy())
self.assertNDArrayNear(
[[input_height, input_width],
[desired_height, desired_width],
[scale_y, scale_x],
[0.0, 0.0]],
image_info.numpy(),
1e-5)
@parameterized.parameters(
(400, 600), (600, 400),
)
def testCenterCropImage(self,
input_height,
input_width):
image = tf.convert_to_tensor(
np.random.rand(input_height, input_width, 3))
cropped_image = preprocess_ops.center_crop_image(image)
cropped_image_shape = tf.shape(cropped_image)
self.assertAllEqual([350, 350, 3], cropped_image_shape.numpy())
@parameterized.parameters(
(400, 600), (600, 400),
)
def testCenterCropImageV2(self,
input_height,
input_width):
image_bytes = tf.constant(
_encode_image(
np.uint8(np.random.rand(input_height, input_width, 3) * 255),
fmt='JPEG'),
dtype=tf.string)
cropped_image = preprocess_ops.center_crop_image_v2(
image_bytes, tf.constant([input_height, input_width, 3], tf.int32))
cropped_image_shape = tf.shape(cropped_image)
self.assertAllEqual([350, 350, 3], cropped_image_shape.numpy())
@parameterized.parameters(
(400, 600), (600, 400),
)
def testRandomCropImage(self,
input_height,
input_width):
image = tf.convert_to_tensor(
np.random.rand(input_height, input_width, 3))
_ = preprocess_ops.random_crop_image(image)
@parameterized.parameters(
(400, 600), (600, 400),
)
def testRandomCropImageV2(self,
input_height,
input_width):
image_bytes = tf.constant(
_encode_image(
np.uint8(np.random.rand(input_height, input_width, 3) * 255),
fmt='JPEG'),
dtype=tf.string)
_ = preprocess_ops.random_crop_image_v2(
image_bytes, tf.constant([input_height, input_width, 3], tf.int32))
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Class to subsample minibatches by balancing positives and negatives.
Subsamples minibatches based on a pre-specified positive fraction in range
[0,1]. The class presumes there are many more negatives than positive examples:
if the desired batch_size cannot be achieved with the pre-specified positive
fraction, it fills the rest with negative examples. If this is not sufficient
for obtaining the desired batch_size, it returns fewer examples.
The main function to call is Subsample(self, indicator, labels). For convenience
one can also call SubsampleWeights(self, weights, labels) which is defined in
the minibatch_sampler base class.
When is_static is True, it implements a method that guarantees static shapes.
It also ensures the length of output of the subsample is always batch_size, even
when number of examples set to True in indicator is less than batch_size.
This is originally implemented in TensorFlow Object Detection API.
"""
# Import libraries
import tensorflow as tf
def combined_static_and_dynamic_shape(tensor):
"""Returns a list containing static and dynamic values for the dimensions.
Returns a list of static and dynamic values for shape dimensions. This is
useful to preserve static shapes when available in reshape operation.
Args:
tensor: A tensor of any type.
Returns:
A list of size tensor.shape.ndims containing integers or a scalar tensor.
"""
static_tensor_shape = tensor.shape.as_list()
dynamic_tensor_shape = tf.shape(input=tensor)
combined_shape = []
for index, dim in enumerate(static_tensor_shape):
if dim is not None:
combined_shape.append(dim)
else:
combined_shape.append(dynamic_tensor_shape[index])
return combined_shape
def indices_to_dense_vector(indices,
size,
indices_value=1.,
default_value=0,
dtype=tf.float32):
"""Creates dense vector with indices set to specific value and rest to zeros.
This function exists because it is unclear if it is safe to use
tf.sparse_to_dense(indices, [size], 1, validate_indices=False)
with indices which are not ordered.
This function accepts a dynamic size (e.g. tf.shape(tensor)[0])
Args:
indices: 1d Tensor with integer indices which are to be set to
indices_values.
size: scalar with size (integer) of output Tensor.
indices_value: values of elements specified by indices in the output vector
default_value: values of other elements in the output vector.
dtype: data type.
Returns:
dense 1D Tensor of shape [size] with indices set to indices_values and the
rest set to default_value.
"""
size = tf.cast(size, dtype=tf.int32)
zeros = tf.ones([size], dtype=dtype) * default_value
values = tf.ones_like(indices, dtype=dtype) * indices_value
return tf.dynamic_stitch(
[tf.range(size), tf.cast(indices, dtype=tf.int32)], [zeros, values])
def matmul_gather_on_zeroth_axis(params, indices, scope=None):
"""Matrix multiplication based implementation of tf.gather on zeroth axis.
TODO(rathodv, jonathanhuang): enable sparse matmul option.
Args:
params: A float32 Tensor. The tensor from which to gather values.
Must be at least rank 1.
indices: A Tensor. Must be one of the following types: int32, int64.
Must be in range [0, params.shape[0])
scope: A name for the operation (optional).
Returns:
A Tensor. Has the same type as params. Values from params gathered
from indices given by indices, with shape indices.shape + params.shape[1:].
"""
scope = scope or 'MatMulGather'
with tf.name_scope(scope):
params_shape = combined_static_and_dynamic_shape(params)
indices_shape = combined_static_and_dynamic_shape(indices)
params2d = tf.reshape(params, [params_shape[0], -1])
indicator_matrix = tf.one_hot(indices, params_shape[0])
gathered_result_flattened = tf.matmul(indicator_matrix, params2d)
return tf.reshape(gathered_result_flattened,
tf.stack(indices_shape + params_shape[1:]))
class BalancedPositiveNegativeSampler:
"""Subsamples minibatches to a desired balance of positives and negatives."""
def __init__(self, positive_fraction=0.5, is_static=False):
"""Constructs a minibatch sampler.
Args:
positive_fraction: desired fraction of positive examples (scalar in [0,1])
in the batch.
is_static: If True, uses an implementation with static shape guarantees.
Raises:
ValueError: if positive_fraction < 0, or positive_fraction > 1
"""
if positive_fraction < 0 or positive_fraction > 1:
raise ValueError('positive_fraction should be in range [0,1]. '
'Received: %s.' % positive_fraction)
self._positive_fraction = positive_fraction
self._is_static = is_static
@staticmethod
def subsample_indicator(indicator, num_samples):
"""Subsample indicator vector.
Given a boolean indicator vector with M elements set to `True`, the function
assigns all but `num_samples` of these previously `True` elements to
`False`. If `num_samples` is greater than M, the original indicator vector
is returned.
Args:
indicator: a 1-dimensional boolean tensor indicating which elements
are allowed to be sampled and which are not.
num_samples: int32 scalar tensor
Returns:
a boolean tensor with the same shape as input (indicator) tensor
"""
indices = tf.where(indicator)
indices = tf.random.shuffle(indices)
indices = tf.reshape(indices, [-1])
num_samples = tf.minimum(tf.size(input=indices), num_samples)
selected_indices = tf.slice(indices, [0], tf.reshape(num_samples, [1]))
selected_indicator = indices_to_dense_vector(
selected_indices,
tf.shape(input=indicator)[0])
return tf.equal(selected_indicator, 1)
def _get_num_pos_neg_samples(self, sorted_indices_tensor, sample_size):
"""Counts the number of positives and negatives numbers to be sampled.
Args:
sorted_indices_tensor: A sorted int32 tensor of shape [N] which contains
the signed indices of the examples where the sign is based on the label
value. The examples that cannot be sampled are set to 0. It samples
at most sample_size*positive_fraction positive examples and remaining
from negative examples.
sample_size: Size of subsamples.
Returns:
A tuple containing the number of positive and negative labels in the
subsample.
"""
input_length = tf.shape(input=sorted_indices_tensor)[0]
valid_positive_index = tf.greater(sorted_indices_tensor,
tf.zeros(input_length, tf.int32))
num_sampled_pos = tf.reduce_sum(
input_tensor=tf.cast(valid_positive_index, tf.int32))
max_num_positive_samples = tf.constant(
int(sample_size * self._positive_fraction), tf.int32)
num_positive_samples = tf.minimum(max_num_positive_samples, num_sampled_pos)
num_negative_samples = tf.constant(sample_size,
tf.int32) - num_positive_samples
return num_positive_samples, num_negative_samples
def _get_values_from_start_and_end(self, input_tensor, num_start_samples,
num_end_samples, total_num_samples):
"""slices num_start_samples and last num_end_samples from input_tensor.
Args:
input_tensor: An int32 tensor of shape [N] to be sliced.
num_start_samples: Number of examples to be sliced from the beginning
of the input tensor.
num_end_samples: Number of examples to be sliced from the end of the
input tensor.
total_num_samples: Sum of is num_start_samples and num_end_samples. This
should be a scalar.
Returns:
A tensor containing the first num_start_samples and last num_end_samples
from input_tensor.
"""
input_length = tf.shape(input=input_tensor)[0]
start_positions = tf.less(tf.range(input_length), num_start_samples)
end_positions = tf.greater_equal(
tf.range(input_length), input_length - num_end_samples)
selected_positions = tf.logical_or(start_positions, end_positions)
selected_positions = tf.cast(selected_positions, tf.float32)
indexed_positions = tf.multiply(tf.cumsum(selected_positions),
selected_positions)
one_hot_selector = tf.one_hot(tf.cast(indexed_positions, tf.int32) - 1,
total_num_samples,
dtype=tf.float32)
return tf.cast(tf.tensordot(tf.cast(input_tensor, tf.float32),
one_hot_selector, axes=[0, 0]), tf.int32)
def _static_subsample(self, indicator, batch_size, labels):
"""Returns subsampled minibatch.
Args:
indicator: boolean tensor of shape [N] whose True entries can be sampled.
N should be a complie time constant.
batch_size: desired batch size. This scalar cannot be None.
labels: boolean tensor of shape [N] denoting positive(=True) and negative
(=False) examples. N should be a complie time constant.
Returns:
sampled_idx_indicator: boolean tensor of shape [N], True for entries which
are sampled. It ensures the length of output of the subsample is always
batch_size, even when number of examples set to True in indicator is
less than batch_size.
Raises:
ValueError: if labels and indicator are not 1D boolean tensors.
"""
# Check if indicator and labels have a static size.
if not indicator.shape.is_fully_defined():
raise ValueError('indicator must be static in shape when is_static is'
'True')
if not labels.shape.is_fully_defined():
raise ValueError('labels must be static in shape when is_static is'
'True')
if not isinstance(batch_size, int):
raise ValueError('batch_size has to be an integer when is_static is'
'True.')
input_length = tf.shape(input=indicator)[0]
# Set the number of examples set True in indicator to be at least
# batch_size.
num_true_sampled = tf.reduce_sum(
input_tensor=tf.cast(indicator, tf.float32))
additional_false_sample = tf.less_equal(
tf.cumsum(tf.cast(tf.logical_not(indicator), tf.float32)),
batch_size - num_true_sampled)
indicator = tf.logical_or(indicator, additional_false_sample)
# Shuffle indicator and label. Need to store the permutation to restore the
# order post sampling.
permutation = tf.random.shuffle(tf.range(input_length))
indicator = matmul_gather_on_zeroth_axis(
tf.cast(indicator, tf.float32), permutation)
labels = matmul_gather_on_zeroth_axis(
tf.cast(labels, tf.float32), permutation)
# index (starting from 1) when indicator is True, 0 when False
indicator_idx = tf.where(
tf.cast(indicator, tf.bool), tf.range(1, input_length + 1),
tf.zeros(input_length, tf.int32))
# Replace -1 for negative, +1 for positive labels
signed_label = tf.where(
tf.cast(labels, tf.bool), tf.ones(input_length, tf.int32),
tf.scalar_mul(-1, tf.ones(input_length, tf.int32)))
# negative of index for negative label, positive index for positive label,
# 0 when indicator is False.
signed_indicator_idx = tf.multiply(indicator_idx, signed_label)
sorted_signed_indicator_idx = tf.nn.top_k(
signed_indicator_idx, input_length, sorted=True).values
[num_positive_samples,
num_negative_samples] = self._get_num_pos_neg_samples(
sorted_signed_indicator_idx, batch_size)
sampled_idx = self._get_values_from_start_and_end(
sorted_signed_indicator_idx, num_positive_samples,
num_negative_samples, batch_size)
# Shift the indices to start from 0 and remove any samples that are set as
# False.
sampled_idx = tf.abs(sampled_idx) - tf.ones(batch_size, tf.int32)
sampled_idx = tf.multiply(
tf.cast(tf.greater_equal(sampled_idx, tf.constant(0)), tf.int32),
sampled_idx)
sampled_idx_indicator = tf.cast(
tf.reduce_sum(
input_tensor=tf.one_hot(sampled_idx, depth=input_length), axis=0),
tf.bool)
# project back the order based on stored permutations
reprojections = tf.one_hot(permutation, depth=input_length,
dtype=tf.float32)
return tf.cast(tf.tensordot(
tf.cast(sampled_idx_indicator, tf.float32),
reprojections, axes=[0, 0]), tf.bool)
def subsample(self, indicator, batch_size, labels, scope=None):
"""Returns subsampled minibatch.
Args:
indicator: boolean tensor of shape [N] whose True entries can be sampled.
batch_size: desired batch size. If None, keeps all positive samples and
randomly selects negative samples so that the positive sample fraction
matches self._positive_fraction. It cannot be None is is_static is True.
labels: boolean tensor of shape [N] denoting positive(=True) and negative
(=False) examples.
scope: name scope.
Returns:
sampled_idx_indicator: boolean tensor of shape [N], True for entries which
are sampled.
Raises:
ValueError: if labels and indicator are not 1D boolean tensors.
"""
if len(indicator.get_shape().as_list()) != 1:
raise ValueError('indicator must be 1 dimensional, got a tensor of '
'shape %s' % indicator.get_shape())
if len(labels.get_shape().as_list()) != 1:
raise ValueError('labels must be 1 dimensional, got a tensor of '
'shape %s' % labels.get_shape())
if labels.dtype != tf.bool:
raise ValueError('labels should be of type bool. Received: %s' %
labels.dtype)
if indicator.dtype != tf.bool:
raise ValueError('indicator should be of type bool. Received: %s' %
indicator.dtype)
scope = scope or 'BalancedPositiveNegativeSampler'
with tf.name_scope(scope):
if self._is_static:
return self._static_subsample(indicator, batch_size, labels)
else:
# Only sample from indicated samples
negative_idx = tf.logical_not(labels)
positive_idx = tf.logical_and(labels, indicator)
negative_idx = tf.logical_and(negative_idx, indicator)
# Sample positive and negative samples separately
if batch_size is None:
max_num_pos = tf.reduce_sum(
input_tensor=tf.cast(positive_idx, dtype=tf.int32))
else:
max_num_pos = int(self._positive_fraction * batch_size)
sampled_pos_idx = self.subsample_indicator(positive_idx, max_num_pos)
num_sampled_pos = tf.reduce_sum(
input_tensor=tf.cast(sampled_pos_idx, tf.int32))
if batch_size is None:
negative_positive_ratio = (
1 - self._positive_fraction) / self._positive_fraction
max_num_neg = tf.cast(
negative_positive_ratio *
tf.cast(num_sampled_pos, dtype=tf.float32),
dtype=tf.int32)
else:
max_num_neg = batch_size - num_sampled_pos
sampled_neg_idx = self.subsample_indicator(negative_idx, max_num_neg)
return tf.logical_or(sampled_pos_idx, sampled_neg_idx)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment