Commit c8e6faf7 authored by A. Unique TensorFlower's avatar A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 431756117
parent 13a5e4fb
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for detection_generator.py."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.modeling.layers import detection_generator
from official.vision.ops import anchor
class SelectTopKScoresTest(tf.test.TestCase):
def testSelectTopKScores(self):
pre_nms_num_boxes = 2
scores_data = [[[0.2, 0.2], [0.1, 0.9], [0.5, 0.1], [0.3, 0.5]]]
scores_in = tf.constant(scores_data, dtype=tf.float32)
top_k_scores, top_k_indices = detection_generator._select_top_k_scores(
scores_in, pre_nms_num_detections=pre_nms_num_boxes)
expected_top_k_scores = np.array([[[0.5, 0.9], [0.3, 0.5]]],
dtype=np.float32)
expected_top_k_indices = [[[2, 1], [3, 3]]]
self.assertAllEqual(top_k_scores.numpy(), expected_top_k_scores)
self.assertAllEqual(top_k_indices.numpy(), expected_top_k_indices)
class DetectionGeneratorTest(
parameterized.TestCase, tf.test.TestCase):
@parameterized.product(
nms_version=['batched', 'v1', 'v2'],
use_cpu_nms=[True, False],
soft_nms_sigma=[None, 0.1])
def testDetectionsOutputShape(self, nms_version, use_cpu_nms, soft_nms_sigma):
max_num_detections = 10
num_classes = 4
pre_nms_top_k = 5000
pre_nms_score_threshold = 0.01
batch_size = 1
kwargs = {
'apply_nms': True,
'pre_nms_top_k': pre_nms_top_k,
'pre_nms_score_threshold': pre_nms_score_threshold,
'nms_iou_threshold': 0.5,
'max_num_detections': max_num_detections,
'nms_version': nms_version,
'use_cpu_nms': use_cpu_nms,
'soft_nms_sigma': soft_nms_sigma,
}
generator = detection_generator.DetectionGenerator(**kwargs)
cls_outputs_all = (
np.random.rand(84, num_classes) - 0.5) * 3 # random 84x3 outputs.
box_outputs_all = np.random.rand(84, 4 * num_classes) # random 84 boxes.
anchor_boxes_all = np.random.rand(84, 4) # random 84 boxes.
class_outputs = tf.reshape(
tf.convert_to_tensor(cls_outputs_all, dtype=tf.float32),
[1, 84, num_classes])
box_outputs = tf.reshape(
tf.convert_to_tensor(box_outputs_all, dtype=tf.float32),
[1, 84, 4 * num_classes])
anchor_boxes = tf.reshape(
tf.convert_to_tensor(anchor_boxes_all, dtype=tf.float32),
[1, 84, 4])
image_info = tf.constant(
[[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]],
dtype=tf.float32)
results = generator(
box_outputs, class_outputs, anchor_boxes, image_info[:, 1, :])
boxes = results['detection_boxes']
classes = results['detection_classes']
scores = results['detection_scores']
valid_detections = results['num_detections']
self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4))
self.assertEqual(scores.numpy().shape, (batch_size, max_num_detections,))
self.assertEqual(classes.numpy().shape, (batch_size, max_num_detections,))
self.assertEqual(valid_detections.numpy().shape, (batch_size,))
def test_serialize_deserialize(self):
kwargs = {
'apply_nms': True,
'pre_nms_top_k': 1000,
'pre_nms_score_threshold': 0.1,
'nms_iou_threshold': 0.5,
'max_num_detections': 10,
'nms_version': 'v2',
'use_cpu_nms': False,
'soft_nms_sigma': None,
}
generator = detection_generator.DetectionGenerator(**kwargs)
expected_config = dict(kwargs)
self.assertEqual(generator.get_config(), expected_config)
new_generator = (
detection_generator.DetectionGenerator.from_config(
generator.get_config()))
self.assertAllEqual(generator.get_config(), new_generator.get_config())
class MultilevelDetectionGeneratorTest(
parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
('batched', False, True, None, None),
('batched', False, False, None, None),
('v2', False, True, None, None),
('v2', False, False, None, None),
('v1', True, True, 0.0, None),
('v1', True, False, 0.1, None),
('v1', True, False, None, None),
('tflite', False, False, None, True),
('tflite', False, False, None, False),
)
def testDetectionsOutputShape(self, nms_version, has_att_heads, use_cpu_nms,
soft_nms_sigma, use_regular_nms):
min_level = 4
max_level = 6
num_scales = 2
max_num_detections = 10
aspect_ratios = [1.0, 2.0]
anchor_scale = 2.0
output_size = [64, 64]
num_classes = 4
pre_nms_top_k = 5000
pre_nms_score_threshold = 0.01
batch_size = 1
tflite_post_processing_config = {
'max_detections': max_num_detections,
'max_classes_per_detection': 1,
'use_regular_nms': use_regular_nms,
'nms_score_threshold': 0.01,
'nms_iou_threshold': 0.5
}
kwargs = {
'apply_nms': True,
'pre_nms_top_k': pre_nms_top_k,
'pre_nms_score_threshold': pre_nms_score_threshold,
'nms_iou_threshold': 0.5,
'max_num_detections': max_num_detections,
'nms_version': nms_version,
'use_cpu_nms': use_cpu_nms,
'soft_nms_sigma': soft_nms_sigma,
'tflite_post_processing_config': tflite_post_processing_config
}
input_anchor = anchor.build_anchor_generator(min_level, max_level,
num_scales, aspect_ratios,
anchor_scale)
anchor_boxes = input_anchor(output_size)
cls_outputs_all = (
np.random.rand(84, num_classes) - 0.5) * 3 # random 84x3 outputs.
box_outputs_all = np.random.rand(84, 4) # random 84 boxes.
class_outputs = {
'4':
tf.reshape(
tf.convert_to_tensor(cls_outputs_all[0:64], dtype=tf.float32),
[1, 8, 8, num_classes]),
'5':
tf.reshape(
tf.convert_to_tensor(cls_outputs_all[64:80], dtype=tf.float32),
[1, 4, 4, num_classes]),
'6':
tf.reshape(
tf.convert_to_tensor(cls_outputs_all[80:84], dtype=tf.float32),
[1, 2, 2, num_classes]),
}
box_outputs = {
'4': tf.reshape(tf.convert_to_tensor(
box_outputs_all[0:64], dtype=tf.float32), [1, 8, 8, 4]),
'5': tf.reshape(tf.convert_to_tensor(
box_outputs_all[64:80], dtype=tf.float32), [1, 4, 4, 4]),
'6': tf.reshape(tf.convert_to_tensor(
box_outputs_all[80:84], dtype=tf.float32), [1, 2, 2, 4]),
}
if has_att_heads:
att_outputs_all = np.random.rand(84, 1) # random attributes.
att_outputs = {
'depth': {
'4':
tf.reshape(
tf.convert_to_tensor(
att_outputs_all[0:64], dtype=tf.float32),
[1, 8, 8, 1]),
'5':
tf.reshape(
tf.convert_to_tensor(
att_outputs_all[64:80], dtype=tf.float32),
[1, 4, 4, 1]),
'6':
tf.reshape(
tf.convert_to_tensor(
att_outputs_all[80:84], dtype=tf.float32),
[1, 2, 2, 1]),
}
}
else:
att_outputs = None
image_info = tf.constant([[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]],
dtype=tf.float32)
generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
results = generator(box_outputs, class_outputs, anchor_boxes,
image_info[:, 1, :], att_outputs)
boxes = results['detection_boxes']
classes = results['detection_classes']
scores = results['detection_scores']
valid_detections = results['num_detections']
if nms_version == 'tflite':
# When nms_version is `tflite`, all output tensors are empty as the actual
# post-processing happens in the TFLite model.
self.assertEqual(boxes.numpy().shape, ())
self.assertEqual(scores.numpy().shape, ())
self.assertEqual(classes.numpy().shape, ())
self.assertEqual(valid_detections.numpy().shape, ())
else:
self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4))
self.assertEqual(scores.numpy().shape, (
batch_size,
max_num_detections,
))
self.assertEqual(classes.numpy().shape, (
batch_size,
max_num_detections,
))
self.assertEqual(valid_detections.numpy().shape, (batch_size,))
if has_att_heads:
for att in results['detection_attributes'].values():
self.assertEqual(att.numpy().shape,
(batch_size, max_num_detections, 1))
def test_serialize_deserialize(self):
tflite_post_processing_config = {
'max_detections': 100,
'max_classes_per_detection': 1,
'use_regular_nms': True,
'nms_score_threshold': 0.01,
'nms_iou_threshold': 0.5
}
kwargs = {
'apply_nms': True,
'pre_nms_top_k': 1000,
'pre_nms_score_threshold': 0.1,
'nms_iou_threshold': 0.5,
'max_num_detections': 10,
'nms_version': 'v2',
'use_cpu_nms': False,
'soft_nms_sigma': None,
'tflite_post_processing_config': tflite_post_processing_config
}
generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
expected_config = dict(kwargs)
self.assertEqual(generator.get_config(), expected_config)
new_generator = (
detection_generator.MultilevelDetectionGenerator.from_config(
generator.get_config()))
self.assertAllEqual(generator.get_config(), new_generator.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of mask sampler."""
# Import libraries
import tensorflow as tf
from official.vision.ops import spatial_transform_ops
def _sample_and_crop_foreground_masks(candidate_rois: tf.Tensor,
candidate_gt_boxes: tf.Tensor,
candidate_gt_classes: tf.Tensor,
candidate_gt_indices: tf.Tensor,
gt_masks: tf.Tensor,
num_sampled_masks: int = 128,
mask_target_size: int = 28):
"""Samples and creates cropped foreground masks for training.
Args:
candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is the
number of candidate RoIs to be considered for mask sampling. It includes
both positive and negative RoIs. The `num_mask_samples_per_image` positive
RoIs will be sampled to create mask training targets.
candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing
the corresponding groundtruth boxes to the `candidate_rois`.
candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing the
corresponding groundtruth classes to the `candidate_rois`. 0 in the tensor
corresponds to the background class, i.e. negative RoIs.
candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the
corresponding groundtruth instance indices to the `candidate_gt_boxes`,
i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i] and
gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is
the superset of candidate_gt_boxes.
gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height,
mask_width] containing all the groundtruth masks which sample masks are
drawn from.
num_sampled_masks: An `int` that specifies the number of masks to sample.
mask_target_size: An `int` that specifies the final cropped mask size after
sampling. The output masks are resized w.r.t the sampled RoIs.
Returns:
foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the
RoI that corresponds to the sampled foreground masks, where
K = num_mask_samples_per_image.
foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the
classes corresponding to the sampled foreground masks.
cropoped_foreground_masks: A `tf.Tensor` of shape of
[batch_size, K, mask_target_size, mask_target_size] storing the cropped
foreground masks used for training.
"""
_, fg_instance_indices = tf.nn.top_k(
tf.cast(tf.greater(candidate_gt_classes, 0), dtype=tf.int32),
k=num_sampled_masks)
fg_instance_indices_shape = tf.shape(fg_instance_indices)
batch_indices = (
tf.expand_dims(tf.range(fg_instance_indices_shape[0]), axis=-1) *
tf.ones([1, fg_instance_indices_shape[-1]], dtype=tf.int32))
gather_nd_instance_indices = tf.stack(
[batch_indices, fg_instance_indices], axis=-1)
foreground_rois = tf.gather_nd(
candidate_rois, gather_nd_instance_indices)
foreground_boxes = tf.gather_nd(
candidate_gt_boxes, gather_nd_instance_indices)
foreground_classes = tf.gather_nd(
candidate_gt_classes, gather_nd_instance_indices)
foreground_gt_indices = tf.gather_nd(
candidate_gt_indices, gather_nd_instance_indices)
foreground_gt_indices = tf.where(
tf.equal(foreground_gt_indices, -1),
tf.zeros_like(foreground_gt_indices),
foreground_gt_indices)
foreground_gt_indices_shape = tf.shape(foreground_gt_indices)
batch_indices = (
tf.expand_dims(tf.range(foreground_gt_indices_shape[0]), axis=-1) *
tf.ones([1, foreground_gt_indices_shape[-1]], dtype=tf.int32))
gather_nd_gt_indices = tf.stack(
[batch_indices, foreground_gt_indices], axis=-1)
foreground_masks = tf.gather_nd(gt_masks, gather_nd_gt_indices)
cropped_foreground_masks = spatial_transform_ops.crop_mask_in_target_box(
foreground_masks, foreground_boxes, foreground_rois, mask_target_size,
sample_offset=0.5)
return foreground_rois, foreground_classes, cropped_foreground_masks
@tf.keras.utils.register_keras_serializable(package='Vision')
class MaskSampler(tf.keras.layers.Layer):
"""Samples and creates mask training targets."""
def __init__(self, mask_target_size: int, num_sampled_masks: int, **kwargs):
self._config_dict = {
'mask_target_size': mask_target_size,
'num_sampled_masks': num_sampled_masks,
}
super(MaskSampler, self).__init__(**kwargs)
def call(self, candidate_rois: tf.Tensor, candidate_gt_boxes: tf.Tensor,
candidate_gt_classes: tf.Tensor, candidate_gt_indices: tf.Tensor,
gt_masks: tf.Tensor):
"""Samples and creates mask targets for training.
Args:
candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is
the number of candidate RoIs to be considered for mask sampling. It
includes both positive and negative RoIs. The
`num_mask_samples_per_image` positive RoIs will be sampled to create
mask training targets.
candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing
the corresponding groundtruth boxes to the `candidate_rois`.
candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing
the corresponding groundtruth classes to the `candidate_rois`. 0 in the
tensor corresponds to the background class, i.e. negative RoIs.
candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the
corresponding groundtruth instance indices to the `candidate_gt_boxes`,
i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i],
where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >=
N, is the superset of candidate_gt_boxes.
gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height,
mask_width] containing all the groundtruth masks which sample masks are
drawn from. after sampling. The output masks are resized w.r.t the
sampled RoIs.
Returns:
foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the
RoI that corresponds to the sampled foreground masks, where
K = num_mask_samples_per_image.
foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the
classes corresponding to the sampled foreground masks.
cropoped_foreground_masks: A `tf.Tensor` of shape of
[batch_size, K, mask_target_size, mask_target_size] storing the
cropped foreground masks used for training.
"""
foreground_rois, foreground_classes, cropped_foreground_masks = (
_sample_and_crop_foreground_masks(
candidate_rois,
candidate_gt_boxes,
candidate_gt_classes,
candidate_gt_indices,
gt_masks,
self._config_dict['num_sampled_masks'],
self._config_dict['mask_target_size']))
return foreground_rois, foreground_classes, cropped_foreground_masks
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains common building blocks for neural networks."""
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Text
# Import libraries
from absl import logging
import tensorflow as tf
from official.modeling import tf_utils
from official.vision.modeling.layers import nn_layers
def _pad_strides(strides: int, axis: int) -> Tuple[int, int, int, int]:
"""Converts int to len 4 strides (`tf.nn.avg_pool` uses length 4)."""
if axis == 1:
return (1, 1, strides, strides)
else:
return (1, strides, strides, 1)
def _maybe_downsample(x: tf.Tensor, out_filter: int, strides: int,
axis: int) -> tf.Tensor:
"""Downsamples feature map and 0-pads tensor if in_filter != out_filter."""
data_format = 'NCHW' if axis == 1 else 'NHWC'
strides = _pad_strides(strides, axis=axis)
x = tf.nn.avg_pool(x, strides, strides, 'VALID', data_format=data_format)
in_filter = x.shape[axis]
if in_filter < out_filter:
# Pad on channel dimension with 0s: half on top half on bottom.
pad_size = [(out_filter - in_filter) // 2, (out_filter - in_filter) // 2]
if axis == 1:
x = tf.pad(x, [[0, 0], pad_size, [0, 0], [0, 0]])
else:
x = tf.pad(x, [[0, 0], [0, 0], [0, 0], pad_size])
return x + 0.
@tf.keras.utils.register_keras_serializable(package='Vision')
class ResidualBlock(tf.keras.layers.Layer):
"""A residual block."""
def __init__(self,
filters,
strides,
use_projection=False,
se_ratio=None,
resnetd_shortcut=False,
stochastic_depth_drop_rate=None,
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_regularizer=None,
activation='relu',
use_explicit_padding: bool = False,
use_sync_bn=False,
norm_momentum=0.99,
norm_epsilon=0.001,
bn_trainable=True,
**kwargs):
"""Initializes a residual block with BN after convolutions.
Args:
filters: An `int` number of filters for the first two convolutions. Note
that the third and final convolution will use 4 times as many filters.
strides: An `int` block stride. If greater than 1, this block will
ultimately downsample the input.
use_projection: A `bool` for whether this block should use a projection
shortcut (versus the default identity shortcut). This is usually `True`
for the first block of a block group, which may change the number of
filters and the resolution.
se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
resnetd_shortcut: A `bool` if True, apply the resnetd style modification
to the shortcut connection. Not implemented in residual blocks.
stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
the stochastic depth layer.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
Default to None.
activation: A `str` name of the activation function.
use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
inputs so that the output dimensions are the same as if 'SAME' padding
were used.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
bn_trainable: A `bool` that indicates whether batch norm layers should be
trainable. Default to True.
**kwargs: Additional keyword arguments to be passed.
"""
super(ResidualBlock, self).__init__(**kwargs)
self._filters = filters
self._strides = strides
self._use_projection = use_projection
self._se_ratio = se_ratio
self._resnetd_shortcut = resnetd_shortcut
self._use_explicit_padding = use_explicit_padding
self._use_sync_bn = use_sync_bn
self._activation = activation
self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
self._kernel_initializer = kernel_initializer
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation_fn = tf_utils.get_activation(activation)
self._bn_trainable = bn_trainable
def build(self, input_shape):
if self._use_projection:
self._shortcut = tf.keras.layers.Conv2D(
filters=self._filters,
kernel_size=1,
strides=self._strides,
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
conv1_padding = 'same'
# explicit padding here is added for centernet
if self._use_explicit_padding:
self._pad = tf.keras.layers.ZeroPadding2D(padding=(1, 1))
conv1_padding = 'valid'
self._conv1 = tf.keras.layers.Conv2D(
filters=self._filters,
kernel_size=3,
strides=self._strides,
padding=conv1_padding,
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
self._conv2 = tf.keras.layers.Conv2D(
filters=self._filters,
kernel_size=3,
strides=1,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm2 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
self._squeeze_excitation = nn_layers.SqueezeExcitation(
in_filters=self._filters,
out_filters=self._filters,
se_ratio=self._se_ratio,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
else:
self._squeeze_excitation = None
if self._stochastic_depth_drop_rate:
self._stochastic_depth = nn_layers.StochasticDepth(
self._stochastic_depth_drop_rate)
else:
self._stochastic_depth = None
super(ResidualBlock, self).build(input_shape)
def get_config(self):
config = {
'filters': self._filters,
'strides': self._strides,
'use_projection': self._use_projection,
'se_ratio': self._se_ratio,
'resnetd_shortcut': self._resnetd_shortcut,
'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_regularizer': self._bias_regularizer,
'activation': self._activation,
'use_explicit_padding': self._use_explicit_padding,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon,
'bn_trainable': self._bn_trainable
}
base_config = super(ResidualBlock, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, training=None):
shortcut = inputs
if self._use_projection:
shortcut = self._shortcut(shortcut)
shortcut = self._norm0(shortcut)
if self._use_explicit_padding:
inputs = self._pad(inputs)
x = self._conv1(inputs)
x = self._norm1(x)
x = self._activation_fn(x)
x = self._conv2(x)
x = self._norm2(x)
if self._squeeze_excitation:
x = self._squeeze_excitation(x)
if self._stochastic_depth:
x = self._stochastic_depth(x, training=training)
return self._activation_fn(x + shortcut)
@tf.keras.utils.register_keras_serializable(package='Vision')
class BottleneckBlock(tf.keras.layers.Layer):
"""A standard bottleneck block."""
def __init__(self,
filters,
strides,
dilation_rate=1,
use_projection=False,
se_ratio=None,
resnetd_shortcut=False,
stochastic_depth_drop_rate=None,
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_regularizer=None,
activation='relu',
use_sync_bn=False,
norm_momentum=0.99,
norm_epsilon=0.001,
bn_trainable=True,
**kwargs):
"""Initializes a standard bottleneck block with BN after convolutions.
Args:
filters: An `int` number of filters for the first two convolutions. Note
that the third and final convolution will use 4 times as many filters.
strides: An `int` block stride. If greater than 1, this block will
ultimately downsample the input.
dilation_rate: An `int` dilation_rate of convolutions. Default to 1.
use_projection: A `bool` for whether this block should use a projection
shortcut (versus the default identity shortcut). This is usually `True`
for the first block of a block group, which may change the number of
filters and the resolution.
se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
resnetd_shortcut: A `bool`. If True, apply the resnetd style modification
to the shortcut connection.
stochastic_depth_drop_rate: A `float` or None. If not None, drop rate for
the stochastic depth layer.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
Default to None.
activation: A `str` name of the activation function.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
bn_trainable: A `bool` that indicates whether batch norm layers should be
trainable. Default to True.
**kwargs: Additional keyword arguments to be passed.
"""
super(BottleneckBlock, self).__init__(**kwargs)
self._filters = filters
self._strides = strides
self._dilation_rate = dilation_rate
self._use_projection = use_projection
self._se_ratio = se_ratio
self._resnetd_shortcut = resnetd_shortcut
self._use_sync_bn = use_sync_bn
self._activation = activation
self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
self._kernel_initializer = kernel_initializer
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._bn_trainable = bn_trainable
def build(self, input_shape):
if self._use_projection:
if self._resnetd_shortcut:
self._shortcut0 = tf.keras.layers.AveragePooling2D(
pool_size=2, strides=self._strides, padding='same')
self._shortcut1 = tf.keras.layers.Conv2D(
filters=self._filters * 4,
kernel_size=1,
strides=1,
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
else:
self._shortcut = tf.keras.layers.Conv2D(
filters=self._filters * 4,
kernel_size=1,
strides=self._strides,
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
self._conv1 = tf.keras.layers.Conv2D(
filters=self._filters,
kernel_size=1,
strides=1,
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
self._activation1 = tf_utils.get_activation(
self._activation, use_keras_layer=True)
self._conv2 = tf.keras.layers.Conv2D(
filters=self._filters,
kernel_size=3,
strides=self._strides,
dilation_rate=self._dilation_rate,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm2 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
self._activation2 = tf_utils.get_activation(
self._activation, use_keras_layer=True)
self._conv3 = tf.keras.layers.Conv2D(
filters=self._filters * 4,
kernel_size=1,
strides=1,
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm3 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
self._activation3 = tf_utils.get_activation(
self._activation, use_keras_layer=True)
if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
self._squeeze_excitation = nn_layers.SqueezeExcitation(
in_filters=self._filters * 4,
out_filters=self._filters * 4,
se_ratio=self._se_ratio,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
else:
self._squeeze_excitation = None
if self._stochastic_depth_drop_rate:
self._stochastic_depth = nn_layers.StochasticDepth(
self._stochastic_depth_drop_rate)
else:
self._stochastic_depth = None
self._add = tf.keras.layers.Add()
super(BottleneckBlock, self).build(input_shape)
def get_config(self):
config = {
'filters': self._filters,
'strides': self._strides,
'dilation_rate': self._dilation_rate,
'use_projection': self._use_projection,
'se_ratio': self._se_ratio,
'resnetd_shortcut': self._resnetd_shortcut,
'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_regularizer': self._bias_regularizer,
'activation': self._activation,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon,
'bn_trainable': self._bn_trainable
}
base_config = super(BottleneckBlock, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, training=None):
shortcut = inputs
if self._use_projection:
if self._resnetd_shortcut:
shortcut = self._shortcut0(shortcut)
shortcut = self._shortcut1(shortcut)
else:
shortcut = self._shortcut(shortcut)
shortcut = self._norm0(shortcut)
x = self._conv1(inputs)
x = self._norm1(x)
x = self._activation1(x)
x = self._conv2(x)
x = self._norm2(x)
x = self._activation2(x)
x = self._conv3(x)
x = self._norm3(x)
if self._squeeze_excitation:
x = self._squeeze_excitation(x)
if self._stochastic_depth:
x = self._stochastic_depth(x, training=training)
x = self._add([x, shortcut])
return self._activation3(x)
@tf.keras.utils.register_keras_serializable(package='Vision')
class InvertedBottleneckBlock(tf.keras.layers.Layer):
"""An inverted bottleneck block."""
def __init__(self,
in_filters,
out_filters,
expand_ratio,
strides,
kernel_size=3,
se_ratio=None,
stochastic_depth_drop_rate=None,
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_regularizer=None,
activation='relu',
se_inner_activation='relu',
se_gating_activation='sigmoid',
se_round_down_protect=True,
expand_se_in_filters=False,
depthwise_activation=None,
use_sync_bn=False,
dilation_rate=1,
divisible_by=1,
regularize_depthwise=False,
use_depthwise=True,
use_residual=True,
norm_momentum=0.99,
norm_epsilon=0.001,
output_intermediate_endpoints=False,
**kwargs):
"""Initializes an inverted bottleneck block with BN after convolutions.
Args:
in_filters: An `int` number of filters of the input tensor.
out_filters: An `int` number of filters of the output tensor.
expand_ratio: An `int` of expand_ratio for an inverted bottleneck block.
strides: An `int` block stride. If greater than 1, this block will
ultimately downsample the input.
kernel_size: An `int` kernel_size of the depthwise conv layer.
se_ratio: A `float` or None. If not None, se ratio for the squeeze and
excitation layer.
stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
the stochastic depth layer.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
Default to None.
activation: A `str` name of the activation function.
se_inner_activation: A `str` name of squeeze-excitation inner activation.
se_gating_activation: A `str` name of squeeze-excitation gating
activation.
se_round_down_protect: A `bool` of whether round down more than 10%
will be allowed in SE layer.
expand_se_in_filters: A `bool` of whether or not to expand in_filter in
squeeze and excitation layer.
depthwise_activation: A `str` name of the activation function for
depthwise only.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
dilation_rate: An `int` that specifies the dilation rate to use for.
divisible_by: An `int` that ensures all inner dimensions are divisible by
this number.
dilated convolution: An `int` to specify the same value for all spatial
dimensions.
regularize_depthwise: A `bool` of whether or not apply regularization on
depthwise.
use_depthwise: A `bool` of whether to uses fused convolutions instead of
depthwise.
use_residual: A `bool` of whether to include residual connection between
input and output.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
output_intermediate_endpoints: A `bool` of whether or not output the
intermediate endpoints.
**kwargs: Additional keyword arguments to be passed.
"""
super(InvertedBottleneckBlock, self).__init__(**kwargs)
self._in_filters = in_filters
self._out_filters = out_filters
self._expand_ratio = expand_ratio
self._strides = strides
self._kernel_size = kernel_size
self._se_ratio = se_ratio
self._divisible_by = divisible_by
self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
self._dilation_rate = dilation_rate
self._use_sync_bn = use_sync_bn
self._regularize_depthwise = regularize_depthwise
self._use_depthwise = use_depthwise
self._use_residual = use_residual
self._activation = activation
self._se_inner_activation = se_inner_activation
self._se_gating_activation = se_gating_activation
self._depthwise_activation = depthwise_activation
self._se_round_down_protect = se_round_down_protect
self._kernel_initializer = kernel_initializer
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
self._expand_se_in_filters = expand_se_in_filters
self._output_intermediate_endpoints = output_intermediate_endpoints
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
if not depthwise_activation:
self._depthwise_activation = activation
if regularize_depthwise:
self._depthsize_regularizer = kernel_regularizer
else:
self._depthsize_regularizer = None
def build(self, input_shape):
expand_filters = self._in_filters
if self._expand_ratio > 1:
# First 1x1 conv for channel expansion.
expand_filters = nn_layers.make_divisible(
self._in_filters * self._expand_ratio, self._divisible_by)
expand_kernel = 1 if self._use_depthwise else self._kernel_size
expand_stride = 1 if self._use_depthwise else self._strides
self._conv0 = tf.keras.layers.Conv2D(
filters=expand_filters,
kernel_size=expand_kernel,
strides=expand_stride,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._activation_layer = tf_utils.get_activation(
self._activation, use_keras_layer=True)
if self._use_depthwise:
# Depthwise conv.
self._conv1 = tf.keras.layers.DepthwiseConv2D(
kernel_size=(self._kernel_size, self._kernel_size),
strides=self._strides,
padding='same',
depth_multiplier=1,
dilation_rate=self._dilation_rate,
use_bias=False,
depthwise_initializer=self._kernel_initializer,
depthwise_regularizer=self._depthsize_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._depthwise_activation_layer = tf_utils.get_activation(
self._depthwise_activation, use_keras_layer=True)
# Squeeze and excitation.
if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
logging.info('Use Squeeze and excitation.')
in_filters = self._in_filters
if self._expand_se_in_filters:
in_filters = expand_filters
self._squeeze_excitation = nn_layers.SqueezeExcitation(
in_filters=in_filters,
out_filters=expand_filters,
se_ratio=self._se_ratio,
divisible_by=self._divisible_by,
round_down_protect=self._se_round_down_protect,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer,
activation=self._se_inner_activation,
gating_activation=self._se_gating_activation)
else:
self._squeeze_excitation = None
# Last 1x1 conv.
self._conv2 = tf.keras.layers.Conv2D(
filters=self._out_filters,
kernel_size=1,
strides=1,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm2 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
if self._stochastic_depth_drop_rate:
self._stochastic_depth = nn_layers.StochasticDepth(
self._stochastic_depth_drop_rate)
else:
self._stochastic_depth = None
self._add = tf.keras.layers.Add()
super(InvertedBottleneckBlock, self).build(input_shape)
def get_config(self):
config = {
'in_filters': self._in_filters,
'out_filters': self._out_filters,
'expand_ratio': self._expand_ratio,
'strides': self._strides,
'kernel_size': self._kernel_size,
'se_ratio': self._se_ratio,
'divisible_by': self._divisible_by,
'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_regularizer': self._bias_regularizer,
'activation': self._activation,
'se_inner_activation': self._se_inner_activation,
'se_gating_activation': self._se_gating_activation,
'se_round_down_protect': self._se_round_down_protect,
'expand_se_in_filters': self._expand_se_in_filters,
'depthwise_activation': self._depthwise_activation,
'dilation_rate': self._dilation_rate,
'use_sync_bn': self._use_sync_bn,
'regularize_depthwise': self._regularize_depthwise,
'use_depthwise': self._use_depthwise,
'use_residual': self._use_residual,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon
}
base_config = super(InvertedBottleneckBlock, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, training=None):
endpoints = {}
shortcut = inputs
if self._expand_ratio > 1:
x = self._conv0(inputs)
x = self._norm0(x)
x = self._activation_layer(x)
else:
x = inputs
if self._use_depthwise:
x = self._conv1(x)
x = self._norm1(x)
x = self._depthwise_activation_layer(x)
if self._output_intermediate_endpoints:
endpoints['depthwise'] = x
if self._squeeze_excitation:
x = self._squeeze_excitation(x)
x = self._conv2(x)
x = self._norm2(x)
if (self._use_residual and self._in_filters == self._out_filters and
self._strides == 1):
if self._stochastic_depth:
x = self._stochastic_depth(x, training=training)
x = self._add([x, shortcut])
if self._output_intermediate_endpoints:
return x, endpoints
return x
@tf.keras.utils.register_keras_serializable(package='Vision')
class ResidualInner(tf.keras.layers.Layer):
"""Creates a single inner block of a residual.
This corresponds to `F`/`G` functions in the RevNet paper:
Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
The Reversible Residual Network: Backpropagation Without Storing Activations.
(https://arxiv.org/pdf/1707.04585.pdf)
"""
def __init__(
self,
filters: int,
strides: int,
kernel_initializer: Union[str, Callable[
..., tf.keras.initializers.Initializer]] = 'VarianceScaling',
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
use_sync_bn: bool = False,
norm_momentum: float = 0.99,
norm_epsilon: float = 0.001,
batch_norm_first: bool = True,
**kwargs):
"""Initializes a ResidualInner.
Args:
filters: An `int` of output filter size.
strides: An `int` of stride size for convolution for the residual block.
kernel_initializer: A `str` or `tf.keras.initializers.Initializer`
instance for convolutional layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D.
activation: A `str` or `callable` instance of the activation function.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
batch_norm_first: A `bool` of whether to apply activation and batch norm
before conv.
**kwargs: Additional keyword arguments to be passed.
"""
super(ResidualInner, self).__init__(**kwargs)
self.strides = strides
self.filters = filters
self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self._kernel_regularizer = kernel_regularizer
self._activation = tf.keras.activations.get(activation)
self._use_sync_bn = use_sync_bn
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._batch_norm_first = batch_norm_first
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation_fn = tf_utils.get_activation(activation)
def build(self, input_shape: tf.TensorShape):
if self._batch_norm_first:
self._batch_norm_0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._conv2d_1 = tf.keras.layers.Conv2D(
filters=self.filters,
kernel_size=3,
strides=self.strides,
use_bias=False,
padding='same',
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer)
self._batch_norm_1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._conv2d_2 = tf.keras.layers.Conv2D(
filters=self.filters,
kernel_size=3,
strides=1,
use_bias=False,
padding='same',
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer)
super(ResidualInner, self).build(input_shape)
def get_config(self) -> Dict[str, Any]:
config = {
'filters': self.filters,
'strides': self.strides,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'activation': self._activation,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon,
'batch_norm_first': self._batch_norm_first,
}
base_config = super(ResidualInner, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self,
inputs: tf.Tensor,
training: Optional[bool] = None) -> tf.Tensor:
x = inputs
if self._batch_norm_first:
x = self._batch_norm_0(x, training=training)
x = self._activation_fn(x)
x = self._conv2d_1(x)
x = self._batch_norm_1(x, training=training)
x = self._activation_fn(x)
x = self._conv2d_2(x)
return x
@tf.keras.utils.register_keras_serializable(package='Vision')
class BottleneckResidualInner(tf.keras.layers.Layer):
"""Creates a single inner block of a bottleneck.
This corresponds to `F`/`G` functions in the RevNet paper:
Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
The Reversible Residual Network: Backpropagation Without Storing Activations.
(https://arxiv.org/pdf/1707.04585.pdf)
"""
def __init__(
self,
filters: int,
strides: int,
kernel_initializer: Union[str, Callable[
..., tf.keras.initializers.Initializer]] = 'VarianceScaling',
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
use_sync_bn: bool = False,
norm_momentum: float = 0.99,
norm_epsilon: float = 0.001,
batch_norm_first: bool = True,
**kwargs):
"""Initializes a BottleneckResidualInner.
Args:
filters: An `int` number of filters for first 2 convolutions. Last Last,
and thus the number of output channels from the bottlneck block is
`4*filters`
strides: An `int` of stride size for convolution for the residual block.
kernel_initializer: A `str` or `tf.keras.initializers.Initializer`
instance for convolutional layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D.
activation: A `str` or `callable` instance of the activation function.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
batch_norm_first: A `bool` of whether to apply activation and batch norm
before conv.
**kwargs: Additional keyword arguments to be passed.
"""
super(BottleneckResidualInner, self).__init__(**kwargs)
self.strides = strides
self.filters = filters
self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self._kernel_regularizer = kernel_regularizer
self._activation = tf.keras.activations.get(activation)
self._use_sync_bn = use_sync_bn
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._batch_norm_first = batch_norm_first
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation_fn = tf_utils.get_activation(activation)
def build(self, input_shape: tf.TensorShape):
if self._batch_norm_first:
self._batch_norm_0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._conv2d_1 = tf.keras.layers.Conv2D(
filters=self.filters,
kernel_size=1,
strides=self.strides,
use_bias=False,
padding='same',
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer)
self._batch_norm_1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._conv2d_2 = tf.keras.layers.Conv2D(
filters=self.filters,
kernel_size=3,
strides=1,
use_bias=False,
padding='same',
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer)
self._batch_norm_2 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._conv2d_3 = tf.keras.layers.Conv2D(
filters=self.filters * 4,
kernel_size=1,
strides=1,
use_bias=False,
padding='same',
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer)
super(BottleneckResidualInner, self).build(input_shape)
def get_config(self) -> Dict[str, Any]:
config = {
'filters': self.filters,
'strides': self.strides,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'activation': self._activation,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon,
'batch_norm_first': self._batch_norm_first,
}
base_config = super(BottleneckResidualInner, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self,
inputs: tf.Tensor,
training: Optional[bool] = None) -> tf.Tensor:
x = inputs
if self._batch_norm_first:
x = self._batch_norm_0(x, training=training)
x = self._activation_fn(x)
x = self._conv2d_1(x)
x = self._batch_norm_1(x, training=training)
x = self._activation_fn(x)
x = self._conv2d_2(x)
x = self._batch_norm_2(x, training=training)
x = self._activation_fn(x)
x = self._conv2d_3(x)
return x
@tf.keras.utils.register_keras_serializable(package='Vision')
class ReversibleLayer(tf.keras.layers.Layer):
"""Creates a reversible layer.
Computes y1 = x1 + f(x2), y2 = x2 + g(y1), where f and g can be arbitrary
layers that are stateless, which in this case are `ResidualInner` layers.
"""
def __init__(self,
f: tf.keras.layers.Layer,
g: tf.keras.layers.Layer,
manual_grads: bool = True,
**kwargs):
"""Initializes a ReversibleLayer.
Args:
f: A `tf.keras.layers.Layer` instance of `f` inner block referred to in
paper. Each reversible layer consists of two inner functions. For
example, in RevNet the reversible residual consists of two f/g inner
(bottleneck) residual functions. Where the input to the reversible layer
is x, the input gets partitioned in the channel dimension and the
forward pass follows (eq8): x = [x1; x2], z1 = x1 + f(x2), y2 = x2 +
g(z1), y1 = stop_gradient(z1).
g: A `tf.keras.layers.Layer` instance of `g` inner block referred to in
paper. Detailed explanation same as above as `f` arg.
manual_grads: A `bool` [Testing Only] of whether to manually take
gradients as in Algorithm 1 or defer to autograd.
**kwargs: Additional keyword arguments to be passed.
"""
super(ReversibleLayer, self).__init__(**kwargs)
self._f = f
self._g = g
self._manual_grads = manual_grads
if tf.keras.backend.image_data_format() == 'channels_last':
self._axis = -1
else:
self._axis = 1
def get_config(self) -> Dict[str, Any]:
config = {
'f': self._f,
'g': self._g,
'manual_grads': self._manual_grads,
}
base_config = super(ReversibleLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def _ckpt_non_trainable_vars(self):
self._f_non_trainable_vars = [
v.read_value() for v in self._f.non_trainable_variables
]
self._g_non_trainable_vars = [
v.read_value() for v in self._g.non_trainable_variables
]
def _load_ckpt_non_trainable_vars(self):
for v, v_chkpt in zip(self._f.non_trainable_variables,
self._f_non_trainable_vars):
v.assign(v_chkpt)
for v, v_chkpt in zip(self._g.non_trainable_variables,
self._g_non_trainable_vars):
v.assign(v_chkpt)
def call(self,
inputs: tf.Tensor,
training: Optional[bool] = None) -> tf.Tensor:
@tf.custom_gradient
def reversible(
x: tf.Tensor
) -> Tuple[tf.Tensor, Callable[[Any], Tuple[List[tf.Tensor],
List[tf.Tensor]]]]:
"""Implements Algorithm 1 in the RevNet paper.
Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
The Reversible Residual Network: Backpropagation Without Storing
Activations.
(https://arxiv.org/pdf/1707.04585.pdf)
Args:
x: An input `tf.Tensor.
Returns:
y: The output [y1; y2] in Algorithm 1.
grad_fn: A callable function that computes the gradients.
"""
with tf.GradientTape() as fwdtape:
fwdtape.watch(x)
x1, x2 = tf.split(x, num_or_size_splits=2, axis=self._axis)
f_x2 = self._f(x2, training=training)
x1_down = _maybe_downsample(x1, f_x2.shape[self._axis], self._f.strides,
self._axis)
z1 = f_x2 + x1_down
g_z1 = self._g(z1, training=training)
x2_down = _maybe_downsample(x2, g_z1.shape[self._axis], self._f.strides,
self._axis)
y2 = x2_down + g_z1
# Equation 8: https://arxiv.org/pdf/1707.04585.pdf
# Decouple y1 and z1 so that their derivatives are different.
y1 = tf.identity(z1)
y = tf.concat([y1, y2], axis=self._axis)
irreversible = ((self._f.strides != 1 or self._g.strides != 1) or
(y.shape[self._axis] != inputs.shape[self._axis]))
# Checkpointing moving mean/variance for batch normalization layers
# as they shouldn't be updated during the custom gradient pass of f/g.
self._ckpt_non_trainable_vars()
def grad_fn(
dy: tf.Tensor,
variables: Optional[List[tf.Variable]] = None,
) -> Tuple[List[tf.Tensor], List[tf.Tensor]]:
"""Given dy calculate (dy/dx)|_{x_{input}} using f/g."""
if irreversible or not self._manual_grads:
grads_combined = fwdtape.gradient(
y, [x] + variables, output_gradients=dy)
dx = grads_combined[0]
grad_vars = grads_combined[1:]
else:
y1_nograd = tf.stop_gradient(y1)
y2_nograd = tf.stop_gradient(y2)
dy1, dy2 = tf.split(dy, num_or_size_splits=2, axis=self._axis)
# Index mapping from self.f/g.trainable_variables to grad_fn
# input `variables` kwarg so that we can reorder dwf + dwg
# variable gradient list to match `variables` order.
f_var_refs = [v.ref() for v in self._f.trainable_variables]
g_var_refs = [v.ref() for v in self._g.trainable_variables]
fg_var_refs = f_var_refs + g_var_refs
self_to_var_index = [fg_var_refs.index(v.ref()) for v in variables]
# Algorithm 1 in paper (line # documented in-line)
z1 = y1_nograd # line 2
with tf.GradientTape() as gtape:
gtape.watch(z1)
g_z1 = self._g(z1, training=training)
x2 = y2_nograd - g_z1 # line 3
with tf.GradientTape() as ftape:
ftape.watch(x2)
f_x2 = self._f(x2, training=training)
x1 = z1 - f_x2 # pylint: disable=unused-variable # line 4
# Compute gradients
g_grads_combined = gtape.gradient(
g_z1, [z1] + self._g.trainable_variables, output_gradients=dy2)
dz1 = dy1 + g_grads_combined[0] # line 5
dwg = g_grads_combined[1:] # line 9
f_grads_combined = ftape.gradient(
f_x2, [x2] + self._f.trainable_variables, output_gradients=dz1)
dx2 = dy2 + f_grads_combined[0] # line 6
dwf = f_grads_combined[1:] # line 8
dx1 = dz1 # line 7
# Pack the input and variable gradients.
dx = tf.concat([dx1, dx2], axis=self._axis)
grad_vars = dwf + dwg
# Reorder gradients (trainable_variables to variables kwarg order)
grad_vars = [grad_vars[i] for i in self_to_var_index]
# Restore batch normalization moving mean/variance for correctness.
self._load_ckpt_non_trainable_vars()
return dx, grad_vars # grad_fn end
return y, grad_fn # reversible end
activations = reversible(inputs)
return activations
@tf.keras.utils.register_keras_serializable(package='Vision')
class DepthwiseSeparableConvBlock(tf.keras.layers.Layer):
"""Creates an depthwise separable convolution block with batch normalization."""
def __init__(
self,
filters: int,
kernel_size: int = 3,
strides: int = 1,
regularize_depthwise=False,
activation: Text = 'relu6',
kernel_initializer: Text = 'VarianceScaling',
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
dilation_rate: int = 1,
use_sync_bn: bool = False,
norm_momentum: float = 0.99,
norm_epsilon: float = 0.001,
**kwargs):
"""Initializes a convolution block with batch normalization.
Args:
filters: An `int` number of filters for the first two convolutions. Note
that the third and final convolution will use 4 times as many filters.
kernel_size: An `int` that specifies the height and width of the 2D
convolution window.
strides: An `int` of block stride. If greater than 1, this block will
ultimately downsample the input.
regularize_depthwise: A `bool`. If Ture, apply regularization on
depthwise.
activation: A `str` name of the activation function.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
dilation_rate: An `int` or tuple/list of 2 `int`, specifying the dilation
rate to use for dilated convolution. Can be a single integer to specify
the same value for all spatial dimensions.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
**kwargs: Additional keyword arguments to be passed.
"""
super(DepthwiseSeparableConvBlock, self).__init__(**kwargs)
self._filters = filters
self._kernel_size = kernel_size
self._strides = strides
self._activation = activation
self._regularize_depthwise = regularize_depthwise
self._kernel_initializer = kernel_initializer
self._kernel_regularizer = kernel_regularizer
self._dilation_rate = dilation_rate
self._use_sync_bn = use_sync_bn
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation_fn = tf_utils.get_activation(activation)
if regularize_depthwise:
self._depthsize_regularizer = kernel_regularizer
else:
self._depthsize_regularizer = None
def get_config(self):
config = {
'filters': self._filters,
'strides': self._strides,
'regularize_depthwise': self._regularize_depthwise,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'activation': self._activation,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon
}
base_config = super(DepthwiseSeparableConvBlock, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def build(self, input_shape):
self._dwconv0 = tf.keras.layers.DepthwiseConv2D(
kernel_size=self._kernel_size,
strides=self._strides,
padding='same',
depth_multiplier=1,
dilation_rate=self._dilation_rate,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._depthsize_regularizer,
use_bias=False)
self._norm0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._conv1 = tf.keras.layers.Conv2D(
filters=self._filters,
kernel_size=1,
strides=1,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer)
self._norm1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
super(DepthwiseSeparableConvBlock, self).build(input_shape)
def call(self, inputs, training=None):
x = self._dwconv0(inputs)
x = self._norm0(x)
x = self._activation_fn(x)
x = self._conv1(x)
x = self._norm1(x)
return self._activation_fn(x)
@tf.keras.utils.register_keras_serializable(package='Vision')
class TuckerConvBlock(tf.keras.layers.Layer):
"""An Tucker block (generalized bottleneck)."""
def __init__(self,
in_filters,
out_filters,
input_compression_ratio,
output_compression_ratio,
strides,
kernel_size=3,
stochastic_depth_drop_rate=None,
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_regularizer=None,
activation='relu',
use_sync_bn=False,
divisible_by=1,
use_residual=True,
norm_momentum=0.99,
norm_epsilon=0.001,
**kwargs):
"""Initializes an inverted bottleneck block with BN after convolutions.
Args:
in_filters: An `int` number of filters of the input tensor.
out_filters: An `int` number of filters of the output tensor.
input_compression_ratio: An `float` of compression ratio for
input filters.
output_compression_ratio: An `float` of compression ratio for
output filters.
strides: An `int` block stride. If greater than 1, this block will
ultimately downsample the input.
kernel_size: An `int` kernel_size of the depthwise conv layer.
stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
the stochastic depth layer.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
Default to None.
activation: A `str` name of the activation function.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
divisible_by: An `int` that ensures all inner dimensions are divisible by
this number.
use_residual: A `bool` of whether to include residual connection between
input and output.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
**kwargs: Additional keyword arguments to be passed.
"""
super(TuckerConvBlock, self).__init__(**kwargs)
self._in_filters = in_filters
self._out_filters = out_filters
self._input_compression_ratio = input_compression_ratio
self._output_compression_ratio = output_compression_ratio
self._strides = strides
self._kernel_size = kernel_size
self._divisible_by = divisible_by
self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
self._use_sync_bn = use_sync_bn
self._use_residual = use_residual
self._activation = activation
self._kernel_initializer = kernel_initializer
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
def build(self, input_shape):
input_compressed_filters = nn_layers.make_divisible(
value=self._in_filters * self._input_compression_ratio,
divisor=self._divisible_by,
round_down_protect=False)
self._conv0 = tf.keras.layers.Conv2D(
filters=input_compressed_filters,
kernel_size=1,
strides=1,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._activation_layer0 = tf_utils.get_activation(
self._activation, use_keras_layer=True)
output_compressed_filters = nn_layers.make_divisible(
value=self._out_filters * self._output_compression_ratio,
divisor=self._divisible_by,
round_down_protect=False)
self._conv1 = tf.keras.layers.Conv2D(
filters=output_compressed_filters,
kernel_size=self._kernel_size,
strides=self._strides,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._activation_layer1 = tf_utils.get_activation(
self._activation, use_keras_layer=True)
# Last 1x1 conv.
self._conv2 = tf.keras.layers.Conv2D(
filters=self._out_filters,
kernel_size=1,
strides=1,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm2 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
if self._stochastic_depth_drop_rate:
self._stochastic_depth = nn_layers.StochasticDepth(
self._stochastic_depth_drop_rate)
else:
self._stochastic_depth = None
self._add = tf.keras.layers.Add()
super(TuckerConvBlock, self).build(input_shape)
def get_config(self):
config = {
'in_filters': self._in_filters,
'out_filters': self._out_filters,
'input_compression_ratio': self._input_compression_ratio,
'output_compression_ratio': self._output_compression_ratio,
'strides': self._strides,
'kernel_size': self._kernel_size,
'divisible_by': self._divisible_by,
'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_regularizer': self._bias_regularizer,
'activation': self._activation,
'use_sync_bn': self._use_sync_bn,
'use_residual': self._use_residual,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon
}
base_config = super(TuckerConvBlock, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, training=None):
shortcut = inputs
x = self._conv0(inputs)
x = self._norm0(x)
x = self._activation_layer0(x)
x = self._conv1(x)
x = self._norm1(x)
x = self._activation_layer1(x)
x = self._conv2(x)
x = self._norm2(x)
if (self._use_residual and
self._in_filters == self._out_filters and
self._strides == 1):
if self._stochastic_depth:
x = self._stochastic_depth(x, training=training)
x = self._add([x, shortcut])
return x
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains common building blocks for 3D networks."""
# Import libraries
import tensorflow as tf
from official.modeling import tf_utils
from official.vision.modeling.layers import nn_layers
@tf.keras.utils.register_keras_serializable(package='Vision')
class SelfGating(tf.keras.layers.Layer):
"""Feature gating as used in S3D-G.
This implements the S3D-G network from:
Saining Xie, Chen Sun, Jonathan Huang, Zhuowen Tu, Kevin Murphy.
Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video
Classification.
(https://arxiv.org/pdf/1712.04851.pdf)
"""
def __init__(self, filters, **kwargs):
"""Initializes a self-gating layer.
Args:
filters: An `int` number of filters for the convolutional layer.
**kwargs: Additional keyword arguments to be passed.
"""
super(SelfGating, self).__init__(**kwargs)
self._filters = filters
def build(self, input_shape):
self._spatial_temporal_average = tf.keras.layers.GlobalAveragePooling3D()
# No BN and activation after conv.
self._transformer_w = tf.keras.layers.Conv3D(
filters=self._filters,
kernel_size=[1, 1, 1],
use_bias=True,
kernel_initializer=tf.keras.initializers.TruncatedNormal(
mean=0.0, stddev=0.01))
super(SelfGating, self).build(input_shape)
def call(self, inputs):
x = self._spatial_temporal_average(inputs)
x = tf.expand_dims(x, 1)
x = tf.expand_dims(x, 2)
x = tf.expand_dims(x, 3)
x = self._transformer_w(x)
x = tf.nn.sigmoid(x)
return tf.math.multiply(x, inputs)
@tf.keras.utils.register_keras_serializable(package='Vision')
class BottleneckBlock3D(tf.keras.layers.Layer):
"""Creates a 3D bottleneck block."""
def __init__(self,
filters,
temporal_kernel_size,
temporal_strides,
spatial_strides,
stochastic_depth_drop_rate=0.0,
se_ratio=None,
use_self_gating=False,
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_regularizer=None,
activation='relu',
use_sync_bn=False,
norm_momentum=0.99,
norm_epsilon=0.001,
**kwargs):
"""Initializes a 3D bottleneck block with BN after convolutions.
Args:
filters: An `int` number of filters for the first two convolutions. Note
that the third and final convolution will use 4 times as many filters.
temporal_kernel_size: An `int` of kernel size for the temporal
convolutional layer.
temporal_strides: An `int` of ftemporal stride for the temporal
convolutional layer.
spatial_strides: An `int` of spatial stride for the spatial convolutional
layer.
stochastic_depth_drop_rate: A `float` or None. If not None, drop rate for
the stochastic depth layer.
se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
use_self_gating: A `bool` of whether to apply self-gating module or not.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
Default to None.
activation: A `str` name of the activation function.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
**kwargs: Additional keyword arguments to be passed.
"""
super(BottleneckBlock3D, self).__init__(**kwargs)
self._filters = filters
self._temporal_kernel_size = temporal_kernel_size
self._spatial_strides = spatial_strides
self._temporal_strides = temporal_strides
self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
self._use_self_gating = use_self_gating
self._se_ratio = se_ratio
self._use_sync_bn = use_sync_bn
self._activation = activation
self._kernel_initializer = kernel_initializer
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation_fn = tf_utils.get_activation(activation)
def build(self, input_shape):
self._shortcut_maxpool = tf.keras.layers.MaxPool3D(
pool_size=[1, 1, 1],
strides=[
self._temporal_strides, self._spatial_strides, self._spatial_strides
])
self._shortcut_conv = tf.keras.layers.Conv3D(
filters=4 * self._filters,
kernel_size=1,
strides=[
self._temporal_strides, self._spatial_strides, self._spatial_strides
],
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._temporal_conv = tf.keras.layers.Conv3D(
filters=self._filters,
kernel_size=[self._temporal_kernel_size, 1, 1],
strides=[self._temporal_strides, 1, 1],
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._spatial_conv = tf.keras.layers.Conv3D(
filters=self._filters,
kernel_size=[1, 3, 3],
strides=[1, self._spatial_strides, self._spatial_strides],
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm2 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._expand_conv = tf.keras.layers.Conv3D(
filters=4 * self._filters,
kernel_size=[1, 1, 1],
strides=[1, 1, 1],
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm3 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
self._squeeze_excitation = nn_layers.SqueezeExcitation(
in_filters=self._filters * 4,
out_filters=self._filters * 4,
se_ratio=self._se_ratio,
use_3d_input=True,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
else:
self._squeeze_excitation = None
if self._stochastic_depth_drop_rate:
self._stochastic_depth = nn_layers.StochasticDepth(
self._stochastic_depth_drop_rate)
else:
self._stochastic_depth = None
if self._use_self_gating:
self._self_gating = SelfGating(filters=4 * self._filters)
else:
self._self_gating = None
super(BottleneckBlock3D, self).build(input_shape)
def get_config(self):
config = {
'filters': self._filters,
'temporal_kernel_size': self._temporal_kernel_size,
'temporal_strides': self._temporal_strides,
'spatial_strides': self._spatial_strides,
'use_self_gating': self._use_self_gating,
'se_ratio': self._se_ratio,
'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_regularizer': self._bias_regularizer,
'activation': self._activation,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon
}
base_config = super(BottleneckBlock3D, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, training=None):
in_filters = inputs.shape.as_list()[-1]
if in_filters == 4 * self._filters:
if self._temporal_strides == 1 and self._spatial_strides == 1:
shortcut = inputs
else:
shortcut = self._shortcut_maxpool(inputs)
else:
shortcut = self._shortcut_conv(inputs)
shortcut = self._norm0(shortcut)
x = self._temporal_conv(inputs)
x = self._norm1(x)
x = self._activation_fn(x)
x = self._spatial_conv(x)
x = self._norm2(x)
x = self._activation_fn(x)
x = self._expand_conv(x)
x = self._norm3(x)
# Apply self-gating, SE, stochastic depth.
if self._self_gating:
x = self._self_gating(x)
if self._squeeze_excitation:
x = self._squeeze_excitation(x)
if self._stochastic_depth:
x = self._stochastic_depth(x, training=training)
# Apply activation before additional modules.
x = self._activation_fn(x + shortcut)
return x
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for resnet."""
# Import libraries
from absl.testing import parameterized
import tensorflow as tf
from official.vision.modeling.layers import nn_blocks_3d
class NNBlocksTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(nn_blocks_3d.BottleneckBlock3D, 1, 1, 2, True, 0.2, 0.1),
(nn_blocks_3d.BottleneckBlock3D, 3, 2, 1, False, 0.0, 0.0),
)
def test_bottleneck_block_creation(self, block_fn, temporal_kernel_size,
temporal_strides, spatial_strides,
use_self_gating, se_ratio,
stochastic_depth):
temporal_size = 16
spatial_size = 128
filters = 256
inputs = tf.keras.Input(
shape=(temporal_size, spatial_size, spatial_size, filters * 4),
batch_size=1)
block = block_fn(
filters=filters,
temporal_kernel_size=temporal_kernel_size,
temporal_strides=temporal_strides,
spatial_strides=spatial_strides,
use_self_gating=use_self_gating,
se_ratio=se_ratio,
stochastic_depth_drop_rate=stochastic_depth)
features = block(inputs)
self.assertAllEqual([
1, temporal_size // temporal_strides, spatial_size // spatial_strides,
spatial_size // spatial_strides, filters * 4
], features.shape.as_list())
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for nn_blocks."""
from typing import Any, Iterable, Tuple
# Import libraries
from absl.testing import parameterized
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.vision.modeling.layers import nn_blocks
def distribution_strategy_combinations() -> Iterable[Tuple[Any, ...]]:
"""Returns the combinations of end-to-end tests to run."""
return combinations.combine(
distribution=[
strategy_combinations.default_strategy,
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],)
class NNBlocksTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(nn_blocks.ResidualBlock, 1, False, 0.0, None),
(nn_blocks.ResidualBlock, 2, True, 0.2, 0.25),
)
def test_residual_block_creation(self, block_fn, strides, use_projection,
stochastic_depth_drop_rate, se_ratio):
input_size = 128
filter_size = 256
inputs = tf.keras.Input(
shape=(input_size, input_size, filter_size), batch_size=1)
block = block_fn(
filter_size,
strides,
use_projection=use_projection,
se_ratio=se_ratio,
stochastic_depth_drop_rate=stochastic_depth_drop_rate,
)
features = block(inputs)
self.assertAllEqual(
[1, input_size // strides, input_size // strides, filter_size],
features.shape.as_list())
@parameterized.parameters(
(nn_blocks.BottleneckBlock, 1, False, 0.0, None),
(nn_blocks.BottleneckBlock, 2, True, 0.2, 0.25),
)
def test_bottleneck_block_creation(self, block_fn, strides, use_projection,
stochastic_depth_drop_rate, se_ratio):
input_size = 128
filter_size = 256
inputs = tf.keras.Input(
shape=(input_size, input_size, filter_size * 4), batch_size=1)
block = block_fn(
filter_size,
strides,
use_projection=use_projection,
se_ratio=se_ratio,
stochastic_depth_drop_rate=stochastic_depth_drop_rate)
features = block(inputs)
self.assertAllEqual(
[1, input_size // strides, input_size // strides, filter_size * 4],
features.shape.as_list())
@parameterized.parameters(
(nn_blocks.InvertedBottleneckBlock, 1, 1, None, None),
(nn_blocks.InvertedBottleneckBlock, 6, 1, None, None),
(nn_blocks.InvertedBottleneckBlock, 1, 2, None, None),
(nn_blocks.InvertedBottleneckBlock, 1, 1, 0.2, None),
(nn_blocks.InvertedBottleneckBlock, 1, 1, None, 0.2),
)
def test_invertedbottleneck_block_creation(self, block_fn, expand_ratio,
strides, se_ratio,
stochastic_depth_drop_rate):
input_size = 128
in_filters = 24
out_filters = 40
inputs = tf.keras.Input(
shape=(input_size, input_size, in_filters), batch_size=1)
block = block_fn(
in_filters=in_filters,
out_filters=out_filters,
expand_ratio=expand_ratio,
strides=strides,
se_ratio=se_ratio,
stochastic_depth_drop_rate=stochastic_depth_drop_rate)
features = block(inputs)
self.assertAllEqual(
[1, input_size // strides, input_size // strides, out_filters],
features.shape.as_list())
@parameterized.parameters(
(nn_blocks.TuckerConvBlock, 1, 0.25, 0.25),
(nn_blocks.TuckerConvBlock, 2, 0.25, 0.25),
)
def test_tucker_conv_block(
self, block_fn, strides,
input_compression_ratio, output_compression_ratio):
input_size = 128
in_filters = 24
out_filters = 24
inputs = tf.keras.Input(
shape=(input_size, input_size, in_filters), batch_size=1)
block = block_fn(
in_filters=in_filters,
out_filters=out_filters,
input_compression_ratio=input_compression_ratio,
output_compression_ratio=output_compression_ratio,
strides=strides)
features = block(inputs)
self.assertAllEqual(
[1, input_size // strides, input_size // strides, out_filters],
features.shape.as_list())
class ResidualInnerTest(parameterized.TestCase, tf.test.TestCase):
@combinations.generate(distribution_strategy_combinations())
def test_shape(self, distribution):
bsz, h, w, c = 8, 32, 32, 32
filters = 64
strides = 2
input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
with distribution.scope():
test_layer = nn_blocks.ResidualInner(filters, strides)
output = test_layer(input_tensor)
expected_output_shape = [bsz, h // strides, w // strides, filters]
self.assertEqual(expected_output_shape, output.shape.as_list())
class BottleneckResidualInnerTest(parameterized.TestCase, tf.test.TestCase):
@combinations.generate(distribution_strategy_combinations())
def test_shape(self, distribution):
bsz, h, w, c = 8, 32, 32, 32
filters = 64
strides = 2
input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
with distribution.scope():
test_layer = nn_blocks.BottleneckResidualInner(filters, strides)
output = test_layer(input_tensor)
expected_output_shape = [bsz, h // strides, w // strides, filters * 4]
self.assertEqual(expected_output_shape, output.shape.as_list())
class DepthwiseSeparableConvBlockTest(parameterized.TestCase, tf.test.TestCase):
@combinations.generate(distribution_strategy_combinations())
def test_shape(self, distribution):
batch_size, height, width, num_channels = 8, 32, 32, 32
num_filters = 64
strides = 2
input_tensor = tf.random.normal(
shape=[batch_size, height, width, num_channels])
with distribution.scope():
block = nn_blocks.DepthwiseSeparableConvBlock(
num_filters, strides=strides)
config_dict = block.get_config()
recreate_block = nn_blocks.DepthwiseSeparableConvBlock(**config_dict)
output_tensor = block(input_tensor)
expected_output_shape = [
batch_size, height // strides, width // strides, num_filters
]
self.assertEqual(output_tensor.shape.as_list(), expected_output_shape)
output_tensor = recreate_block(input_tensor)
self.assertEqual(output_tensor.shape.as_list(), expected_output_shape)
class ReversibleLayerTest(parameterized.TestCase, tf.test.TestCase):
@combinations.generate(distribution_strategy_combinations())
def test_downsampling_non_reversible_step(self, distribution):
bsz, h, w, c = 8, 32, 32, 32
filters = 64
strides = 2
input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
with distribution.scope():
f = nn_blocks.ResidualInner(
filters=filters // 2, strides=strides, batch_norm_first=True)
g = nn_blocks.ResidualInner(
filters=filters // 2, strides=1, batch_norm_first=True)
test_layer = nn_blocks.ReversibleLayer(f, g)
test_layer.build(input_tensor.shape)
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
@tf.function
def step_fn():
with tf.GradientTape() as tape:
output = test_layer(input_tensor, training=True)
grads = tape.gradient(output, test_layer.trainable_variables)
# Test applying gradients with optimizer works
optimizer.apply_gradients(zip(grads, test_layer.trainable_variables))
return output
replica_output = distribution.run(step_fn)
outputs = distribution.experimental_local_results(replica_output)
# Assert forward pass shape
expected_output_shape = [bsz, h // strides, w // strides, filters]
for output in outputs:
self.assertEqual(expected_output_shape, output.shape.as_list())
@combinations.generate(distribution_strategy_combinations())
def test_reversible_step(self, distribution):
# Reversible layers satisfy: (a) strides = 1 (b) in_filter = out_filter
bsz, h, w, c = 8, 32, 32, 32
filters = c
strides = 1
input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
with distribution.scope():
f = nn_blocks.ResidualInner(
filters=filters // 2, strides=strides, batch_norm_first=False)
g = nn_blocks.ResidualInner(
filters=filters // 2, strides=1, batch_norm_first=False)
test_layer = nn_blocks.ReversibleLayer(f, g)
test_layer(input_tensor, training=False) # init weights
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
@tf.function
def step_fn():
with tf.GradientTape() as tape:
output = test_layer(input_tensor, training=True)
grads = tape.gradient(output, test_layer.trainable_variables)
# Test applying gradients with optimizer works
optimizer.apply_gradients(zip(grads, test_layer.trainable_variables))
return output
@tf.function
def fwd():
test_layer(input_tensor)
distribution.run(fwd) # Initialize variables
prev_variables = tf.identity_n(test_layer.trainable_variables)
replica_output = distribution.run(step_fn)
outputs = distribution.experimental_local_results(replica_output)
# Assert variables values have changed values
for v0, v1 in zip(prev_variables, test_layer.trainable_variables):
self.assertNotAllEqual(v0, v1)
# Assert forward pass shape
expected_output_shape = [bsz, h // strides, w // strides, filters]
for output in outputs:
self.assertEqual(expected_output_shape, output.shape.as_list())
@combinations.generate(distribution_strategy_combinations())
def test_manual_gradients_correctness(self, distribution):
bsz, h, w, c = 8, 32, 32, 32
filters = c
strides = 1
input_tensor = tf.random.uniform(shape=[bsz, h, w, c * 4]) # bottleneck
with distribution.scope():
f_manual = nn_blocks.BottleneckResidualInner(
filters=filters // 2, strides=strides, batch_norm_first=False)
g_manual = nn_blocks.BottleneckResidualInner(
filters=filters // 2, strides=1, batch_norm_first=False)
manual_grad_layer = nn_blocks.ReversibleLayer(f_manual, g_manual)
manual_grad_layer(input_tensor, training=False) # init weights
f_auto = nn_blocks.BottleneckResidualInner(
filters=filters // 2, strides=strides, batch_norm_first=False)
g_auto = nn_blocks.BottleneckResidualInner(
filters=filters // 2, strides=1, batch_norm_first=False)
auto_grad_layer = nn_blocks.ReversibleLayer(
f_auto, g_auto, manual_grads=False)
auto_grad_layer(input_tensor) # init weights
# Clone all weights (tf.keras.layers.Layer has no .clone())
auto_grad_layer._f.set_weights(manual_grad_layer._f.get_weights())
auto_grad_layer._g.set_weights(manual_grad_layer._g.get_weights())
@tf.function
def manual_fn():
with tf.GradientTape() as tape:
output = manual_grad_layer(input_tensor, training=True)
grads = tape.gradient(output, manual_grad_layer.trainable_variables)
return grads
@tf.function
def auto_fn():
with tf.GradientTape() as tape:
output = auto_grad_layer(input_tensor, training=True)
grads = tape.gradient(output, auto_grad_layer.trainable_variables)
return grads
manual_grads = distribution.run(manual_fn)
auto_grads = distribution.run(auto_fn)
# Assert gradients calculated manually are close to that from autograd
for manual_grad, auto_grad in zip(manual_grads, auto_grads):
self.assertAllClose(
distribution.experimental_local_results(manual_grad),
distribution.experimental_local_results(auto_grad),
atol=5e-3,
rtol=5e-3)
# Verify that BN moving mean and variance is correct.
for manual_var, auto_var in zip(manual_grad_layer.non_trainable_variables,
auto_grad_layer.non_trainable_variables):
self.assertAllClose(manual_var, auto_var)
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains common building blocks for neural networks."""
from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
from absl import logging
import tensorflow as tf
import tensorflow_addons as tfa
from official.modeling import tf_utils
from official.vision.ops import spatial_transform_ops
# Type annotations.
States = Dict[str, tf.Tensor]
Activation = Union[str, Callable]
def make_divisible(value: float,
divisor: int,
min_value: Optional[float] = None,
round_down_protect: bool = True,
) -> int:
"""This is to ensure that all layers have channels that are divisible by 8.
Args:
value: A `float` of original value.
divisor: An `int` of the divisor that need to be checked upon.
min_value: A `float` of minimum value threshold.
round_down_protect: A `bool` indicating whether round down more than 10%
will be allowed.
Returns:
The adjusted value in `int` that is divisible against divisor.
"""
if min_value is None:
min_value = divisor
new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
if round_down_protect and new_value < 0.9 * value:
new_value += divisor
return int(new_value)
def round_filters(filters: int,
multiplier: float,
divisor: int = 8,
min_depth: Optional[int] = None,
round_down_protect: bool = True,
skip: bool = False) -> int:
"""Rounds number of filters based on width multiplier."""
orig_f = filters
if skip or not multiplier:
return filters
new_filters = make_divisible(value=filters * multiplier,
divisor=divisor,
min_value=min_depth,
round_down_protect=round_down_protect)
logging.info('round_filter input=%s output=%s', orig_f, new_filters)
return int(new_filters)
def get_padding_for_kernel_size(kernel_size):
"""Compute padding size given kernel size."""
if kernel_size == 7:
return (3, 3)
elif kernel_size == 3:
return (1, 1)
else:
raise ValueError('Padding for kernel size {} not known.'.format(
kernel_size))
@tf.keras.utils.register_keras_serializable(package='Vision')
class SqueezeExcitation(tf.keras.layers.Layer):
"""Creates a squeeze and excitation layer."""
def __init__(self,
in_filters,
out_filters,
se_ratio,
divisible_by=1,
use_3d_input=False,
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_regularizer=None,
activation='relu',
gating_activation='sigmoid',
round_down_protect=True,
**kwargs):
"""Initializes a squeeze and excitation layer.
Args:
in_filters: An `int` number of filters of the input tensor.
out_filters: An `int` number of filters of the output tensor.
se_ratio: A `float` or None. If not None, se ratio for the squeeze and
excitation layer.
divisible_by: An `int` that ensures all inner dimensions are divisible by
this number.
use_3d_input: A `bool` of whether input is 2D or 3D image.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
Default to None.
activation: A `str` name of the activation function.
gating_activation: A `str` name of the activation function for final
gating function.
round_down_protect: A `bool` of whether round down more than 10% will be
allowed.
**kwargs: Additional keyword arguments to be passed.
"""
super(SqueezeExcitation, self).__init__(**kwargs)
self._in_filters = in_filters
self._out_filters = out_filters
self._se_ratio = se_ratio
self._divisible_by = divisible_by
self._round_down_protect = round_down_protect
self._use_3d_input = use_3d_input
self._activation = activation
self._gating_activation = gating_activation
self._kernel_initializer = kernel_initializer
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
if tf.keras.backend.image_data_format() == 'channels_last':
if not use_3d_input:
self._spatial_axis = [1, 2]
else:
self._spatial_axis = [1, 2, 3]
else:
if not use_3d_input:
self._spatial_axis = [2, 3]
else:
self._spatial_axis = [2, 3, 4]
self._activation_fn = tf_utils.get_activation(activation)
self._gating_activation_fn = tf_utils.get_activation(gating_activation)
def build(self, input_shape):
num_reduced_filters = make_divisible(
max(1, int(self._in_filters * self._se_ratio)),
divisor=self._divisible_by,
round_down_protect=self._round_down_protect)
self._se_reduce = tf.keras.layers.Conv2D(
filters=num_reduced_filters,
kernel_size=1,
strides=1,
padding='same',
use_bias=True,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._se_expand = tf.keras.layers.Conv2D(
filters=self._out_filters,
kernel_size=1,
strides=1,
padding='same',
use_bias=True,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
super(SqueezeExcitation, self).build(input_shape)
def get_config(self):
config = {
'in_filters': self._in_filters,
'out_filters': self._out_filters,
'se_ratio': self._se_ratio,
'divisible_by': self._divisible_by,
'use_3d_input': self._use_3d_input,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_regularizer': self._bias_regularizer,
'activation': self._activation,
'gating_activation': self._gating_activation,
'round_down_protect': self._round_down_protect,
}
base_config = super(SqueezeExcitation, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
x = tf.reduce_mean(inputs, self._spatial_axis, keepdims=True)
x = self._activation_fn(self._se_reduce(x))
x = self._gating_activation_fn(self._se_expand(x))
return x * inputs
def get_stochastic_depth_rate(init_rate, i, n):
"""Get drop connect rate for the ith block.
Args:
init_rate: A `float` of initial drop rate.
i: An `int` of order of the current block.
n: An `int` total number of blocks.
Returns:
Drop rate of the ith block.
"""
if init_rate is not None:
if init_rate < 0 or init_rate > 1:
raise ValueError('Initial drop rate must be within 0 and 1.')
rate = init_rate * float(i) / n
else:
rate = None
return rate
@tf.keras.utils.register_keras_serializable(package='Vision')
class StochasticDepth(tf.keras.layers.Layer):
"""Creates a stochastic depth layer."""
def __init__(self, stochastic_depth_drop_rate, **kwargs):
"""Initializes a stochastic depth layer.
Args:
stochastic_depth_drop_rate: A `float` of drop rate.
**kwargs: Additional keyword arguments to be passed.
Returns:
A output `tf.Tensor` of which should have the same shape as input.
"""
super(StochasticDepth, self).__init__(**kwargs)
self._drop_rate = stochastic_depth_drop_rate
def get_config(self):
config = {'drop_rate': self._drop_rate}
base_config = super(StochasticDepth, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, training=None):
if training is None:
training = tf.keras.backend.learning_phase()
if not training or self._drop_rate is None or self._drop_rate == 0:
return inputs
keep_prob = 1.0 - self._drop_rate
batch_size = tf.shape(inputs)[0]
random_tensor = keep_prob
random_tensor += tf.random.uniform(
[batch_size] + [1] * (inputs.shape.rank - 1), dtype=inputs.dtype)
binary_tensor = tf.floor(random_tensor)
output = tf.math.divide(inputs, keep_prob) * binary_tensor
return output
@tf.keras.utils.register_keras_serializable(package='Vision')
def pyramid_feature_fusion(inputs, target_level):
"""Fuses all feature maps in the feature pyramid at the target level.
Args:
inputs: A dictionary containing the feature pyramid. The size of the input
tensor needs to be fixed.
target_level: An `int` of the target feature level for feature fusion.
Returns:
A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width,
feature_channel].
"""
# Convert keys to int.
pyramid_feats = {int(k): v for k, v in inputs.items()}
min_level = min(pyramid_feats.keys())
max_level = max(pyramid_feats.keys())
resampled_feats = []
for l in range(min_level, max_level + 1):
if l == target_level:
resampled_feats.append(pyramid_feats[l])
else:
feat = pyramid_feats[l]
target_size = list(feat.shape[1:3])
target_size[0] *= 2**(l - target_level)
target_size[1] *= 2**(l - target_level)
# Casts feat to float32 so the resize op can be run on TPU.
feat = tf.cast(feat, tf.float32)
feat = tf.image.resize(
feat, size=target_size, method=tf.image.ResizeMethod.BILINEAR)
# Casts it back to be compatible with the rest opetations.
feat = tf.cast(feat, pyramid_feats[l].dtype)
resampled_feats.append(feat)
return tf.math.add_n(resampled_feats)
class PanopticFPNFusion(tf.keras.Model):
"""Creates a Panoptic FPN feature Fusion layer.
This implements feature fusion for semantic segmentation head from the paper:
Alexander Kirillov, Ross Girshick, Kaiming He and Piotr Dollar.
Panoptic Feature Pyramid Networks.
(https://arxiv.org/pdf/1901.02446.pdf)
"""
def __init__(
self,
min_level: int = 2,
max_level: int = 5,
target_level: int = 2,
num_filters: int = 128,
num_fpn_filters: int = 256,
activation: str = 'relu',
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
**kwargs):
"""Initializes panoptic FPN feature fusion layer.
Args:
min_level: An `int` of minimum level to use in feature fusion.
max_level: An `int` of maximum level to use in feature fusion.
target_level: An `int` of the target feature level for feature fusion.
num_filters: An `int` number of filters in conv2d layers.
num_fpn_filters: An `int` number of filters in the FPN outputs
activation: A `str` name of the activation function.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default is None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
**kwargs: Additional keyword arguments to be passed.
Returns:
A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width,
feature_channel].
"""
if target_level > max_level:
raise ValueError('target_level should be less than max_level')
self._config_dict = {
'min_level': min_level,
'max_level': max_level,
'target_level': target_level,
'num_filters': num_filters,
'num_fpn_filters': num_fpn_filters,
'activation': activation,
'kernel_regularizer': kernel_regularizer,
'bias_regularizer': bias_regularizer,
}
norm = tfa.layers.GroupNormalization
conv2d = tf.keras.layers.Conv2D
activation_fn = tf_utils.get_activation(activation)
if tf.keras.backend.image_data_format() == 'channels_last':
norm_axis = -1
else:
norm_axis = 1
inputs = self._build_inputs(num_fpn_filters, min_level, max_level)
upscaled_features = []
for level in range(min_level, max_level + 1):
num_conv_layers = max(1, level - target_level)
x = inputs[str(level)]
for i in range(num_conv_layers):
x = conv2d(
filters=num_filters,
kernel_size=3,
padding='same',
kernel_initializer=tf.keras.initializers.VarianceScaling(),
kernel_regularizer=kernel_regularizer,
bias_regularizer=bias_regularizer)(x)
x = norm(groups=32, axis=norm_axis)(x)
x = activation_fn(x)
if level != target_level:
x = spatial_transform_ops.nearest_upsampling(x, scale=2)
upscaled_features.append(x)
fused_features = tf.math.add_n(upscaled_features)
self._output_specs = {str(target_level): fused_features.get_shape()}
super(PanopticFPNFusion, self).__init__(
inputs=inputs, outputs=fused_features, **kwargs)
def _build_inputs(self, num_filters: int,
min_level: int, max_level: int):
inputs = {}
for level in range(min_level, max_level + 1):
inputs[str(level)] = tf.keras.Input(shape=[None, None, num_filters])
return inputs
def get_config(self) -> Mapping[str, Any]:
return self._config_dict
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
@property
def output_specs(self) -> Mapping[str, tf.TensorShape]:
"""A dict of {level: TensorShape} pairs for the model output."""
return self._output_specs
@tf.keras.utils.register_keras_serializable(package='Vision')
class Scale(tf.keras.layers.Layer):
"""Scales the input by a trainable scalar weight.
This is useful for applying ReZero to layers, which improves convergence
speed. This implements the paper:
ReZero is All You Need: Fast Convergence at Large Depth.
(https://arxiv.org/pdf/2003.04887.pdf).
"""
def __init__(
self,
initializer: tf.keras.initializers.Initializer = 'ones',
regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
**kwargs):
"""Initializes a scale layer.
Args:
initializer: A `str` of initializer for the scalar weight.
regularizer: A `tf.keras.regularizers.Regularizer` for the scalar weight.
**kwargs: Additional keyword arguments to be passed to this layer.
Returns:
An `tf.Tensor` of which should have the same shape as input.
"""
super(Scale, self).__init__(**kwargs)
self._initializer = initializer
self._regularizer = regularizer
self._scale = self.add_weight(
name='scale',
shape=[],
dtype=self.dtype,
initializer=self._initializer,
regularizer=self._regularizer,
trainable=True)
def get_config(self):
"""Returns a dictionary containing the config used for initialization."""
config = {
'initializer': self._initializer,
'regularizer': self._regularizer,
}
base_config = super(Scale, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
"""Calls the layer with the given inputs."""
scale = tf.cast(self._scale, inputs.dtype)
return scale * inputs
@tf.keras.utils.register_keras_serializable(package='Vision')
class TemporalSoftmaxPool(tf.keras.layers.Layer):
"""Creates a network layer corresponding to temporal softmax pooling.
This is useful for multi-class logits (used in e.g., Charades). Modified from
AssembleNet Charades evaluation from:
Michael S. Ryoo, AJ Piergiovanni, Mingxing Tan, Anelia Angelova.
AssembleNet: Searching for Multi-Stream Neural Connectivity in Video
Architectures.
(https://arxiv.org/pdf/1905.13209.pdf).
"""
def call(self, inputs):
"""Calls the layer with the given inputs."""
assert inputs.shape.rank in (3, 4, 5)
frames = tf.shape(inputs)[1]
pre_logits = inputs / tf.sqrt(tf.cast(frames, inputs.dtype))
activations = tf.nn.softmax(pre_logits, axis=1)
outputs = inputs * activations
return outputs
@tf.keras.utils.register_keras_serializable(package='Vision')
class PositionalEncoding(tf.keras.layers.Layer):
"""Creates a network layer that adds a sinusoidal positional encoding.
Positional encoding is incremented across frames, and is added to the input.
The positional encoding is first weighted at 0 so that the network can choose
to ignore it. This implements:
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin.
Attention Is All You Need.
(https://arxiv.org/pdf/1706.03762.pdf).
"""
def __init__(self,
initializer: tf.keras.initializers.Initializer = 'zeros',
cache_encoding: bool = False,
state_prefix: Optional[str] = None,
**kwargs):
"""Initializes positional encoding.
Args:
initializer: A `str` of initializer for weighting the positional encoding.
cache_encoding: A `bool`. If True, cache the positional encoding tensor
after calling build. Otherwise, rebuild the tensor for every call.
Setting this to False can be useful when we want to input a variable
number of frames, so the positional encoding tensor can change shape.
state_prefix: a prefix string to identify states.
**kwargs: Additional keyword arguments to be passed to this layer.
Returns:
A `tf.Tensor` of which should have the same shape as input.
"""
super(PositionalEncoding, self).__init__(**kwargs)
self._initializer = initializer
self._cache_encoding = cache_encoding
self._pos_encoding = None
self._rezero = Scale(initializer=initializer, name='rezero')
state_prefix = state_prefix if state_prefix is not None else ''
self._state_prefix = state_prefix
self._frame_count_name = f'{state_prefix}_pos_enc_frame_count'
def get_config(self):
"""Returns a dictionary containing the config used for initialization."""
config = {
'initializer': self._initializer,
'cache_encoding': self._cache_encoding,
'state_prefix': self._state_prefix,
}
base_config = super(PositionalEncoding, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def _positional_encoding(self,
num_positions: Union[int, tf.Tensor],
hidden_size: Union[int, tf.Tensor],
start_position: Union[int, tf.Tensor] = 0,
dtype: str = 'float32') -> tf.Tensor:
"""Creates a sequence of sinusoidal positional encoding vectors.
Args:
num_positions: the total number of positions (frames).
hidden_size: the number of channels used for the hidden vectors.
start_position: the start position.
dtype: the dtype of the output tensor.
Returns:
The positional encoding tensor with shape [num_positions, hidden_size].
"""
if isinstance(start_position, tf.Tensor) and start_position.shape.rank == 1:
start_position = start_position[0]
# Calling `tf.range` with `dtype=tf.bfloat16` results in an error,
# so we cast afterward.
positions = tf.range(start_position, start_position + num_positions)
positions = tf.cast(positions, dtype)[:, tf.newaxis]
idx = tf.range(hidden_size)[tf.newaxis, :]
power = tf.cast(2 * (idx // 2), dtype)
power /= tf.cast(hidden_size, dtype)
angles = 1. / tf.math.pow(10_000., power)
radians = positions * angles
sin = tf.math.sin(radians[:, 0::2])
cos = tf.math.cos(radians[:, 1::2])
pos_encoding = tf.concat([sin, cos], axis=-1)
return pos_encoding
def _get_pos_encoding(self,
input_shape: tf.Tensor,
frame_count: int = 0) -> tf.Tensor:
"""Calculates the positional encoding from the input shape.
Args:
input_shape: the shape of the input.
frame_count: a count of frames that indicates the index of the first
frame.
Returns:
The positional encoding tensor with shape [num_positions, hidden_size].
"""
frames = input_shape[1]
channels = input_shape[-1]
pos_encoding = self._positional_encoding(
frames, channels, start_position=frame_count, dtype=self.dtype)
pos_encoding = tf.reshape(pos_encoding, [1, frames, 1, 1, channels])
return pos_encoding
def build(self, input_shape):
"""Builds the layer with the given input shape.
Args:
input_shape: The input shape.
Raises:
ValueError: If using 'channels_first' data format.
"""
if tf.keras.backend.image_data_format() == 'channels_first':
raise ValueError('"channels_first" mode is unsupported.')
if self._cache_encoding:
self._pos_encoding = self._get_pos_encoding(input_shape)
super(PositionalEncoding, self).build(input_shape)
def call(
self,
inputs: tf.Tensor,
states: Optional[States] = None,
output_states: bool = True,
) -> Union[tf.Tensor, Tuple[tf.Tensor, States]]:
"""Calls the layer with the given inputs.
Args:
inputs: An input `tf.Tensor`.
states: A `dict` of states such that, if any of the keys match for this
layer, will overwrite the contents of the buffer(s). Expected keys
include `state_prefix + '_pos_enc_frame_count'`.
output_states: A `bool`. If True, returns the output tensor and output
states. Returns just the output tensor otherwise.
Returns:
An output `tf.Tensor` (and optionally the states if `output_states=True`).
Raises:
ValueError: If using 'channels_first' data format.
"""
states = dict(states) if states is not None else {}
# Keep a count of frames encountered across input iterations in
# num_frames to be able to accurately update the positional encoding.
num_frames = tf.shape(inputs)[1]
frame_count = tf.cast(states.get(self._frame_count_name, [0]), tf.int32)
states[self._frame_count_name] = frame_count + num_frames
if self._cache_encoding:
pos_encoding = self._pos_encoding
else:
pos_encoding = self._get_pos_encoding(
tf.shape(inputs), frame_count=frame_count)
pos_encoding = tf.cast(pos_encoding, inputs.dtype)
pos_encoding = self._rezero(pos_encoding)
outputs = inputs + pos_encoding
return (outputs, states) if output_states else outputs
@tf.keras.utils.register_keras_serializable(package='Vision')
class GlobalAveragePool3D(tf.keras.layers.Layer):
"""Creates a global average pooling layer with causal mode.
Implements causal mode, which runs a cumulative sum (with `tf.cumsum`) across
frames in the time dimension, allowing the use of a stream buffer. Sums any
valid input state with the current input to allow state to accumulate over
several iterations.
"""
def __init__(self,
keepdims: bool = False,
causal: bool = False,
state_prefix: Optional[str] = None,
**kwargs):
"""Initializes a global average pool layer.
Args:
keepdims: A `bool`. If True, keep the averaged dimensions.
causal: A `bool` of whether to run in causal mode with a cumulative sum
across frames.
state_prefix: a prefix string to identify states.
**kwargs: Additional keyword arguments to be passed to this layer.
Returns:
An output `tf.Tensor`.
"""
super(GlobalAveragePool3D, self).__init__(**kwargs)
self._keepdims = keepdims
self._causal = causal
state_prefix = state_prefix if state_prefix is not None else ''
self._state_prefix = state_prefix
self._state_name = f'{state_prefix}_pool_buffer'
self._frame_count_name = f'{state_prefix}_pool_frame_count'
def get_config(self):
"""Returns a dictionary containing the config used for initialization."""
config = {
'keepdims': self._keepdims,
'causal': self._causal,
'state_prefix': self._state_prefix,
}
base_config = super(GlobalAveragePool3D, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self,
inputs: tf.Tensor,
states: Optional[States] = None,
output_states: bool = True
) -> Union[tf.Tensor, Tuple[tf.Tensor, States]]:
"""Calls the layer with the given inputs.
Args:
inputs: An input `tf.Tensor`.
states: A `dict` of states such that, if any of the keys match for this
layer, will overwrite the contents of the buffer(s).
Expected keys include `state_prefix + '__pool_buffer'` and
`state_prefix + '__pool_frame_count'`.
output_states: A `bool`. If True, returns the output tensor and output
states. Returns just the output tensor otherwise.
Returns:
An output `tf.Tensor` (and optionally the states if `output_states=True`).
If `causal=True`, the output tensor will have shape
`[batch_size, num_frames, 1, 1, channels]` if `keepdims=True`. We keep
the frame dimension in this case to simulate a cumulative global average
as if we are inputting one frame at a time. If `causal=False`, the output
is equivalent to `tf.keras.layers.GlobalAveragePooling3D` with shape
`[batch_size, 1, 1, 1, channels]` if `keepdims=True` (plus the optional
buffer stored in `states`).
Raises:
ValueError: If using 'channels_first' data format.
"""
states = dict(states) if states is not None else {}
if tf.keras.backend.image_data_format() == 'channels_first':
raise ValueError('"channels_first" mode is unsupported.')
# Shape: [batch_size, 1, 1, 1, channels]
buffer = states.get(self._state_name, None)
if buffer is None:
buffer = tf.zeros_like(inputs[:, :1, :1, :1], dtype=inputs.dtype)
states[self._state_name] = buffer
# Keep a count of frames encountered across input iterations in
# num_frames to be able to accurately take a cumulative average across
# all frames when running in streaming mode
num_frames = tf.shape(inputs)[1]
frame_count = states.get(self._frame_count_name, tf.constant([0]))
frame_count = tf.cast(frame_count, tf.int32)
states[self._frame_count_name] = frame_count + num_frames
if self._causal:
# Take a mean of spatial dimensions to make computation more efficient.
x = tf.reduce_mean(inputs, axis=[2, 3], keepdims=True)
x = tf.cumsum(x, axis=1)
x = x + buffer
# The last frame will be the value of the next state
# Shape: [batch_size, 1, 1, 1, channels]
states[self._state_name] = x[:, -1:]
# In causal mode, the divisor increments by 1 for every frame to
# calculate cumulative averages instead of one global average
mean_divisors = tf.range(num_frames) + frame_count + 1
mean_divisors = tf.reshape(mean_divisors, [1, num_frames, 1, 1, 1])
mean_divisors = tf.cast(mean_divisors, x.dtype)
# Shape: [batch_size, num_frames, 1, 1, channels]
x = x / mean_divisors
else:
# In non-causal mode, we (optionally) sum across frames to take a
# cumulative average across input iterations rather than individual
# frames. If no buffer state is passed, this essentially becomes
# regular global average pooling.
# Shape: [batch_size, 1, 1, 1, channels]
x = tf.reduce_sum(inputs, axis=(1, 2, 3), keepdims=True)
x = x / tf.cast(tf.shape(inputs)[2] * tf.shape(inputs)[3], x.dtype)
x = x + buffer
# Shape: [batch_size, 1, 1, 1, channels]
states[self._state_name] = x
x = x / tf.cast(frame_count + num_frames, x.dtype)
if not self._keepdims:
x = tf.squeeze(x, axis=(1, 2, 3))
return (x, states) if output_states else x
@tf.keras.utils.register_keras_serializable(package='Vision')
class SpatialAveragePool3D(tf.keras.layers.Layer):
"""Creates a global average pooling layer pooling across spatial dimentions."""
def __init__(self, keepdims: bool = False, **kwargs):
"""Initializes a global average pool layer.
Args:
keepdims: A `bool`. If True, keep the averaged dimensions.
**kwargs: Additional keyword arguments to be passed to this layer.
Returns:
An output `tf.Tensor`.
"""
super(SpatialAveragePool3D, self).__init__(**kwargs)
self._keepdims = keepdims
def get_config(self):
"""Returns a dictionary containing the config used for initialization."""
config = {
'keepdims': self._keepdims,
}
base_config = super(SpatialAveragePool3D, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def build(self, input_shape):
"""Builds the layer with the given input shape."""
if tf.keras.backend.image_data_format() == 'channels_first':
raise ValueError('"channels_first" mode is unsupported.')
super(SpatialAveragePool3D, self).build(input_shape)
def call(self, inputs):
"""Calls the layer with the given inputs."""
if inputs.shape.rank != 5:
raise ValueError(
'Input should have rank {}, got {}'.format(5, inputs.shape.rank))
return tf.reduce_mean(inputs, axis=(2, 3), keepdims=self._keepdims)
class CausalConvMixin:
"""Mixin class to implement CausalConv for `tf.keras.layers.Conv` layers."""
@property
def use_buffered_input(self) -> bool:
return self._use_buffered_input
@use_buffered_input.setter
def use_buffered_input(self, variable: bool):
self._use_buffered_input = variable
def _compute_buffered_causal_padding(self,
inputs: tf.Tensor,
use_buffered_input: bool = False,
time_axis: int = 1,
) -> List[List[int]]:
"""Calculates padding for 'causal' option for conv layers.
Args:
inputs: An optional input `tf.Tensor` to be padded.
use_buffered_input: A `bool`. If True, use 'valid' padding along the time
dimension. This should be set when applying the stream buffer.
time_axis: An `int` of the axis of the time dimension.
Returns:
A list of paddings for `tf.pad`.
"""
input_shape = tf.shape(inputs)[1:-1]
if tf.keras.backend.image_data_format() == 'channels_first':
raise ValueError('"channels_first" mode is unsupported.')
kernel_size_effective = [
(self.kernel_size[i] +
(self.kernel_size[i] - 1) * (self.dilation_rate[i] - 1))
for i in range(self.rank)
]
pad_total = [kernel_size_effective[0] - 1]
for i in range(1, self.rank):
overlap = (input_shape[i] - 1) % self.strides[i] + 1
pad_total.append(tf.maximum(kernel_size_effective[i] - overlap, 0))
pad_beg = [pad_total[i] // 2 for i in range(self.rank)]
pad_end = [pad_total[i] - pad_beg[i] for i in range(self.rank)]
padding = [[pad_beg[i], pad_end[i]] for i in range(self.rank)]
padding = [[0, 0]] + padding + [[0, 0]]
if use_buffered_input:
padding[time_axis] = [0, 0]
else:
padding[time_axis] = [padding[time_axis][0] + padding[time_axis][1], 0]
return padding
def _causal_validate_init(self):
"""Validates the Conv layer initial configuration."""
# Overriding this method is meant to circumvent unnecessary errors when
# using causal padding.
if (self.filters is not None
and self.filters % self.groups != 0):
raise ValueError(
'The number of filters must be evenly divisible by the number of '
'groups. Received: groups={}, filters={}'.format(
self.groups, self.filters))
if not all(self.kernel_size):
raise ValueError('The argument `kernel_size` cannot contain 0(s). '
'Received: %s' % (self.kernel_size,))
def _buffered_spatial_output_shape(self, spatial_output_shape: List[int]):
"""Computes the spatial output shape from the input shape."""
# When buffer padding, use 'valid' padding across time. The output shape
# across time should be the input shape minus any padding, assuming
# the stride across time is 1.
if self._use_buffered_input and spatial_output_shape[0] is not None:
padding = self._compute_buffered_causal_padding(
tf.zeros([1] + spatial_output_shape + [1]), use_buffered_input=False)
spatial_output_shape[0] -= sum(padding[1])
return spatial_output_shape
@tf.keras.utils.register_keras_serializable(package='Vision')
class Conv2D(tf.keras.layers.Conv2D, CausalConvMixin):
"""Conv2D layer supporting CausalConv.
Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
which applies causal padding to the temporal dimension, and same padding in
the spatial dimensions.
"""
def __init__(self, *args, use_buffered_input=False, **kwargs):
"""Initializes conv2d.
Args:
*args: Arguments to be passed.
use_buffered_input: A `bool`. If True, the input is expected to be padded
beforehand. In effect, calling this layer will use 'valid' padding on
the temporal dimension to simulate 'causal' padding.
**kwargs: Additional keyword arguments to be passed.
Returns:
An output `tf.Tensor` of the Conv2D operation.
"""
super(Conv2D, self).__init__(*args, **kwargs)
self._use_buffered_input = use_buffered_input
def get_config(self):
"""Returns a dictionary containing the config used for initialization."""
config = {
'use_buffered_input': self._use_buffered_input,
}
base_config = super(Conv2D, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def _compute_causal_padding(self, inputs):
"""Computes causal padding dimensions for the given inputs."""
return self._compute_buffered_causal_padding(
inputs, use_buffered_input=self._use_buffered_input)
def _validate_init(self):
"""Validates the Conv layer initial configuration."""
self._causal_validate_init()
def _spatial_output_shape(self, spatial_input_shape: List[int]):
"""Computes the spatial output shape from the input shape."""
shape = super(Conv2D, self)._spatial_output_shape(spatial_input_shape)
return self._buffered_spatial_output_shape(shape)
@tf.keras.utils.register_keras_serializable(package='Vision')
class DepthwiseConv2D(tf.keras.layers.DepthwiseConv2D, CausalConvMixin):
"""DepthwiseConv2D layer supporting CausalConv.
Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
which applies causal padding to the temporal dimension, and same padding in
the spatial dimensions.
"""
def __init__(self, *args, use_buffered_input=False, **kwargs):
"""Initializes depthwise conv2d.
Args:
*args: Arguments to be passed.
use_buffered_input: A `bool`. If True, the input is expected to be padded
beforehand. In effect, calling this layer will use 'valid' padding on
the temporal dimension to simulate 'causal' padding.
**kwargs: Additional keyword arguments to be passed.
Returns:
An output `tf.Tensor` of the DepthwiseConv2D operation.
"""
super(DepthwiseConv2D, self).__init__(*args, **kwargs)
self._use_buffered_input = use_buffered_input
# Causal padding is unsupported by default for DepthwiseConv2D,
# so we resort to valid padding internally. However, we handle
# causal padding as a special case with `self._is_causal`, which is
# defined by the super class.
if self.padding == 'causal':
self.padding = 'valid'
def get_config(self):
"""Returns a dictionary containing the config used for initialization."""
config = {
'use_buffered_input': self._use_buffered_input,
}
base_config = super(DepthwiseConv2D, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
"""Calls the layer with the given inputs."""
if self._is_causal:
inputs = tf.pad(inputs, self._compute_causal_padding(inputs))
return super(DepthwiseConv2D, self).call(inputs)
def _compute_causal_padding(self, inputs):
"""Computes causal padding dimensions for the given inputs."""
return self._compute_buffered_causal_padding(
inputs, use_buffered_input=self._use_buffered_input)
def _validate_init(self):
"""Validates the Conv layer initial configuration."""
self._causal_validate_init()
def _spatial_output_shape(self, spatial_input_shape: List[int]):
"""Computes the spatial output shape from the input shape."""
shape = super(DepthwiseConv2D, self)._spatial_output_shape(
spatial_input_shape)
return self._buffered_spatial_output_shape(shape)
@tf.keras.utils.register_keras_serializable(package='Vision')
class Conv3D(tf.keras.layers.Conv3D, CausalConvMixin):
"""Conv3D layer supporting CausalConv.
Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
which applies causal padding to the temporal dimension, and same padding in
the spatial dimensions.
"""
def __init__(self, *args, use_buffered_input=False, **kwargs):
"""Initializes conv3d.
Args:
*args: Arguments to be passed.
use_buffered_input: A `bool`. If True, the input is expected to be padded
beforehand. In effect, calling this layer will use 'valid' padding on
the temporal dimension to simulate 'causal' padding.
**kwargs: Additional keyword arguments to be passed.
Returns:
An output `tf.Tensor` of the Conv3D operation.
"""
super(Conv3D, self).__init__(*args, **kwargs)
self._use_buffered_input = use_buffered_input
def get_config(self):
"""Returns a dictionary containing the config used for initialization."""
config = {
'use_buffered_input': self._use_buffered_input,
}
base_config = super(Conv3D, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
"""Call the layer with the given inputs."""
# Note: tf.nn.conv3d with depthwise kernels on CPU is currently only
# supported when compiling with TF graph (XLA) using tf.function, so it
# is compiled by default here (b/186463870).
conv_fn = tf.function(super(Conv3D, self).call, jit_compile=True)
return conv_fn(inputs)
def _compute_causal_padding(self, inputs):
"""Computes causal padding dimensions for the given inputs."""
return self._compute_buffered_causal_padding(
inputs, use_buffered_input=self._use_buffered_input)
def _validate_init(self):
"""Validates the Conv layer initial configuration."""
self._causal_validate_init()
def _spatial_output_shape(self, spatial_input_shape: List[int]):
"""Computes the spatial output shape from the input shape."""
shape = super(Conv3D, self)._spatial_output_shape(spatial_input_shape)
return self._buffered_spatial_output_shape(shape)
@tf.keras.utils.register_keras_serializable(package='Vision')
class SpatialPyramidPooling(tf.keras.layers.Layer):
"""Implements the Atrous Spatial Pyramid Pooling.
References:
[Rethinking Atrous Convolution for Semantic Image Segmentation](
https://arxiv.org/pdf/1706.05587.pdf)
[Encoder-Decoder with Atrous Separable Convolution for Semantic Image
Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
"""
def __init__(
self,
output_channels: int,
dilation_rates: List[int],
pool_kernel_size: Optional[List[int]] = None,
use_sync_bn: bool = False,
batchnorm_momentum: float = 0.99,
batchnorm_epsilon: float = 0.001,
activation: str = 'relu',
dropout: float = 0.5,
kernel_initializer: str = 'GlorotUniform',
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
interpolation: str = 'bilinear',
use_depthwise_convolution: bool = False,
**kwargs):
"""Initializes `SpatialPyramidPooling`.
Args:
output_channels: Number of channels produced by SpatialPyramidPooling.
dilation_rates: A list of integers for parallel dilated conv.
pool_kernel_size: A list of integers or None. If None, global average
pooling is applied, otherwise an average pooling of pool_kernel_size is
applied.
use_sync_bn: A bool, whether or not to use sync batch normalization.
batchnorm_momentum: A float for the momentum in BatchNorm. Defaults to
0.99.
batchnorm_epsilon: A float for the epsilon value in BatchNorm. Defaults to
0.001.
activation: A `str` for type of activation to be used. Defaults to 'relu'.
dropout: A float for the dropout rate before output. Defaults to 0.5.
kernel_initializer: Kernel initializer for conv layers. Defaults to
`glorot_uniform`.
kernel_regularizer: Kernel regularizer for conv layers. Defaults to None.
interpolation: The interpolation method for upsampling. Defaults to
`bilinear`.
use_depthwise_convolution: Allows spatial pooling to be separable
depthwise convolusions. [Encoder-Decoder with Atrous Separable
Convolution for Semantic Image Segmentation](
https://arxiv.org/pdf/1802.02611.pdf)
**kwargs: Other keyword arguments for the layer.
"""
super().__init__(**kwargs)
self._output_channels = output_channels
self._dilation_rates = dilation_rates
self._use_sync_bn = use_sync_bn
self._batchnorm_momentum = batchnorm_momentum
self._batchnorm_epsilon = batchnorm_epsilon
self._activation = activation
self._dropout = dropout
self._kernel_initializer = kernel_initializer
self._kernel_regularizer = kernel_regularizer
self._interpolation = interpolation
self._pool_kernel_size = pool_kernel_size
self._use_depthwise_convolution = use_depthwise_convolution
self._activation_fn = tf_utils.get_activation(activation)
if self._use_sync_bn:
self._bn_op = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._bn_op = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
def build(self, input_shape):
height = input_shape[1]
width = input_shape[2]
channels = input_shape[3]
self.aspp_layers = []
conv1 = tf.keras.layers.Conv2D(
filters=self._output_channels,
kernel_size=(1, 1),
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
use_bias=False)
norm1 = self._bn_op(
axis=self._bn_axis,
momentum=self._batchnorm_momentum,
epsilon=self._batchnorm_epsilon)
self.aspp_layers.append([conv1, norm1])
for dilation_rate in self._dilation_rates:
leading_layers = []
kernel_size = (3, 3)
if self._use_depthwise_convolution:
leading_layers += [
tf.keras.layers.DepthwiseConv2D(
depth_multiplier=1,
kernel_size=kernel_size,
padding='same',
depthwise_regularizer=self._kernel_regularizer,
depthwise_initializer=self._kernel_initializer,
dilation_rate=dilation_rate,
use_bias=False)
]
kernel_size = (1, 1)
conv_dilation = leading_layers + [
tf.keras.layers.Conv2D(
filters=self._output_channels,
kernel_size=kernel_size,
padding='same',
kernel_regularizer=self._kernel_regularizer,
kernel_initializer=self._kernel_initializer,
dilation_rate=dilation_rate,
use_bias=False)
]
norm_dilation = self._bn_op(
axis=self._bn_axis,
momentum=self._batchnorm_momentum,
epsilon=self._batchnorm_epsilon)
self.aspp_layers.append(conv_dilation + [norm_dilation])
if self._pool_kernel_size is None:
pooling = [
tf.keras.layers.GlobalAveragePooling2D(),
tf.keras.layers.Reshape((1, 1, channels))
]
else:
pooling = [tf.keras.layers.AveragePooling2D(self._pool_kernel_size)]
conv2 = tf.keras.layers.Conv2D(
filters=self._output_channels,
kernel_size=(1, 1),
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
use_bias=False)
norm2 = self._bn_op(
axis=self._bn_axis,
momentum=self._batchnorm_momentum,
epsilon=self._batchnorm_epsilon)
self.aspp_layers.append(pooling + [conv2, norm2])
self._resizing_layer = tf.keras.layers.Resizing(
height, width, interpolation=self._interpolation, dtype=tf.float32)
self._projection = [
tf.keras.layers.Conv2D(
filters=self._output_channels,
kernel_size=(1, 1),
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
use_bias=False),
self._bn_op(
axis=self._bn_axis,
momentum=self._batchnorm_momentum,
epsilon=self._batchnorm_epsilon)
]
self._dropout_layer = tf.keras.layers.Dropout(rate=self._dropout)
self._concat_layer = tf.keras.layers.Concatenate(axis=-1)
def call(self,
inputs: tf.Tensor,
training: Optional[bool] = None) -> tf.Tensor:
if training is None:
training = tf.keras.backend.learning_phase()
result = []
for i, layers in enumerate(self.aspp_layers):
x = inputs
for layer in layers:
# Apply layers sequentially.
x = layer(x, training=training)
x = self._activation_fn(x)
# Apply resize layer to the end of the last set of layers.
if i == len(self.aspp_layers) - 1:
x = self._resizing_layer(x)
result.append(tf.cast(x, inputs.dtype))
x = self._concat_layer(result)
for layer in self._projection:
x = layer(x, training=training)
x = self._activation_fn(x)
return self._dropout_layer(x)
def get_config(self):
config = {
'output_channels': self._output_channels,
'dilation_rates': self._dilation_rates,
'pool_kernel_size': self._pool_kernel_size,
'use_sync_bn': self._use_sync_bn,
'batchnorm_momentum': self._batchnorm_momentum,
'batchnorm_epsilon': self._batchnorm_epsilon,
'activation': self._activation,
'dropout': self._dropout,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'interpolation': self._interpolation,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for nn_layers."""
# Import libraries
from absl.testing import parameterized
import tensorflow as tf
from official.vision.modeling.layers import nn_layers
class NNLayersTest(parameterized.TestCase, tf.test.TestCase):
def test_scale(self):
scale = nn_layers.Scale(initializer=tf.keras.initializers.constant(10.))
output = scale(3.)
self.assertAllEqual(output, 30.)
def test_temporal_softmax_pool(self):
inputs = tf.range(4, dtype=tf.float32) + 1.
inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
layer = nn_layers.TemporalSoftmaxPool()
output = layer(inputs)
self.assertAllClose(
output,
[[[[[0.10153633]]],
[[[0.33481020]]],
[[[0.82801306]]],
[[[1.82021690]]]]])
def test_positional_encoding(self):
pos_encoding = nn_layers.PositionalEncoding(
initializer='ones', cache_encoding=False)
pos_encoding_cached = nn_layers.PositionalEncoding(
initializer='ones', cache_encoding=True)
inputs = tf.ones([1, 4, 1, 1, 3])
outputs, _ = pos_encoding(inputs)
outputs_cached, _ = pos_encoding_cached(inputs)
expected = tf.constant(
[[[[[1.0000000, 1.0000000, 2.0000000]]],
[[[1.8414710, 1.0021545, 1.5403023]]],
[[[1.9092975, 1.0043088, 0.5838531]]],
[[[1.1411200, 1.0064633, 0.0100075]]]]])
self.assertEqual(outputs.shape, expected.shape)
self.assertAllClose(outputs, expected)
self.assertEqual(outputs.shape, outputs_cached.shape)
self.assertAllClose(outputs, outputs_cached)
inputs = tf.ones([1, 5, 1, 1, 3])
_ = pos_encoding(inputs)
def test_positional_encoding_bfloat16(self):
pos_encoding = nn_layers.PositionalEncoding(initializer='ones')
inputs = tf.ones([1, 4, 1, 1, 3], dtype=tf.bfloat16)
outputs, _ = pos_encoding(inputs)
expected = tf.constant(
[[[[[1.0000000, 1.0000000, 2.0000000]]],
[[[1.8414710, 1.0021545, 1.5403023]]],
[[[1.9092975, 1.0043088, 0.5838531]]],
[[[1.1411200, 1.0064633, 0.0100075]]]]])
self.assertEqual(outputs.shape, expected.shape)
self.assertAllClose(outputs, expected)
def test_global_average_pool_basic(self):
pool = nn_layers.GlobalAveragePool3D(keepdims=True)
inputs = tf.ones([1, 2, 3, 4, 1])
outputs = pool(inputs, output_states=False)
expected = tf.ones([1, 1, 1, 1, 1])
self.assertEqual(outputs.shape, expected.shape)
self.assertAllEqual(outputs, expected)
def test_positional_encoding_stream(self):
pos_encoding = nn_layers.PositionalEncoding(
initializer='ones', cache_encoding=False)
inputs = tf.range(4, dtype=tf.float32) + 1.
inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
inputs = tf.tile(inputs, [1, 1, 1, 1, 3])
expected, _ = pos_encoding(inputs)
for num_splits in [1, 2, 4]:
frames = tf.split(inputs, num_splits, axis=1)
states = {}
predicted = []
for frame in frames:
output, states = pos_encoding(frame, states=states)
predicted.append(output)
predicted = tf.concat(predicted, axis=1)
self.assertEqual(predicted.shape, expected.shape)
self.assertAllClose(predicted, expected)
self.assertAllClose(predicted, [[[[[1.0000000, 1.0000000, 2.0000000]]],
[[[2.8414710, 2.0021544, 2.5403023]]],
[[[3.9092975, 3.0043090, 2.5838532]]],
[[[4.1411200, 4.0064630, 3.0100074]]]]])
def test_global_average_pool_keras(self):
pool = nn_layers.GlobalAveragePool3D(keepdims=False)
keras_pool = tf.keras.layers.GlobalAveragePooling3D()
inputs = 10 * tf.random.normal([1, 2, 3, 4, 1])
outputs = pool(inputs, output_states=False)
keras_output = keras_pool(inputs)
self.assertAllEqual(outputs.shape, keras_output.shape)
self.assertAllClose(outputs, keras_output)
def test_stream_global_average_pool(self):
gap = nn_layers.GlobalAveragePool3D(keepdims=True, causal=False)
inputs = tf.range(4, dtype=tf.float32) + 1.
inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
inputs = tf.tile(inputs, [1, 1, 2, 2, 3])
expected, _ = gap(inputs)
for num_splits in [1, 2, 4]:
frames = tf.split(inputs, num_splits, axis=1)
states = {}
predicted = None
for frame in frames:
predicted, states = gap(frame, states=states)
self.assertEqual(predicted.shape, expected.shape)
self.assertAllClose(predicted, expected)
self.assertAllClose(
predicted,
[[[[[2.5, 2.5, 2.5]]]]])
def test_causal_stream_global_average_pool(self):
gap = nn_layers.GlobalAveragePool3D(keepdims=True, causal=True)
inputs = tf.range(4, dtype=tf.float32) + 1.
inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
inputs = tf.tile(inputs, [1, 1, 2, 2, 3])
expected, _ = gap(inputs)
for num_splits in [1, 2, 4]:
frames = tf.split(inputs, num_splits, axis=1)
states = {}
predicted = []
for frame in frames:
x, states = gap(frame, states=states)
predicted.append(x)
predicted = tf.concat(predicted, axis=1)
self.assertEqual(predicted.shape, expected.shape)
self.assertAllClose(predicted, expected)
self.assertAllClose(
predicted,
[[[[[1.0, 1.0, 1.0]]],
[[[1.5, 1.5, 1.5]]],
[[[2.0, 2.0, 2.0]]],
[[[2.5, 2.5, 2.5]]]]])
def test_spatial_average_pool(self):
pool = nn_layers.SpatialAveragePool3D(keepdims=True)
inputs = tf.range(64, dtype=tf.float32) + 1.
inputs = tf.reshape(inputs, [1, 4, 4, 4, 1])
output = pool(inputs)
self.assertEqual(output.shape, [1, 4, 1, 1, 1])
self.assertAllClose(
output,
[[[[[8.50]]],
[[[24.5]]],
[[[40.5]]],
[[[56.5]]]]])
def test_conv2d_causal(self):
conv2d = nn_layers.Conv2D(
filters=3,
kernel_size=(3, 3),
strides=(1, 2),
padding='causal',
use_buffered_input=True,
kernel_initializer='ones',
use_bias=False,
)
inputs = tf.ones([1, 4, 2, 3])
paddings = [[0, 0], [2, 0], [0, 0], [0, 0]]
padded_inputs = tf.pad(inputs, paddings)
predicted = conv2d(padded_inputs)
expected = tf.constant(
[[[[6.0, 6.0, 6.0]],
[[12., 12., 12.]],
[[18., 18., 18.]],
[[18., 18., 18.]]]])
self.assertEqual(predicted.shape, expected.shape)
self.assertAllClose(predicted, expected)
conv2d.use_buffered_input = False
predicted = conv2d(inputs)
self.assertFalse(conv2d.use_buffered_input)
self.assertEqual(predicted.shape, expected.shape)
self.assertAllClose(predicted, expected)
def test_depthwise_conv2d_causal(self):
conv2d = nn_layers.DepthwiseConv2D(
kernel_size=(3, 3),
strides=(1, 1),
padding='causal',
use_buffered_input=True,
depthwise_initializer='ones',
use_bias=False,
)
inputs = tf.ones([1, 2, 2, 3])
paddings = [[0, 0], [2, 0], [0, 0], [0, 0]]
padded_inputs = tf.pad(inputs, paddings)
predicted = conv2d(padded_inputs)
expected = tf.constant(
[[[[2., 2., 2.],
[2., 2., 2.]],
[[4., 4., 4.],
[4., 4., 4.]]]])
self.assertEqual(predicted.shape, expected.shape)
self.assertAllClose(predicted, expected)
conv2d.use_buffered_input = False
predicted = conv2d(inputs)
self.assertEqual(predicted.shape, expected.shape)
self.assertAllClose(predicted, expected)
def test_conv3d_causal(self):
conv3d = nn_layers.Conv3D(
filters=3,
kernel_size=(3, 3, 3),
strides=(1, 2, 2),
padding='causal',
use_buffered_input=True,
kernel_initializer='ones',
use_bias=False,
)
inputs = tf.ones([1, 2, 4, 4, 3])
paddings = [[0, 0], [2, 0], [0, 0], [0, 0], [0, 0]]
padded_inputs = tf.pad(inputs, paddings)
predicted = conv3d(padded_inputs)
expected = tf.constant(
[[[[[27., 27., 27.],
[18., 18., 18.]],
[[18., 18., 18.],
[12., 12., 12.]]],
[[[54., 54., 54.],
[36., 36., 36.]],
[[36., 36., 36.],
[24., 24., 24.]]]]])
self.assertEqual(predicted.shape, expected.shape)
self.assertAllClose(predicted, expected)
conv3d.use_buffered_input = False
predicted = conv3d(inputs)
self.assertEqual(predicted.shape, expected.shape)
self.assertAllClose(predicted, expected)
def test_depthwise_conv3d_causal(self):
conv3d = nn_layers.Conv3D(
filters=3,
kernel_size=(3, 3, 3),
strides=(1, 2, 2),
padding='causal',
use_buffered_input=True,
kernel_initializer='ones',
use_bias=False,
groups=3,
)
inputs = tf.ones([1, 2, 4, 4, 3])
paddings = [[0, 0], [2, 0], [0, 0], [0, 0], [0, 0]]
padded_inputs = tf.pad(inputs, paddings)
predicted = conv3d(padded_inputs)
expected = tf.constant(
[[[[[9.0, 9.0, 9.0],
[6.0, 6.0, 6.0]],
[[6.0, 6.0, 6.0],
[4.0, 4.0, 4.0]]],
[[[18.0, 18.0, 18.0],
[12., 12., 12.]],
[[12., 12., 12.],
[8., 8., 8.]]]]])
output_shape = conv3d._spatial_output_shape([4, 4, 4])
self.assertAllClose(output_shape, [2, 2, 2])
self.assertEqual(predicted.shape, expected.shape)
self.assertAllClose(predicted, expected)
conv3d.use_buffered_input = False
predicted = conv3d(inputs)
self.assertEqual(predicted.shape, expected.shape)
self.assertAllClose(predicted, expected)
def test_conv3d_causal_padding_2d(self):
"""Test to ensure causal padding works like standard padding."""
conv3d = nn_layers.Conv3D(
filters=1,
kernel_size=(1, 3, 3),
strides=(1, 2, 2),
padding='causal',
use_buffered_input=False,
kernel_initializer='ones',
use_bias=False,
)
keras_conv3d = tf.keras.layers.Conv3D(
filters=1,
kernel_size=(1, 3, 3),
strides=(1, 2, 2),
padding='same',
kernel_initializer='ones',
use_bias=False,
)
inputs = tf.ones([1, 1, 4, 4, 1])
predicted = conv3d(inputs)
expected = keras_conv3d(inputs)
self.assertEqual(predicted.shape, expected.shape)
self.assertAllClose(predicted, expected)
self.assertAllClose(predicted,
[[[[[9.],
[6.]],
[[6.],
[4.]]]]])
def test_conv3d_causal_padding_1d(self):
"""Test to ensure causal padding works like standard padding."""
conv3d = nn_layers.Conv3D(
filters=1,
kernel_size=(3, 1, 1),
strides=(2, 1, 1),
padding='causal',
use_buffered_input=False,
kernel_initializer='ones',
use_bias=False,
)
keras_conv1d = tf.keras.layers.Conv1D(
filters=1,
kernel_size=3,
strides=2,
padding='causal',
kernel_initializer='ones',
use_bias=False,
)
inputs = tf.ones([1, 4, 1, 1, 1])
predicted = conv3d(inputs)
expected = keras_conv1d(tf.squeeze(inputs, axis=[2, 3]))
expected = tf.reshape(expected, [1, 2, 1, 1, 1])
self.assertEqual(predicted.shape, expected.shape)
self.assertAllClose(predicted, expected)
self.assertAllClose(predicted,
[[[[[1.]]],
[[[3.]]]]])
@parameterized.parameters(
(None, []),
(None, [6, 12, 18]),
([32, 32], [6, 12, 18]),
)
def test_aspp(self, pool_kernel_size, dilation_rates):
inputs = tf.keras.Input(shape=(64, 64, 128), dtype=tf.float32)
layer = nn_layers.SpatialPyramidPooling(
output_channels=256,
dilation_rates=dilation_rates,
pool_kernel_size=pool_kernel_size)
output = layer(inputs)
self.assertAllEqual([None, 64, 64, 256], output.shape)
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of ROI aligner."""
from typing import Mapping
import tensorflow as tf
from official.vision.ops import spatial_transform_ops
@tf.keras.utils.register_keras_serializable(package='Vision')
class MultilevelROIAligner(tf.keras.layers.Layer):
"""Performs ROIAlign for the second stage processing."""
def __init__(self, crop_size: int = 7, sample_offset: float = 0.5, **kwargs):
"""Initializes a ROI aligner.
Args:
crop_size: An `int` of the output size of the cropped features.
sample_offset: A `float` in [0, 1] of the subpixel sample offset.
**kwargs: Additional keyword arguments passed to Layer.
"""
self._config_dict = {
'crop_size': crop_size,
'sample_offset': sample_offset,
}
super(MultilevelROIAligner, self).__init__(**kwargs)
def call(self,
features: Mapping[str, tf.Tensor],
boxes: tf.Tensor,
training: bool = None):
"""Generates ROIs.
Args:
features: A dictionary with key as pyramid level and value as features.
The features are in shape of
[batch_size, height_l, width_l, num_filters].
boxes: A 3-D `tf.Tensor` of shape [batch_size, num_boxes, 4]. Each row
represents a box with [y1, x1, y2, x2] in un-normalized coordinates.
from grid point.
training: A `bool` of whether it is in training mode.
Returns:
A 5-D `tf.Tensor` representing feature crop of shape
[batch_size, num_boxes, crop_size, crop_size, num_filters].
"""
roi_features = spatial_transform_ops.multilevel_crop_and_resize(
features,
boxes,
output_size=self._config_dict['crop_size'],
sample_offset=self._config_dict['sample_offset'])
return roi_features
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for roi_aligner.py."""
# Import libraries
import tensorflow as tf
from official.vision.modeling.layers import roi_aligner
class MultilevelROIAlignerTest(tf.test.TestCase):
def test_serialize_deserialize(self):
kwargs = dict(
crop_size=7,
sample_offset=0.5,
)
aligner = roi_aligner.MultilevelROIAligner(**kwargs)
expected_config = dict(kwargs)
self.assertEqual(aligner.get_config(), expected_config)
new_aligner = roi_aligner.MultilevelROIAligner.from_config(
aligner.get_config())
self.assertAllEqual(aligner.get_config(), new_aligner.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of ROI generator."""
from typing import Optional, Mapping
# Import libraries
import tensorflow as tf
from official.vision.ops import box_ops
from official.vision.ops import nms
def _multilevel_propose_rois(raw_boxes: Mapping[str, tf.Tensor],
raw_scores: Mapping[str, tf.Tensor],
anchor_boxes: Mapping[str, tf.Tensor],
image_shape: tf.Tensor,
pre_nms_top_k: int = 2000,
pre_nms_score_threshold: float = 0.0,
pre_nms_min_size_threshold: float = 0.0,
nms_iou_threshold: float = 0.7,
num_proposals: int = 1000,
use_batched_nms: bool = False,
decode_boxes: bool = True,
clip_boxes: bool = True,
apply_sigmoid_to_score: bool = True):
"""Proposes RoIs given a group of candidates from different FPN levels.
The following describes the steps:
1. For each individual level:
a. Apply sigmoid transform if specified.
b. Decode boxes if specified.
c. Clip boxes if specified.
d. Filter small boxes and those fall outside image if specified.
e. Apply pre-NMS filtering including pre-NMS top k and score thresholding.
f. Apply NMS.
2. Aggregate post-NMS boxes from each level.
3. Apply an overall top k to generate the final selected RoIs.
Args:
raw_boxes: A `dict` with keys representing FPN levels and values
representing box tenors of shape
[batch_size, feature_h, feature_w, num_anchors * 4].
raw_scores: A `dict` with keys representing FPN levels and values
representing logit tensors of shape
[batch_size, feature_h, feature_w, num_anchors].
anchor_boxes: A `dict` with keys representing FPN levels and values
representing anchor box tensors of shape
[batch_size, feature_h * feature_w * num_anchors, 4].
image_shape: A `tf.Tensor` of shape [batch_size, 2] where the last dimension
are [height, width] of the scaled image.
pre_nms_top_k: An `int` of top scoring RPN proposals *per level* to keep
before applying NMS. Default: 2000.
pre_nms_score_threshold: A `float` between 0 and 1 representing the minimal
box score to keep before applying NMS. This is often used as a
pre-filtering step for better performance. Default: 0, no filtering is
applied.
pre_nms_min_size_threshold: A `float` representing the minimal box size in
each side (w.r.t. the scaled image) to keep before applying NMS. This is
often used as a pre-filtering step for better performance. Default: 0, no
filtering is applied.
nms_iou_threshold: A `float` between 0 and 1 representing the IoU threshold
used for NMS. If 0.0, no NMS is applied. Default: 0.7.
num_proposals: An `int` of top scoring RPN proposals *in total* to keep
after applying NMS. Default: 1000.
use_batched_nms: A `bool` indicating whether NMS is applied in batch using
`tf.image.combined_non_max_suppression`. Currently only available in
CPU/GPU. Default is False.
decode_boxes: A `bool` indicating whether `raw_boxes` needs to be decoded
using `anchor_boxes`. If False, use `raw_boxes` directly and ignore
`anchor_boxes`. Default is True.
clip_boxes: A `bool` indicating whether boxes are first clipped to the
scaled image size before appliying NMS. If False, no clipping is applied
and `image_shape` is ignored. Default is True.
apply_sigmoid_to_score: A `bool` indicating whether apply sigmoid to
`raw_scores` before applying NMS. Default is True.
Returns:
selected_rois: A `tf.Tensor` of shape [batch_size, num_proposals, 4],
representing the box coordinates of the selected proposals w.r.t. the
scaled image.
selected_roi_scores: A `tf.Tensor` of shape [batch_size, num_proposals, 1],
representing the scores of the selected proposals.
"""
with tf.name_scope('multilevel_propose_rois'):
rois = []
roi_scores = []
image_shape = tf.expand_dims(image_shape, axis=1)
for level in sorted(raw_scores.keys()):
with tf.name_scope('level_%s' % level):
_, feature_h, feature_w, num_anchors_per_location = (
raw_scores[level].get_shape().as_list())
num_boxes = feature_h * feature_w * num_anchors_per_location
this_level_scores = tf.reshape(raw_scores[level], [-1, num_boxes])
this_level_boxes = tf.reshape(raw_boxes[level], [-1, num_boxes, 4])
this_level_anchors = tf.cast(
tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
dtype=this_level_scores.dtype)
if apply_sigmoid_to_score:
this_level_scores = tf.sigmoid(this_level_scores)
if decode_boxes:
this_level_boxes = box_ops.decode_boxes(
this_level_boxes, this_level_anchors)
if clip_boxes:
this_level_boxes = box_ops.clip_boxes(
this_level_boxes, image_shape)
if pre_nms_min_size_threshold > 0.0:
this_level_boxes, this_level_scores = box_ops.filter_boxes(
this_level_boxes,
this_level_scores,
image_shape,
pre_nms_min_size_threshold)
this_level_pre_nms_top_k = min(num_boxes, pre_nms_top_k)
this_level_post_nms_top_k = min(num_boxes, num_proposals)
if nms_iou_threshold > 0.0:
if use_batched_nms:
this_level_rois, this_level_roi_scores, _, _ = (
tf.image.combined_non_max_suppression(
tf.expand_dims(this_level_boxes, axis=2),
tf.expand_dims(this_level_scores, axis=-1),
max_output_size_per_class=this_level_pre_nms_top_k,
max_total_size=this_level_post_nms_top_k,
iou_threshold=nms_iou_threshold,
score_threshold=pre_nms_score_threshold,
pad_per_class=False,
clip_boxes=False))
else:
if pre_nms_score_threshold > 0.0:
this_level_boxes, this_level_scores = (
box_ops.filter_boxes_by_scores(
this_level_boxes,
this_level_scores,
pre_nms_score_threshold))
this_level_boxes, this_level_scores = box_ops.top_k_boxes(
this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k)
this_level_roi_scores, this_level_rois = (
nms.sorted_non_max_suppression_padded(
this_level_scores,
this_level_boxes,
max_output_size=this_level_post_nms_top_k,
iou_threshold=nms_iou_threshold))
else:
this_level_rois, this_level_roi_scores = box_ops.top_k_boxes(
this_level_boxes,
this_level_scores,
k=this_level_post_nms_top_k)
rois.append(this_level_rois)
roi_scores.append(this_level_roi_scores)
all_rois = tf.concat(rois, axis=1)
all_roi_scores = tf.concat(roi_scores, axis=1)
with tf.name_scope('top_k_rois'):
_, num_valid_rois = all_roi_scores.get_shape().as_list()
overall_top_k = min(num_valid_rois, num_proposals)
selected_rois, selected_roi_scores = box_ops.top_k_boxes(
all_rois, all_roi_scores, k=overall_top_k)
return selected_rois, selected_roi_scores
@tf.keras.utils.register_keras_serializable(package='Vision')
class MultilevelROIGenerator(tf.keras.layers.Layer):
"""Proposes RoIs for the second stage processing."""
def __init__(self,
pre_nms_top_k: int = 2000,
pre_nms_score_threshold: float = 0.0,
pre_nms_min_size_threshold: float = 0.0,
nms_iou_threshold: float = 0.7,
num_proposals: int = 1000,
test_pre_nms_top_k: int = 1000,
test_pre_nms_score_threshold: float = 0.0,
test_pre_nms_min_size_threshold: float = 0.0,
test_nms_iou_threshold: float = 0.7,
test_num_proposals: int = 1000,
use_batched_nms: bool = False,
**kwargs):
"""Initializes a ROI generator.
The ROI generator transforms the raw predictions from RPN to ROIs.
Args:
pre_nms_top_k: An `int` of the number of top scores proposals to be kept
before applying NMS.
pre_nms_score_threshold: A `float` of the score threshold to apply before
applying NMS. Proposals whose scores are below this threshold are
thrown away.
pre_nms_min_size_threshold: A `float` of the threshold of each side of the
box (w.r.t. the scaled image). Proposals whose sides are below this
threshold are thrown away.
nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
num_proposals: An `int` of the final number of proposals to generate.
test_pre_nms_top_k: An `int` of the number of top scores proposals to be
kept before applying NMS in testing.
test_pre_nms_score_threshold: A `float` of the score threshold to apply
before applying NMS in testing. Proposals whose scores are below this
threshold are thrown away.
test_pre_nms_min_size_threshold: A `float` of the threshold of each side
of the box (w.r.t. the scaled image) in testing. Proposals whose sides
are below this threshold are thrown away.
test_nms_iou_threshold: A `float` in [0, 1] of the NMS IoU threshold in
testing.
test_num_proposals: An `int` of the final number of proposals to generate
in testing.
use_batched_nms: A `bool` of whether or not use
`tf.image.combined_non_max_suppression`.
**kwargs: Additional keyword arguments passed to Layer.
"""
self._config_dict = {
'pre_nms_top_k': pre_nms_top_k,
'pre_nms_score_threshold': pre_nms_score_threshold,
'pre_nms_min_size_threshold': pre_nms_min_size_threshold,
'nms_iou_threshold': nms_iou_threshold,
'num_proposals': num_proposals,
'test_pre_nms_top_k': test_pre_nms_top_k,
'test_pre_nms_score_threshold': test_pre_nms_score_threshold,
'test_pre_nms_min_size_threshold': test_pre_nms_min_size_threshold,
'test_nms_iou_threshold': test_nms_iou_threshold,
'test_num_proposals': test_num_proposals,
'use_batched_nms': use_batched_nms,
}
super(MultilevelROIGenerator, self).__init__(**kwargs)
def call(self,
raw_boxes: Mapping[str, tf.Tensor],
raw_scores: Mapping[str, tf.Tensor],
anchor_boxes: Mapping[str, tf.Tensor],
image_shape: tf.Tensor,
training: Optional[bool] = None):
"""Proposes RoIs given a group of candidates from different FPN levels.
The following describes the steps:
1. For each individual level:
a. Apply sigmoid transform if specified.
b. Decode boxes if specified.
c. Clip boxes if specified.
d. Filter small boxes and those fall outside image if specified.
e. Apply pre-NMS filtering including pre-NMS top k and score
thresholding.
f. Apply NMS.
2. Aggregate post-NMS boxes from each level.
3. Apply an overall top k to generate the final selected RoIs.
Args:
raw_boxes: A `dict` with keys representing FPN levels and values
representing box tenors of shape
[batch, feature_h, feature_w, num_anchors * 4].
raw_scores: A `dict` with keys representing FPN levels and values
representing logit tensors of shape
[batch, feature_h, feature_w, num_anchors].
anchor_boxes: A `dict` with keys representing FPN levels and values
representing anchor box tensors of shape
[batch, feature_h * feature_w * num_anchors, 4].
image_shape: A `tf.Tensor` of shape [batch, 2] where the last dimension
are [height, width] of the scaled image.
training: A `bool` that indicates whether it is in training mode.
Returns:
roi_boxes: A `tf.Tensor` of shape [batch, num_proposals, 4], the proposed
ROIs in the scaled image coordinate.
roi_scores: A `tf.Tensor` of shape [batch, num_proposals], scores of the
proposed ROIs.
"""
roi_boxes, roi_scores = _multilevel_propose_rois(
raw_boxes,
raw_scores,
anchor_boxes,
image_shape,
pre_nms_top_k=(
self._config_dict['pre_nms_top_k'] if training
else self._config_dict['test_pre_nms_top_k']),
pre_nms_score_threshold=(
self._config_dict['pre_nms_score_threshold'] if training
else self._config_dict['test_pre_nms_score_threshold']),
pre_nms_min_size_threshold=(
self._config_dict['pre_nms_min_size_threshold'] if training
else self._config_dict['test_pre_nms_min_size_threshold']),
nms_iou_threshold=(
self._config_dict['nms_iou_threshold'] if training
else self._config_dict['test_nms_iou_threshold']),
num_proposals=(
self._config_dict['num_proposals'] if training
else self._config_dict['test_num_proposals']),
use_batched_nms=self._config_dict['use_batched_nms'],
decode_boxes=True,
clip_boxes=True,
apply_sigmoid_to_score=True)
return roi_boxes, roi_scores
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of ROI sampler."""
# Import libraries
import tensorflow as tf
from official.vision.modeling.layers import box_sampler
from official.vision.ops import box_matcher
from official.vision.ops import iou_similarity
from official.vision.ops import target_gather
@tf.keras.utils.register_keras_serializable(package='Vision')
class ROISampler(tf.keras.layers.Layer):
"""Samples ROIs and assigns targets to the sampled ROIs."""
def __init__(self,
mix_gt_boxes: bool = True,
num_sampled_rois: int = 512,
foreground_fraction: float = 0.25,
foreground_iou_threshold: float = 0.5,
background_iou_high_threshold: float = 0.5,
background_iou_low_threshold: float = 0,
skip_subsampling: bool = False,
**kwargs):
"""Initializes a ROI sampler.
Args:
mix_gt_boxes: A `bool` of whether to mix the groundtruth boxes with
proposed ROIs.
num_sampled_rois: An `int` of the number of sampled ROIs per image.
foreground_fraction: A `float` in [0, 1], what percentage of proposed ROIs
should be sampled from the foreground boxes.
foreground_iou_threshold: A `float` that represents the IoU threshold for
a box to be considered as positive (if >= `foreground_iou_threshold`).
background_iou_high_threshold: A `float` that represents the IoU threshold
for a box to be considered as negative (if overlap in
[`background_iou_low_threshold`, `background_iou_high_threshold`]).
background_iou_low_threshold: A `float` that represents the IoU threshold
for a box to be considered as negative (if overlap in
[`background_iou_low_threshold`, `background_iou_high_threshold`])
skip_subsampling: a bool that determines if we want to skip the sampling
procedure than balances the fg/bg classes. Used for upper frcnn layers
in cascade RCNN.
**kwargs: Additional keyword arguments passed to Layer.
"""
self._config_dict = {
'mix_gt_boxes': mix_gt_boxes,
'num_sampled_rois': num_sampled_rois,
'foreground_fraction': foreground_fraction,
'foreground_iou_threshold': foreground_iou_threshold,
'background_iou_high_threshold': background_iou_high_threshold,
'background_iou_low_threshold': background_iou_low_threshold,
'skip_subsampling': skip_subsampling,
}
self._sim_calc = iou_similarity.IouSimilarity()
self._box_matcher = box_matcher.BoxMatcher(
thresholds=[
background_iou_low_threshold, background_iou_high_threshold,
foreground_iou_threshold
],
indicators=[-3, -1, -2, 1])
self._target_gather = target_gather.TargetGather()
self._sampler = box_sampler.BoxSampler(
num_sampled_rois, foreground_fraction)
super(ROISampler, self).__init__(**kwargs)
def call(self, boxes: tf.Tensor, gt_boxes: tf.Tensor, gt_classes: tf.Tensor):
"""Assigns the proposals with groundtruth classes and performs subsmpling.
Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
following algorithm to generate the final `num_samples_per_image` RoIs.
1. Calculates the IoU between each proposal box and each gt_boxes.
2. Assigns each proposed box with a groundtruth class and box by choosing
the largest IoU overlap.
3. Samples `num_samples_per_image` boxes from all proposed boxes, and
returns box_targets, class_targets, and RoIs.
Args:
boxes: A `tf.Tensor` of shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment. The last dimension is the
box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
format.
gt_boxes: A `tf.Tensor` of shape of [batch_size, MAX_NUM_INSTANCES, 4].
The coordinates of gt_boxes are in the pixel coordinates of the scaled
image. This tensor might have padding of values -1 indicating the
invalid box coordinates.
gt_classes: A `tf.Tensor` with a shape of [batch_size, MAX_NUM_INSTANCES].
This tensor might have paddings with values of -1 indicating the invalid
classes.
Returns:
sampled_rois: A `tf.Tensor` of shape of [batch_size, K, 4], representing
the coordinates of the sampled RoIs, where K is the number of the
sampled RoIs, i.e. K = num_samples_per_image.
sampled_gt_boxes: A `tf.Tensor` of shape of [batch_size, K, 4], storing
the box coordinates of the matched groundtruth boxes of the samples
RoIs.
sampled_gt_classes: A `tf.Tensor` of shape of [batch_size, K], storing the
classes of the matched groundtruth boxes of the sampled RoIs.
sampled_gt_indices: A `tf.Tensor` of shape of [batch_size, K], storing the
indices of the sampled groudntruth boxes in the original `gt_boxes`
tensor, i.e.,
gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
"""
gt_boxes = tf.cast(gt_boxes, dtype=boxes.dtype)
if self._config_dict['mix_gt_boxes']:
boxes = tf.concat([boxes, gt_boxes], axis=1)
boxes_invalid_mask = tf.less(
tf.reduce_max(boxes, axis=-1, keepdims=True), 0.0)
gt_invalid_mask = tf.less(
tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
similarity_matrix = self._sim_calc(boxes, gt_boxes, boxes_invalid_mask,
gt_invalid_mask)
matched_gt_indices, match_indicators = self._box_matcher(similarity_matrix)
positive_matches = tf.greater_equal(match_indicators, 0)
negative_matches = tf.equal(match_indicators, -1)
ignored_matches = tf.equal(match_indicators, -2)
invalid_matches = tf.equal(match_indicators, -3)
background_mask = tf.expand_dims(
tf.logical_or(negative_matches, invalid_matches), -1)
gt_classes = tf.expand_dims(gt_classes, axis=-1)
matched_gt_classes = self._target_gather(gt_classes, matched_gt_indices,
background_mask)
matched_gt_classes = tf.where(background_mask,
tf.zeros_like(matched_gt_classes),
matched_gt_classes)
matched_gt_boxes = self._target_gather(gt_boxes, matched_gt_indices,
tf.tile(background_mask, [1, 1, 4]))
matched_gt_boxes = tf.where(background_mask,
tf.zeros_like(matched_gt_boxes),
matched_gt_boxes)
matched_gt_indices = tf.where(
tf.squeeze(background_mask, -1), -tf.ones_like(matched_gt_indices),
matched_gt_indices)
if self._config_dict['skip_subsampling']:
return (boxes, matched_gt_boxes, tf.squeeze(matched_gt_classes,
axis=-1), matched_gt_indices)
sampled_indices = self._sampler(
positive_matches, negative_matches, ignored_matches)
sampled_rois = self._target_gather(boxes, sampled_indices)
sampled_gt_boxes = self._target_gather(matched_gt_boxes, sampled_indices)
sampled_gt_classes = tf.squeeze(self._target_gather(
matched_gt_classes, sampled_indices), axis=-1)
sampled_gt_indices = tf.squeeze(self._target_gather(
tf.expand_dims(matched_gt_indices, -1), sampled_indices), axis=-1)
return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
sampled_gt_indices)
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""R-CNN(-RS) models."""
from typing import Any, List, Mapping, Optional, Tuple, Union
import tensorflow as tf
from official.vision.ops import anchor
from official.vision.ops import box_ops
@tf.keras.utils.register_keras_serializable(package='Vision')
class MaskRCNNModel(tf.keras.Model):
"""The Mask R-CNN(-RS) and Cascade RCNN-RS models."""
def __init__(self,
backbone: tf.keras.Model,
decoder: tf.keras.Model,
rpn_head: tf.keras.layers.Layer,
detection_head: Union[tf.keras.layers.Layer,
List[tf.keras.layers.Layer]],
roi_generator: tf.keras.layers.Layer,
roi_sampler: Union[tf.keras.layers.Layer,
List[tf.keras.layers.Layer]],
roi_aligner: tf.keras.layers.Layer,
detection_generator: tf.keras.layers.Layer,
mask_head: Optional[tf.keras.layers.Layer] = None,
mask_sampler: Optional[tf.keras.layers.Layer] = None,
mask_roi_aligner: Optional[tf.keras.layers.Layer] = None,
class_agnostic_bbox_pred: bool = False,
cascade_class_ensemble: bool = False,
min_level: Optional[int] = None,
max_level: Optional[int] = None,
num_scales: Optional[int] = None,
aspect_ratios: Optional[List[float]] = None,
anchor_size: Optional[float] = None,
**kwargs):
"""Initializes the R-CNN(-RS) model.
Args:
backbone: `tf.keras.Model`, the backbone network.
decoder: `tf.keras.Model`, the decoder network.
rpn_head: the RPN head.
detection_head: the detection head or a list of heads.
roi_generator: the ROI generator.
roi_sampler: a single ROI sampler or a list of ROI samplers for cascade
detection heads.
roi_aligner: the ROI aligner.
detection_generator: the detection generator.
mask_head: the mask head.
mask_sampler: the mask sampler.
mask_roi_aligner: the ROI alginer for mask prediction.
class_agnostic_bbox_pred: if True, perform class agnostic bounding box
prediction. Needs to be `True` for Cascade RCNN models.
cascade_class_ensemble: if True, ensemble classification scores over all
detection heads.
min_level: Minimum level in output feature maps.
max_level: Maximum level in output feature maps.
num_scales: A number representing intermediate scales added on each level.
For instances, num_scales=2 adds one additional intermediate anchor
scales [2^0, 2^0.5] on each level.
aspect_ratios: A list representing the aspect raito anchors added on each
level. The number indicates the ratio of width to height. For instances,
aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level.
anchor_size: A number representing the scale of size of the base anchor to
the feature stride 2^level.
**kwargs: keyword arguments to be passed.
"""
super(MaskRCNNModel, self).__init__(**kwargs)
self._config_dict = {
'backbone': backbone,
'decoder': decoder,
'rpn_head': rpn_head,
'detection_head': detection_head,
'roi_generator': roi_generator,
'roi_sampler': roi_sampler,
'roi_aligner': roi_aligner,
'detection_generator': detection_generator,
'mask_head': mask_head,
'mask_sampler': mask_sampler,
'mask_roi_aligner': mask_roi_aligner,
'class_agnostic_bbox_pred': class_agnostic_bbox_pred,
'cascade_class_ensemble': cascade_class_ensemble,
'min_level': min_level,
'max_level': max_level,
'num_scales': num_scales,
'aspect_ratios': aspect_ratios,
'anchor_size': anchor_size,
}
self.backbone = backbone
self.decoder = decoder
self.rpn_head = rpn_head
if not isinstance(detection_head, (list, tuple)):
self.detection_head = [detection_head]
else:
self.detection_head = detection_head
self.roi_generator = roi_generator
if not isinstance(roi_sampler, (list, tuple)):
self.roi_sampler = [roi_sampler]
else:
self.roi_sampler = roi_sampler
if len(self.roi_sampler) > 1 and not class_agnostic_bbox_pred:
raise ValueError(
'`class_agnostic_bbox_pred` needs to be True if multiple detection heads are specified.'
)
self.roi_aligner = roi_aligner
self.detection_generator = detection_generator
self._include_mask = mask_head is not None
self.mask_head = mask_head
if self._include_mask and mask_sampler is None:
raise ValueError('`mask_sampler` is not provided in Mask R-CNN.')
self.mask_sampler = mask_sampler
if self._include_mask and mask_roi_aligner is None:
raise ValueError('`mask_roi_aligner` is not provided in Mask R-CNN.')
self.mask_roi_aligner = mask_roi_aligner
# Weights for the regression losses for each FRCNN layer.
# TODO(xianzhi): Make the weights configurable.
self._cascade_layer_to_weights = [
[10.0, 10.0, 5.0, 5.0],
[20.0, 20.0, 10.0, 10.0],
[30.0, 30.0, 15.0, 15.0],
]
def call(self,
images: tf.Tensor,
image_shape: tf.Tensor,
anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
gt_boxes: Optional[tf.Tensor] = None,
gt_classes: Optional[tf.Tensor] = None,
gt_masks: Optional[tf.Tensor] = None,
training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
model_outputs, intermediate_outputs = self._call_box_outputs(
images=images, image_shape=image_shape, anchor_boxes=anchor_boxes,
gt_boxes=gt_boxes, gt_classes=gt_classes, training=training)
if not self._include_mask:
return model_outputs
model_mask_outputs = self._call_mask_outputs(
model_box_outputs=model_outputs,
features=model_outputs['decoder_features'],
current_rois=intermediate_outputs['current_rois'],
matched_gt_indices=intermediate_outputs['matched_gt_indices'],
matched_gt_boxes=intermediate_outputs['matched_gt_boxes'],
matched_gt_classes=intermediate_outputs['matched_gt_classes'],
gt_masks=gt_masks,
training=training)
model_outputs.update(model_mask_outputs)
return model_outputs
def _get_backbone_and_decoder_features(self, images):
backbone_features = self.backbone(images)
if self.decoder:
features = self.decoder(backbone_features)
else:
features = backbone_features
return backbone_features, features
def _call_box_outputs(
self, images: tf.Tensor,
image_shape: tf.Tensor,
anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
gt_boxes: Optional[tf.Tensor] = None,
gt_classes: Optional[tf.Tensor] = None,
training: Optional[bool] = None) -> Tuple[
Mapping[str, tf.Tensor], Mapping[str, tf.Tensor]]:
"""Implementation of the Faster-RCNN logic for boxes."""
model_outputs = {}
# Feature extraction.
(backbone_features,
decoder_features) = self._get_backbone_and_decoder_features(images)
# Region proposal network.
rpn_scores, rpn_boxes = self.rpn_head(decoder_features)
model_outputs.update({
'backbone_features': backbone_features,
'decoder_features': decoder_features,
'rpn_boxes': rpn_boxes,
'rpn_scores': rpn_scores
})
# Generate anchor boxes for this batch if not provided.
if anchor_boxes is None:
_, image_height, image_width, _ = images.get_shape().as_list()
anchor_boxes = anchor.Anchor(
min_level=self._config_dict['min_level'],
max_level=self._config_dict['max_level'],
num_scales=self._config_dict['num_scales'],
aspect_ratios=self._config_dict['aspect_ratios'],
anchor_size=self._config_dict['anchor_size'],
image_size=(image_height, image_width)).multilevel_boxes
for l in anchor_boxes:
anchor_boxes[l] = tf.tile(
tf.expand_dims(anchor_boxes[l], axis=0),
[tf.shape(images)[0], 1, 1, 1])
# Generate RoIs.
current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores, anchor_boxes,
image_shape, training)
next_rois = current_rois
all_class_outputs = []
for cascade_num in range(len(self.roi_sampler)):
# In cascade RCNN we want the higher layers to have different regression
# weights as the predicted deltas become smaller and smaller.
regression_weights = self._cascade_layer_to_weights[cascade_num]
current_rois = next_rois
(class_outputs, box_outputs, model_outputs, matched_gt_boxes,
matched_gt_classes, matched_gt_indices,
current_rois) = self._run_frcnn_head(
features=decoder_features,
rois=current_rois,
gt_boxes=gt_boxes,
gt_classes=gt_classes,
training=training,
model_outputs=model_outputs,
cascade_num=cascade_num,
regression_weights=regression_weights)
all_class_outputs.append(class_outputs)
# Generate ROIs for the next cascade head if there is any.
if cascade_num < len(self.roi_sampler) - 1:
next_rois = box_ops.decode_boxes(
tf.cast(box_outputs, tf.float32),
current_rois,
weights=regression_weights)
next_rois = box_ops.clip_boxes(next_rois,
tf.expand_dims(image_shape, axis=1))
if not training:
if self._config_dict['cascade_class_ensemble']:
class_outputs = tf.add_n(all_class_outputs) / len(all_class_outputs)
detections = self.detection_generator(
box_outputs,
class_outputs,
current_rois,
image_shape,
regression_weights,
bbox_per_class=(not self._config_dict['class_agnostic_bbox_pred']))
model_outputs.update({
'cls_outputs': class_outputs,
'box_outputs': box_outputs,
})
if self.detection_generator.get_config()['apply_nms']:
model_outputs.update({
'detection_boxes': detections['detection_boxes'],
'detection_scores': detections['detection_scores'],
'detection_classes': detections['detection_classes'],
'num_detections': detections['num_detections']
})
else:
model_outputs.update({
'decoded_boxes': detections['decoded_boxes'],
'decoded_box_scores': detections['decoded_box_scores']
})
intermediate_outputs = {
'matched_gt_boxes': matched_gt_boxes,
'matched_gt_indices': matched_gt_indices,
'matched_gt_classes': matched_gt_classes,
'current_rois': current_rois,
}
return (model_outputs, intermediate_outputs)
def _call_mask_outputs(
self,
model_box_outputs: Mapping[str, tf.Tensor],
features: tf.Tensor,
current_rois: tf.Tensor,
matched_gt_indices: tf.Tensor,
matched_gt_boxes: tf.Tensor,
matched_gt_classes: tf.Tensor,
gt_masks: tf.Tensor,
training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
"""Implementation of Mask-RCNN mask prediction logic."""
model_outputs = dict(model_box_outputs)
if training:
current_rois, roi_classes, roi_masks = self.mask_sampler(
current_rois, matched_gt_boxes, matched_gt_classes,
matched_gt_indices, gt_masks)
roi_masks = tf.stop_gradient(roi_masks)
model_outputs.update({
'mask_class_targets': roi_classes,
'mask_targets': roi_masks,
})
else:
current_rois = model_outputs['detection_boxes']
roi_classes = model_outputs['detection_classes']
mask_logits, mask_probs = self._features_to_mask_outputs(
features, current_rois, roi_classes)
if training:
model_outputs.update({
'mask_outputs': mask_logits,
})
else:
model_outputs.update({
'detection_masks': mask_probs,
})
return model_outputs
def _run_frcnn_head(self, features, rois, gt_boxes, gt_classes, training,
model_outputs, cascade_num, regression_weights):
"""Runs the frcnn head that does both class and box prediction.
Args:
features: `list` of features from the feature extractor.
rois: `list` of current rois that will be used to predict bbox refinement
and classes from.
gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4].
This tensor might have paddings with a negative value.
gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
classes. It is padded with -1s to indicate the invalid classes.
training: `bool`, if model is training or being evaluated.
model_outputs: `dict`, used for storing outputs used for eval and losses.
cascade_num: `int`, the current frcnn layer in the cascade.
regression_weights: `list`, weights used for l1 loss in bounding box
regression.
Returns:
class_outputs: Class predictions for rois.
box_outputs: Box predictions for rois. These are formatted for the
regression loss and need to be converted before being used as rois
in the next stage.
model_outputs: Updated dict with predictions used for losses and eval.
matched_gt_boxes: If `is_training` is true, then these give the gt box
location of its positive match.
matched_gt_classes: If `is_training` is true, then these give the gt class
of the predicted box.
matched_gt_boxes: If `is_training` is true, then these give the box
location of its positive match.
matched_gt_indices: If `is_training` is true, then gives the index of
the positive box match. Used for mask prediction.
rois: The sampled rois used for this layer.
"""
# Only used during training.
matched_gt_boxes, matched_gt_classes, matched_gt_indices = (None, None,
None)
if training and gt_boxes is not None:
rois = tf.stop_gradient(rois)
current_roi_sampler = self.roi_sampler[cascade_num]
rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
current_roi_sampler(rois, gt_boxes, gt_classes))
# Create bounding box training targets.
box_targets = box_ops.encode_boxes(
matched_gt_boxes, rois, weights=regression_weights)
# If the target is background, the box target is set to all 0s.
box_targets = tf.where(
tf.tile(
tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
[1, 1, 4]), tf.zeros_like(box_targets), box_targets)
model_outputs.update({
'class_targets_{}'.format(cascade_num)
if cascade_num else 'class_targets':
matched_gt_classes,
'box_targets_{}'.format(cascade_num)
if cascade_num else 'box_targets':
box_targets,
})
# Get roi features.
roi_features = self.roi_aligner(features, rois)
# Run frcnn head to get class and bbox predictions.
current_detection_head = self.detection_head[cascade_num]
class_outputs, box_outputs = current_detection_head(roi_features)
model_outputs.update({
'class_outputs_{}'.format(cascade_num)
if cascade_num else 'class_outputs':
class_outputs,
'box_outputs_{}'.format(cascade_num) if cascade_num else 'box_outputs':
box_outputs,
})
return (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
matched_gt_classes, matched_gt_indices, rois)
def _features_to_mask_outputs(self, features, rois, roi_classes):
# Mask RoI align.
mask_roi_features = self.mask_roi_aligner(features, rois)
# Mask head.
raw_masks = self.mask_head([mask_roi_features, roi_classes])
return raw_masks, tf.nn.sigmoid(raw_masks)
@property
def checkpoint_items(
self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
"""Returns a dictionary of items to be additionally checkpointed."""
items = dict(
backbone=self.backbone,
rpn_head=self.rpn_head,
detection_head=self.detection_head)
if self.decoder is not None:
items.update(decoder=self.decoder)
if self._include_mask:
items.update(mask_head=self.mask_head)
return items
def get_config(self) -> Mapping[str, Any]:
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for maskrcnn_model.py."""
import os
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.vision.modeling import maskrcnn_model
from official.vision.modeling.backbones import resnet
from official.vision.modeling.decoders import fpn
from official.vision.modeling.heads import dense_prediction_heads
from official.vision.modeling.heads import instance_heads
from official.vision.modeling.layers import detection_generator
from official.vision.modeling.layers import mask_sampler
from official.vision.modeling.layers import roi_aligner
from official.vision.modeling.layers import roi_generator
from official.vision.modeling.layers import roi_sampler
from official.vision.ops import anchor
class MaskRCNNModelTest(parameterized.TestCase, tf.test.TestCase):
@combinations.generate(
combinations.combine(
include_mask=[True, False],
use_separable_conv=[True, False],
build_anchor_boxes=[True, False],
is_training=[True, False]))
def test_build_model(self, include_mask, use_separable_conv,
build_anchor_boxes, is_training):
num_classes = 3
min_level = 3
max_level = 7
num_scales = 3
aspect_ratios = [1.0]
anchor_size = 3
resnet_model_id = 50
num_anchors_per_location = num_scales * len(aspect_ratios)
image_size = 384
images = np.random.rand(2, image_size, image_size, 3)
image_shape = np.array([[image_size, image_size], [image_size, image_size]])
if build_anchor_boxes:
anchor_boxes = anchor.Anchor(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=3,
image_size=(image_size, image_size)).multilevel_boxes
for l in anchor_boxes:
anchor_boxes[l] = tf.tile(
tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
else:
anchor_boxes = None
backbone = resnet.ResNet(model_id=resnet_model_id)
decoder = fpn.FPN(
input_specs=backbone.output_specs,
min_level=min_level,
max_level=max_level,
use_separable_conv=use_separable_conv)
rpn_head = dense_prediction_heads.RPNHead(
min_level=min_level,
max_level=max_level,
num_anchors_per_location=num_anchors_per_location,
num_convs=1)
detection_head = instance_heads.DetectionHead(num_classes=num_classes)
roi_generator_obj = roi_generator.MultilevelROIGenerator()
roi_sampler_obj = roi_sampler.ROISampler()
roi_aligner_obj = roi_aligner.MultilevelROIAligner()
detection_generator_obj = detection_generator.DetectionGenerator()
if include_mask:
mask_head = instance_heads.MaskHead(
num_classes=num_classes, upsample_factor=2)
mask_sampler_obj = mask_sampler.MaskSampler(
mask_target_size=28, num_sampled_masks=1)
mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
else:
mask_head = None
mask_sampler_obj = None
mask_roi_aligner_obj = None
model = maskrcnn_model.MaskRCNNModel(
backbone,
decoder,
rpn_head,
detection_head,
roi_generator_obj,
roi_sampler_obj,
roi_aligner_obj,
detection_generator_obj,
mask_head,
mask_sampler_obj,
mask_roi_aligner_obj,
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=anchor_size)
gt_boxes = np.array(
[[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
[[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
dtype=np.float32)
gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
if include_mask:
gt_masks = np.ones((2, 3, 100, 100))
else:
gt_masks = None
# Results will be checked in test_forward.
_ = model(
images,
image_shape,
anchor_boxes,
gt_boxes,
gt_classes,
gt_masks,
training=is_training)
@combinations.generate(
combinations.combine(
strategy=[
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],
include_mask=[True, False],
build_anchor_boxes=[True, False],
use_cascade_heads=[True, False],
training=[True, False],
))
def test_forward(self, strategy, include_mask, build_anchor_boxes, training,
use_cascade_heads):
num_classes = 3
min_level = 3
max_level = 4
num_scales = 3
aspect_ratios = [1.0]
anchor_size = 3
if use_cascade_heads:
cascade_iou_thresholds = [0.6]
class_agnostic_bbox_pred = True
cascade_class_ensemble = True
else:
cascade_iou_thresholds = None
class_agnostic_bbox_pred = False
cascade_class_ensemble = False
image_size = (256, 256)
images = np.random.rand(2, image_size[0], image_size[1], 3)
image_shape = np.array([[224, 100], [100, 224]])
with strategy.scope():
if build_anchor_boxes:
anchor_boxes = anchor.Anchor(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=anchor_size,
image_size=image_size).multilevel_boxes
else:
anchor_boxes = None
num_anchors_per_location = len(aspect_ratios) * num_scales
input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
decoder = fpn.FPN(
min_level=min_level,
max_level=max_level,
input_specs=backbone.output_specs)
rpn_head = dense_prediction_heads.RPNHead(
min_level=min_level,
max_level=max_level,
num_anchors_per_location=num_anchors_per_location)
detection_head = instance_heads.DetectionHead(
num_classes=num_classes,
class_agnostic_bbox_pred=class_agnostic_bbox_pred)
roi_generator_obj = roi_generator.MultilevelROIGenerator()
roi_sampler_cascade = []
roi_sampler_obj = roi_sampler.ROISampler()
roi_sampler_cascade.append(roi_sampler_obj)
if cascade_iou_thresholds:
for iou in cascade_iou_thresholds:
roi_sampler_obj = roi_sampler.ROISampler(
mix_gt_boxes=False,
foreground_iou_threshold=iou,
background_iou_high_threshold=iou,
background_iou_low_threshold=0.0,
skip_subsampling=True)
roi_sampler_cascade.append(roi_sampler_obj)
roi_aligner_obj = roi_aligner.MultilevelROIAligner()
detection_generator_obj = detection_generator.DetectionGenerator()
if include_mask:
mask_head = instance_heads.MaskHead(
num_classes=num_classes, upsample_factor=2)
mask_sampler_obj = mask_sampler.MaskSampler(
mask_target_size=28, num_sampled_masks=1)
mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
else:
mask_head = None
mask_sampler_obj = None
mask_roi_aligner_obj = None
model = maskrcnn_model.MaskRCNNModel(
backbone,
decoder,
rpn_head,
detection_head,
roi_generator_obj,
roi_sampler_obj,
roi_aligner_obj,
detection_generator_obj,
mask_head,
mask_sampler_obj,
mask_roi_aligner_obj,
class_agnostic_bbox_pred=class_agnostic_bbox_pred,
cascade_class_ensemble=cascade_class_ensemble,
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=anchor_size)
gt_boxes = np.array(
[[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
[[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
dtype=np.float32)
gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
if include_mask:
gt_masks = np.ones((2, 3, 100, 100))
else:
gt_masks = None
results = model(
images,
image_shape,
anchor_boxes,
gt_boxes,
gt_classes,
gt_masks,
training=training)
self.assertIn('rpn_boxes', results)
self.assertIn('rpn_scores', results)
if training:
self.assertIn('class_targets', results)
self.assertIn('box_targets', results)
self.assertIn('class_outputs', results)
self.assertIn('box_outputs', results)
if include_mask:
self.assertIn('mask_outputs', results)
else:
self.assertIn('detection_boxes', results)
self.assertIn('detection_scores', results)
self.assertIn('detection_classes', results)
self.assertIn('num_detections', results)
if include_mask:
self.assertIn('detection_masks', results)
@parameterized.parameters(
(False,),
(True,),
)
def test_serialize_deserialize(self, include_mask):
input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
decoder = fpn.FPN(
min_level=3, max_level=7, input_specs=backbone.output_specs)
rpn_head = dense_prediction_heads.RPNHead(
min_level=3, max_level=7, num_anchors_per_location=3)
detection_head = instance_heads.DetectionHead(num_classes=2)
roi_generator_obj = roi_generator.MultilevelROIGenerator()
roi_sampler_obj = roi_sampler.ROISampler()
roi_aligner_obj = roi_aligner.MultilevelROIAligner()
detection_generator_obj = detection_generator.DetectionGenerator()
if include_mask:
mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
mask_sampler_obj = mask_sampler.MaskSampler(
mask_target_size=28, num_sampled_masks=1)
mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
else:
mask_head = None
mask_sampler_obj = None
mask_roi_aligner_obj = None
model = maskrcnn_model.MaskRCNNModel(
backbone,
decoder,
rpn_head,
detection_head,
roi_generator_obj,
roi_sampler_obj,
roi_aligner_obj,
detection_generator_obj,
mask_head,
mask_sampler_obj,
mask_roi_aligner_obj,
min_level=3,
max_level=7,
num_scales=3,
aspect_ratios=[1.0],
anchor_size=3)
config = model.get_config()
new_model = maskrcnn_model.MaskRCNNModel.from_config(config)
# Validate that the config can be forced to JSON.
_ = new_model.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(model.get_config(), new_model.get_config())
@parameterized.parameters(
(False,),
(True,),
)
def test_checkpoint(self, include_mask):
input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
decoder = fpn.FPN(
min_level=3, max_level=7, input_specs=backbone.output_specs)
rpn_head = dense_prediction_heads.RPNHead(
min_level=3, max_level=7, num_anchors_per_location=3)
detection_head = instance_heads.DetectionHead(num_classes=2)
roi_generator_obj = roi_generator.MultilevelROIGenerator()
roi_sampler_obj = roi_sampler.ROISampler()
roi_aligner_obj = roi_aligner.MultilevelROIAligner()
detection_generator_obj = detection_generator.DetectionGenerator()
if include_mask:
mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
mask_sampler_obj = mask_sampler.MaskSampler(
mask_target_size=28, num_sampled_masks=1)
mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
else:
mask_head = None
mask_sampler_obj = None
mask_roi_aligner_obj = None
model = maskrcnn_model.MaskRCNNModel(
backbone,
decoder,
rpn_head,
detection_head,
roi_generator_obj,
roi_sampler_obj,
roi_aligner_obj,
detection_generator_obj,
mask_head,
mask_sampler_obj,
mask_roi_aligner_obj,
min_level=3,
max_level=7,
num_scales=3,
aspect_ratios=[1.0],
anchor_size=3)
expect_checkpoint_items = dict(
backbone=backbone,
decoder=decoder,
rpn_head=rpn_head,
detection_head=[detection_head])
if include_mask:
expect_checkpoint_items['mask_head'] = mask_head
self.assertAllEqual(expect_checkpoint_items, model.checkpoint_items)
# Test save and load checkpoints.
ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items)
save_dir = self.create_tempdir().full_path
ckpt.save(os.path.join(save_dir, 'ckpt'))
partial_ckpt = tf.train.Checkpoint(backbone=backbone)
partial_ckpt.read(tf.train.latest_checkpoint(
save_dir)).expect_partial().assert_existing_objects_matched()
if include_mask:
partial_ckpt_mask = tf.train.Checkpoint(
backbone=backbone, mask_head=mask_head)
partial_ckpt_mask.restore(tf.train.latest_checkpoint(
save_dir)).expect_partial().assert_existing_objects_matched()
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""RetinaNet."""
from typing import Any, Mapping, List, Optional, Union
# Import libraries
import tensorflow as tf
from official.vision.ops import anchor
@tf.keras.utils.register_keras_serializable(package='Vision')
class RetinaNetModel(tf.keras.Model):
"""The RetinaNet model class."""
def __init__(self,
backbone: tf.keras.Model,
decoder: tf.keras.Model,
head: tf.keras.layers.Layer,
detection_generator: tf.keras.layers.Layer,
min_level: Optional[int] = None,
max_level: Optional[int] = None,
num_scales: Optional[int] = None,
aspect_ratios: Optional[List[float]] = None,
anchor_size: Optional[float] = None,
**kwargs):
"""Classification initialization function.
Args:
backbone: `tf.keras.Model` a backbone network.
decoder: `tf.keras.Model` a decoder network.
head: `RetinaNetHead`, the RetinaNet head.
detection_generator: the detection generator.
min_level: Minimum level in output feature maps.
max_level: Maximum level in output feature maps.
num_scales: A number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: A list representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: A number representing the scale of size of the base
anchor to the feature stride 2^level.
**kwargs: keyword arguments to be passed.
"""
super(RetinaNetModel, self).__init__(**kwargs)
self._config_dict = {
'backbone': backbone,
'decoder': decoder,
'head': head,
'detection_generator': detection_generator,
'min_level': min_level,
'max_level': max_level,
'num_scales': num_scales,
'aspect_ratios': aspect_ratios,
'anchor_size': anchor_size,
}
self._backbone = backbone
self._decoder = decoder
self._head = head
self._detection_generator = detection_generator
def call(self,
images: tf.Tensor,
image_shape: Optional[tf.Tensor] = None,
anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
output_intermediate_features: bool = False,
training: bool = None) -> Mapping[str, tf.Tensor]:
"""Forward pass of the RetinaNet model.
Args:
images: `Tensor`, the input batched images, whose shape is
[batch, height, width, 3].
image_shape: `Tensor`, the actual shape of the input images, whose shape
is [batch, 2] where the last dimension is [height, width]. Note that
this is the actual image shape excluding paddings. For example, images
in the batch may be resized into different shapes before padding to the
fixed size.
anchor_boxes: a dict of tensors which includes multilevel anchors.
- key: `str`, the level of the multilevel predictions.
- values: `Tensor`, the anchor coordinates of a particular feature
level, whose shape is [height_l, width_l, num_anchors_per_location].
output_intermediate_features: `bool` indicating whether to return the
intermediate feature maps generated by backbone and decoder.
training: `bool`, indicating whether it is in training mode.
Returns:
scores: a dict of tensors which includes scores of the predictions.
- key: `str`, the level of the multilevel predictions.
- values: `Tensor`, the box scores predicted from a particular feature
level, whose shape is
[batch, height_l, width_l, num_classes * num_anchors_per_location].
boxes: a dict of tensors which includes coordinates of the predictions.
- key: `str`, the level of the multilevel predictions.
- values: `Tensor`, the box coordinates predicted from a particular
feature level, whose shape is
[batch, height_l, width_l, 4 * num_anchors_per_location].
attributes: a dict of (attribute_name, attribute_predictions). Each
attribute prediction is a dict that includes:
- key: `str`, the level of the multilevel predictions.
- values: `Tensor`, the attribute predictions from a particular
feature level, whose shape is
[batch, height_l, width_l, att_size * num_anchors_per_location].
"""
outputs = {}
# Feature extraction.
features = self.backbone(images)
if output_intermediate_features:
outputs.update(
{'backbone_{}'.format(k): v for k, v in features.items()})
if self.decoder:
features = self.decoder(features)
if output_intermediate_features:
outputs.update(
{'decoder_{}'.format(k): v for k, v in features.items()})
# Dense prediction. `raw_attributes` can be empty.
raw_scores, raw_boxes, raw_attributes = self.head(features)
if training:
outputs.update({
'cls_outputs': raw_scores,
'box_outputs': raw_boxes,
})
if raw_attributes:
outputs.update({'attribute_outputs': raw_attributes})
return outputs
else:
# Generate anchor boxes for this batch if not provided.
if anchor_boxes is None:
_, image_height, image_width, _ = images.get_shape().as_list()
anchor_boxes = anchor.Anchor(
min_level=self._config_dict['min_level'],
max_level=self._config_dict['max_level'],
num_scales=self._config_dict['num_scales'],
aspect_ratios=self._config_dict['aspect_ratios'],
anchor_size=self._config_dict['anchor_size'],
image_size=(image_height, image_width)).multilevel_boxes
for l in anchor_boxes:
anchor_boxes[l] = tf.tile(
tf.expand_dims(anchor_boxes[l], axis=0),
[tf.shape(images)[0], 1, 1, 1])
# Post-processing.
final_results = self.detection_generator(raw_boxes, raw_scores,
anchor_boxes, image_shape,
raw_attributes)
outputs.update({
'cls_outputs': raw_scores,
'box_outputs': raw_boxes,
})
if self.detection_generator.get_config()['apply_nms']:
outputs.update({
'detection_boxes': final_results['detection_boxes'],
'detection_scores': final_results['detection_scores'],
'detection_classes': final_results['detection_classes'],
'num_detections': final_results['num_detections']
})
else:
outputs.update({
'decoded_boxes': final_results['decoded_boxes'],
'decoded_box_scores': final_results['decoded_box_scores']
})
if raw_attributes:
outputs.update({
'attribute_outputs': raw_attributes,
'detection_attributes': final_results['detection_attributes'],
})
return outputs
@property
def checkpoint_items(
self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
"""Returns a dictionary of items to be additionally checkpointed."""
items = dict(backbone=self.backbone, head=self.head)
if self.decoder is not None:
items.update(decoder=self.decoder)
return items
@property
def backbone(self) -> tf.keras.Model:
return self._backbone
@property
def decoder(self) -> tf.keras.Model:
return self._decoder
@property
def head(self) -> tf.keras.layers.Layer:
return self._head
@property
def detection_generator(self) -> tf.keras.layers.Layer:
return self._detection_generator
def get_config(self) -> Mapping[str, Any]:
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for RetinaNet models."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.vision.modeling import retinanet_model
from official.vision.modeling.backbones import resnet
from official.vision.modeling.decoders import fpn
from official.vision.modeling.heads import dense_prediction_heads
from official.vision.modeling.layers import detection_generator
from official.vision.ops import anchor
class RetinaNetTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
{
'use_separable_conv': True,
'build_anchor_boxes': True,
'is_training': False,
'has_att_heads': False
},
{
'use_separable_conv': False,
'build_anchor_boxes': True,
'is_training': False,
'has_att_heads': False
},
{
'use_separable_conv': False,
'build_anchor_boxes': False,
'is_training': False,
'has_att_heads': False
},
{
'use_separable_conv': False,
'build_anchor_boxes': False,
'is_training': True,
'has_att_heads': False
},
{
'use_separable_conv': False,
'build_anchor_boxes': True,
'is_training': True,
'has_att_heads': True
},
{
'use_separable_conv': False,
'build_anchor_boxes': True,
'is_training': False,
'has_att_heads': True
},
)
def test_build_model(self, use_separable_conv, build_anchor_boxes,
is_training, has_att_heads):
num_classes = 3
min_level = 3
max_level = 7
num_scales = 3
aspect_ratios = [1.0]
anchor_size = 3
fpn_num_filters = 256
head_num_convs = 4
head_num_filters = 256
num_anchors_per_location = num_scales * len(aspect_ratios)
image_size = 384
images = np.random.rand(2, image_size, image_size, 3)
image_shape = np.array([[image_size, image_size], [image_size, image_size]])
if build_anchor_boxes:
anchor_boxes = anchor.Anchor(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=anchor_size,
image_size=(image_size, image_size)).multilevel_boxes
for l in anchor_boxes:
anchor_boxes[l] = tf.tile(
tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
else:
anchor_boxes = None
if has_att_heads:
attribute_heads = [dict(name='depth', type='regression', size=1)]
else:
attribute_heads = None
backbone = resnet.ResNet(model_id=50)
decoder = fpn.FPN(
input_specs=backbone.output_specs,
min_level=min_level,
max_level=max_level,
num_filters=fpn_num_filters,
use_separable_conv=use_separable_conv)
head = dense_prediction_heads.RetinaNetHead(
min_level=min_level,
max_level=max_level,
num_classes=num_classes,
attribute_heads=attribute_heads,
num_anchors_per_location=num_anchors_per_location,
use_separable_conv=use_separable_conv,
num_convs=head_num_convs,
num_filters=head_num_filters)
generator = detection_generator.MultilevelDetectionGenerator(
max_num_detections=10)
model = retinanet_model.RetinaNetModel(
backbone=backbone,
decoder=decoder,
head=head,
detection_generator=generator,
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=anchor_size)
_ = model(images, image_shape, anchor_boxes, training=is_training)
@combinations.generate(
combinations.combine(
strategy=[
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],
image_size=[
(128, 128),
],
training=[True, False],
has_att_heads=[True, False],
output_intermediate_features=[True, False],
soft_nms_sigma=[None, 0.0, 0.1],
))
def test_forward(self, strategy, image_size, training, has_att_heads,
output_intermediate_features, soft_nms_sigma):
"""Test for creation of a R50-FPN RetinaNet."""
tf.keras.backend.set_image_data_format('channels_last')
num_classes = 3
min_level = 3
max_level = 7
num_scales = 3
aspect_ratios = [1.0]
num_anchors_per_location = num_scales * len(aspect_ratios)
images = np.random.rand(2, image_size[0], image_size[1], 3)
image_shape = np.array(
[[image_size[0], image_size[1]], [image_size[0], image_size[1]]])
with strategy.scope():
anchor_gen = anchor.build_anchor_generator(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=3)
anchor_boxes = anchor_gen(image_size)
for l in anchor_boxes:
anchor_boxes[l] = tf.tile(
tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
backbone = resnet.ResNet(model_id=50)
decoder = fpn.FPN(
input_specs=backbone.output_specs,
min_level=min_level,
max_level=max_level)
if has_att_heads:
attribute_heads = [dict(name='depth', type='regression', size=1)]
else:
attribute_heads = None
head = dense_prediction_heads.RetinaNetHead(
min_level=min_level,
max_level=max_level,
num_classes=num_classes,
attribute_heads=attribute_heads,
num_anchors_per_location=num_anchors_per_location)
generator = detection_generator.MultilevelDetectionGenerator(
max_num_detections=10,
nms_version='v1',
use_cpu_nms=soft_nms_sigma is not None,
soft_nms_sigma=soft_nms_sigma)
model = retinanet_model.RetinaNetModel(
backbone=backbone,
decoder=decoder,
head=head,
detection_generator=generator)
model_outputs = model(
images,
image_shape,
anchor_boxes,
output_intermediate_features=output_intermediate_features,
training=training)
if training:
cls_outputs = model_outputs['cls_outputs']
box_outputs = model_outputs['box_outputs']
for level in range(min_level, max_level + 1):
self.assertIn(str(level), cls_outputs)
self.assertIn(str(level), box_outputs)
self.assertAllEqual([
2,
image_size[0] // 2**level,
image_size[1] // 2**level,
num_classes * num_anchors_per_location
], cls_outputs[str(level)].numpy().shape)
self.assertAllEqual([
2,
image_size[0] // 2**level,
image_size[1] // 2**level,
4 * num_anchors_per_location
], box_outputs[str(level)].numpy().shape)
if has_att_heads:
att_outputs = model_outputs['attribute_outputs']
for att in att_outputs.values():
self.assertAllEqual([
2, image_size[0] // 2**level, image_size[1] // 2**level,
1 * num_anchors_per_location
], att[str(level)].numpy().shape)
else:
self.assertIn('detection_boxes', model_outputs)
self.assertIn('detection_scores', model_outputs)
self.assertIn('detection_classes', model_outputs)
self.assertIn('num_detections', model_outputs)
self.assertAllEqual(
[2, 10, 4], model_outputs['detection_boxes'].numpy().shape)
self.assertAllEqual(
[2, 10], model_outputs['detection_scores'].numpy().shape)
self.assertAllEqual(
[2, 10], model_outputs['detection_classes'].numpy().shape)
self.assertAllEqual(
[2,], model_outputs['num_detections'].numpy().shape)
if has_att_heads:
self.assertIn('detection_attributes', model_outputs)
self.assertAllEqual(
[2, 10, 1],
model_outputs['detection_attributes']['depth'].numpy().shape)
if output_intermediate_features:
for l in range(2, 6):
self.assertIn('backbone_{}'.format(l), model_outputs)
self.assertAllEqual([
2, image_size[0] // 2**l, image_size[1] // 2**l,
backbone.output_specs[str(l)].as_list()[-1]
], model_outputs['backbone_{}'.format(l)].numpy().shape)
for l in range(min_level, max_level + 1):
self.assertIn('decoder_{}'.format(l), model_outputs)
self.assertAllEqual([
2, image_size[0] // 2**l, image_size[1] // 2**l,
decoder.output_specs[str(l)].as_list()[-1]
], model_outputs['decoder_{}'.format(l)].numpy().shape)
def test_serialize_deserialize(self):
"""Validate the network can be serialized and deserialized."""
num_classes = 3
min_level = 3
max_level = 7
num_scales = 3
aspect_ratios = [1.0]
num_anchors_per_location = num_scales * len(aspect_ratios)
backbone = resnet.ResNet(model_id=50)
decoder = fpn.FPN(
input_specs=backbone.output_specs,
min_level=min_level,
max_level=max_level)
head = dense_prediction_heads.RetinaNetHead(
min_level=min_level,
max_level=max_level,
num_classes=num_classes,
num_anchors_per_location=num_anchors_per_location)
generator = detection_generator.MultilevelDetectionGenerator(
max_num_detections=10)
model = retinanet_model.RetinaNetModel(
backbone=backbone,
decoder=decoder,
head=head,
detection_generator=generator,
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=3)
config = model.get_config()
new_model = retinanet_model.RetinaNetModel.from_config(config)
# Validate that the config can be forced to JSON.
_ = new_model.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(model.get_config(), new_model.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Build segmentation models."""
from typing import Any, Mapping, Union, Optional, Dict
# Import libraries
import tensorflow as tf
layers = tf.keras.layers
@tf.keras.utils.register_keras_serializable(package='Vision')
class SegmentationModel(tf.keras.Model):
"""A Segmentation class model.
Input images are passed through backbone first. Decoder network is then
applied, and finally, segmentation head is applied on the output of the
decoder network. Layers such as ASPP should be part of decoder. Any feature
fusion is done as part of the segmentation head (i.e. deeplabv3+ feature
fusion is not part of the decoder, instead it is part of the segmentation
head). This way, different feature fusion techniques can be combined with
different backbones, and decoders.
"""
def __init__(self, backbone: tf.keras.Model, decoder: tf.keras.Model,
head: tf.keras.layers.Layer,
mask_scoring_head: Optional[tf.keras.layers.Layer] = None,
**kwargs):
"""Segmentation initialization function.
Args:
backbone: a backbone network.
decoder: a decoder network. E.g. FPN.
head: segmentation head.
mask_scoring_head: mask scoring head.
**kwargs: keyword arguments to be passed.
"""
super(SegmentationModel, self).__init__(**kwargs)
self._config_dict = {
'backbone': backbone,
'decoder': decoder,
'head': head,
'mask_scoring_head': mask_scoring_head,
}
self.backbone = backbone
self.decoder = decoder
self.head = head
self.mask_scoring_head = mask_scoring_head
def call(self, inputs: tf.Tensor, training: bool = None
) -> Dict[str, tf.Tensor]:
backbone_features = self.backbone(inputs)
if self.decoder:
decoder_features = self.decoder(backbone_features)
else:
decoder_features = backbone_features
logits = self.head((backbone_features, decoder_features))
outputs = {'logits': logits}
if self.mask_scoring_head:
mask_scores = self.mask_scoring_head(logits)
outputs.update({'mask_scores': mask_scores})
return outputs
@property
def checkpoint_items(
self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
"""Returns a dictionary of items to be additionally checkpointed."""
items = dict(backbone=self.backbone, head=self.head)
if self.decoder is not None:
items.update(decoder=self.decoder)
if self.mask_scoring_head is not None:
items.update(mask_scoring_head=self.mask_scoring_head)
return items
def get_config(self) -> Mapping[str, Any]:
return self._config_dict
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for segmentation network."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.modeling import backbones
from official.vision.modeling import segmentation_model
from official.vision.modeling.decoders import fpn
from official.vision.modeling.heads import segmentation_heads
class SegmentationNetworkTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(128, 2),
(128, 3),
(128, 4),
(256, 2),
(256, 3),
(256, 4),
)
def test_segmentation_network_creation(
self, input_size, level):
"""Test for creation of a segmentation network."""
num_classes = 10
inputs = np.random.rand(2, input_size, input_size, 3)
tf.keras.backend.set_image_data_format('channels_last')
backbone = backbones.ResNet(model_id=50)
decoder = fpn.FPN(
input_specs=backbone.output_specs, min_level=2, max_level=7)
head = segmentation_heads.SegmentationHead(num_classes, level=level)
model = segmentation_model.SegmentationModel(
backbone=backbone,
decoder=decoder,
head=head,
mask_scoring_head=None,
)
outputs = model(inputs)
self.assertAllEqual(
[2, input_size // (2**level), input_size // (2**level), num_classes],
outputs['logits'].numpy().shape)
def test_serialize_deserialize(self):
"""Validate the network can be serialized and deserialized."""
num_classes = 3
backbone = backbones.ResNet(model_id=50)
decoder = fpn.FPN(
input_specs=backbone.output_specs, min_level=3, max_level=7)
head = segmentation_heads.SegmentationHead(num_classes, level=3)
model = segmentation_model.SegmentationModel(
backbone=backbone,
decoder=decoder,
head=head
)
config = model.get_config()
new_model = segmentation_model.SegmentationModel.from_config(config)
# Validate that the config can be forced to JSON.
_ = new_model.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(model.get_config(), new_model.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Build video classification models."""
from typing import Any, Mapping, Optional, Union, List, Text
import tensorflow as tf
layers = tf.keras.layers
@tf.keras.utils.register_keras_serializable(package='Vision')
class VideoClassificationModel(tf.keras.Model):
"""A video classification class builder."""
def __init__(
self,
backbone: tf.keras.Model,
num_classes: int,
input_specs: Optional[Mapping[str, tf.keras.layers.InputSpec]] = None,
dropout_rate: float = 0.0,
aggregate_endpoints: bool = False,
kernel_initializer: str = 'random_uniform',
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
require_endpoints: Optional[List[Text]] = None,
**kwargs):
"""Video Classification initialization function.
Args:
backbone: a 3d backbone network.
num_classes: `int` number of classes in classification task.
input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
dropout_rate: `float` rate for dropout regularization.
aggregate_endpoints: `bool` aggregate all end ponits or only use the
final end point.
kernel_initializer: kernel initializer for the dense layer.
kernel_regularizer: tf.keras.regularizers.Regularizer object. Default to
None.
bias_regularizer: tf.keras.regularizers.Regularizer object. Default to
None.
require_endpoints: the required endpoints for prediction. If None or
empty, then only uses the final endpoint.
**kwargs: keyword arguments to be passed.
"""
if not input_specs:
input_specs = {
'image': layers.InputSpec(shape=[None, None, None, None, 3])
}
self._self_setattr_tracking = False
self._config_dict = {
'backbone': backbone,
'num_classes': num_classes,
'input_specs': input_specs,
'dropout_rate': dropout_rate,
'aggregate_endpoints': aggregate_endpoints,
'kernel_initializer': kernel_initializer,
'kernel_regularizer': kernel_regularizer,
'bias_regularizer': bias_regularizer,
'require_endpoints': require_endpoints,
}
self._input_specs = input_specs
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
self._backbone = backbone
inputs = {
k: tf.keras.Input(shape=v.shape[1:]) for k, v in input_specs.items()
}
endpoints = backbone(inputs['image'])
if aggregate_endpoints:
pooled_feats = []
for endpoint in endpoints.values():
x_pool = tf.keras.layers.GlobalAveragePooling3D()(endpoint)
pooled_feats.append(x_pool)
x = tf.concat(pooled_feats, axis=1)
else:
if not require_endpoints:
# Uses the last endpoint for prediction.
x = endpoints[max(endpoints.keys())]
x = tf.keras.layers.GlobalAveragePooling3D()(x)
else:
# Concats all the required endpoints for prediction.
outputs = []
for name in require_endpoints:
x = endpoints[name]
x = tf.keras.layers.GlobalAveragePooling3D()(x)
outputs.append(x)
x = tf.concat(outputs, axis=1)
x = tf.keras.layers.Dropout(dropout_rate)(x)
x = tf.keras.layers.Dense(
num_classes, kernel_initializer=kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)(
x)
super(VideoClassificationModel, self).__init__(
inputs=inputs, outputs=x, **kwargs)
@property
def checkpoint_items(
self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
"""Returns a dictionary of items to be additionally checkpointed."""
return dict(backbone=self.backbone)
@property
def backbone(self) -> tf.keras.Model:
return self._backbone
def get_config(self) -> Mapping[str, Any]:
return self._config_dict
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for video classification network."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.modeling import backbones
from official.vision.modeling import video_classification_model
class VideoClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(50, 8, 112, 'relu', False),
(50, 8, 112, 'swish', True),
)
def test_resnet3d_network_creation(self, model_id, temporal_size,
spatial_size, activation,
aggregate_endpoints):
"""Test for creation of a ResNet3D-50 classifier."""
input_specs = tf.keras.layers.InputSpec(
shape=[None, temporal_size, spatial_size, spatial_size, 3])
temporal_strides = [1, 1, 1, 1]
temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
(1, 3, 1)]
tf.keras.backend.set_image_data_format('channels_last')
backbone = backbones.ResNet3D(
model_id=model_id,
temporal_strides=temporal_strides,
temporal_kernel_sizes=temporal_kernel_sizes,
input_specs=input_specs,
activation=activation)
num_classes = 1000
model = video_classification_model.VideoClassificationModel(
backbone=backbone,
num_classes=num_classes,
input_specs={'image': input_specs},
dropout_rate=0.2,
aggregate_endpoints=aggregate_endpoints,
)
inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
logits = model(inputs)
self.assertAllEqual([2, num_classes], logits.numpy().shape)
def test_serialize_deserialize(self):
"""Validate the classification network can be serialized and deserialized."""
model_id = 50
temporal_strides = [1, 1, 1, 1]
temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
(1, 3, 1)]
backbone = backbones.ResNet3D(
model_id=model_id,
temporal_strides=temporal_strides,
temporal_kernel_sizes=temporal_kernel_sizes)
model = video_classification_model.VideoClassificationModel(
backbone=backbone, num_classes=1000)
config = model.get_config()
new_model = video_classification_model.VideoClassificationModel.from_config(
config)
# Validate that the config can be forced to JSON.
_ = new_model.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(model.get_config(), new_model.get_config())
if __name__ == '__main__':
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment