Internal change

PiperOrigin-RevId: 425740068

Internal change
PiperOrigin-RevId: 425740068
7785dec0 · Yeqing Li · A. Unique TensorFlower · 9c93f07c · 9c93f07c · 9c93f07c
Commit 7785dec0 authored Feb 01, 2022 by Yeqing Li Committed by A. Unique TensorFlower Feb 01, 2022
20 changed files
--- a/official/vision/modeling/heads/instance_heads_test.py
+++ b/official/vision/modeling/heads/instance_heads_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Tests for instance_heads.py."""
-
-# Import libraries
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from official.vision.modeling.heads import instance_heads
-
-
-class DetectionHeadTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters(
-      (0, 0, False, False),
-      (0, 1, False, False),
-      (1, 0, False, False),
-      (1, 1, False, False),
-  )
-  def test_forward(self, num_convs, num_fcs, use_separable_conv, use_sync_bn):
-    detection_head = instance_heads.DetectionHead(
-        num_classes=3,
-        num_convs=num_convs,
-        num_filters=16,
-        use_separable_conv=use_separable_conv,
-        num_fcs=num_fcs,
-        fc_dims=4,
-        activation='relu',
-        use_sync_bn=use_sync_bn,
-        norm_momentum=0.99,
-        norm_epsilon=0.001,
-        kernel_regularizer=None,
-        bias_regularizer=None,
-    )
-    roi_features = np.random.rand(2, 10, 128, 128, 16)
-    scores, boxes = detection_head(roi_features)
-    self.assertAllEqual(scores.numpy().shape, [2, 10, 3])
-    self.assertAllEqual(boxes.numpy().shape, [2, 10, 12])
-
-  def test_serialize_deserialize(self):
-    detection_head = instance_heads.DetectionHead(
-        num_classes=91,
-        num_convs=0,
-        num_filters=256,
-        use_separable_conv=False,
-        num_fcs=2,
-        fc_dims=1024,
-        activation='relu',
-        use_sync_bn=False,
-        norm_momentum=0.99,
-        norm_epsilon=0.001,
-        kernel_regularizer=None,
-        bias_regularizer=None,
-    )
-    config = detection_head.get_config()
-    new_detection_head = instance_heads.DetectionHead.from_config(config)
-    self.assertAllEqual(
-        detection_head.get_config(), new_detection_head.get_config())
-
-
-class MaskHeadTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters(
-      (1, 1, False),
-      (1, 2, False),
-      (2, 1, False),
-      (2, 2, False),
-  )
-  def test_forward(self, upsample_factor, num_convs, use_sync_bn):
-    mask_head = instance_heads.MaskHead(
-        num_classes=3,
-        upsample_factor=upsample_factor,
-        num_convs=num_convs,
-        num_filters=16,
-        use_separable_conv=False,
-        activation='relu',
-        use_sync_bn=use_sync_bn,
-        norm_momentum=0.99,
-        norm_epsilon=0.001,
-        kernel_regularizer=None,
-        bias_regularizer=None,
-    )
-    roi_features = np.random.rand(2, 10, 14, 14, 16)
-    roi_classes = np.zeros((2, 10))
-    masks = mask_head([roi_features, roi_classes])
-    self.assertAllEqual(
-        masks.numpy().shape,
-        [2, 10, 14 * upsample_factor, 14 * upsample_factor])
-
-  def test_serialize_deserialize(self):
-    mask_head = instance_heads.MaskHead(
-        num_classes=3,
-        upsample_factor=2,
-        num_convs=1,
-        num_filters=256,
-        use_separable_conv=False,
-        activation='relu',
-        use_sync_bn=False,
-        norm_momentum=0.99,
-        norm_epsilon=0.001,
-        kernel_regularizer=None,
-        bias_regularizer=None,
-    )
-    config = mask_head.get_config()
-    new_mask_head = instance_heads.MaskHead.from_config(config)
-    self.assertAllEqual(
-        mask_head.get_config(), new_mask_head.get_config())
-
-  def test_forward_class_agnostic(self):
-    mask_head = instance_heads.MaskHead(
-        num_classes=3,
-        class_agnostic=True
-    )
-    roi_features = np.random.rand(2, 10, 14, 14, 16)
-    roi_classes = np.zeros((2, 10))
-    masks = mask_head([roi_features, roi_classes])
-    self.assertAllEqual(masks.numpy().shape, [2, 10, 28, 28])
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/heads/segmentation_heads.py
+++ b/official/vision/modeling/heads/segmentation_heads.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Contains definitions of segmentation heads."""
-from typing import List, Union, Optional, Mapping, Tuple, Any
-import tensorflow as tf
-
-from official.modeling import tf_utils
-from official.vision.modeling.layers import nn_layers
-from official.vision.ops import spatial_transform_ops
-
-
-class MaskScoring(tf.keras.Model):
-  """Creates a mask scoring layer.
-
-  This implements mask scoring layer from the paper:
-
-  Zhaojin Huang, Lichao Huang, Yongchao Gong, Chang Huang, Xinggang Wang.
-  Mask Scoring R-CNN.
-  (https://arxiv.org/pdf/1903.00241.pdf)
-  """
-
-  def __init__(
-      self,
-      num_classes: int,
-      fc_input_size: List[int],
-      num_convs: int = 3,
-      num_filters: int = 256,
-      fc_dims: int = 1024,
-      num_fcs: int = 2,
-      activation: str = 'relu',
-      use_sync_bn: bool = False,
-      norm_momentum: float = 0.99,
-      norm_epsilon: float = 0.001,
-      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-      **kwargs):
-
-    """Initializes mask scoring layer.
-
-    Args:
-      num_classes: An `int` for number of classes.
-      fc_input_size: A List of `int` for the input size of the
-        fully connected layers.
-      num_convs: An`int` for number of conv layers.
-      num_filters: An `int` for the number of filters for conv layers.
-      fc_dims: An `int` number of filters for each fully connected layers.
-      num_fcs: An `int` for number of fully connected layers.
-      activation: A `str` name of the activation function.
-      use_sync_bn: A bool, whether or not to use sync batch normalization.
-      norm_momentum: A float for the momentum in BatchNorm. Defaults to 0.99.
-      norm_epsilon: A float for the epsilon value in BatchNorm. Defaults to
-        0.001.
-      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
-        Conv2D. Default is None.
-      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
-      **kwargs: Additional keyword arguments to be passed.
-    """
-    super(MaskScoring, self).__init__(**kwargs)
-
-    self._config_dict = {
-        'num_classes': num_classes,
-        'num_convs': num_convs,
-        'num_filters': num_filters,
-        'fc_input_size': fc_input_size,
-        'fc_dims': fc_dims,
-        'num_fcs': num_fcs,
-        'use_sync_bn': use_sync_bn,
-        'norm_momentum': norm_momentum,
-        'norm_epsilon': norm_epsilon,
-        'activation': activation,
-        'kernel_regularizer': kernel_regularizer,
-        'bias_regularizer': bias_regularizer,
-    }
-
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-    self._activation = tf_utils.get_activation(activation)
-
-  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
-    """Creates the variables of the mask scoring head."""
-    conv_op = tf.keras.layers.Conv2D
-    conv_kwargs = {
-        'filters': self._config_dict['num_filters'],
-        'kernel_size': 3,
-        'padding': 'same',
-    }
-    conv_kwargs.update({
-        'kernel_initializer': tf.keras.initializers.VarianceScaling(
-            scale=2, mode='fan_out', distribution='untruncated_normal'),
-        'bias_initializer': tf.zeros_initializer(),
-        'kernel_regularizer': self._config_dict['kernel_regularizer'],
-        'bias_regularizer': self._config_dict['bias_regularizer'],
-    })
-    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
-             if self._config_dict['use_sync_bn']
-             else tf.keras.layers.BatchNormalization)
-    bn_kwargs = {
-        'axis': self._bn_axis,
-        'momentum': self._config_dict['norm_momentum'],
-        'epsilon': self._config_dict['norm_epsilon'],
-    }
-
-    self._convs = []
-    self._conv_norms = []
-    for i in range(self._config_dict['num_convs']):
-      conv_name = 'mask-scoring_{}'.format(i)
-      self._convs.append(conv_op(name=conv_name, **conv_kwargs))
-      bn_name = 'mask-scoring-bn_{}'.format(i)
-      self._conv_norms.append(bn_op(name=bn_name, **bn_kwargs))
-
-    self._fcs = []
-    self._fc_norms = []
-    for i in range(self._config_dict['num_fcs']):
-      fc_name = 'mask-scoring-fc_{}'.format(i)
-      self._fcs.append(
-          tf.keras.layers.Dense(
-              units=self._config_dict['fc_dims'],
-              kernel_initializer=tf.keras.initializers.VarianceScaling(
-                  scale=1 / 3.0, mode='fan_out', distribution='uniform'),
-              kernel_regularizer=self._config_dict['kernel_regularizer'],
-              bias_regularizer=self._config_dict['bias_regularizer'],
-              name=fc_name))
-      bn_name = 'mask-scoring-fc-bn_{}'.format(i)
-      self._fc_norms.append(bn_op(name=bn_name, **bn_kwargs))
-
-    self._classifier = tf.keras.layers.Dense(
-        units=self._config_dict['num_classes'],
-        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
-        bias_initializer=tf.zeros_initializer(),
-        kernel_regularizer=self._config_dict['kernel_regularizer'],
-        bias_regularizer=self._config_dict['bias_regularizer'],
-        name='iou-scores')
-
-    super(MaskScoring, self).build(input_shape)
-
-  def call(self, inputs: tf.Tensor, training: bool = None):
-    """Forward pass mask scoring head.
-
-    Args:
-      inputs: A `tf.Tensor` of the shape [batch_size, width, size, num_classes],
-      representing the segmentation logits.
-      training: a `bool` indicating whether it is in `training` mode.
-
-    Returns:
-      mask_scores: A `tf.Tensor` of predicted mask scores
-        [batch_size, num_classes].
-    """
-    x = tf.stop_gradient(inputs)
-    for conv, bn in zip(self._convs, self._conv_norms):
-      x = conv(x)
-      x = bn(x)
-      x = self._activation(x)
-
-    # Casts feat to float32 so the resize op can be run on TPU.
-    x = tf.cast(x, tf.float32)
-    x = tf.image.resize(x, size=self._config_dict['fc_input_size'],
-                        method=tf.image.ResizeMethod.BILINEAR)
-    # Casts it back to be compatible with the rest opetations.
-    x = tf.cast(x, inputs.dtype)
-
-    _, h, w, filters = x.get_shape().as_list()
-    x = tf.reshape(x, [-1, h * w * filters])
-
-    for fc, bn in zip(self._fcs, self._fc_norms):
-      x = fc(x)
-      x = bn(x)
-      x = self._activation(x)
-
-    ious = self._classifier(x)
-    return ious
-
-  def get_config(self) -> Mapping[str, Any]:
-    return self._config_dict
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class SegmentationHead(tf.keras.layers.Layer):
-  """Creates a segmentation head."""
-
-  def __init__(
-      self,
-      num_classes: int,
-      level: Union[int, str],
-      num_convs: int = 2,
-      num_filters: int = 256,
-      use_depthwise_convolution: bool = False,
-      prediction_kernel_size: int = 1,
-      upsample_factor: int = 1,
-      feature_fusion: Optional[str] = None,
-      decoder_min_level: Optional[int] = None,
-      decoder_max_level: Optional[int] = None,
-      low_level: int = 2,
-      low_level_num_filters: int = 48,
-      num_decoder_filters: int = 256,
-      activation: str = 'relu',
-      use_sync_bn: bool = False,
-      norm_momentum: float = 0.99,
-      norm_epsilon: float = 0.001,
-      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-      **kwargs):
-    """Initializes a segmentation head.
-
-    Args:
-      num_classes: An `int` number of mask classification categories. The number
-        of classes does not include background class.
-      level: An `int` or `str`, level to use to build segmentation head.
-      num_convs: An `int` number of stacked convolution before the last
-        prediction layer.
-      num_filters: An `int` number to specify the number of filters used.
-        Default is 256.
-      use_depthwise_convolution: A bool to specify if use depthwise separable
-        convolutions.
-      prediction_kernel_size: An `int` number to specify the kernel size of the
-      prediction layer.
-      upsample_factor: An `int` number to specify the upsampling factor to
-        generate finer mask. Default 1 means no upsampling is applied.
-      feature_fusion: One of `deeplabv3plus`, `pyramid_fusion`,
-        `panoptic_fpn_fusion`, or None. If `deeplabv3plus`, features from
-        decoder_features[level] will be fused with low level feature maps from
-        backbone. If `pyramid_fusion`, multiscale features will be resized and
-        fused at the target level.
-      decoder_min_level: An `int` of minimum level from decoder to use in
-        feature fusion. It is only used when feature_fusion is set to
-        `panoptic_fpn_fusion`.
-      decoder_max_level: An `int` of maximum level from decoder to use in
-        feature fusion. It is only used when feature_fusion is set to
-        `panoptic_fpn_fusion`.
-      low_level: An `int` of backbone level to be used for feature fusion. It is
-        used when feature_fusion is set to `deeplabv3plus`.
-      low_level_num_filters: An `int` of reduced number of filters for the low
-        level features before fusing it with higher level features. It is only
-        used when feature_fusion is set to `deeplabv3plus`.
-      num_decoder_filters: An `int` of number of filters in the decoder outputs.
-        It is only used when feature_fusion is set to `panoptic_fpn_fusion`.
-      activation: A `str` that indicates which activation is used, e.g. 'relu',
-        'swish', etc.
-      use_sync_bn: A `bool` that indicates whether to use synchronized batch
-        normalization across different replicas.
-      norm_momentum: A `float` of normalization momentum for the moving average.
-      norm_epsilon: A `float` added to variance to avoid dividing by zero.
-      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
-        Conv2D. Default is None.
-      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
-      **kwargs: Additional keyword arguments to be passed.
-    """
-    super(SegmentationHead, self).__init__(**kwargs)
-
-    self._config_dict = {
-        'num_classes': num_classes,
-        'level': level,
-        'num_convs': num_convs,
-        'num_filters': num_filters,
-        'use_depthwise_convolution': use_depthwise_convolution,
-        'prediction_kernel_size': prediction_kernel_size,
-        'upsample_factor': upsample_factor,
-        'feature_fusion': feature_fusion,
-        'decoder_min_level': decoder_min_level,
-        'decoder_max_level': decoder_max_level,
-        'low_level': low_level,
-        'low_level_num_filters': low_level_num_filters,
-        'num_decoder_filters': num_decoder_filters,
-        'activation': activation,
-        'use_sync_bn': use_sync_bn,
-        'norm_momentum': norm_momentum,
-        'norm_epsilon': norm_epsilon,
-        'kernel_regularizer': kernel_regularizer,
-        'bias_regularizer': bias_regularizer
-    }
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-    self._activation = tf_utils.get_activation(activation)
-
-  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
-    """Creates the variables of the segmentation head."""
-    use_depthwise_convolution = self._config_dict['use_depthwise_convolution']
-    random_initializer = tf.keras.initializers.RandomNormal(stddev=0.01)
-    conv_op = tf.keras.layers.Conv2D
-    conv_kwargs = {
-        'kernel_size': 3 if not use_depthwise_convolution else 1,
-        'padding': 'same',
-        'use_bias': False,
-        'kernel_initializer': random_initializer,
-        'kernel_regularizer': self._config_dict['kernel_regularizer'],
-    }
-    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
-             if self._config_dict['use_sync_bn']
-             else tf.keras.layers.BatchNormalization)
-    bn_kwargs = {
-        'axis': self._bn_axis,
-        'momentum': self._config_dict['norm_momentum'],
-        'epsilon': self._config_dict['norm_epsilon'],
-    }
-
-    if self._config_dict['feature_fusion'] == 'deeplabv3plus':
-      # Deeplabv3+ feature fusion layers.
-      self._dlv3p_conv = conv_op(
-          kernel_size=1,
-          padding='same',
-          use_bias=False,
-          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
-          kernel_regularizer=self._config_dict['kernel_regularizer'],
-          name='segmentation_head_deeplabv3p_fusion_conv',
-          filters=self._config_dict['low_level_num_filters'])
-
-      self._dlv3p_norm = bn_op(
-          name='segmentation_head_deeplabv3p_fusion_norm', **bn_kwargs)
-
-    elif self._config_dict['feature_fusion'] == 'panoptic_fpn_fusion':
-      self._panoptic_fpn_fusion = nn_layers.PanopticFPNFusion(
-          min_level=self._config_dict['decoder_min_level'],
-          max_level=self._config_dict['decoder_max_level'],
-          target_level=self._config_dict['level'],
-          num_filters=self._config_dict['num_filters'],
-          num_fpn_filters=self._config_dict['num_decoder_filters'],
-          activation=self._config_dict['activation'],
-          kernel_regularizer=self._config_dict['kernel_regularizer'],
-          bias_regularizer=self._config_dict['bias_regularizer'])
-
-    # Segmentation head layers.
-    self._convs = []
-    self._norms = []
-    for i in range(self._config_dict['num_convs']):
-      if use_depthwise_convolution:
-        self._convs.append(
-            tf.keras.layers.DepthwiseConv2D(
-                name='segmentation_head_depthwise_conv_{}'.format(i),
-                kernel_size=3,
-                padding='same',
-                use_bias=False,
-                depthwise_initializer=random_initializer,
-                depthwise_regularizer=self._config_dict['kernel_regularizer'],
-                depth_multiplier=1))
-        norm_name = 'segmentation_head_depthwise_norm_{}'.format(i)
-        self._norms.append(bn_op(name=norm_name, **bn_kwargs))
-      conv_name = 'segmentation_head_conv_{}'.format(i)
-      self._convs.append(
-          conv_op(
-              name=conv_name,
-              filters=self._config_dict['num_filters'],
-              **conv_kwargs))
-      norm_name = 'segmentation_head_norm_{}'.format(i)
-      self._norms.append(bn_op(name=norm_name, **bn_kwargs))
-
-    self._classifier = conv_op(
-        name='segmentation_output',
-        filters=self._config_dict['num_classes'],
-        kernel_size=self._config_dict['prediction_kernel_size'],
-        padding='same',
-        bias_initializer=tf.zeros_initializer(),
-        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
-        kernel_regularizer=self._config_dict['kernel_regularizer'],
-        bias_regularizer=self._config_dict['bias_regularizer'])
-
-    super().build(input_shape)
-
-  def call(self, inputs: Tuple[Union[tf.Tensor, Mapping[str, tf.Tensor]],
-                               Union[tf.Tensor, Mapping[str, tf.Tensor]]]):
-    """Forward pass of the segmentation head.
-
-    It supports both a tuple of 2 tensors or 2 dictionaries. The first is
-    backbone endpoints, and the second is decoder endpoints. When inputs are
-    tensors, they are from a single level of feature maps. When inputs are
-    dictionaries, they contain multiple levels of feature maps, where the key
-    is the index of feature map.
-
-    Args:
-      inputs: A tuple of 2 feature map tensors of shape
-        [batch, height_l, width_l, channels] or 2 dictionaries of tensors:
-        - key: A `str` of the level of the multilevel features.
-        - values: A `tf.Tensor` of the feature map tensors, whose shape is
-            [batch, height_l, width_l, channels].
-        The first is backbone endpoints, and the second is decoder endpoints.
-    Returns:
-      segmentation prediction mask: A `tf.Tensor` of the segmentation mask
-        scores predicted from input features.
-    """
-
-    backbone_output = inputs[0]
-    decoder_output = inputs[1]
-    if self._config_dict['feature_fusion'] == 'deeplabv3plus':
-      # deeplabv3+ feature fusion
-      x = decoder_output[str(self._config_dict['level'])] if isinstance(
-          decoder_output, dict) else decoder_output
-      y = backbone_output[str(self._config_dict['low_level'])] if isinstance(
-          backbone_output, dict) else backbone_output
-      y = self._dlv3p_norm(self._dlv3p_conv(y))
-      y = self._activation(y)
-
-      x = tf.image.resize(
-          x, tf.shape(y)[1:3], method=tf.image.ResizeMethod.BILINEAR)
-      x = tf.cast(x, dtype=y.dtype)
-      x = tf.concat([x, y], axis=self._bn_axis)
-    elif self._config_dict['feature_fusion'] == 'pyramid_fusion':
-      if not isinstance(decoder_output, dict):
-        raise ValueError('Only support dictionary decoder_output.')
-      x = nn_layers.pyramid_feature_fusion(decoder_output,
-                                           self._config_dict['level'])
-    elif self._config_dict['feature_fusion'] == 'panoptic_fpn_fusion':
-      x = self._panoptic_fpn_fusion(decoder_output)
-    else:
-      x = decoder_output[str(self._config_dict['level'])] if isinstance(
-          decoder_output, dict) else decoder_output
-
-    for conv, norm in zip(self._convs, self._norms):
-      x = conv(x)
-      x = norm(x)
-      x = self._activation(x)
-    if self._config_dict['upsample_factor'] > 1:
-      x = spatial_transform_ops.nearest_upsampling(
-          x, scale=self._config_dict['upsample_factor'])
-
-    return self._classifier(x)
-
-  def get_config(self):
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(self._config_dict.items()))
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
--- a/official/vision/modeling/heads/segmentation_heads_test.py
+++ b/official/vision/modeling/heads/segmentation_heads_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Tests for segmentation_heads.py."""
-
-# Import libraries
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from official.vision.modeling.heads import segmentation_heads
-
-
-class SegmentationHeadTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters(
-      (2, 'pyramid_fusion', None, None),
-      (3, 'pyramid_fusion', None, None),
-      (2, 'panoptic_fpn_fusion', 2, 5),
-      (2, 'panoptic_fpn_fusion', 2, 6),
-      (3, 'panoptic_fpn_fusion', 3, 5),
-      (3, 'panoptic_fpn_fusion', 3, 6))
-  def test_forward(self, level, feature_fusion,
-                   decoder_min_level, decoder_max_level):
-    backbone_features = {
-        '3': np.random.rand(2, 128, 128, 16),
-        '4': np.random.rand(2, 64, 64, 16),
-        '5': np.random.rand(2, 32, 32, 16),
-    }
-    decoder_features = {
-        '3': np.random.rand(2, 128, 128, 64),
-        '4': np.random.rand(2, 64, 64, 64),
-        '5': np.random.rand(2, 32, 32, 64),
-        '6': np.random.rand(2, 16, 16, 64),
-    }
-
-    if feature_fusion == 'panoptic_fpn_fusion':
-      backbone_features['2'] = np.random.rand(2, 256, 256, 16)
-      decoder_features['2'] = np.random.rand(2, 256, 256, 64)
-
-    head = segmentation_heads.SegmentationHead(
-        num_classes=10,
-        level=level,
-        feature_fusion=feature_fusion,
-        decoder_min_level=decoder_min_level,
-        decoder_max_level=decoder_max_level,
-        num_decoder_filters=64)
-
-    logits = head((backbone_features, decoder_features))
-
-    if level in decoder_features:
-      self.assertAllEqual(logits.numpy().shape, [
-          2, decoder_features[str(level)].shape[1],
-          decoder_features[str(level)].shape[2], 10
-      ])
-
-  def test_serialize_deserialize(self):
-    head = segmentation_heads.SegmentationHead(num_classes=10, level=3)
-    config = head.get_config()
-    new_head = segmentation_heads.SegmentationHead.from_config(config)
-    self.assertAllEqual(head.get_config(), new_head.get_config())
-
-
-class MaskScoringHeadTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters(
-      (1, 1, 64, [4, 4]),
-      (2, 1, 64, [4, 4]),
-      (3, 1, 64, [4, 4]),
-      (1, 2, 32, [8, 8]),
-      (2, 2, 32, [8, 8]),
-      (3, 2, 32, [8, 8]),)
-  def test_forward(self, num_convs, num_fcs,
-                   num_filters, fc_input_size):
-    features = np.random.rand(2, 64, 64, 16)
-
-    head = segmentation_heads.MaskScoring(
-        num_classes=2,
-        num_convs=num_convs,
-        num_filters=num_filters,
-        fc_dims=128,
-        fc_input_size=fc_input_size)
-
-    scores = head(features)
-    self.assertAllEqual(scores.numpy().shape, [2, 2])
-
-  def test_serialize_deserialize(self):
-    head = segmentation_heads.MaskScoring(
-        num_classes=2, fc_input_size=[4, 4], fc_dims=128)
-    config = head.get_config()
-    new_head = segmentation_heads.MaskScoring.from_config(config)
-    self.assertAllEqual(head.get_config(), new_head.get_config())
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/layers/__init__.py
+++ b/official/vision/modeling/layers/__init__.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Layers package definition."""
-
-from official.vision.modeling.layers.box_sampler import BoxSampler
-from official.vision.modeling.layers.detection_generator import DetectionGenerator
-from official.vision.modeling.layers.detection_generator import MultilevelDetectionGenerator
-from official.vision.modeling.layers.mask_sampler import MaskSampler
-from official.vision.modeling.layers.nn_blocks import BottleneckBlock
-from official.vision.modeling.layers.nn_blocks import BottleneckResidualInner
-from official.vision.modeling.layers.nn_blocks import DepthwiseSeparableConvBlock
-from official.vision.modeling.layers.nn_blocks import InvertedBottleneckBlock
-from official.vision.modeling.layers.nn_blocks import ResidualBlock
-from official.vision.modeling.layers.nn_blocks import ResidualInner
-from official.vision.modeling.layers.nn_blocks import ReversibleLayer
-from official.vision.modeling.layers.nn_blocks_3d import BottleneckBlock3D
-from official.vision.modeling.layers.nn_blocks_3d import SelfGating
-from official.vision.modeling.layers.nn_layers import CausalConvMixin
-from official.vision.modeling.layers.nn_layers import Conv2D
-from official.vision.modeling.layers.nn_layers import Conv3D
-from official.vision.modeling.layers.nn_layers import DepthwiseConv2D
-from official.vision.modeling.layers.nn_layers import GlobalAveragePool3D
-from official.vision.modeling.layers.nn_layers import PositionalEncoding
-from official.vision.modeling.layers.nn_layers import Scale
-from official.vision.modeling.layers.nn_layers import SpatialAveragePool3D
-from official.vision.modeling.layers.nn_layers import SqueezeExcitation
-from official.vision.modeling.layers.nn_layers import StochasticDepth
-from official.vision.modeling.layers.nn_layers import TemporalSoftmaxPool
-from official.vision.modeling.layers.roi_aligner import MultilevelROIAligner
-from official.vision.modeling.layers.roi_generator import MultilevelROIGenerator
-from official.vision.modeling.layers.roi_sampler import ROISampler
--- a/official/vision/modeling/layers/box_sampler.py
+++ b/official/vision/modeling/layers/box_sampler.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Contains definitions of box sampler."""
-
-# Import libraries
-import tensorflow as tf
-
-from official.vision.ops import sampling_ops
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class BoxSampler(tf.keras.layers.Layer):
-  """Creates a BoxSampler to sample positive and negative boxes."""
-
-  def __init__(self,
-               num_samples: int = 512,
-               foreground_fraction: float = 0.25,
-               **kwargs):
-    """Initializes a box sampler.
-
-    Args:
-      num_samples: An `int` of the number of sampled boxes per image.
-      foreground_fraction: A `float` in [0, 1], what percentage of boxes should
-        be sampled from the positive examples.
-      **kwargs: Additional keyword arguments passed to Layer.
-    """
-    self._config_dict = {
-        'num_samples': num_samples,
-        'foreground_fraction': foreground_fraction,
-    }
-    super(BoxSampler, self).__init__(**kwargs)
-
-  def call(self, positive_matches: tf.Tensor, negative_matches: tf.Tensor,
-           ignored_matches: tf.Tensor):
-    """Samples and selects positive and negative instances.
-
-    Args:
-      positive_matches: A `bool` tensor of shape of [batch, N] where N is the
-        number of instances. For each element, `True` means the instance
-        corresponds to a positive example.
-      negative_matches: A `bool` tensor of shape of [batch, N] where N is the
-        number of instances. For each element, `True` means the instance
-        corresponds to a negative example.
-      ignored_matches: A `bool` tensor of shape of [batch, N] where N is the
-        number of instances. For each element, `True` means the instance should
-        be ignored.
-
-    Returns:
-      A `tf.tensor` of shape of [batch_size, K], storing the indices of the
-        sampled examples, where K is `num_samples`.
-    """
-    sample_candidates = tf.logical_and(
-        tf.logical_or(positive_matches, negative_matches),
-        tf.logical_not(ignored_matches))
-
-    sampler = sampling_ops.BalancedPositiveNegativeSampler(
-        positive_fraction=self._config_dict['foreground_fraction'],
-        is_static=True)
-
-    batch_size = sample_candidates.shape[0]
-    sampled_indicators = []
-    for i in range(batch_size):
-      sampled_indicator = sampler.subsample(
-          sample_candidates[i],
-          self._config_dict['num_samples'],
-          positive_matches[i])
-      sampled_indicators.append(sampled_indicator)
-    sampled_indicators = tf.stack(sampled_indicators)
-    _, selected_indices = tf.nn.top_k(
-        tf.cast(sampled_indicators, dtype=tf.int32),
-        k=self._config_dict['num_samples'],
-        sorted=True)
-
-    return selected_indices
-
-  def get_config(self):
-    return self._config_dict
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
--- a/official/vision/modeling/layers/box_sampler_test.py
+++ b/official/vision/modeling/layers/box_sampler_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for roi_sampler.py."""
-
-# Import libraries
-import numpy as np
-import tensorflow as tf
-
-from official.vision.modeling.layers import box_sampler
-
-
-class BoxSamplerTest(tf.test.TestCase):
-
-  def test_box_sampler(self):
-    positive_matches = np.array(
-        [[True, False, False, False, True, True, False],
-         [False, False, False, False, False, True, True]])
-    negative_matches = np.array(
-        [[False, True, True, True, False, False, False],
-         [True, True, True, True, False, False, False]])
-    ignored_matches = np.array(
-        [[False, False, False, False, False, False, True],
-         [False, False, False, False, True, False, False]])
-
-    sampler = box_sampler.BoxSampler(num_samples=2, foreground_fraction=0.5)
-
-    # Runs on TPU.
-    strategy = tf.distribute.TPUStrategy()
-    with strategy.scope():
-      selected_indices_tpu = sampler(
-          positive_matches, negative_matches, ignored_matches)
-
-    self.assertEqual(2, tf.shape(selected_indices_tpu)[1])
-
-    # Runs on CPU.
-    selected_indices_cpu = sampler(
-        positive_matches, negative_matches, ignored_matches)
-    self.assertEqual(2, tf.shape(selected_indices_cpu)[1])
-
-  def test_serialize_deserialize(self):
-    kwargs = dict(
-        num_samples=512,
-        foreground_fraction=0.25,
-    )
-    sampler = box_sampler.BoxSampler(**kwargs)
-
-    expected_config = dict(kwargs)
-    self.assertEqual(sampler.get_config(), expected_config)
-
-    new_sampler = box_sampler.BoxSampler.from_config(
-        sampler.get_config())
-
-    self.assertAllEqual(sampler.get_config(), new_sampler.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/layers/deeplab.py
+++ b/official/vision/modeling/layers/deeplab.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Layers for DeepLabV3."""
-
-import tensorflow as tf
-
-
-class SpatialPyramidPooling(tf.keras.layers.Layer):
-  """Implements the Atrous Spatial Pyramid Pooling.
-
-  References:
-    [Rethinking Atrous Convolution for Semantic Image Segmentation](
-      https://arxiv.org/pdf/1706.05587.pdf)
-    [Encoder-Decoder with Atrous Separable Convolution for Semantic Image
-    Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
-  """
-
-  def __init__(
-      self,
-      output_channels,
-      dilation_rates,
-      pool_kernel_size=None,
-      use_sync_bn=False,
-      batchnorm_momentum=0.99,
-      batchnorm_epsilon=0.001,
-      activation='relu',
-      dropout=0.5,
-      kernel_initializer='glorot_uniform',
-      kernel_regularizer=None,
-      interpolation='bilinear',
-      use_depthwise_convolution=False,
-      **kwargs):
-    """Initializes `SpatialPyramidPooling`.
-
-    Args:
-      output_channels: Number of channels produced by SpatialPyramidPooling.
-      dilation_rates: A list of integers for parallel dilated conv.
-      pool_kernel_size: A list of integers or None. If None, global average
-        pooling is applied, otherwise an average pooling of pool_kernel_size
-        is applied.
-      use_sync_bn: A bool, whether or not to use sync batch normalization.
-      batchnorm_momentum: A float for the momentum in BatchNorm. Defaults to
-        0.99.
-      batchnorm_epsilon: A float for the epsilon value in BatchNorm. Defaults to
-        0.001.
-      activation: A `str` for type of activation to be used. Defaults to 'relu'.
-      dropout: A float for the dropout rate before output. Defaults to 0.5.
-      kernel_initializer: Kernel initializer for conv layers. Defaults to
-        `glorot_uniform`.
-      kernel_regularizer: Kernel regularizer for conv layers. Defaults to None.
-      interpolation: The interpolation method for upsampling. Defaults to
-        `bilinear`.
-      use_depthwise_convolution: Allows spatial pooling to be separable
-         depthwise convolusions. [Encoder-Decoder with Atrous Separable
-         Convolution for Semantic Image Segmentation](
-         https://arxiv.org/pdf/1802.02611.pdf)
-      **kwargs: Other keyword arguments for the layer.
-    """
-    super(SpatialPyramidPooling, self).__init__(**kwargs)
-
-    self.output_channels = output_channels
-    self.dilation_rates = dilation_rates
-    self.use_sync_bn = use_sync_bn
-    self.batchnorm_momentum = batchnorm_momentum
-    self.batchnorm_epsilon = batchnorm_epsilon
-    self.activation = activation
-    self.dropout = dropout
-    self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self.interpolation = interpolation
-    self.input_spec = tf.keras.layers.InputSpec(ndim=4)
-    self.pool_kernel_size = pool_kernel_size
-    self.use_depthwise_convolution = use_depthwise_convolution
-
-  def build(self, input_shape):
-    height = input_shape[1]
-    width = input_shape[2]
-    channels = input_shape[3]
-
-    self.aspp_layers = []
-
-    if self.use_sync_bn:
-      bn_op = tf.keras.layers.experimental.SyncBatchNormalization
-    else:
-      bn_op = tf.keras.layers.BatchNormalization
-
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      bn_axis = -1
-    else:
-      bn_axis = 1
-
-    conv_sequential = tf.keras.Sequential([
-        tf.keras.layers.Conv2D(
-            filters=self.output_channels, kernel_size=(1, 1),
-            kernel_initializer=self.kernel_initializer,
-            kernel_regularizer=self.kernel_regularizer,
-            use_bias=False),
-        bn_op(
-            axis=bn_axis,
-            momentum=self.batchnorm_momentum,
-            epsilon=self.batchnorm_epsilon),
-        tf.keras.layers.Activation(self.activation)
-    ])
-    self.aspp_layers.append(conv_sequential)
-
-    for dilation_rate in self.dilation_rates:
-      leading_layers = []
-      kernel_size = (3, 3)
-      if self.use_depthwise_convolution:
-        leading_layers += [
-            tf.keras.layers.DepthwiseConv2D(
-                depth_multiplier=1, kernel_size=kernel_size,
-                padding='same', depthwise_regularizer=self.kernel_regularizer,
-                depthwise_initializer=self.kernel_initializer,
-                dilation_rate=dilation_rate, use_bias=False)
-        ]
-        kernel_size = (1, 1)
-      conv_sequential = tf.keras.Sequential(leading_layers + [
-          tf.keras.layers.Conv2D(
-              filters=self.output_channels, kernel_size=kernel_size,
-              padding='same', kernel_regularizer=self.kernel_regularizer,
-              kernel_initializer=self.kernel_initializer,
-              dilation_rate=dilation_rate, use_bias=False),
-          bn_op(axis=bn_axis, momentum=self.batchnorm_momentum,
-                epsilon=self.batchnorm_epsilon),
-          tf.keras.layers.Activation(self.activation)])
-      self.aspp_layers.append(conv_sequential)
-
-    if self.pool_kernel_size is None:
-      pool_sequential = tf.keras.Sequential([
-          tf.keras.layers.GlobalAveragePooling2D(),
-          tf.keras.layers.Reshape((1, 1, channels))])
-    else:
-      pool_sequential = tf.keras.Sequential([
-          tf.keras.layers.AveragePooling2D(self.pool_kernel_size)])
-
-    pool_sequential.add(
-        tf.keras.Sequential([
-            tf.keras.layers.Conv2D(
-                filters=self.output_channels,
-                kernel_size=(1, 1),
-                kernel_initializer=self.kernel_initializer,
-                kernel_regularizer=self.kernel_regularizer,
-                use_bias=False),
-            bn_op(
-                axis=bn_axis,
-                momentum=self.batchnorm_momentum,
-                epsilon=self.batchnorm_epsilon),
-            tf.keras.layers.Activation(self.activation),
-            tf.keras.layers.experimental.preprocessing.Resizing(
-                height,
-                width,
-                interpolation=self.interpolation,
-                dtype=tf.float32)
-        ]))
-
-    self.aspp_layers.append(pool_sequential)
-
-    self.projection = tf.keras.Sequential([
-        tf.keras.layers.Conv2D(
-            filters=self.output_channels, kernel_size=(1, 1),
-            kernel_initializer=self.kernel_initializer,
-            kernel_regularizer=self.kernel_regularizer,
-            use_bias=False),
-        bn_op(
-            axis=bn_axis,
-            momentum=self.batchnorm_momentum,
-            epsilon=self.batchnorm_epsilon),
-        tf.keras.layers.Activation(self.activation),
-        tf.keras.layers.Dropout(rate=self.dropout)])
-
-  def call(self, inputs, training=None):
-    if training is None:
-      training = tf.keras.backend.learning_phase()
-    result = []
-    for layer in self.aspp_layers:
-      result.append(tf.cast(layer(inputs, training=training), inputs.dtype))
-    result = tf.concat(result, axis=-1)
-    result = self.projection(result, training=training)
-    return result
-
-  def get_config(self):
-    config = {
-        'output_channels': self.output_channels,
-        'dilation_rates': self.dilation_rates,
-        'pool_kernel_size': self.pool_kernel_size,
-        'use_sync_bn': self.use_sync_bn,
-        'batchnorm_momentum': self.batchnorm_momentum,
-        'batchnorm_epsilon': self.batchnorm_epsilon,
-        'activation': self.activation,
-        'dropout': self.dropout,
-        'kernel_initializer': tf.keras.initializers.serialize(
-            self.kernel_initializer),
-        'kernel_regularizer': tf.keras.regularizers.serialize(
-            self.kernel_regularizer),
-        'interpolation': self.interpolation,
-    }
-    base_config = super(SpatialPyramidPooling, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
--- a/official/vision/modeling/layers/deeplab_test.py
+++ b/official/vision/modeling/layers/deeplab_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for ASPP."""
-
-import tensorflow as tf
-
-from tensorflow.python.keras import keras_parameterized
-from official.vision.modeling.layers import deeplab
-
-
-@keras_parameterized.run_all_keras_modes
-class DeeplabTest(keras_parameterized.TestCase):
-
-  @keras_parameterized.parameterized.parameters(
-      (None,),
-      ([32, 32],),
-      )
-  def test_aspp(self, pool_kernel_size):
-    inputs = tf.keras.Input(shape=(64, 64, 128), dtype=tf.float32)
-    layer = deeplab.SpatialPyramidPooling(output_channels=256,
-                                          dilation_rates=[6, 12, 18],
-                                          pool_kernel_size=None)
-    output = layer(inputs)
-    self.assertAllEqual([None, 64, 64, 256], output.shape)
-
-  def test_aspp_invalid_shape(self):
-    inputs = tf.keras.Input(shape=(64, 64), dtype=tf.float32)
-    layer = deeplab.SpatialPyramidPooling(output_channels=256,
-                                          dilation_rates=[6, 12, 18])
-    with self.assertRaises(ValueError):
-      _ = layer(inputs)
-
-  def test_config_with_custom_name(self):
-    layer = deeplab.SpatialPyramidPooling(256, [5], name='aspp')
-    config = layer.get_config()
-    layer_1 = deeplab.SpatialPyramidPooling.from_config(config)
-    self.assertEqual(layer_1.name, layer.name)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/layers/detection_generator.py
+++ b/official/vision/modeling/layers/detection_generator.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Contains definitions of generators to generate the final detections."""
-import contextlib
-from typing import List, Optional, Mapping
-# Import libraries
-import tensorflow as tf
-
-from official.vision.ops import box_ops
-from official.vision.ops import nms
-from official.vision.ops import preprocess_ops
-
-
-def _generate_detections_v1(boxes: tf.Tensor,
-                            scores: tf.Tensor,
-                            attributes: Optional[Mapping[str,
-                                                         tf.Tensor]] = None,
-                            pre_nms_top_k: int = 5000,
-                            pre_nms_score_threshold: float = 0.05,
-                            nms_iou_threshold: float = 0.5,
-                            max_num_detections: int = 100,
-                            soft_nms_sigma: Optional[float] = None):
-  """Generates the final detections given the model outputs.
-
-  The implementation unrolls the batch dimension and process images one by one.
-  It required the batch dimension to be statically known and it is TPU
-  compatible.
-
-  Args:
-    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
-      `[batch_size, N, 1, 4]` for box predictions on all feature levels. The
-      N is the number of total anchors on all levels.
-    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
-      stacks class probability on all feature levels. The N is the number of
-      total anchors on all levels. The num_classes is the number of classes
-      predicted by the model. Note that the class_outputs here is the raw score.
-    attributes: None or a dict of (attribute_name, attributes) pairs. Each
-      attributes is a `tf.Tensor` with shape
-      `[batch_size, N, num_classes, attribute_size]` or
-      `[batch_size, N, 1, attribute_size]` for attribute predictions on all
-      feature levels. The N is the number of total anchors on all levels. Can
-      be None if no attribute learning is required.
-    pre_nms_top_k: An `int` number of top candidate detections per class before
-      NMS.
-    pre_nms_score_threshold: A `float` representing the threshold for deciding
-      when to remove boxes based on score.
-    nms_iou_threshold: A `float` representing the threshold for deciding whether
-      boxes overlap too much with respect to IOU.
-    max_num_detections: A scalar representing maximum number of boxes retained
-      over all classes.
-    soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
-      When soft_nms_sigma=0.0 (which is default), we fall back to standard NMS.
-
-  Returns:
-    nms_boxes: A `float` type `tf.Tensor` of shape
-      `[batch_size, max_num_detections, 4]` representing top detected boxes in
-      `[y1, x1, y2, x2]`.
-    nms_scores: A `float` type `tf.Tensor` of shape
-      `[batch_size, max_num_detections]` representing sorted confidence scores
-      for detected boxes. The values are between `[0, 1]`.
-    nms_classes: An `int` type `tf.Tensor` of shape
-      `[batch_size, max_num_detections]` representing classes for detected
-      boxes.
-    valid_detections: An `int` type `tf.Tensor` of shape `[batch_size]` only the
-       top `valid_detections` boxes are valid detections.
-    nms_attributes: None or a dict of (attribute_name, attributes). Each
-      attribute is a `float` type `tf.Tensor` of shape
-      `[batch_size, max_num_detections, attribute_size]` representing attribute
-      predictions for detected boxes. Can be an empty dict if no attribute
-      learning is required.
-  """
-  with tf.name_scope('generate_detections'):
-    batch_size = scores.get_shape().as_list()[0]
-    nmsed_boxes = []
-    nmsed_classes = []
-    nmsed_scores = []
-    valid_detections = []
-    if attributes:
-      nmsed_attributes = {att_name: [] for att_name in attributes.keys()}
-    else:
-      nmsed_attributes = {}
-
-    for i in range(batch_size):
-      (nmsed_boxes_i, nmsed_scores_i, nmsed_classes_i, valid_detections_i,
-       nmsed_att_i) = _generate_detections_per_image(
-           boxes[i],
-           scores[i],
-           attributes={
-               att_name: att[i] for att_name, att in attributes.items()
-           } if attributes else {},
-           pre_nms_top_k=pre_nms_top_k,
-           pre_nms_score_threshold=pre_nms_score_threshold,
-           nms_iou_threshold=nms_iou_threshold,
-           max_num_detections=max_num_detections,
-           soft_nms_sigma=soft_nms_sigma)
-      nmsed_boxes.append(nmsed_boxes_i)
-      nmsed_scores.append(nmsed_scores_i)
-      nmsed_classes.append(nmsed_classes_i)
-      valid_detections.append(valid_detections_i)
-      if attributes:
-        for att_name in attributes.keys():
-          nmsed_attributes[att_name].append(nmsed_att_i[att_name])
-
-  nmsed_boxes = tf.stack(nmsed_boxes, axis=0)
-  nmsed_scores = tf.stack(nmsed_scores, axis=0)
-  nmsed_classes = tf.stack(nmsed_classes, axis=0)
-  valid_detections = tf.stack(valid_detections, axis=0)
-  if attributes:
-    for att_name in attributes.keys():
-      nmsed_attributes[att_name] = tf.stack(nmsed_attributes[att_name], axis=0)
-
-  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes
-
-
-def _generate_detections_per_image(
-    boxes: tf.Tensor,
-    scores: tf.Tensor,
-    attributes: Optional[Mapping[str, tf.Tensor]] = None,
-    pre_nms_top_k: int = 5000,
-    pre_nms_score_threshold: float = 0.05,
-    nms_iou_threshold: float = 0.5,
-    max_num_detections: int = 100,
-    soft_nms_sigma: Optional[float] = None):
-  """Generates the final detections per image given the model outputs.
-
-  Args:
-    boxes: A  `tf.Tensor` with shape `[N, num_classes, 4]` or `[N, 1, 4]`, which
-      box predictions on all feature levels. The N is the number of total
-      anchors on all levels.
-    scores: A `tf.Tensor` with shape `[N, num_classes]`, which stacks class
-      probability on all feature levels. The N is the number of total anchors on
-      all levels. The num_classes is the number of classes predicted by the
-      model. Note that the class_outputs here is the raw score.
-    attributes: If not None, a dict of `tf.Tensor`. Each value is in shape
-      `[N, num_classes, attribute_size]` or `[N, 1, attribute_size]` of
-      attribute predictions on all feature levels. The N is the number of total
-      anchors on all levels.
-    pre_nms_top_k: An `int` number of top candidate detections per class before
-      NMS.
-    pre_nms_score_threshold: A `float` representing the threshold for deciding
-      when to remove boxes based on score.
-    nms_iou_threshold: A `float` representing the threshold for deciding whether
-      boxes overlap too much with respect to IOU.
-    max_num_detections: A `scalar` representing maximum number of boxes retained
-      over all classes.
-    soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
-      When soft_nms_sigma=0.0, we fall back to standard NMS.
-      If set to None, `tf.image.non_max_suppression_padded` is called instead.
-
-  Returns:
-    nms_boxes: A `float` tf.Tensor of shape `[max_num_detections, 4]`
-      representing top detected boxes in `[y1, x1, y2, x2]`.
-    nms_scores: A `float` tf.Tensor of shape `[max_num_detections]` representing
-      sorted confidence scores for detected boxes. The values are between [0,
-      1].
-    nms_classes: An `int` tf.Tensor of shape `[max_num_detections]` representing
-      classes for detected boxes.
-    valid_detections: An `int` tf.Tensor of shape [1] only the top
-      `valid_detections` boxes are valid detections.
-    nms_attributes: None or a dict. Each value is a `float` tf.Tensor of shape
-      `[max_num_detections, attribute_size]` representing attribute predictions
-      for detected boxes. Can be an empty dict if `attributes` is None.
-  """
-  nmsed_boxes = []
-  nmsed_scores = []
-  nmsed_classes = []
-  num_classes_for_box = boxes.get_shape().as_list()[1]
-  num_classes = scores.get_shape().as_list()[1]
-  if attributes:
-    nmsed_attributes = {att_name: [] for att_name in attributes.keys()}
-  else:
-    nmsed_attributes = {}
-
-  for i in range(num_classes):
-    boxes_i = boxes[:, min(num_classes_for_box - 1, i)]
-    scores_i = scores[:, i]
-    # Obtains pre_nms_top_k before running NMS.
-    scores_i, indices = tf.nn.top_k(
-        scores_i, k=tf.minimum(tf.shape(scores_i)[-1], pre_nms_top_k))
-    boxes_i = tf.gather(boxes_i, indices)
-
-    if soft_nms_sigma is not None:
-      (nmsed_indices_i,
-       nmsed_scores_i) = tf.image.non_max_suppression_with_scores(
-           tf.cast(boxes_i, tf.float32),
-           tf.cast(scores_i, tf.float32),
-           max_num_detections,
-           iou_threshold=nms_iou_threshold,
-           score_threshold=pre_nms_score_threshold,
-           soft_nms_sigma=soft_nms_sigma,
-           name='nms_detections_' + str(i))
-      nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
-      nmsed_boxes_i = preprocess_ops.clip_or_pad_to_fixed_size(
-          nmsed_boxes_i, max_num_detections, 0.0)
-      nmsed_scores_i = preprocess_ops.clip_or_pad_to_fixed_size(
-          nmsed_scores_i, max_num_detections, -1.0)
-    else:
-      (nmsed_indices_i,
-       nmsed_num_valid_i) = tf.image.non_max_suppression_padded(
-           tf.cast(boxes_i, tf.float32),
-           tf.cast(scores_i, tf.float32),
-           max_num_detections,
-           iou_threshold=nms_iou_threshold,
-           score_threshold=pre_nms_score_threshold,
-           pad_to_max_output_size=True,
-           name='nms_detections_' + str(i))
-      nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
-      nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i)
-      # Sets scores of invalid boxes to -1.
-      nmsed_scores_i = tf.where(
-          tf.less(tf.range(max_num_detections), [nmsed_num_valid_i]),
-          nmsed_scores_i, -tf.ones_like(nmsed_scores_i))
-
-    nmsed_classes_i = tf.fill([max_num_detections], i)
-    nmsed_boxes.append(nmsed_boxes_i)
-    nmsed_scores.append(nmsed_scores_i)
-    nmsed_classes.append(nmsed_classes_i)
-    if attributes:
-      for att_name, att in attributes.items():
-        num_classes_for_attr = att.get_shape().as_list()[1]
-        att_i = att[:, min(num_classes_for_attr - 1, i)]
-        att_i = tf.gather(att_i, indices)
-        nmsed_att_i = tf.gather(att_i, nmsed_indices_i)
-        nmsed_att_i = preprocess_ops.clip_or_pad_to_fixed_size(
-            nmsed_att_i, max_num_detections, 0.0)
-        nmsed_attributes[att_name].append(nmsed_att_i)
-
-  # Concats results from all classes and sort them.
-  nmsed_boxes = tf.concat(nmsed_boxes, axis=0)
-  nmsed_scores = tf.concat(nmsed_scores, axis=0)
-  nmsed_classes = tf.concat(nmsed_classes, axis=0)
-  nmsed_scores, indices = tf.nn.top_k(
-      nmsed_scores, k=max_num_detections, sorted=True)
-  nmsed_boxes = tf.gather(nmsed_boxes, indices)
-  nmsed_classes = tf.gather(nmsed_classes, indices)
-  valid_detections = tf.reduce_sum(
-      tf.cast(tf.greater(nmsed_scores, -1), tf.int32))
-  if attributes:
-    for att_name in attributes.keys():
-      nmsed_attributes[att_name] = tf.concat(nmsed_attributes[att_name], axis=0)
-      nmsed_attributes[att_name] = tf.gather(nmsed_attributes[att_name],
-                                             indices)
-
-  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes
-
-
-def _select_top_k_scores(scores_in: tf.Tensor, pre_nms_num_detections: int):
-  """Selects top_k scores and indices for each class.
-
-  Args:
-    scores_in: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
-      stacks class logit outputs on all feature levels. The N is the number of
-      total anchors on all levels. The num_classes is the number of classes
-      predicted by the model.
-    pre_nms_num_detections: Number of candidates before NMS.
-
-  Returns:
-    scores and indices: A `tf.Tensor` with shape
-      `[batch_size, pre_nms_num_detections, num_classes]`.
-  """
-  batch_size, num_anchors, num_class = scores_in.get_shape().as_list()
-  if batch_size is None:
-    batch_size = tf.shape(scores_in)[0]
-  scores_trans = tf.transpose(scores_in, perm=[0, 2, 1])
-  scores_trans = tf.reshape(scores_trans, [-1, num_anchors])
-
-  top_k_scores, top_k_indices = tf.nn.top_k(
-      scores_trans, k=pre_nms_num_detections, sorted=True)
-
-  top_k_scores = tf.reshape(top_k_scores,
-                            [batch_size, num_class, pre_nms_num_detections])
-  top_k_indices = tf.reshape(top_k_indices,
-                             [batch_size, num_class, pre_nms_num_detections])
-
-  return tf.transpose(top_k_scores,
-                      [0, 2, 1]), tf.transpose(top_k_indices, [0, 2, 1])
-
-
-def _generate_detections_v2(boxes: tf.Tensor,
-                            scores: tf.Tensor,
-                            pre_nms_top_k: int = 5000,
-                            pre_nms_score_threshold: float = 0.05,
-                            nms_iou_threshold: float = 0.5,
-                            max_num_detections: int = 100):
-  """Generates the final detections given the model outputs.
-
-  This implementation unrolls classes dimension while using the tf.while_loop
-  to implement the batched NMS, so that it can be parallelized at the batch
-  dimension. It should give better performance comparing to v1 implementation.
-  It is TPU compatible.
-
-  Args:
-    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
-      `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
-      N is the number of total anchors on all levels.
-    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
-      stacks class probability on all feature levels. The N is the number of
-      total anchors on all levels. The num_classes is the number of classes
-      predicted by the model. Note that the class_outputs here is the raw score.
-    pre_nms_top_k: An `int` number of top candidate detections per class before
-      NMS.
-    pre_nms_score_threshold: A `float` representing the threshold for deciding
-      when to remove boxes based on score.
-    nms_iou_threshold: A `float` representing the threshold for deciding whether
-      boxes overlap too much with respect to IOU.
-    max_num_detections: A `scalar` representing maximum number of boxes retained
-      over all classes.
-
-  Returns:
-    nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4]
-      representing top detected boxes in [y1, x1, y2, x2].
-    nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections]
-      representing sorted confidence scores for detected boxes. The values are
-      between [0, 1].
-    nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections]
-      representing classes for detected boxes.
-    valid_detections: An `int` tf.Tensor of shape [batch_size] only the top
-      `valid_detections` boxes are valid detections.
-  """
-  with tf.name_scope('generate_detections'):
-    nmsed_boxes = []
-    nmsed_classes = []
-    nmsed_scores = []
-    valid_detections = []
-    batch_size, _, num_classes_for_box, _ = boxes.get_shape().as_list()
-    if batch_size is None:
-      batch_size = tf.shape(boxes)[0]
-    _, total_anchors, num_classes = scores.get_shape().as_list()
-    # Selects top pre_nms_num scores and indices before NMS.
-    scores, indices = _select_top_k_scores(
-        scores, min(total_anchors, pre_nms_top_k))
-    for i in range(num_classes):
-      boxes_i = boxes[:, :, min(num_classes_for_box - 1, i), :]
-      scores_i = scores[:, :, i]
-      # Obtains pre_nms_top_k before running NMS.
-      boxes_i = tf.gather(boxes_i, indices[:, :, i], batch_dims=1, axis=1)
-
-      # Filter out scores.
-      boxes_i, scores_i = box_ops.filter_boxes_by_scores(
-          boxes_i, scores_i, min_score_threshold=pre_nms_score_threshold)
-
-      (nmsed_scores_i, nmsed_boxes_i) = nms.sorted_non_max_suppression_padded(
-          tf.cast(scores_i, tf.float32),
-          tf.cast(boxes_i, tf.float32),
-          max_num_detections,
-          iou_threshold=nms_iou_threshold)
-      nmsed_classes_i = tf.fill([batch_size, max_num_detections], i)
-      nmsed_boxes.append(nmsed_boxes_i)
-      nmsed_scores.append(nmsed_scores_i)
-      nmsed_classes.append(nmsed_classes_i)
-  nmsed_boxes = tf.concat(nmsed_boxes, axis=1)
-  nmsed_scores = tf.concat(nmsed_scores, axis=1)
-  nmsed_classes = tf.concat(nmsed_classes, axis=1)
-  nmsed_scores, indices = tf.nn.top_k(
-      nmsed_scores, k=max_num_detections, sorted=True)
-  nmsed_boxes = tf.gather(nmsed_boxes, indices, batch_dims=1, axis=1)
-  nmsed_classes = tf.gather(nmsed_classes, indices, batch_dims=1)
-  valid_detections = tf.reduce_sum(
-      input_tensor=tf.cast(tf.greater(nmsed_scores, 0.0), tf.int32), axis=1)
-  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
-
-
-def _generate_detections_batched(boxes: tf.Tensor, scores: tf.Tensor,
-                                 pre_nms_score_threshold: float,
-                                 nms_iou_threshold: float,
-                                 max_num_detections: int):
-  """Generates detected boxes with scores and classes for one-stage detector.
-
-  The function takes output of multi-level ConvNets and anchor boxes and
-  generates detected boxes. Note that this used batched nms, which is not
-  supported on TPU currently.
-
-  Args:
-    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
-      `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
-      N is the number of total anchors on all levels.
-    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
-      stacks class probability on all feature levels. The N is the number of
-      total anchors on all levels. The num_classes is the number of classes
-      predicted by the model. Note that the class_outputs here is the raw score.
-    pre_nms_score_threshold: A `float` representing the threshold for deciding
-      when to remove boxes based on score.
-    nms_iou_threshold: A `float` representing the threshold for deciding whether
-      boxes overlap too much with respect to IOU.
-    max_num_detections: A `scalar` representing maximum number of boxes retained
-      over all classes.
-
-  Returns:
-    nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4]
-      representing top detected boxes in [y1, x1, y2, x2].
-    nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections]
-      representing sorted confidence scores for detected boxes. The values are
-      between [0, 1].
-    nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections]
-      representing classes for detected boxes.
-    valid_detections: An `int` tf.Tensor of shape [batch_size] only the top
-      `valid_detections` boxes are valid detections.
-  """
-  with tf.name_scope('generate_detections'):
-    nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
-        tf.image.combined_non_max_suppression(
-            boxes,
-            scores,
-            max_output_size_per_class=max_num_detections,
-            max_total_size=max_num_detections,
-            iou_threshold=nms_iou_threshold,
-            score_threshold=pre_nms_score_threshold,
-            pad_per_class=False,
-            clip_boxes=False))
-    nmsed_classes = tf.cast(nmsed_classes, tf.int32)
-  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class DetectionGenerator(tf.keras.layers.Layer):
-  """Generates the final detected boxes with scores and classes."""
-
-  def __init__(self,
-               apply_nms: bool = True,
-               pre_nms_top_k: int = 5000,
-               pre_nms_score_threshold: float = 0.05,
-               nms_iou_threshold: float = 0.5,
-               max_num_detections: int = 100,
-               nms_version: str = 'v2',
-               use_cpu_nms: bool = False,
-               soft_nms_sigma: Optional[float] = None,
-               **kwargs):
-    """Initializes a detection generator.
-
-    Args:
-      apply_nms: A `bool` of whether or not apply non maximum suppression.
-        If False, the decoded boxes and their scores are returned.
-      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
-        before applying NMS.
-      pre_nms_score_threshold: A `float` of the score threshold to apply before
-        applying  NMS. Proposals whose scores are below this threshold are
-        thrown away.
-      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
-      max_num_detections: An `int` of the final number of total detections to
-        generate.
-      nms_version: A string of `batched`, `v1` or `v2` specifies NMS version.
-      use_cpu_nms: A `bool` of whether or not enforce NMS to run on CPU.
-      soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
-        When soft_nms_sigma=0.0, we fall back to standard NMS.
-      **kwargs: Additional keyword arguments passed to Layer.
-    """
-    self._config_dict = {
-        'apply_nms': apply_nms,
-        'pre_nms_top_k': pre_nms_top_k,
-        'pre_nms_score_threshold': pre_nms_score_threshold,
-        'nms_iou_threshold': nms_iou_threshold,
-        'max_num_detections': max_num_detections,
-        'nms_version': nms_version,
-        'use_cpu_nms': use_cpu_nms,
-        'soft_nms_sigma': soft_nms_sigma,
-    }
-    super(DetectionGenerator, self).__init__(**kwargs)
-
-  def __call__(self,
-               raw_boxes: tf.Tensor,
-               raw_scores: tf.Tensor,
-               anchor_boxes: tf.Tensor,
-               image_shape: tf.Tensor,
-               regression_weights: Optional[List[float]] = None,
-               bbox_per_class: bool = True):
-    """Generates final detections.
-
-    Args:
-      raw_boxes: A `tf.Tensor` of shape of `[batch_size, K, num_classes * 4]`
-        representing the class-specific box coordinates relative to anchors.
-      raw_scores: A `tf.Tensor` of shape of `[batch_size, K, num_classes]`
-        representing the class logits before applying score activiation.
-      anchor_boxes: A `tf.Tensor` of shape of `[batch_size, K, 4]` representing
-        the corresponding anchor boxes w.r.t `box_outputs`.
-      image_shape: A `tf.Tensor` of shape of `[batch_size, 2]` storing the image
-        height and width w.r.t. the scaled image, i.e. the same image space as
-        `box_outputs` and `anchor_boxes`.
-      regression_weights: A list of four float numbers to scale coordinates.
-      bbox_per_class: A `bool`. If True, perform per-class box regression.
-
-    Returns:
-      If `apply_nms` = True, the return is a dictionary with keys:
-        `detection_boxes`: A `float` tf.Tensor of shape
-          [batch, max_num_detections, 4] representing top detected boxes in
-          [y1, x1, y2, x2].
-        `detection_scores`: A `float` `tf.Tensor` of shape
-          [batch, max_num_detections] representing sorted confidence scores for
-          detected boxes. The values are between [0, 1].
-        `detection_classes`: An `int` tf.Tensor of shape
-          [batch, max_num_detections] representing classes for detected boxes.
-        `num_detections`: An `int` tf.Tensor of shape [batch] only the first
-          `num_detections` boxes are valid detections
-      If `apply_nms` = False, the return is a dictionary with keys:
-        `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
-          representing all the decoded boxes.
-        `decoded_box_scores`: A `float` tf.Tensor of shape
-          [batch, num_raw_boxes] representing socres of all the decoded boxes.
-    """
-    box_scores = tf.nn.softmax(raw_scores, axis=-1)
-
-    # Removes the background class.
-    box_scores_shape = tf.shape(box_scores)
-    box_scores_shape_list = box_scores.get_shape().as_list()
-    batch_size = box_scores_shape[0]
-    num_locations = box_scores_shape_list[1]
-    num_classes = box_scores_shape_list[-1]
-
-    box_scores = tf.slice(box_scores, [0, 0, 1], [-1, -1, -1])
-
-    if bbox_per_class:
-      num_detections = num_locations * (num_classes - 1)
-      raw_boxes = tf.reshape(raw_boxes,
-                             [batch_size, num_locations, num_classes, 4])
-      raw_boxes = tf.slice(raw_boxes, [0, 0, 1, 0], [-1, -1, -1, -1])
-      anchor_boxes = tf.tile(
-          tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
-      raw_boxes = tf.reshape(raw_boxes, [batch_size, num_detections, 4])
-      anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])
-
-    # Box decoding.
-    decoded_boxes = box_ops.decode_boxes(
-        raw_boxes, anchor_boxes, weights=regression_weights)
-
-    # Box clipping
-    decoded_boxes = box_ops.clip_boxes(
-        decoded_boxes, tf.expand_dims(image_shape, axis=1))
-
-    if bbox_per_class:
-      decoded_boxes = tf.reshape(
-          decoded_boxes, [batch_size, num_locations, num_classes - 1, 4])
-    else:
-      decoded_boxes = tf.expand_dims(decoded_boxes, axis=2)
-
-    if not self._config_dict['apply_nms']:
-      return {
-          'decoded_boxes': decoded_boxes,
-          'decoded_box_scores': box_scores,
-      }
-
-    # Optionally force the NMS be run on CPU.
-    if self._config_dict['use_cpu_nms']:
-      nms_context = tf.device('cpu:0')
-    else:
-      nms_context = contextlib.nullcontext()
-
-    with nms_context:
-      if self._config_dict['nms_version'] == 'batched':
-        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
-            _generate_detections_batched(
-                decoded_boxes, box_scores,
-                self._config_dict['pre_nms_score_threshold'],
-                self._config_dict['nms_iou_threshold'],
-                self._config_dict['max_num_detections']))
-      elif self._config_dict['nms_version'] == 'v1':
-        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, _) = (
-            _generate_detections_v1(
-                decoded_boxes,
-                box_scores,
-                pre_nms_top_k=self._config_dict['pre_nms_top_k'],
-                pre_nms_score_threshold=self
-                ._config_dict['pre_nms_score_threshold'],
-                nms_iou_threshold=self._config_dict['nms_iou_threshold'],
-                max_num_detections=self._config_dict['max_num_detections'],
-                soft_nms_sigma=self._config_dict['soft_nms_sigma']))
-      elif self._config_dict['nms_version'] == 'v2':
-        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
-            _generate_detections_v2(
-                decoded_boxes,
-                box_scores,
-                pre_nms_top_k=self._config_dict['pre_nms_top_k'],
-                pre_nms_score_threshold=self
-                ._config_dict['pre_nms_score_threshold'],
-                nms_iou_threshold=self._config_dict['nms_iou_threshold'],
-                max_num_detections=self._config_dict['max_num_detections']))
-      else:
-        raise ValueError('NMS version {} not supported.'.format(
-            self._config_dict['nms_version']))
-
-    # Adds 1 to offset the background class which has index 0.
-    nmsed_classes += 1
-
-    return {
-        'num_detections': valid_detections,
-        'detection_boxes': nmsed_boxes,
-        'detection_classes': nmsed_classes,
-        'detection_scores': nmsed_scores,
-    }
-
-  def get_config(self):
-    return self._config_dict
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class MultilevelDetectionGenerator(tf.keras.layers.Layer):
-  """Generates detected boxes with scores and classes for one-stage detector."""
-
-  def __init__(self,
-               apply_nms: bool = True,
-               pre_nms_top_k: int = 5000,
-               pre_nms_score_threshold: float = 0.05,
-               nms_iou_threshold: float = 0.5,
-               max_num_detections: int = 100,
-               nms_version: str = 'v1',
-               use_cpu_nms: bool = False,
-               soft_nms_sigma: Optional[float] = None,
-               **kwargs):
-    """Initializes a multi-level detection generator.
-
-    Args:
-      apply_nms: A `bool` of whether or not apply non maximum suppression. If
-        False, the decoded boxes and their scores are returned.
-      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
-        before applying NMS.
-      pre_nms_score_threshold: A `float` of the score threshold to apply before
-        applying NMS. Proposals whose scores are below this threshold are thrown
-        away.
-      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
-      max_num_detections: An `int` of the final number of total detections to
-        generate.
-      nms_version: A string of `batched`, `v1` or `v2` specifies NMS version
-      use_cpu_nms: A `bool` of whether or not enforce NMS to run on CPU.
-      soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
-        When soft_nms_sigma=0.0, we fall back to standard NMS.
-      **kwargs: Additional keyword arguments passed to Layer.
-    """
-    self._config_dict = {
-        'apply_nms': apply_nms,
-        'pre_nms_top_k': pre_nms_top_k,
-        'pre_nms_score_threshold': pre_nms_score_threshold,
-        'nms_iou_threshold': nms_iou_threshold,
-        'max_num_detections': max_num_detections,
-        'nms_version': nms_version,
-        'use_cpu_nms': use_cpu_nms,
-        'soft_nms_sigma': soft_nms_sigma,
-    }
-    super(MultilevelDetectionGenerator, self).__init__(**kwargs)
-
-  def _decode_multilevel_outputs(
-      self,
-      raw_boxes: Mapping[str, tf.Tensor],
-      raw_scores: Mapping[str, tf.Tensor],
-      anchor_boxes: tf.Tensor,
-      image_shape: tf.Tensor,
-      raw_attributes: Optional[Mapping[str, tf.Tensor]] = None):
-    """Collects dict of multilevel boxes, scores, attributes into lists."""
-    boxes = []
-    scores = []
-    if raw_attributes:
-      attributes = {att_name: [] for att_name in raw_attributes.keys()}
-    else:
-      attributes = {}
-
-    levels = list(raw_boxes.keys())
-    min_level = int(min(levels))
-    max_level = int(max(levels))
-    for i in range(min_level, max_level + 1):
-      raw_boxes_i = raw_boxes[str(i)]
-      raw_scores_i = raw_scores[str(i)]
-      batch_size = tf.shape(raw_boxes_i)[0]
-      (_, feature_h_i, feature_w_i,
-       num_anchors_per_locations_times_4) = raw_boxes_i.get_shape().as_list()
-      num_locations = feature_h_i * feature_w_i
-      num_anchors_per_locations = num_anchors_per_locations_times_4 // 4
-      num_classes = raw_scores_i.get_shape().as_list(
-      )[-1] // num_anchors_per_locations
-
-      # Applies score transformation and remove the implicit background class.
-      scores_i = tf.sigmoid(
-          tf.reshape(raw_scores_i, [
-              batch_size, num_locations * num_anchors_per_locations, num_classes
-          ]))
-      scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1])
-
-      # Box decoding.
-      # The anchor boxes are shared for all data in a batch.
-      # One stage detector only supports class agnostic box regression.
-      anchor_boxes_i = tf.reshape(
-          anchor_boxes[str(i)],
-          [batch_size, num_locations * num_anchors_per_locations, 4])
-      raw_boxes_i = tf.reshape(
-          raw_boxes_i,
-          [batch_size, num_locations * num_anchors_per_locations, 4])
-      boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i)
-
-      # Box clipping.
-      boxes_i = box_ops.clip_boxes(
-          boxes_i, tf.expand_dims(image_shape, axis=1))
-
-      boxes.append(boxes_i)
-      scores.append(scores_i)
-
-      if raw_attributes:
-        for att_name, raw_att in raw_attributes.items():
-          attribute_size = raw_att[str(
-              i)].get_shape().as_list()[-1] // num_anchors_per_locations
-          att_i = tf.reshape(raw_att[str(i)], [
-              batch_size, num_locations * num_anchors_per_locations,
-              attribute_size
-          ])
-          attributes[att_name].append(att_i)
-
-    boxes = tf.concat(boxes, axis=1)
-    boxes = tf.expand_dims(boxes, axis=2)
-    scores = tf.concat(scores, axis=1)
-
-    if raw_attributes:
-      for att_name in raw_attributes.keys():
-        attributes[att_name] = tf.concat(attributes[att_name], axis=1)
-        attributes[att_name] = tf.expand_dims(attributes[att_name], axis=2)
-
-    return boxes, scores, attributes
-
-  def __call__(self,
-               raw_boxes: Mapping[str, tf.Tensor],
-               raw_scores: Mapping[str, tf.Tensor],
-               anchor_boxes: tf.Tensor,
-               image_shape: tf.Tensor,
-               raw_attributes: Optional[Mapping[str, tf.Tensor]] = None):
-    """Generates final detections.
-
-    Args:
-      raw_boxes: A `dict` with keys representing FPN levels and values
-        representing box tenors of shape `[batch, feature_h, feature_w,
-        num_anchors * 4]`.
-      raw_scores: A `dict` with keys representing FPN levels and values
-        representing logit tensors of shape `[batch, feature_h, feature_w,
-        num_anchors]`.
-      anchor_boxes: A `tf.Tensor` of shape of [batch_size, K, 4] representing
-        the corresponding anchor boxes w.r.t `box_outputs`.
-      image_shape: A `tf.Tensor` of shape of [batch_size, 2] storing the image
-        height and width w.r.t. the scaled image, i.e. the same image space as
-        `box_outputs` and `anchor_boxes`.
-      raw_attributes: If not None, a `dict` of (attribute_name,
-        attribute_prediction) pairs. `attribute_prediction` is a dict that
-        contains keys representing FPN levels and values representing tenors of
-        shape `[batch, feature_h, feature_w, num_anchors * attribute_size]`.
-
-    Returns:
-      If `apply_nms` = True, the return is a dictionary with keys:
-        `detection_boxes`: A `float` tf.Tensor of shape
-          [batch, max_num_detections, 4] representing top detected boxes in
-          [y1, x1, y2, x2].
-        `detection_scores`: A `float` tf.Tensor of shape
-          [batch, max_num_detections] representing sorted confidence scores for
-          detected boxes. The values are between [0, 1].
-        `detection_classes`: An `int` tf.Tensor of shape
-          [batch, max_num_detections] representing classes for detected boxes.
-        `num_detections`: An `int` tf.Tensor of shape [batch] only the first
-          `num_detections` boxes are valid detections
-        `detection_attributes`: A dict. Values of the dict is a `float`
-          tf.Tensor of shape [batch, max_num_detections, attribute_size]
-          representing attribute predictions for detected boxes.
-      If `apply_nms` = False, the return is a dictionary with keys:
-        `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
-          representing all the decoded boxes.
-        `decoded_box_scores`: A `float` tf.Tensor of shape
-          [batch, num_raw_boxes] representing socres of all the decoded boxes.
-        `decoded_box_attributes`: A dict. Values in the dict is a
-          `float` tf.Tensor of shape [batch, num_raw_boxes, attribute_size]
-          representing attribute predictions of all the decoded boxes.
-    """
-    boxes, scores, attributes = self._decode_multilevel_outputs(
-        raw_boxes, raw_scores, anchor_boxes, image_shape, raw_attributes)
-
-    if not self._config_dict['apply_nms']:
-      return {
-          'decoded_boxes': boxes,
-          'decoded_box_scores': scores,
-          'decoded_box_attributes': attributes,
-      }
-
-    # Optionally force the NMS to run on CPU.
-    if self._config_dict['use_cpu_nms']:
-      nms_context = tf.device('cpu:0')
-    else:
-      nms_context = contextlib.nullcontext()
-
-    with nms_context:
-      if raw_attributes and (self._config_dict['nms_version'] != 'v1'):
-        raise ValueError(
-            'Attribute learning is only supported for NMSv1 but NMS {} is used.'
-            .format(self._config_dict['nms_version']))
-      if self._config_dict['nms_version'] == 'batched':
-        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
-            _generate_detections_batched(
-                boxes, scores, self._config_dict['pre_nms_score_threshold'],
-                self._config_dict['nms_iou_threshold'],
-                self._config_dict['max_num_detections']))
-        # Set `nmsed_attributes` to None for batched NMS.
-        nmsed_attributes = {}
-      elif self._config_dict['nms_version'] == 'v1':
-        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections,
-         nmsed_attributes) = (
-             _generate_detections_v1(
-                 boxes,
-                 scores,
-                 attributes=attributes if raw_attributes else None,
-                 pre_nms_top_k=self._config_dict['pre_nms_top_k'],
-                 pre_nms_score_threshold=self
-                 ._config_dict['pre_nms_score_threshold'],
-                 nms_iou_threshold=self._config_dict['nms_iou_threshold'],
-                 max_num_detections=self._config_dict['max_num_detections'],
-                 soft_nms_sigma=self._config_dict['soft_nms_sigma']))
-      elif self._config_dict['nms_version'] == 'v2':
-        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
-            _generate_detections_v2(
-                boxes,
-                scores,
-                pre_nms_top_k=self._config_dict['pre_nms_top_k'],
-                pre_nms_score_threshold=self
-                ._config_dict['pre_nms_score_threshold'],
-                nms_iou_threshold=self._config_dict['nms_iou_threshold'],
-                max_num_detections=self._config_dict['max_num_detections']))
-        # Set `nmsed_attributes` to None for v2.
-        nmsed_attributes = {}
-      else:
-        raise ValueError('NMS version {} not supported.'.format(
-            self._config_dict['nms_version']))
-
-    # Adds 1 to offset the background class which has index 0.
-    nmsed_classes += 1
-
-    return {
-        'num_detections': valid_detections,
-        'detection_boxes': nmsed_boxes,
-        'detection_classes': nmsed_classes,
-        'detection_scores': nmsed_scores,
-        'detection_attributes': nmsed_attributes,
-    }
-
-  def get_config(self):
-    return self._config_dict
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
--- a/official/vision/modeling/layers/detection_generator_test.py
+++ b/official/vision/modeling/layers/detection_generator_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for detection_generator.py."""
-# Import libraries
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from official.vision.modeling.layers import detection_generator
-from official.vision.ops import anchor
-
-
-class SelectTopKScoresTest(tf.test.TestCase):
-
-  def testSelectTopKScores(self):
-    pre_nms_num_boxes = 2
-    scores_data = [[[0.2, 0.2], [0.1, 0.9], [0.5, 0.1], [0.3, 0.5]]]
-    scores_in = tf.constant(scores_data, dtype=tf.float32)
-    top_k_scores, top_k_indices = detection_generator._select_top_k_scores(
-        scores_in, pre_nms_num_detections=pre_nms_num_boxes)
-    expected_top_k_scores = np.array([[[0.5, 0.9], [0.3, 0.5]]],
-                                     dtype=np.float32)
-
-    expected_top_k_indices = [[[2, 1], [3, 3]]]
-
-    self.assertAllEqual(top_k_scores.numpy(), expected_top_k_scores)
-    self.assertAllEqual(top_k_indices.numpy(), expected_top_k_indices)
-
-
-class DetectionGeneratorTest(
-    parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.product(
-      nms_version=['batched', 'v1', 'v2'],
-      use_cpu_nms=[True, False],
-      soft_nms_sigma=[None, 0.1])
-  def testDetectionsOutputShape(self, nms_version, use_cpu_nms, soft_nms_sigma):
-    max_num_detections = 10
-    num_classes = 4
-    pre_nms_top_k = 5000
-    pre_nms_score_threshold = 0.01
-    batch_size = 1
-    kwargs = {
-        'apply_nms': True,
-        'pre_nms_top_k': pre_nms_top_k,
-        'pre_nms_score_threshold': pre_nms_score_threshold,
-        'nms_iou_threshold': 0.5,
-        'max_num_detections': max_num_detections,
-        'nms_version': nms_version,
-        'use_cpu_nms': use_cpu_nms,
-        'soft_nms_sigma': soft_nms_sigma,
-    }
-    generator = detection_generator.DetectionGenerator(**kwargs)
-
-    cls_outputs_all = (
-        np.random.rand(84, num_classes) - 0.5) * 3  # random 84x3 outputs.
-    box_outputs_all = np.random.rand(84, 4 * num_classes)  # random 84 boxes.
-    anchor_boxes_all = np.random.rand(84, 4)  # random 84 boxes.
-    class_outputs = tf.reshape(
-        tf.convert_to_tensor(cls_outputs_all, dtype=tf.float32),
-        [1, 84, num_classes])
-    box_outputs = tf.reshape(
-        tf.convert_to_tensor(box_outputs_all, dtype=tf.float32),
-        [1, 84, 4 * num_classes])
-    anchor_boxes = tf.reshape(
-        tf.convert_to_tensor(anchor_boxes_all, dtype=tf.float32),
-        [1, 84, 4])
-    image_info = tf.constant(
-        [[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]],
-        dtype=tf.float32)
-    results = generator(
-        box_outputs, class_outputs, anchor_boxes, image_info[:, 1, :])
-    boxes = results['detection_boxes']
-    classes = results['detection_classes']
-    scores = results['detection_scores']
-    valid_detections = results['num_detections']
-
-    self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4))
-    self.assertEqual(scores.numpy().shape, (batch_size, max_num_detections,))
-    self.assertEqual(classes.numpy().shape, (batch_size, max_num_detections,))
-    self.assertEqual(valid_detections.numpy().shape, (batch_size,))
-
-  def test_serialize_deserialize(self):
-    kwargs = {
-        'apply_nms': True,
-        'pre_nms_top_k': 1000,
-        'pre_nms_score_threshold': 0.1,
-        'nms_iou_threshold': 0.5,
-        'max_num_detections': 10,
-        'nms_version': 'v2',
-        'use_cpu_nms': False,
-        'soft_nms_sigma': None,
-    }
-    generator = detection_generator.DetectionGenerator(**kwargs)
-
-    expected_config = dict(kwargs)
-    self.assertEqual(generator.get_config(), expected_config)
-
-    new_generator = (
-        detection_generator.DetectionGenerator.from_config(
-            generator.get_config()))
-
-    self.assertAllEqual(generator.get_config(), new_generator.get_config())
-
-
-class MultilevelDetectionGeneratorTest(
-    parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters(
-      ('batched', False, True, None),
-      ('batched', False, False, None),
-      ('v2', False, True, None),
-      ('v2', False, False, None),
-      ('v1', True, True, 0.0),
-      ('v1', True, False, 0.1),
-      ('v1', True, False, None),
-  )
-  def testDetectionsOutputShape(self, nms_version, has_att_heads, use_cpu_nms,
-                                soft_nms_sigma):
-    min_level = 4
-    max_level = 6
-    num_scales = 2
-    max_num_detections = 10
-    aspect_ratios = [1.0, 2.0]
-    anchor_scale = 2.0
-    output_size = [64, 64]
-    num_classes = 4
-    pre_nms_top_k = 5000
-    pre_nms_score_threshold = 0.01
-    batch_size = 1
-    kwargs = {
-        'apply_nms': True,
-        'pre_nms_top_k': pre_nms_top_k,
-        'pre_nms_score_threshold': pre_nms_score_threshold,
-        'nms_iou_threshold': 0.5,
-        'max_num_detections': max_num_detections,
-        'nms_version': nms_version,
-        'use_cpu_nms': use_cpu_nms,
-        'soft_nms_sigma': soft_nms_sigma,
-    }
-
-    input_anchor = anchor.build_anchor_generator(min_level, max_level,
-                                                 num_scales, aspect_ratios,
-                                                 anchor_scale)
-    anchor_boxes = input_anchor(output_size)
-    cls_outputs_all = (
-        np.random.rand(84, num_classes) - 0.5) * 3  # random 84x3 outputs.
-    box_outputs_all = np.random.rand(84, 4)  # random 84 boxes.
-    class_outputs = {
-        '4':
-            tf.reshape(
-                tf.convert_to_tensor(cls_outputs_all[0:64], dtype=tf.float32),
-                [1, 8, 8, num_classes]),
-        '5':
-            tf.reshape(
-                tf.convert_to_tensor(cls_outputs_all[64:80], dtype=tf.float32),
-                [1, 4, 4, num_classes]),
-        '6':
-            tf.reshape(
-                tf.convert_to_tensor(cls_outputs_all[80:84], dtype=tf.float32),
-                [1, 2, 2, num_classes]),
-    }
-    box_outputs = {
-        '4': tf.reshape(tf.convert_to_tensor(
-            box_outputs_all[0:64], dtype=tf.float32), [1, 8, 8, 4]),
-        '5': tf.reshape(tf.convert_to_tensor(
-            box_outputs_all[64:80], dtype=tf.float32), [1, 4, 4, 4]),
-        '6': tf.reshape(tf.convert_to_tensor(
-            box_outputs_all[80:84], dtype=tf.float32), [1, 2, 2, 4]),
-    }
-    if has_att_heads:
-      att_outputs_all = np.random.rand(84, 1)  # random attributes.
-      att_outputs = {
-          'depth': {
-              '4':
-                  tf.reshape(
-                      tf.convert_to_tensor(
-                          att_outputs_all[0:64], dtype=tf.float32),
-                      [1, 8, 8, 1]),
-              '5':
-                  tf.reshape(
-                      tf.convert_to_tensor(
-                          att_outputs_all[64:80], dtype=tf.float32),
-                      [1, 4, 4, 1]),
-              '6':
-                  tf.reshape(
-                      tf.convert_to_tensor(
-                          att_outputs_all[80:84], dtype=tf.float32),
-                      [1, 2, 2, 1]),
-          }
-      }
-    else:
-      att_outputs = None
-    image_info = tf.constant([[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]],
-                             dtype=tf.float32)
-    generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
-    results = generator(box_outputs, class_outputs, anchor_boxes,
-                        image_info[:, 1, :], att_outputs)
-    boxes = results['detection_boxes']
-    classes = results['detection_classes']
-    scores = results['detection_scores']
-    valid_detections = results['num_detections']
-
-    self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4))
-    self.assertEqual(scores.numpy().shape, (batch_size, max_num_detections,))
-    self.assertEqual(classes.numpy().shape, (batch_size, max_num_detections,))
-    self.assertEqual(valid_detections.numpy().shape, (batch_size,))
-    if has_att_heads:
-      for att in results['detection_attributes'].values():
-        self.assertEqual(att.numpy().shape, (batch_size, max_num_detections, 1))
-
-  def test_serialize_deserialize(self):
-    kwargs = {
-        'apply_nms': True,
-        'pre_nms_top_k': 1000,
-        'pre_nms_score_threshold': 0.1,
-        'nms_iou_threshold': 0.5,
-        'max_num_detections': 10,
-        'nms_version': 'v2',
-        'use_cpu_nms': False,
-        'soft_nms_sigma': None,
-    }
-    generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
-
-    expected_config = dict(kwargs)
-    self.assertEqual(generator.get_config(), expected_config)
-
-    new_generator = (
-        detection_generator.MultilevelDetectionGenerator.from_config(
-            generator.get_config()))
-
-    self.assertAllEqual(generator.get_config(), new_generator.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/layers/mask_sampler.py
+++ b/official/vision/modeling/layers/mask_sampler.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Contains definitions of mask sampler."""
-
-# Import libraries
-import tensorflow as tf
-
-from official.vision.ops import spatial_transform_ops
-
-
-def _sample_and_crop_foreground_masks(candidate_rois: tf.Tensor,
-                                      candidate_gt_boxes: tf.Tensor,
-                                      candidate_gt_classes: tf.Tensor,
-                                      candidate_gt_indices: tf.Tensor,
-                                      gt_masks: tf.Tensor,
-                                      num_sampled_masks: int = 128,
-                                      mask_target_size: int = 28):
-  """Samples and creates cropped foreground masks for training.
-
-  Args:
-    candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is the
-      number of candidate RoIs to be considered for mask sampling. It includes
-      both positive and negative RoIs. The `num_mask_samples_per_image` positive
-      RoIs will be sampled to create mask training targets.
-    candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing
-      the corresponding groundtruth boxes to the `candidate_rois`.
-    candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing the
-      corresponding groundtruth classes to the `candidate_rois`. 0 in the tensor
-      corresponds to the background class, i.e. negative RoIs.
-    candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the
-      corresponding groundtruth instance indices to the `candidate_gt_boxes`,
-      i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i] and
-      gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is
-      the superset of candidate_gt_boxes.
-    gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height,
-      mask_width] containing all the groundtruth masks which sample masks are
-      drawn from.
-    num_sampled_masks: An `int` that specifies the number of masks to sample.
-    mask_target_size: An `int` that specifies the final cropped mask size after
-      sampling. The output masks are resized w.r.t the sampled RoIs.
-
-  Returns:
-    foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the
-      RoI that corresponds to the sampled foreground masks, where
-      K = num_mask_samples_per_image.
-    foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the
-      classes corresponding to the sampled foreground masks.
-    cropoped_foreground_masks: A `tf.Tensor` of shape of
-      [batch_size, K, mask_target_size, mask_target_size] storing the cropped
-      foreground masks used for training.
-  """
-  _, fg_instance_indices = tf.nn.top_k(
-      tf.cast(tf.greater(candidate_gt_classes, 0), dtype=tf.int32),
-      k=num_sampled_masks)
-
-  fg_instance_indices_shape = tf.shape(fg_instance_indices)
-  batch_indices = (
-      tf.expand_dims(tf.range(fg_instance_indices_shape[0]), axis=-1) *
-      tf.ones([1, fg_instance_indices_shape[-1]], dtype=tf.int32))
-
-  gather_nd_instance_indices = tf.stack(
-      [batch_indices, fg_instance_indices], axis=-1)
-  foreground_rois = tf.gather_nd(
-      candidate_rois, gather_nd_instance_indices)
-  foreground_boxes = tf.gather_nd(
-      candidate_gt_boxes, gather_nd_instance_indices)
-  foreground_classes = tf.gather_nd(
-      candidate_gt_classes, gather_nd_instance_indices)
-  foreground_gt_indices = tf.gather_nd(
-      candidate_gt_indices, gather_nd_instance_indices)
-  foreground_gt_indices = tf.where(
-      tf.equal(foreground_gt_indices, -1),
-      tf.zeros_like(foreground_gt_indices),
-      foreground_gt_indices)
-
-  foreground_gt_indices_shape = tf.shape(foreground_gt_indices)
-  batch_indices = (
-      tf.expand_dims(tf.range(foreground_gt_indices_shape[0]), axis=-1) *
-      tf.ones([1, foreground_gt_indices_shape[-1]], dtype=tf.int32))
-  gather_nd_gt_indices = tf.stack(
-      [batch_indices, foreground_gt_indices], axis=-1)
-  foreground_masks = tf.gather_nd(gt_masks, gather_nd_gt_indices)
-
-  cropped_foreground_masks = spatial_transform_ops.crop_mask_in_target_box(
-      foreground_masks, foreground_boxes, foreground_rois, mask_target_size,
-      sample_offset=0.5)
-
-  return foreground_rois, foreground_classes, cropped_foreground_masks
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class MaskSampler(tf.keras.layers.Layer):
-  """Samples and creates mask training targets."""
-
-  def __init__(self, mask_target_size: int, num_sampled_masks: int, **kwargs):
-    self._config_dict = {
-        'mask_target_size': mask_target_size,
-        'num_sampled_masks': num_sampled_masks,
-    }
-    super(MaskSampler, self).__init__(**kwargs)
-
-  def call(self, candidate_rois: tf.Tensor, candidate_gt_boxes: tf.Tensor,
-           candidate_gt_classes: tf.Tensor, candidate_gt_indices: tf.Tensor,
-           gt_masks: tf.Tensor):
-    """Samples and creates mask targets for training.
-
-    Args:
-      candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is
-        the number of candidate RoIs to be considered for mask sampling. It
-        includes both positive and negative RoIs. The
-        `num_mask_samples_per_image` positive RoIs will be sampled to create
-        mask training targets.
-      candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing
-        the corresponding groundtruth boxes to the `candidate_rois`.
-      candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing
-        the corresponding groundtruth classes to the `candidate_rois`. 0 in the
-        tensor corresponds to the background class, i.e. negative RoIs.
-      candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the
-        corresponding groundtruth instance indices to the `candidate_gt_boxes`,
-        i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i],
-          where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >=
-          N, is the superset of candidate_gt_boxes.
-      gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height,
-        mask_width] containing all the groundtruth masks which sample masks are
-        drawn from. after sampling. The output masks are resized w.r.t the
-        sampled RoIs.
-
-    Returns:
-      foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the
-        RoI that corresponds to the sampled foreground masks, where
-        K = num_mask_samples_per_image.
-      foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the
-        classes corresponding to the sampled foreground masks.
-      cropoped_foreground_masks: A `tf.Tensor` of shape of
-        [batch_size, K, mask_target_size, mask_target_size] storing the
-        cropped foreground masks used for training.
-    """
-    foreground_rois, foreground_classes, cropped_foreground_masks = (
-        _sample_and_crop_foreground_masks(
-            candidate_rois,
-            candidate_gt_boxes,
-            candidate_gt_classes,
-            candidate_gt_indices,
-            gt_masks,
-            self._config_dict['num_sampled_masks'],
-            self._config_dict['mask_target_size']))
-    return foreground_rois, foreground_classes, cropped_foreground_masks
-
-  def get_config(self):
-    return self._config_dict
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
--- a/official/vision/modeling/layers/mask_sampler_test.py
+++ b/official/vision/modeling/layers/mask_sampler_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for mask_sampler.py."""
-
-# Import libraries
-import numpy as np
-import tensorflow as tf
-
-from official.vision.modeling.layers import mask_sampler
-
-
-class SampleAndCropForegroundMasksTest(tf.test.TestCase):
-
-  def test_sample_and_crop_foreground_masks(self):
-    candidate_rois_np = np.array(
-        [[[0, 0, 0.5, 0.5], [0.5, 0.5, 1, 1],
-          [2, 2, 4, 4], [1, 1, 5, 5]]])
-    candidate_rois = tf.constant(candidate_rois_np, dtype=tf.float32)
-
-    candidate_gt_boxes_np = np.array(
-        [[[0, 0, 0.6, 0.6], [0, 0, 0, 0],
-          [1, 1, 3, 3], [1, 1, 3, 3]]])
-    candidate_gt_boxes = tf.constant(candidate_gt_boxes_np, dtype=tf.float32)
-
-    candidate_gt_classes_np = np.array([[4, 0, 0, 2]])
-    candidate_gt_classes = tf.constant(
-        candidate_gt_classes_np, dtype=tf.float32)
-
-    candidate_gt_indices_np = np.array([[10, -1, -1, 20]])
-    candidate_gt_indices = tf.constant(
-        candidate_gt_indices_np, dtype=tf.int32)
-
-    gt_masks_np = np.random.rand(1, 100, 32, 32)
-    gt_masks = tf.constant(gt_masks_np, dtype=tf.float32)
-
-    num_mask_samples_per_image = 2
-    mask_target_size = 28
-
-    # Runs on TPU.
-    strategy = tf.distribute.TPUStrategy()
-    with strategy.scope():
-      foreground_rois, foreground_classes, cropped_foreground_masks = (
-          mask_sampler._sample_and_crop_foreground_masks(
-              candidate_rois, candidate_gt_boxes, candidate_gt_classes,
-              candidate_gt_indices, gt_masks, num_mask_samples_per_image,
-              mask_target_size))
-    foreground_rois_tpu = foreground_rois.numpy()
-    foreground_classes_tpu = foreground_classes.numpy()
-    cropped_foreground_masks_tpu = cropped_foreground_masks.numpy()
-
-    foreground_rois, foreground_classes, cropped_foreground_masks = (
-        mask_sampler._sample_and_crop_foreground_masks(
-            candidate_rois, candidate_gt_boxes, candidate_gt_classes,
-            candidate_gt_indices, gt_masks, num_mask_samples_per_image,
-            mask_target_size))
-    foreground_rois_cpu = foreground_rois.numpy()
-    foreground_classes_cpu = foreground_classes.numpy()
-    cropped_foreground_masks_cpu = cropped_foreground_masks.numpy()
-
-    # consistency.
-    self.assertAllEqual(foreground_rois_tpu.shape, foreground_rois_cpu.shape)
-    self.assertAllEqual(
-        foreground_classes_tpu.shape, foreground_classes_cpu.shape)
-    self.assertAllEqual(
-        cropped_foreground_masks_tpu.shape, cropped_foreground_masks_cpu.shape)
-
-    # correctnesss.
-    self.assertAllEqual(foreground_rois_tpu.shape, [1, 2, 4])
-    self.assertAllEqual(foreground_classes_tpu.shape, [1, 2])
-    self.assertAllEqual(cropped_foreground_masks_tpu.shape, [1, 2, 28, 28])
-
-
-class MaskSamplerTest(tf.test.TestCase):
-
-  def test_mask_sampler(self):
-    candidate_rois_np = np.array(
-        [[[0, 0, 0.5, 0.5], [0.5, 0.5, 1, 1],
-          [2, 2, 4, 4], [1, 1, 5, 5]]])
-    candidate_rois = tf.constant(candidate_rois_np, dtype=tf.float32)
-
-    candidate_gt_boxes_np = np.array(
-        [[[0, 0, 0.6, 0.6], [0, 0, 0, 0],
-          [1, 1, 3, 3], [1, 1, 3, 3]]])
-    candidate_gt_boxes = tf.constant(candidate_gt_boxes_np, dtype=tf.float32)
-
-    candidate_gt_classes_np = np.array([[4, 0, 0, 2]])
-    candidate_gt_classes = tf.constant(
-        candidate_gt_classes_np, dtype=tf.float32)
-
-    candidate_gt_indices_np = np.array([[10, -1, -1, 20]])
-    candidate_gt_indices = tf.constant(
-        candidate_gt_indices_np, dtype=tf.int32)
-
-    gt_masks_np = np.random.rand(1, 100, 32, 32)
-    gt_masks = tf.constant(gt_masks_np, dtype=tf.float32)
-
-    sampler = mask_sampler.MaskSampler(28, 2)
-
-    foreground_rois, foreground_classes, cropped_foreground_masks = sampler(
-        candidate_rois, candidate_gt_boxes, candidate_gt_classes,
-        candidate_gt_indices, gt_masks)
-
-    # correctnesss.
-    self.assertAllEqual(foreground_rois.numpy().shape, [1, 2, 4])
-    self.assertAllEqual(foreground_classes.numpy().shape, [1, 2])
-    self.assertAllEqual(cropped_foreground_masks.numpy().shape, [1, 2, 28, 28])
-
-  def test_serialize_deserialize(self):
-    kwargs = dict(
-        mask_target_size=7,
-        num_sampled_masks=10,
-    )
-    sampler = mask_sampler.MaskSampler(**kwargs)
-
-    expected_config = dict(kwargs)
-    self.assertEqual(sampler.get_config(), expected_config)
-
-    new_sampler = mask_sampler.MaskSampler.from_config(
-        sampler.get_config())
-
-    self.assertAllEqual(sampler.get_config(), new_sampler.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/layers/nn_blocks.py
+++ b/official/vision/modeling/layers/nn_blocks.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Contains common building blocks for neural networks."""
-
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Text
-
-# Import libraries
-from absl import logging
-import tensorflow as tf
-
-from official.modeling import tf_utils
-from official.vision.modeling.layers import nn_layers
-
-
-def _pad_strides(strides: int, axis: int) -> Tuple[int, int, int, int]:
-  """Converts int to len 4 strides (`tf.nn.avg_pool` uses length 4)."""
-  if axis == 1:
-    return (1, 1, strides, strides)
-  else:
-    return (1, strides, strides, 1)
-
-
-def _maybe_downsample(x: tf.Tensor, out_filter: int, strides: int,
-                      axis: int) -> tf.Tensor:
-  """Downsamples feature map and 0-pads tensor if in_filter != out_filter."""
-  data_format = 'NCHW' if axis == 1 else 'NHWC'
-  strides = _pad_strides(strides, axis=axis)
-
-  x = tf.nn.avg_pool(x, strides, strides, 'VALID', data_format=data_format)
-
-  in_filter = x.shape[axis]
-  if in_filter < out_filter:
-    # Pad on channel dimension with 0s: half on top half on bottom.
-    pad_size = [(out_filter - in_filter) // 2, (out_filter - in_filter) // 2]
-    if axis == 1:
-      x = tf.pad(x, [[0, 0], pad_size, [0, 0], [0, 0]])
-    else:
-      x = tf.pad(x, [[0, 0], [0, 0], [0, 0], pad_size])
-
-  return x + 0.
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class ResidualBlock(tf.keras.layers.Layer):
-  """A residual block."""
-
-  def __init__(self,
-               filters,
-               strides,
-               use_projection=False,
-               se_ratio=None,
-               resnetd_shortcut=False,
-               stochastic_depth_drop_rate=None,
-               kernel_initializer='VarianceScaling',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activation='relu',
-               use_explicit_padding: bool = False,
-               use_sync_bn=False,
-               norm_momentum=0.99,
-               norm_epsilon=0.001,
-               bn_trainable=True,
-               **kwargs):
-    """Initializes a residual block with BN after convolutions.
-
-    Args:
-      filters: An `int` number of filters for the first two convolutions. Note
-        that the third and final convolution will use 4 times as many filters.
-      strides: An `int` block stride. If greater than 1, this block will
-        ultimately downsample the input.
-      use_projection: A `bool` for whether this block should use a projection
-        shortcut (versus the default identity shortcut). This is usually `True`
-        for the first block of a block group, which may change the number of
-        filters and the resolution.
-      se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
-      resnetd_shortcut: A `bool` if True, apply the resnetd style modification
-        to the shortcut connection. Not implemented in residual blocks.
-      stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
-        the stochastic depth layer.
-      kernel_initializer: A `str` of kernel_initializer for convolutional
-        layers.
-      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
-        Conv2D. Default to None.
-      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
-        Default to None.
-      activation: A `str` name of the activation function.
-      use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
-        inputs so that the output dimensions are the same as if 'SAME' padding
-        were used.
-      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
-      norm_momentum: A `float` of normalization momentum for the moving average.
-      norm_epsilon: A `float` added to variance to avoid dividing by zero.
-      bn_trainable: A `bool` that indicates whether batch norm layers should be
-        trainable. Default to True.
-      **kwargs: Additional keyword arguments to be passed.
-    """
-    super(ResidualBlock, self).__init__(**kwargs)
-
-    self._filters = filters
-    self._strides = strides
-    self._use_projection = use_projection
-    self._se_ratio = se_ratio
-    self._resnetd_shortcut = resnetd_shortcut
-    self._use_explicit_padding = use_explicit_padding
-    self._use_sync_bn = use_sync_bn
-    self._activation = activation
-    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
-    self._kernel_initializer = kernel_initializer
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-
-    if use_sync_bn:
-      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
-    else:
-      self._norm = tf.keras.layers.BatchNormalization
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-    self._activation_fn = tf_utils.get_activation(activation)
-    self._bn_trainable = bn_trainable
-
-  def build(self, input_shape):
-    if self._use_projection:
-      self._shortcut = tf.keras.layers.Conv2D(
-          filters=self._filters,
-          kernel_size=1,
-          strides=self._strides,
-          use_bias=False,
-          kernel_initializer=self._kernel_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer)
-      self._norm0 = self._norm(
-          axis=self._bn_axis,
-          momentum=self._norm_momentum,
-          epsilon=self._norm_epsilon,
-          trainable=self._bn_trainable)
-
-    conv1_padding = 'same'
-    # explicit padding here is added for centernet
-    if self._use_explicit_padding:
-      self._pad = tf.keras.layers.ZeroPadding2D(padding=(1, 1))
-      conv1_padding = 'valid'
-
-    self._conv1 = tf.keras.layers.Conv2D(
-        filters=self._filters,
-        kernel_size=3,
-        strides=self._strides,
-        padding=conv1_padding,
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm1 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon,
-        trainable=self._bn_trainable)
-
-    self._conv2 = tf.keras.layers.Conv2D(
-        filters=self._filters,
-        kernel_size=3,
-        strides=1,
-        padding='same',
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm2 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon,
-        trainable=self._bn_trainable)
-
-    if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
-      self._squeeze_excitation = nn_layers.SqueezeExcitation(
-          in_filters=self._filters,
-          out_filters=self._filters,
-          se_ratio=self._se_ratio,
-          kernel_initializer=self._kernel_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer)
-    else:
-      self._squeeze_excitation = None
-
-    if self._stochastic_depth_drop_rate:
-      self._stochastic_depth = nn_layers.StochasticDepth(
-          self._stochastic_depth_drop_rate)
-    else:
-      self._stochastic_depth = None
-
-    super(ResidualBlock, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        'filters': self._filters,
-        'strides': self._strides,
-        'use_projection': self._use_projection,
-        'se_ratio': self._se_ratio,
-        'resnetd_shortcut': self._resnetd_shortcut,
-        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
-        'kernel_initializer': self._kernel_initializer,
-        'kernel_regularizer': self._kernel_regularizer,
-        'bias_regularizer': self._bias_regularizer,
-        'activation': self._activation,
-        'use_explicit_padding': self._use_explicit_padding,
-        'use_sync_bn': self._use_sync_bn,
-        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon,
-        'bn_trainable': self._bn_trainable
-    }
-    base_config = super(ResidualBlock, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs, training=None):
-    shortcut = inputs
-    if self._use_projection:
-      shortcut = self._shortcut(shortcut)
-      shortcut = self._norm0(shortcut)
-
-    if self._use_explicit_padding:
-      inputs = self._pad(inputs)
-    x = self._conv1(inputs)
-    x = self._norm1(x)
-    x = self._activation_fn(x)
-
-    x = self._conv2(x)
-    x = self._norm2(x)
-
-    if self._squeeze_excitation:
-      x = self._squeeze_excitation(x)
-
-    if self._stochastic_depth:
-      x = self._stochastic_depth(x, training=training)
-
-    return self._activation_fn(x + shortcut)
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class BottleneckBlock(tf.keras.layers.Layer):
-  """A standard bottleneck block."""
-
-  def __init__(self,
-               filters,
-               strides,
-               dilation_rate=1,
-               use_projection=False,
-               se_ratio=None,
-               resnetd_shortcut=False,
-               stochastic_depth_drop_rate=None,
-               kernel_initializer='VarianceScaling',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activation='relu',
-               use_sync_bn=False,
-               norm_momentum=0.99,
-               norm_epsilon=0.001,
-               bn_trainable=True,
-               **kwargs):
-    """Initializes a standard bottleneck block with BN after convolutions.
-
-    Args:
-      filters: An `int` number of filters for the first two convolutions. Note
-        that the third and final convolution will use 4 times as many filters.
-      strides: An `int` block stride. If greater than 1, this block will
-        ultimately downsample the input.
-      dilation_rate: An `int` dilation_rate of convolutions. Default to 1.
-      use_projection: A `bool` for whether this block should use a projection
-        shortcut (versus the default identity shortcut). This is usually `True`
-        for the first block of a block group, which may change the number of
-        filters and the resolution.
-      se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
-      resnetd_shortcut: A `bool`. If True, apply the resnetd style modification
-        to the shortcut connection.
-      stochastic_depth_drop_rate: A `float` or None. If not None, drop rate for
-        the stochastic depth layer.
-      kernel_initializer: A `str` of kernel_initializer for convolutional
-        layers.
-      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
-        Conv2D. Default to None.
-      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
-        Default to None.
-      activation: A `str` name of the activation function.
-      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
-      norm_momentum: A `float` of normalization momentum for the moving average.
-      norm_epsilon: A `float` added to variance to avoid dividing by zero.
-      bn_trainable: A `bool` that indicates whether batch norm layers should be
-        trainable. Default to True.
-      **kwargs: Additional keyword arguments to be passed.
-    """
-    super(BottleneckBlock, self).__init__(**kwargs)
-
-    self._filters = filters
-    self._strides = strides
-    self._dilation_rate = dilation_rate
-    self._use_projection = use_projection
-    self._se_ratio = se_ratio
-    self._resnetd_shortcut = resnetd_shortcut
-    self._use_sync_bn = use_sync_bn
-    self._activation = activation
-    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
-    self._kernel_initializer = kernel_initializer
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-    if use_sync_bn:
-      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
-    else:
-      self._norm = tf.keras.layers.BatchNormalization
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-    self._bn_trainable = bn_trainable
-
-  def build(self, input_shape):
-    if self._use_projection:
-      if self._resnetd_shortcut:
-        self._shortcut0 = tf.keras.layers.AveragePooling2D(
-            pool_size=2, strides=self._strides, padding='same')
-        self._shortcut1 = tf.keras.layers.Conv2D(
-            filters=self._filters * 4,
-            kernel_size=1,
-            strides=1,
-            use_bias=False,
-            kernel_initializer=self._kernel_initializer,
-            kernel_regularizer=self._kernel_regularizer,
-            bias_regularizer=self._bias_regularizer)
-      else:
-        self._shortcut = tf.keras.layers.Conv2D(
-            filters=self._filters * 4,
-            kernel_size=1,
-            strides=self._strides,
-            use_bias=False,
-            kernel_initializer=self._kernel_initializer,
-            kernel_regularizer=self._kernel_regularizer,
-            bias_regularizer=self._bias_regularizer)
-
-      self._norm0 = self._norm(
-          axis=self._bn_axis,
-          momentum=self._norm_momentum,
-          epsilon=self._norm_epsilon,
-          trainable=self._bn_trainable)
-
-    self._conv1 = tf.keras.layers.Conv2D(
-        filters=self._filters,
-        kernel_size=1,
-        strides=1,
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm1 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon,
-        trainable=self._bn_trainable)
-    self._activation1 = tf_utils.get_activation(
-        self._activation, use_keras_layer=True)
-
-    self._conv2 = tf.keras.layers.Conv2D(
-        filters=self._filters,
-        kernel_size=3,
-        strides=self._strides,
-        dilation_rate=self._dilation_rate,
-        padding='same',
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm2 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon,
-        trainable=self._bn_trainable)
-    self._activation2 = tf_utils.get_activation(
-        self._activation, use_keras_layer=True)
-
-    self._conv3 = tf.keras.layers.Conv2D(
-        filters=self._filters * 4,
-        kernel_size=1,
-        strides=1,
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm3 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon,
-        trainable=self._bn_trainable)
-    self._activation3 = tf_utils.get_activation(
-        self._activation, use_keras_layer=True)
-
-    if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
-      self._squeeze_excitation = nn_layers.SqueezeExcitation(
-          in_filters=self._filters * 4,
-          out_filters=self._filters * 4,
-          se_ratio=self._se_ratio,
-          kernel_initializer=self._kernel_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer)
-    else:
-      self._squeeze_excitation = None
-
-    if self._stochastic_depth_drop_rate:
-      self._stochastic_depth = nn_layers.StochasticDepth(
-          self._stochastic_depth_drop_rate)
-    else:
-      self._stochastic_depth = None
-    self._add = tf.keras.layers.Add()
-
-    super(BottleneckBlock, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        'filters': self._filters,
-        'strides': self._strides,
-        'dilation_rate': self._dilation_rate,
-        'use_projection': self._use_projection,
-        'se_ratio': self._se_ratio,
-        'resnetd_shortcut': self._resnetd_shortcut,
-        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
-        'kernel_initializer': self._kernel_initializer,
-        'kernel_regularizer': self._kernel_regularizer,
-        'bias_regularizer': self._bias_regularizer,
-        'activation': self._activation,
-        'use_sync_bn': self._use_sync_bn,
-        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon,
-        'bn_trainable': self._bn_trainable
-    }
-    base_config = super(BottleneckBlock, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs, training=None):
-    shortcut = inputs
-    if self._use_projection:
-      if self._resnetd_shortcut:
-        shortcut = self._shortcut0(shortcut)
-        shortcut = self._shortcut1(shortcut)
-      else:
-        shortcut = self._shortcut(shortcut)
-      shortcut = self._norm0(shortcut)
-
-    x = self._conv1(inputs)
-    x = self._norm1(x)
-    x = self._activation1(x)
-
-    x = self._conv2(x)
-    x = self._norm2(x)
-    x = self._activation2(x)
-
-    x = self._conv3(x)
-    x = self._norm3(x)
-
-    if self._squeeze_excitation:
-      x = self._squeeze_excitation(x)
-
-    if self._stochastic_depth:
-      x = self._stochastic_depth(x, training=training)
-
-    x = self._add([x, shortcut])
-    return self._activation3(x)
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class InvertedBottleneckBlock(tf.keras.layers.Layer):
-  """An inverted bottleneck block."""
-
-  def __init__(self,
-               in_filters,
-               out_filters,
-               expand_ratio,
-               strides,
-               kernel_size=3,
-               se_ratio=None,
-               stochastic_depth_drop_rate=None,
-               kernel_initializer='VarianceScaling',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activation='relu',
-               se_inner_activation='relu',
-               se_gating_activation='sigmoid',
-               se_round_down_protect=True,
-               expand_se_in_filters=False,
-               depthwise_activation=None,
-               use_sync_bn=False,
-               dilation_rate=1,
-               divisible_by=1,
-               regularize_depthwise=False,
-               use_depthwise=True,
-               use_residual=True,
-               norm_momentum=0.99,
-               norm_epsilon=0.001,
-               output_intermediate_endpoints=False,
-               **kwargs):
-    """Initializes an inverted bottleneck block with BN after convolutions.
-
-    Args:
-      in_filters: An `int` number of filters of the input tensor.
-      out_filters: An `int` number of filters of the output tensor.
-      expand_ratio: An `int` of expand_ratio for an inverted bottleneck block.
-      strides: An `int` block stride. If greater than 1, this block will
-        ultimately downsample the input.
-      kernel_size: An `int` kernel_size of the depthwise conv layer.
-      se_ratio: A `float` or None. If not None, se ratio for the squeeze and
-        excitation layer.
-      stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
-        the stochastic depth layer.
-      kernel_initializer: A `str` of kernel_initializer for convolutional
-        layers.
-      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
-        Conv2D. Default to None.
-      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
-        Default to None.
-      activation: A `str` name of the activation function.
-      se_inner_activation: A `str` name of squeeze-excitation inner activation.
-      se_gating_activation: A `str` name of squeeze-excitation gating
-        activation.
-      se_round_down_protect: A `bool` of whether round down more than 10%
-        will be allowed in SE layer.
-      expand_se_in_filters: A `bool` of whether or not to expand in_filter in
-        squeeze and excitation layer.
-      depthwise_activation: A `str` name of the activation function for
-        depthwise only.
-      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
-      dilation_rate: An `int` that specifies the dilation rate to use for.
-      divisible_by: An `int` that ensures all inner dimensions are divisible by
-        this number.
-      dilated convolution: An `int` to specify the same value for all spatial
-        dimensions.
-      regularize_depthwise: A `bool` of whether or not apply regularization on
-        depthwise.
-      use_depthwise: A `bool` of whether to uses fused convolutions instead of
-        depthwise.
-      use_residual: A `bool` of whether to include residual connection between
-        input and output.
-      norm_momentum: A `float` of normalization momentum for the moving average.
-      norm_epsilon: A `float` added to variance to avoid dividing by zero.
-      output_intermediate_endpoints: A `bool` of whether or not output the
-        intermediate endpoints.
-      **kwargs: Additional keyword arguments to be passed.
-    """
-    super(InvertedBottleneckBlock, self).__init__(**kwargs)
-
-    self._in_filters = in_filters
-    self._out_filters = out_filters
-    self._expand_ratio = expand_ratio
-    self._strides = strides
-    self._kernel_size = kernel_size
-    self._se_ratio = se_ratio
-    self._divisible_by = divisible_by
-    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
-    self._dilation_rate = dilation_rate
-    self._use_sync_bn = use_sync_bn
-    self._regularize_depthwise = regularize_depthwise
-    self._use_depthwise = use_depthwise
-    self._use_residual = use_residual
-    self._activation = activation
-    self._se_inner_activation = se_inner_activation
-    self._se_gating_activation = se_gating_activation
-    self._depthwise_activation = depthwise_activation
-    self._se_round_down_protect = se_round_down_protect
-    self._kernel_initializer = kernel_initializer
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-    self._expand_se_in_filters = expand_se_in_filters
-    self._output_intermediate_endpoints = output_intermediate_endpoints
-
-    if use_sync_bn:
-      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
-    else:
-      self._norm = tf.keras.layers.BatchNormalization
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-    if not depthwise_activation:
-      self._depthwise_activation = activation
-    if regularize_depthwise:
-      self._depthsize_regularizer = kernel_regularizer
-    else:
-      self._depthsize_regularizer = None
-
-  def build(self, input_shape):
-    expand_filters = self._in_filters
-    if self._expand_ratio > 1:
-      # First 1x1 conv for channel expansion.
-      expand_filters = nn_layers.make_divisible(
-          self._in_filters * self._expand_ratio, self._divisible_by)
-
-      expand_kernel = 1 if self._use_depthwise else self._kernel_size
-      expand_stride = 1 if self._use_depthwise else self._strides
-
-      self._conv0 = tf.keras.layers.Conv2D(
-          filters=expand_filters,
-          kernel_size=expand_kernel,
-          strides=expand_stride,
-          padding='same',
-          use_bias=False,
-          kernel_initializer=self._kernel_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer)
-      self._norm0 = self._norm(
-          axis=self._bn_axis,
-          momentum=self._norm_momentum,
-          epsilon=self._norm_epsilon)
-      self._activation_layer = tf_utils.get_activation(
-          self._activation, use_keras_layer=True)
-
-    if self._use_depthwise:
-      # Depthwise conv.
-      self._conv1 = tf.keras.layers.DepthwiseConv2D(
-          kernel_size=(self._kernel_size, self._kernel_size),
-          strides=self._strides,
-          padding='same',
-          depth_multiplier=1,
-          dilation_rate=self._dilation_rate,
-          use_bias=False,
-          depthwise_initializer=self._kernel_initializer,
-          depthwise_regularizer=self._depthsize_regularizer,
-          bias_regularizer=self._bias_regularizer)
-      self._norm1 = self._norm(
-          axis=self._bn_axis,
-          momentum=self._norm_momentum,
-          epsilon=self._norm_epsilon)
-      self._depthwise_activation_layer = tf_utils.get_activation(
-          self._depthwise_activation, use_keras_layer=True)
-
-    # Squeeze and excitation.
-    if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
-      logging.info('Use Squeeze and excitation.')
-      in_filters = self._in_filters
-      if self._expand_se_in_filters:
-        in_filters = expand_filters
-      self._squeeze_excitation = nn_layers.SqueezeExcitation(
-          in_filters=in_filters,
-          out_filters=expand_filters,
-          se_ratio=self._se_ratio,
-          divisible_by=self._divisible_by,
-          round_down_protect=self._se_round_down_protect,
-          kernel_initializer=self._kernel_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer,
-          activation=self._se_inner_activation,
-          gating_activation=self._se_gating_activation)
-    else:
-      self._squeeze_excitation = None
-
-    # Last 1x1 conv.
-    self._conv2 = tf.keras.layers.Conv2D(
-        filters=self._out_filters,
-        kernel_size=1,
-        strides=1,
-        padding='same',
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm2 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    if self._stochastic_depth_drop_rate:
-      self._stochastic_depth = nn_layers.StochasticDepth(
-          self._stochastic_depth_drop_rate)
-    else:
-      self._stochastic_depth = None
-    self._add = tf.keras.layers.Add()
-
-    super(InvertedBottleneckBlock, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        'in_filters': self._in_filters,
-        'out_filters': self._out_filters,
-        'expand_ratio': self._expand_ratio,
-        'strides': self._strides,
-        'kernel_size': self._kernel_size,
-        'se_ratio': self._se_ratio,
-        'divisible_by': self._divisible_by,
-        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
-        'kernel_initializer': self._kernel_initializer,
-        'kernel_regularizer': self._kernel_regularizer,
-        'bias_regularizer': self._bias_regularizer,
-        'activation': self._activation,
-        'se_inner_activation': self._se_inner_activation,
-        'se_gating_activation': self._se_gating_activation,
-        'se_round_down_protect': self._se_round_down_protect,
-        'expand_se_in_filters': self._expand_se_in_filters,
-        'depthwise_activation': self._depthwise_activation,
-        'dilation_rate': self._dilation_rate,
-        'use_sync_bn': self._use_sync_bn,
-        'regularize_depthwise': self._regularize_depthwise,
-        'use_depthwise': self._use_depthwise,
-        'use_residual': self._use_residual,
-        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon
-    }
-    base_config = super(InvertedBottleneckBlock, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs, training=None):
-    endpoints = {}
-    shortcut = inputs
-    if self._expand_ratio > 1:
-      x = self._conv0(inputs)
-      x = self._norm0(x)
-      x = self._activation_layer(x)
-    else:
-      x = inputs
-
-    if self._use_depthwise:
-      x = self._conv1(x)
-      x = self._norm1(x)
-      x = self._depthwise_activation_layer(x)
-      if self._output_intermediate_endpoints:
-        endpoints['depthwise'] = x
-
-    if self._squeeze_excitation:
-      x = self._squeeze_excitation(x)
-
-    x = self._conv2(x)
-    x = self._norm2(x)
-
-    if (self._use_residual and self._in_filters == self._out_filters and
-        self._strides == 1):
-      if self._stochastic_depth:
-        x = self._stochastic_depth(x, training=training)
-      x = self._add([x, shortcut])
-
-    if self._output_intermediate_endpoints:
-      return x, endpoints
-    return x
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class ResidualInner(tf.keras.layers.Layer):
-  """Creates a single inner block of a residual.
-
-  This corresponds to `F`/`G` functions in the RevNet paper:
-  Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
-  The Reversible Residual Network: Backpropagation Without Storing Activations.
-  (https://arxiv.org/pdf/1707.04585.pdf)
-  """
-
-  def __init__(
-      self,
-      filters: int,
-      strides: int,
-      kernel_initializer: Union[str, Callable[
-          ..., tf.keras.initializers.Initializer]] = 'VarianceScaling',
-      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-      activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
-      use_sync_bn: bool = False,
-      norm_momentum: float = 0.99,
-      norm_epsilon: float = 0.001,
-      batch_norm_first: bool = True,
-      **kwargs):
-    """Initializes a ResidualInner.
-
-    Args:
-      filters: An `int` of output filter size.
-      strides: An `int` of stride size for convolution for the residual block.
-      kernel_initializer: A `str` or `tf.keras.initializers.Initializer`
-        instance for convolutional layers.
-      kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D.
-      activation: A `str` or `callable` instance of the activation function.
-      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
-      norm_momentum: A `float` of normalization momentum for the moving average.
-      norm_epsilon: A `float` added to variance to avoid dividing by zero.
-      batch_norm_first: A `bool` of whether to apply activation and batch norm
-        before conv.
-      **kwargs: Additional keyword arguments to be passed.
-    """
-    super(ResidualInner, self).__init__(**kwargs)
-
-    self.strides = strides
-    self.filters = filters
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._kernel_regularizer = kernel_regularizer
-    self._activation = tf.keras.activations.get(activation)
-    self._use_sync_bn = use_sync_bn
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-    self._batch_norm_first = batch_norm_first
-
-    if use_sync_bn:
-      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
-    else:
-      self._norm = tf.keras.layers.BatchNormalization
-
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-    self._activation_fn = tf_utils.get_activation(activation)
-
-  def build(self, input_shape: tf.TensorShape):
-    if self._batch_norm_first:
-      self._batch_norm_0 = self._norm(
-          axis=self._bn_axis,
-          momentum=self._norm_momentum,
-          epsilon=self._norm_epsilon)
-
-    self._conv2d_1 = tf.keras.layers.Conv2D(
-        filters=self.filters,
-        kernel_size=3,
-        strides=self.strides,
-        use_bias=False,
-        padding='same',
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer)
-
-    self._batch_norm_1 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    self._conv2d_2 = tf.keras.layers.Conv2D(
-        filters=self.filters,
-        kernel_size=3,
-        strides=1,
-        use_bias=False,
-        padding='same',
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer)
-
-    super(ResidualInner, self).build(input_shape)
-
-  def get_config(self) -> Dict[str, Any]:
-    config = {
-        'filters': self.filters,
-        'strides': self.strides,
-        'kernel_initializer': self._kernel_initializer,
-        'kernel_regularizer': self._kernel_regularizer,
-        'activation': self._activation,
-        'use_sync_bn': self._use_sync_bn,
-        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon,
-        'batch_norm_first': self._batch_norm_first,
-    }
-    base_config = super(ResidualInner, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self,
-           inputs: tf.Tensor,
-           training: Optional[bool] = None) -> tf.Tensor:
-    x = inputs
-    if self._batch_norm_first:
-      x = self._batch_norm_0(x, training=training)
-      x = self._activation_fn(x)
-    x = self._conv2d_1(x)
-
-    x = self._batch_norm_1(x, training=training)
-    x = self._activation_fn(x)
-    x = self._conv2d_2(x)
-    return x
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class BottleneckResidualInner(tf.keras.layers.Layer):
-  """Creates a single inner block of a bottleneck.
-
-  This corresponds to `F`/`G` functions in the RevNet paper:
-  Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
-  The Reversible Residual Network: Backpropagation Without Storing Activations.
-  (https://arxiv.org/pdf/1707.04585.pdf)
-  """
-
-  def __init__(
-      self,
-      filters: int,
-      strides: int,
-      kernel_initializer: Union[str, Callable[
-          ..., tf.keras.initializers.Initializer]] = 'VarianceScaling',
-      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-      activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
-      use_sync_bn: bool = False,
-      norm_momentum: float = 0.99,
-      norm_epsilon: float = 0.001,
-      batch_norm_first: bool = True,
-      **kwargs):
-    """Initializes a BottleneckResidualInner.
-
-    Args:
-      filters: An `int` number of filters for first 2 convolutions. Last Last,
-        and thus the number of output channels from the bottlneck block is
-        `4*filters`
-      strides: An `int` of stride size for convolution for the residual block.
-      kernel_initializer: A `str` or `tf.keras.initializers.Initializer`
-        instance for convolutional layers.
-      kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D.
-      activation: A `str` or `callable` instance of the activation function.
-      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
-      norm_momentum: A `float` of normalization momentum for the moving average.
-      norm_epsilon: A `float` added to variance to avoid dividing by zero.
-      batch_norm_first: A `bool` of whether to apply activation and batch norm
-        before conv.
-      **kwargs: Additional keyword arguments to be passed.
-    """
-    super(BottleneckResidualInner, self).__init__(**kwargs)
-
-    self.strides = strides
-    self.filters = filters
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._kernel_regularizer = kernel_regularizer
-    self._activation = tf.keras.activations.get(activation)
-    self._use_sync_bn = use_sync_bn
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-    self._batch_norm_first = batch_norm_first
-
-    if use_sync_bn:
-      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
-    else:
-      self._norm = tf.keras.layers.BatchNormalization
-
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-    self._activation_fn = tf_utils.get_activation(activation)
-
-  def build(self, input_shape: tf.TensorShape):
-    if self._batch_norm_first:
-      self._batch_norm_0 = self._norm(
-          axis=self._bn_axis,
-          momentum=self._norm_momentum,
-          epsilon=self._norm_epsilon)
-    self._conv2d_1 = tf.keras.layers.Conv2D(
-        filters=self.filters,
-        kernel_size=1,
-        strides=self.strides,
-        use_bias=False,
-        padding='same',
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer)
-    self._batch_norm_1 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-    self._conv2d_2 = tf.keras.layers.Conv2D(
-        filters=self.filters,
-        kernel_size=3,
-        strides=1,
-        use_bias=False,
-        padding='same',
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer)
-    self._batch_norm_2 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-    self._conv2d_3 = tf.keras.layers.Conv2D(
-        filters=self.filters * 4,
-        kernel_size=1,
-        strides=1,
-        use_bias=False,
-        padding='same',
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer)
-
-    super(BottleneckResidualInner, self).build(input_shape)
-
-  def get_config(self) -> Dict[str, Any]:
-    config = {
-        'filters': self.filters,
-        'strides': self.strides,
-        'kernel_initializer': self._kernel_initializer,
-        'kernel_regularizer': self._kernel_regularizer,
-        'activation': self._activation,
-        'use_sync_bn': self._use_sync_bn,
-        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon,
-        'batch_norm_first': self._batch_norm_first,
-    }
-    base_config = super(BottleneckResidualInner, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self,
-           inputs: tf.Tensor,
-           training: Optional[bool] = None) -> tf.Tensor:
-    x = inputs
-    if self._batch_norm_first:
-      x = self._batch_norm_0(x, training=training)
-      x = self._activation_fn(x)
-    x = self._conv2d_1(x)
-
-    x = self._batch_norm_1(x, training=training)
-    x = self._activation_fn(x)
-    x = self._conv2d_2(x)
-
-    x = self._batch_norm_2(x, training=training)
-    x = self._activation_fn(x)
-    x = self._conv2d_3(x)
-
-    return x
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class ReversibleLayer(tf.keras.layers.Layer):
-  """Creates a reversible layer.
-
-  Computes y1 = x1 + f(x2), y2 = x2 + g(y1), where f and g can be arbitrary
-  layers that are stateless, which in this case are `ResidualInner` layers.
-  """
-
-  def __init__(self,
-               f: tf.keras.layers.Layer,
-               g: tf.keras.layers.Layer,
-               manual_grads: bool = True,
-               **kwargs):
-    """Initializes a ReversibleLayer.
-
-    Args:
-      f: A `tf.keras.layers.Layer` instance of `f` inner block referred to in
-        paper. Each reversible layer consists of two inner functions. For
-        example, in RevNet the reversible residual consists of two f/g inner
-        (bottleneck) residual functions. Where the input to the reversible layer
-        is x, the input gets partitioned in the channel dimension and the
-        forward pass follows (eq8): x = [x1; x2], z1 = x1 + f(x2), y2 = x2 +
-          g(z1), y1 = stop_gradient(z1).
-      g: A `tf.keras.layers.Layer` instance of `g` inner block referred to in
-        paper. Detailed explanation same as above as `f` arg.
-      manual_grads: A `bool` [Testing Only] of whether to manually take
-        gradients as in Algorithm 1 or defer to autograd.
-      **kwargs: Additional keyword arguments to be passed.
-    """
-    super(ReversibleLayer, self).__init__(**kwargs)
-
-    self._f = f
-    self._g = g
-    self._manual_grads = manual_grads
-
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._axis = -1
-    else:
-      self._axis = 1
-
-  def get_config(self) -> Dict[str, Any]:
-    config = {
-        'f': self._f,
-        'g': self._g,
-        'manual_grads': self._manual_grads,
-    }
-    base_config = super(ReversibleLayer, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def _ckpt_non_trainable_vars(self):
-    self._f_non_trainable_vars = [
-        v.read_value() for v in self._f.non_trainable_variables
-    ]
-    self._g_non_trainable_vars = [
-        v.read_value() for v in self._g.non_trainable_variables
-    ]
-
-  def _load_ckpt_non_trainable_vars(self):
-    for v, v_chkpt in zip(self._f.non_trainable_variables,
-                          self._f_non_trainable_vars):
-      v.assign(v_chkpt)
-    for v, v_chkpt in zip(self._g.non_trainable_variables,
-                          self._g_non_trainable_vars):
-      v.assign(v_chkpt)
-
-  def call(self,
-           inputs: tf.Tensor,
-           training: Optional[bool] = None) -> tf.Tensor:
-
-    @tf.custom_gradient
-    def reversible(
-        x: tf.Tensor
-    ) -> Tuple[tf.Tensor, Callable[[Any], Tuple[List[tf.Tensor],
-                                                List[tf.Tensor]]]]:
-      """Implements Algorithm 1 in the RevNet paper.
-
-         Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
-         The Reversible Residual Network: Backpropagation Without Storing
-         Activations.
-         (https://arxiv.org/pdf/1707.04585.pdf)
-
-      Args:
-        x: An input `tf.Tensor.
-
-      Returns:
-        y: The output [y1; y2] in Algorithm 1.
-        grad_fn: A callable function that computes the gradients.
-      """
-      with tf.GradientTape() as fwdtape:
-        fwdtape.watch(x)
-        x1, x2 = tf.split(x, num_or_size_splits=2, axis=self._axis)
-        f_x2 = self._f(x2, training=training)
-        x1_down = _maybe_downsample(x1, f_x2.shape[self._axis], self._f.strides,
-                                    self._axis)
-        z1 = f_x2 + x1_down
-        g_z1 = self._g(z1, training=training)
-        x2_down = _maybe_downsample(x2, g_z1.shape[self._axis], self._f.strides,
-                                    self._axis)
-        y2 = x2_down + g_z1
-
-        # Equation 8: https://arxiv.org/pdf/1707.04585.pdf
-        # Decouple y1 and z1 so that their derivatives are different.
-        y1 = tf.identity(z1)
-        y = tf.concat([y1, y2], axis=self._axis)
-
-        irreversible = ((self._f.strides != 1 or self._g.strides != 1) or
-                        (y.shape[self._axis] != inputs.shape[self._axis]))
-
-        # Checkpointing moving mean/variance for batch normalization layers
-        # as they shouldn't be updated during the custom gradient pass of f/g.
-        self._ckpt_non_trainable_vars()
-
-      def grad_fn(
-          dy: tf.Tensor,
-          variables: Optional[List[tf.Variable]] = None,
-      ) -> Tuple[List[tf.Tensor], List[tf.Tensor]]:
-        """Given dy calculate (dy/dx)|_{x_{input}} using f/g."""
-        if irreversible or not self._manual_grads:
-          grads_combined = fwdtape.gradient(
-              y, [x] + variables, output_gradients=dy)
-          dx = grads_combined[0]
-          grad_vars = grads_combined[1:]
-        else:
-          y1_nograd = tf.stop_gradient(y1)
-          y2_nograd = tf.stop_gradient(y2)
-          dy1, dy2 = tf.split(dy, num_or_size_splits=2, axis=self._axis)
-
-          # Index mapping from self.f/g.trainable_variables to grad_fn
-          # input `variables` kwarg so that we can reorder dwf + dwg
-          # variable gradient list to match `variables` order.
-          f_var_refs = [v.ref() for v in self._f.trainable_variables]
-          g_var_refs = [v.ref() for v in self._g.trainable_variables]
-          fg_var_refs = f_var_refs + g_var_refs
-          self_to_var_index = [fg_var_refs.index(v.ref()) for v in variables]
-
-          # Algorithm 1 in paper (line # documented in-line)
-          z1 = y1_nograd  # line 2
-          with tf.GradientTape() as gtape:
-            gtape.watch(z1)
-            g_z1 = self._g(z1, training=training)
-          x2 = y2_nograd - g_z1  # line 3
-
-          with tf.GradientTape() as ftape:
-            ftape.watch(x2)
-            f_x2 = self._f(x2, training=training)
-          x1 = z1 - f_x2  # pylint: disable=unused-variable      # line 4
-
-          # Compute gradients
-          g_grads_combined = gtape.gradient(
-              g_z1, [z1] + self._g.trainable_variables, output_gradients=dy2)
-          dz1 = dy1 + g_grads_combined[0]  # line 5
-          dwg = g_grads_combined[1:]  # line 9
-
-          f_grads_combined = ftape.gradient(
-              f_x2, [x2] + self._f.trainable_variables, output_gradients=dz1)
-          dx2 = dy2 + f_grads_combined[0]  # line 6
-          dwf = f_grads_combined[1:]  # line 8
-          dx1 = dz1  # line 7
-
-          # Pack the input and variable gradients.
-          dx = tf.concat([dx1, dx2], axis=self._axis)
-          grad_vars = dwf + dwg
-          # Reorder gradients (trainable_variables to variables kwarg order)
-          grad_vars = [grad_vars[i] for i in self_to_var_index]
-
-          # Restore batch normalization moving mean/variance for correctness.
-          self._load_ckpt_non_trainable_vars()
-
-        return dx, grad_vars  # grad_fn end
-
-      return y, grad_fn  # reversible end
-
-    activations = reversible(inputs)
-    return activations
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class DepthwiseSeparableConvBlock(tf.keras.layers.Layer):
-  """Creates an depthwise separable convolution block with batch normalization."""
-
-  def __init__(
-      self,
-      filters: int,
-      kernel_size: int = 3,
-      strides: int = 1,
-      regularize_depthwise=False,
-      activation: Text = 'relu6',
-      kernel_initializer: Text = 'VarianceScaling',
-      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-      dilation_rate: int = 1,
-      use_sync_bn: bool = False,
-      norm_momentum: float = 0.99,
-      norm_epsilon: float = 0.001,
-      **kwargs):
-    """Initializes a convolution block with batch normalization.
-
-    Args:
-      filters: An `int` number of filters for the first two convolutions. Note
-        that the third and final convolution will use 4 times as many filters.
-      kernel_size: An `int` that specifies the height and width of the 2D
-        convolution window.
-      strides: An `int` of block stride. If greater than 1, this block will
-        ultimately downsample the input.
-      regularize_depthwise: A `bool`. If Ture, apply regularization on
-        depthwise.
-      activation: A `str` name of the activation function.
-      kernel_initializer: A `str` of kernel_initializer for convolutional
-        layers.
-      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
-        Conv2D. Default to None.
-      dilation_rate: An `int` or tuple/list of 2 `int`, specifying the dilation
-        rate to use for dilated convolution. Can be a single integer to specify
-        the same value for all spatial dimensions.
-      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
-      norm_momentum: A `float` of normalization momentum for the moving average.
-      norm_epsilon: A `float` added to variance to avoid dividing by zero.
-      **kwargs: Additional keyword arguments to be passed.
-    """
-    super(DepthwiseSeparableConvBlock, self).__init__(**kwargs)
-    self._filters = filters
-    self._kernel_size = kernel_size
-    self._strides = strides
-    self._activation = activation
-    self._regularize_depthwise = regularize_depthwise
-    self._kernel_initializer = kernel_initializer
-    self._kernel_regularizer = kernel_regularizer
-    self._dilation_rate = dilation_rate
-    self._use_sync_bn = use_sync_bn
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-
-    if use_sync_bn:
-      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
-    else:
-      self._norm = tf.keras.layers.BatchNormalization
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-    self._activation_fn = tf_utils.get_activation(activation)
-    if regularize_depthwise:
-      self._depthsize_regularizer = kernel_regularizer
-    else:
-      self._depthsize_regularizer = None
-
-  def get_config(self):
-    config = {
-        'filters': self._filters,
-        'strides': self._strides,
-        'regularize_depthwise': self._regularize_depthwise,
-        'kernel_initializer': self._kernel_initializer,
-        'kernel_regularizer': self._kernel_regularizer,
-        'activation': self._activation,
-        'use_sync_bn': self._use_sync_bn,
-        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon
-    }
-    base_config = super(DepthwiseSeparableConvBlock, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def build(self, input_shape):
-
-    self._dwconv0 = tf.keras.layers.DepthwiseConv2D(
-        kernel_size=self._kernel_size,
-        strides=self._strides,
-        padding='same',
-        depth_multiplier=1,
-        dilation_rate=self._dilation_rate,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._depthsize_regularizer,
-        use_bias=False)
-    self._norm0 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    self._conv1 = tf.keras.layers.Conv2D(
-        filters=self._filters,
-        kernel_size=1,
-        strides=1,
-        padding='same',
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer)
-    self._norm1 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    super(DepthwiseSeparableConvBlock, self).build(input_shape)
-
-  def call(self, inputs, training=None):
-    x = self._dwconv0(inputs)
-    x = self._norm0(x)
-    x = self._activation_fn(x)
-
-    x = self._conv1(x)
-    x = self._norm1(x)
-    return self._activation_fn(x)
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class TuckerConvBlock(tf.keras.layers.Layer):
-  """An Tucker block (generalized bottleneck)."""
-
-  def __init__(self,
-               in_filters,
-               out_filters,
-               input_compression_ratio,
-               output_compression_ratio,
-               strides,
-               kernel_size=3,
-               stochastic_depth_drop_rate=None,
-               kernel_initializer='VarianceScaling',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activation='relu',
-               use_sync_bn=False,
-               divisible_by=1,
-               use_residual=True,
-               norm_momentum=0.99,
-               norm_epsilon=0.001,
-               **kwargs):
-    """Initializes an inverted bottleneck block with BN after convolutions.
-
-    Args:
-      in_filters: An `int` number of filters of the input tensor.
-      out_filters: An `int` number of filters of the output tensor.
-      input_compression_ratio: An `float` of compression ratio for
-        input filters.
-      output_compression_ratio: An `float` of compression ratio for
-        output filters.
-      strides: An `int` block stride. If greater than 1, this block will
-        ultimately downsample the input.
-      kernel_size: An `int` kernel_size of the depthwise conv layer.
-      stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
-        the stochastic depth layer.
-      kernel_initializer: A `str` of kernel_initializer for convolutional
-        layers.
-      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
-        Conv2D. Default to None.
-      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
-        Default to None.
-      activation: A `str` name of the activation function.
-      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
-      divisible_by: An `int` that ensures all inner dimensions are divisible by
-        this number.
-      use_residual: A `bool` of whether to include residual connection between
-        input and output.
-      norm_momentum: A `float` of normalization momentum for the moving average.
-      norm_epsilon: A `float` added to variance to avoid dividing by zero.
-      **kwargs: Additional keyword arguments to be passed.
-    """
-    super(TuckerConvBlock, self).__init__(**kwargs)
-
-    self._in_filters = in_filters
-    self._out_filters = out_filters
-    self._input_compression_ratio = input_compression_ratio
-    self._output_compression_ratio = output_compression_ratio
-    self._strides = strides
-    self._kernel_size = kernel_size
-    self._divisible_by = divisible_by
-    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
-    self._use_sync_bn = use_sync_bn
-    self._use_residual = use_residual
-    self._activation = activation
-    self._kernel_initializer = kernel_initializer
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-
-    if use_sync_bn:
-      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
-    else:
-      self._norm = tf.keras.layers.BatchNormalization
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-
-  def build(self, input_shape):
-    input_compressed_filters = nn_layers.make_divisible(
-        value=self._in_filters * self._input_compression_ratio,
-        divisor=self._divisible_by,
-        round_down_protect=False)
-
-    self._conv0 = tf.keras.layers.Conv2D(
-        filters=input_compressed_filters,
-        kernel_size=1,
-        strides=1,
-        padding='same',
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm0 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-    self._activation_layer0 = tf_utils.get_activation(
-        self._activation, use_keras_layer=True)
-
-    output_compressed_filters = nn_layers.make_divisible(
-        value=self._out_filters * self._output_compression_ratio,
-        divisor=self._divisible_by,
-        round_down_protect=False)
-
-    self._conv1 = tf.keras.layers.Conv2D(
-        filters=output_compressed_filters,
-        kernel_size=self._kernel_size,
-        strides=self._strides,
-        padding='same',
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm1 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-    self._activation_layer1 = tf_utils.get_activation(
-        self._activation, use_keras_layer=True)
-
-    # Last 1x1 conv.
-    self._conv2 = tf.keras.layers.Conv2D(
-        filters=self._out_filters,
-        kernel_size=1,
-        strides=1,
-        padding='same',
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm2 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    if self._stochastic_depth_drop_rate:
-      self._stochastic_depth = nn_layers.StochasticDepth(
-          self._stochastic_depth_drop_rate)
-    else:
-      self._stochastic_depth = None
-    self._add = tf.keras.layers.Add()
-
-    super(TuckerConvBlock, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        'in_filters': self._in_filters,
-        'out_filters': self._out_filters,
-        'input_compression_ratio': self._input_compression_ratio,
-        'output_compression_ratio': self._output_compression_ratio,
-        'strides': self._strides,
-        'kernel_size': self._kernel_size,
-        'divisible_by': self._divisible_by,
-        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
-        'kernel_initializer': self._kernel_initializer,
-        'kernel_regularizer': self._kernel_regularizer,
-        'bias_regularizer': self._bias_regularizer,
-        'activation': self._activation,
-        'use_sync_bn': self._use_sync_bn,
-        'use_residual': self._use_residual,
-        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon
-    }
-    base_config = super(TuckerConvBlock, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs, training=None):
-    shortcut = inputs
-
-    x = self._conv0(inputs)
-    x = self._norm0(x)
-    x = self._activation_layer0(x)
-
-    x = self._conv1(x)
-    x = self._norm1(x)
-    x = self._activation_layer1(x)
-
-    x = self._conv2(x)
-    x = self._norm2(x)
-
-    if (self._use_residual and
-        self._in_filters == self._out_filters and
-        self._strides == 1):
-      if self._stochastic_depth:
-        x = self._stochastic_depth(x, training=training)
-      x = self._add([x, shortcut])
-
-    return x
--- a/official/vision/modeling/layers/nn_blocks_3d.py
+++ b/official/vision/modeling/layers/nn_blocks_3d.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Contains common building blocks for 3D networks."""
-# Import libraries
-import tensorflow as tf
-
-from official.modeling import tf_utils
-from official.vision.modeling.layers import nn_layers
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class SelfGating(tf.keras.layers.Layer):
-  """Feature gating as used in S3D-G.
-
-  This implements the S3D-G network from:
-  Saining Xie, Chen Sun, Jonathan Huang, Zhuowen Tu, Kevin Murphy.
-  Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video
-  Classification.
-  (https://arxiv.org/pdf/1712.04851.pdf)
-  """
-
-  def __init__(self, filters, **kwargs):
-    """Initializes a self-gating layer.
-
-    Args:
-      filters: An `int` number of filters for the convolutional layer.
-      **kwargs: Additional keyword arguments to be passed.
-    """
-    super(SelfGating, self).__init__(**kwargs)
-    self._filters = filters
-
-  def build(self, input_shape):
-    self._spatial_temporal_average = tf.keras.layers.GlobalAveragePooling3D()
-
-    # No BN and activation after conv.
-    self._transformer_w = tf.keras.layers.Conv3D(
-        filters=self._filters,
-        kernel_size=[1, 1, 1],
-        use_bias=True,
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(
-            mean=0.0, stddev=0.01))
-
-    super(SelfGating, self).build(input_shape)
-
-  def call(self, inputs):
-    x = self._spatial_temporal_average(inputs)
-
-    x = tf.expand_dims(x, 1)
-    x = tf.expand_dims(x, 2)
-    x = tf.expand_dims(x, 3)
-
-    x = self._transformer_w(x)
-    x = tf.nn.sigmoid(x)
-
-    return tf.math.multiply(x, inputs)
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class BottleneckBlock3D(tf.keras.layers.Layer):
-  """Creates a 3D bottleneck block."""
-
-  def __init__(self,
-               filters,
-               temporal_kernel_size,
-               temporal_strides,
-               spatial_strides,
-               stochastic_depth_drop_rate=0.0,
-               se_ratio=None,
-               use_self_gating=False,
-               kernel_initializer='VarianceScaling',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activation='relu',
-               use_sync_bn=False,
-               norm_momentum=0.99,
-               norm_epsilon=0.001,
-               **kwargs):
-    """Initializes a 3D bottleneck block with BN after convolutions.
-
-    Args:
-      filters: An `int` number of filters for the first two convolutions. Note
-        that the third and final convolution will use 4 times as many filters.
-      temporal_kernel_size: An `int` of kernel size for the temporal
-        convolutional layer.
-      temporal_strides: An `int` of ftemporal stride for the temporal
-        convolutional layer.
-      spatial_strides: An `int` of spatial stride for the spatial convolutional
-        layer.
-      stochastic_depth_drop_rate: A `float` or None. If not None, drop rate for
-        the stochastic depth layer.
-      se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
-      use_self_gating: A `bool` of whether to apply self-gating module or not.
-      kernel_initializer: A `str` of kernel_initializer for convolutional
-        layers.
-      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
-        Conv2D. Default to None.
-      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
-        Default to None.
-      activation: A `str` name of the activation function.
-      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
-      norm_momentum: A `float` of normalization momentum for the moving average.
-      norm_epsilon: A `float` added to variance to avoid dividing by zero.
-      **kwargs: Additional keyword arguments to be passed.
-    """
-    super(BottleneckBlock3D, self).__init__(**kwargs)
-
-    self._filters = filters
-    self._temporal_kernel_size = temporal_kernel_size
-    self._spatial_strides = spatial_strides
-    self._temporal_strides = temporal_strides
-    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
-    self._use_self_gating = use_self_gating
-    self._se_ratio = se_ratio
-    self._use_sync_bn = use_sync_bn
-    self._activation = activation
-    self._kernel_initializer = kernel_initializer
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-
-    if use_sync_bn:
-      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
-    else:
-      self._norm = tf.keras.layers.BatchNormalization
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-    self._activation_fn = tf_utils.get_activation(activation)
-
-  def build(self, input_shape):
-    self._shortcut_maxpool = tf.keras.layers.MaxPool3D(
-        pool_size=[1, 1, 1],
-        strides=[
-            self._temporal_strides, self._spatial_strides, self._spatial_strides
-        ])
-
-    self._shortcut_conv = tf.keras.layers.Conv3D(
-        filters=4 * self._filters,
-        kernel_size=1,
-        strides=[
-            self._temporal_strides, self._spatial_strides, self._spatial_strides
-        ],
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm0 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    self._temporal_conv = tf.keras.layers.Conv3D(
-        filters=self._filters,
-        kernel_size=[self._temporal_kernel_size, 1, 1],
-        strides=[self._temporal_strides, 1, 1],
-        padding='same',
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm1 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    self._spatial_conv = tf.keras.layers.Conv3D(
-        filters=self._filters,
-        kernel_size=[1, 3, 3],
-        strides=[1, self._spatial_strides, self._spatial_strides],
-        padding='same',
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm2 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    self._expand_conv = tf.keras.layers.Conv3D(
-        filters=4 * self._filters,
-        kernel_size=[1, 1, 1],
-        strides=[1, 1, 1],
-        padding='same',
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm3 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
-
-    if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
-      self._squeeze_excitation = nn_layers.SqueezeExcitation(
-          in_filters=self._filters * 4,
-          out_filters=self._filters * 4,
-          se_ratio=self._se_ratio,
-          use_3d_input=True,
-          kernel_initializer=self._kernel_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer)
-    else:
-      self._squeeze_excitation = None
-
-    if self._stochastic_depth_drop_rate:
-      self._stochastic_depth = nn_layers.StochasticDepth(
-          self._stochastic_depth_drop_rate)
-    else:
-      self._stochastic_depth = None
-
-    if self._use_self_gating:
-      self._self_gating = SelfGating(filters=4 * self._filters)
-    else:
-      self._self_gating = None
-
-    super(BottleneckBlock3D, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        'filters': self._filters,
-        'temporal_kernel_size': self._temporal_kernel_size,
-        'temporal_strides': self._temporal_strides,
-        'spatial_strides': self._spatial_strides,
-        'use_self_gating': self._use_self_gating,
-        'se_ratio': self._se_ratio,
-        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
-        'kernel_initializer': self._kernel_initializer,
-        'kernel_regularizer': self._kernel_regularizer,
-        'bias_regularizer': self._bias_regularizer,
-        'activation': self._activation,
-        'use_sync_bn': self._use_sync_bn,
-        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon
-    }
-    base_config = super(BottleneckBlock3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs, training=None):
-    in_filters = inputs.shape.as_list()[-1]
-    if in_filters == 4 * self._filters:
-      if self._temporal_strides == 1 and self._spatial_strides == 1:
-        shortcut = inputs
-      else:
-        shortcut = self._shortcut_maxpool(inputs)
-    else:
-      shortcut = self._shortcut_conv(inputs)
-      shortcut = self._norm0(shortcut)
-
-    x = self._temporal_conv(inputs)
-    x = self._norm1(x)
-    x = self._activation_fn(x)
-
-    x = self._spatial_conv(x)
-    x = self._norm2(x)
-    x = self._activation_fn(x)
-
-    x = self._expand_conv(x)
-    x = self._norm3(x)
-
-    # Apply self-gating, SE, stochastic depth.
-    if self._self_gating:
-      x = self._self_gating(x)
-    if self._squeeze_excitation:
-      x = self._squeeze_excitation(x)
-    if self._stochastic_depth:
-      x = self._stochastic_depth(x, training=training)
-
-    # Apply activation before additional modules.
-    x = self._activation_fn(x + shortcut)
-
-    return x
--- a/official/vision/modeling/layers/nn_blocks_3d_test.py
+++ b/official/vision/modeling/layers/nn_blocks_3d_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Tests for resnet."""
-
-# Import libraries
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.vision.modeling.layers import nn_blocks_3d
-
-
-class NNBlocksTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters(
-      (nn_blocks_3d.BottleneckBlock3D, 1, 1, 2, True, 0.2, 0.1),
-      (nn_blocks_3d.BottleneckBlock3D, 3, 2, 1, False, 0.0, 0.0),
-  )
-  def test_bottleneck_block_creation(self, block_fn, temporal_kernel_size,
-                                     temporal_strides, spatial_strides,
-                                     use_self_gating, se_ratio,
-                                     stochastic_depth):
-    temporal_size = 16
-    spatial_size = 128
-    filters = 256
-    inputs = tf.keras.Input(
-        shape=(temporal_size, spatial_size, spatial_size, filters * 4),
-        batch_size=1)
-    block = block_fn(
-        filters=filters,
-        temporal_kernel_size=temporal_kernel_size,
-        temporal_strides=temporal_strides,
-        spatial_strides=spatial_strides,
-        use_self_gating=use_self_gating,
-        se_ratio=se_ratio,
-        stochastic_depth_drop_rate=stochastic_depth)
-
-    features = block(inputs)
-
-    self.assertAllEqual([
-        1, temporal_size // temporal_strides, spatial_size // spatial_strides,
-        spatial_size // spatial_strides, filters * 4
-    ], features.shape.as_list())
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/layers/nn_blocks_test.py
+++ b/official/vision/modeling/layers/nn_blocks_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Tests for nn_blocks."""
-
-from typing import Any, Iterable, Tuple
-# Import libraries
-from absl.testing import parameterized
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.vision.modeling.layers import nn_blocks
-
-
-def distribution_strategy_combinations() -> Iterable[Tuple[Any, ...]]:
-  """Returns the combinations of end-to-end tests to run."""
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.default_strategy,
-          strategy_combinations.cloud_tpu_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-      ],)
-
-
-class NNBlocksTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters(
-      (nn_blocks.ResidualBlock, 1, False, 0.0, None),
-      (nn_blocks.ResidualBlock, 2, True, 0.2, 0.25),
-  )
-  def test_residual_block_creation(self, block_fn, strides, use_projection,
-                                   stochastic_depth_drop_rate, se_ratio):
-    input_size = 128
-    filter_size = 256
-    inputs = tf.keras.Input(
-        shape=(input_size, input_size, filter_size), batch_size=1)
-    block = block_fn(
-        filter_size,
-        strides,
-        use_projection=use_projection,
-        se_ratio=se_ratio,
-        stochastic_depth_drop_rate=stochastic_depth_drop_rate,
-    )
-
-    features = block(inputs)
-
-    self.assertAllEqual(
-        [1, input_size // strides, input_size // strides, filter_size],
-        features.shape.as_list())
-
-  @parameterized.parameters(
-      (nn_blocks.BottleneckBlock, 1, False, 0.0, None),
-      (nn_blocks.BottleneckBlock, 2, True, 0.2, 0.25),
-  )
-  def test_bottleneck_block_creation(self, block_fn, strides, use_projection,
-                                     stochastic_depth_drop_rate, se_ratio):
-    input_size = 128
-    filter_size = 256
-    inputs = tf.keras.Input(
-        shape=(input_size, input_size, filter_size * 4), batch_size=1)
-    block = block_fn(
-        filter_size,
-        strides,
-        use_projection=use_projection,
-        se_ratio=se_ratio,
-        stochastic_depth_drop_rate=stochastic_depth_drop_rate)
-
-    features = block(inputs)
-
-    self.assertAllEqual(
-        [1, input_size // strides, input_size // strides, filter_size * 4],
-        features.shape.as_list())
-
-  @parameterized.parameters(
-      (nn_blocks.InvertedBottleneckBlock, 1, 1, None, None),
-      (nn_blocks.InvertedBottleneckBlock, 6, 1, None, None),
-      (nn_blocks.InvertedBottleneckBlock, 1, 2, None, None),
-      (nn_blocks.InvertedBottleneckBlock, 1, 1, 0.2, None),
-      (nn_blocks.InvertedBottleneckBlock, 1, 1, None, 0.2),
-  )
-  def test_invertedbottleneck_block_creation(self, block_fn, expand_ratio,
-                                             strides, se_ratio,
-                                             stochastic_depth_drop_rate):
-    input_size = 128
-    in_filters = 24
-    out_filters = 40
-    inputs = tf.keras.Input(
-        shape=(input_size, input_size, in_filters), batch_size=1)
-    block = block_fn(
-        in_filters=in_filters,
-        out_filters=out_filters,
-        expand_ratio=expand_ratio,
-        strides=strides,
-        se_ratio=se_ratio,
-        stochastic_depth_drop_rate=stochastic_depth_drop_rate)
-
-    features = block(inputs)
-
-    self.assertAllEqual(
-        [1, input_size // strides, input_size // strides, out_filters],
-        features.shape.as_list())
-
-  @parameterized.parameters(
-      (nn_blocks.TuckerConvBlock, 1, 0.25, 0.25),
-      (nn_blocks.TuckerConvBlock, 2, 0.25, 0.25),
-  )
-  def test_tucker_conv_block(
-      self, block_fn, strides,
-      input_compression_ratio, output_compression_ratio):
-    input_size = 128
-    in_filters = 24
-    out_filters = 24
-    inputs = tf.keras.Input(
-        shape=(input_size, input_size, in_filters), batch_size=1)
-    block = block_fn(
-        in_filters=in_filters,
-        out_filters=out_filters,
-        input_compression_ratio=input_compression_ratio,
-        output_compression_ratio=output_compression_ratio,
-        strides=strides)
-
-    features = block(inputs)
-
-    self.assertAllEqual(
-        [1, input_size // strides, input_size // strides, out_filters],
-        features.shape.as_list())
-
-
-class ResidualInnerTest(parameterized.TestCase, tf.test.TestCase):
-
-  @combinations.generate(distribution_strategy_combinations())
-  def test_shape(self, distribution):
-    bsz, h, w, c = 8, 32, 32, 32
-    filters = 64
-    strides = 2
-
-    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
-    with distribution.scope():
-      test_layer = nn_blocks.ResidualInner(filters, strides)
-
-    output = test_layer(input_tensor)
-    expected_output_shape = [bsz, h // strides, w // strides, filters]
-    self.assertEqual(expected_output_shape, output.shape.as_list())
-
-
-class BottleneckResidualInnerTest(parameterized.TestCase, tf.test.TestCase):
-
-  @combinations.generate(distribution_strategy_combinations())
-  def test_shape(self, distribution):
-    bsz, h, w, c = 8, 32, 32, 32
-    filters = 64
-    strides = 2
-
-    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
-    with distribution.scope():
-      test_layer = nn_blocks.BottleneckResidualInner(filters, strides)
-
-    output = test_layer(input_tensor)
-    expected_output_shape = [bsz, h // strides, w // strides, filters * 4]
-    self.assertEqual(expected_output_shape, output.shape.as_list())
-
-
-class DepthwiseSeparableConvBlockTest(parameterized.TestCase, tf.test.TestCase):
-
-  @combinations.generate(distribution_strategy_combinations())
-  def test_shape(self, distribution):
-    batch_size, height, width, num_channels = 8, 32, 32, 32
-    num_filters = 64
-    strides = 2
-
-    input_tensor = tf.random.normal(
-        shape=[batch_size, height, width, num_channels])
-    with distribution.scope():
-      block = nn_blocks.DepthwiseSeparableConvBlock(
-          num_filters, strides=strides)
-      config_dict = block.get_config()
-      recreate_block = nn_blocks.DepthwiseSeparableConvBlock(**config_dict)
-
-    output_tensor = block(input_tensor)
-    expected_output_shape = [
-        batch_size, height // strides, width // strides, num_filters
-    ]
-    self.assertEqual(output_tensor.shape.as_list(), expected_output_shape)
-
-    output_tensor = recreate_block(input_tensor)
-    self.assertEqual(output_tensor.shape.as_list(), expected_output_shape)
-
-
-class ReversibleLayerTest(parameterized.TestCase, tf.test.TestCase):
-
-  @combinations.generate(distribution_strategy_combinations())
-  def test_downsampling_non_reversible_step(self, distribution):
-    bsz, h, w, c = 8, 32, 32, 32
-    filters = 64
-    strides = 2
-
-    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
-    with distribution.scope():
-      f = nn_blocks.ResidualInner(
-          filters=filters // 2, strides=strides, batch_norm_first=True)
-      g = nn_blocks.ResidualInner(
-          filters=filters // 2, strides=1, batch_norm_first=True)
-      test_layer = nn_blocks.ReversibleLayer(f, g)
-      test_layer.build(input_tensor.shape)
-      optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
-
-    @tf.function
-    def step_fn():
-      with tf.GradientTape() as tape:
-        output = test_layer(input_tensor, training=True)
-      grads = tape.gradient(output, test_layer.trainable_variables)
-      # Test applying gradients with optimizer works
-      optimizer.apply_gradients(zip(grads, test_layer.trainable_variables))
-
-      return output
-
-    replica_output = distribution.run(step_fn)
-    outputs = distribution.experimental_local_results(replica_output)
-
-    # Assert forward pass shape
-    expected_output_shape = [bsz, h // strides, w // strides, filters]
-    for output in outputs:
-      self.assertEqual(expected_output_shape, output.shape.as_list())
-
-  @combinations.generate(distribution_strategy_combinations())
-  def test_reversible_step(self, distribution):
-    # Reversible layers satisfy: (a) strides = 1 (b) in_filter = out_filter
-    bsz, h, w, c = 8, 32, 32, 32
-    filters = c
-    strides = 1
-
-    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
-    with distribution.scope():
-      f = nn_blocks.ResidualInner(
-          filters=filters // 2, strides=strides, batch_norm_first=False)
-      g = nn_blocks.ResidualInner(
-          filters=filters // 2, strides=1, batch_norm_first=False)
-      test_layer = nn_blocks.ReversibleLayer(f, g)
-      test_layer(input_tensor, training=False)  # init weights
-      optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
-
-    @tf.function
-    def step_fn():
-      with tf.GradientTape() as tape:
-        output = test_layer(input_tensor, training=True)
-      grads = tape.gradient(output, test_layer.trainable_variables)
-      # Test applying gradients with optimizer works
-      optimizer.apply_gradients(zip(grads, test_layer.trainable_variables))
-
-      return output
-
-    @tf.function
-    def fwd():
-      test_layer(input_tensor)
-
-    distribution.run(fwd)  # Initialize variables
-    prev_variables = tf.identity_n(test_layer.trainable_variables)
-    replica_output = distribution.run(step_fn)
-    outputs = distribution.experimental_local_results(replica_output)
-
-    # Assert variables values have changed values
-    for v0, v1 in zip(prev_variables, test_layer.trainable_variables):
-      self.assertNotAllEqual(v0, v1)
-
-    # Assert forward pass shape
-    expected_output_shape = [bsz, h // strides, w // strides, filters]
-    for output in outputs:
-      self.assertEqual(expected_output_shape, output.shape.as_list())
-
-  @combinations.generate(distribution_strategy_combinations())
-  def test_manual_gradients_correctness(self, distribution):
-    bsz, h, w, c = 8, 32, 32, 32
-    filters = c
-    strides = 1
-
-    input_tensor = tf.random.uniform(shape=[bsz, h, w, c * 4])  # bottleneck
-    with distribution.scope():
-      f_manual = nn_blocks.BottleneckResidualInner(
-          filters=filters // 2, strides=strides, batch_norm_first=False)
-      g_manual = nn_blocks.BottleneckResidualInner(
-          filters=filters // 2, strides=1, batch_norm_first=False)
-      manual_grad_layer = nn_blocks.ReversibleLayer(f_manual, g_manual)
-      manual_grad_layer(input_tensor, training=False)  # init weights
-
-      f_auto = nn_blocks.BottleneckResidualInner(
-          filters=filters // 2, strides=strides, batch_norm_first=False)
-      g_auto = nn_blocks.BottleneckResidualInner(
-          filters=filters // 2, strides=1, batch_norm_first=False)
-      auto_grad_layer = nn_blocks.ReversibleLayer(
-          f_auto, g_auto, manual_grads=False)
-      auto_grad_layer(input_tensor)  # init weights
-      # Clone all weights (tf.keras.layers.Layer has no .clone())
-      auto_grad_layer._f.set_weights(manual_grad_layer._f.get_weights())
-      auto_grad_layer._g.set_weights(manual_grad_layer._g.get_weights())
-
-    @tf.function
-    def manual_fn():
-      with tf.GradientTape() as tape:
-        output = manual_grad_layer(input_tensor, training=True)
-      grads = tape.gradient(output, manual_grad_layer.trainable_variables)
-      return grads
-
-    @tf.function
-    def auto_fn():
-      with tf.GradientTape() as tape:
-        output = auto_grad_layer(input_tensor, training=True)
-      grads = tape.gradient(output, auto_grad_layer.trainable_variables)
-      return grads
-
-    manual_grads = distribution.run(manual_fn)
-    auto_grads = distribution.run(auto_fn)
-
-    # Assert gradients calculated manually are close to that from autograd
-    for manual_grad, auto_grad in zip(manual_grads, auto_grads):
-      self.assertAllClose(
-          distribution.experimental_local_results(manual_grad),
-          distribution.experimental_local_results(auto_grad),
-          atol=5e-3,
-          rtol=5e-3)
-
-    # Verify that BN moving mean and variance is correct.
-    for manual_var, auto_var in zip(manual_grad_layer.non_trainable_variables,
-                                    auto_grad_layer.non_trainable_variables):
-      self.assertAllClose(manual_var, auto_var)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/layers/nn_layers.py
+++ b/official/vision/modeling/layers/nn_layers.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Contains common building blocks for neural networks."""
-from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
-
-from absl import logging
-import tensorflow as tf
-import tensorflow_addons as tfa
-
-from official.modeling import tf_utils
-from official.vision.ops import spatial_transform_ops
-
-
-# Type annotations.
-States = Dict[str, tf.Tensor]
-Activation = Union[str, Callable]
-
-
-def make_divisible(value: float,
-                   divisor: int,
-                   min_value: Optional[float] = None,
-                   round_down_protect: bool = True,
-                   ) -> int:
-  """This is to ensure that all layers have channels that are divisible by 8.
-
-  Args:
-    value: A `float` of original value.
-    divisor: An `int` of the divisor that need to be checked upon.
-    min_value: A `float` of  minimum value threshold.
-    round_down_protect: A `bool` indicating whether round down more than 10%
-      will be allowed.
-
-  Returns:
-    The adjusted value in `int` that is divisible against divisor.
-  """
-  if min_value is None:
-    min_value = divisor
-  new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
-  # Make sure that round down does not go down by more than 10%.
-  if round_down_protect and new_value < 0.9 * value:
-    new_value += divisor
-  return int(new_value)
-
-
-def round_filters(filters: int,
-                  multiplier: float,
-                  divisor: int = 8,
-                  min_depth: Optional[int] = None,
-                  round_down_protect: bool = True,
-                  skip: bool = False) -> int:
-  """Rounds number of filters based on width multiplier."""
-  orig_f = filters
-  if skip or not multiplier:
-    return filters
-
-  new_filters = make_divisible(value=filters * multiplier,
-                               divisor=divisor,
-                               min_value=min_depth,
-                               round_down_protect=round_down_protect)
-
-  logging.info('round_filter input=%s output=%s', orig_f, new_filters)
-  return int(new_filters)
-
-
-def get_padding_for_kernel_size(kernel_size):
-  """Compute padding size given kernel size."""
-  if kernel_size == 7:
-    return (3, 3)
-  elif kernel_size == 3:
-    return (1, 1)
-  else:
-    raise ValueError('Padding for kernel size {} not known.'.format(
-        kernel_size))
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class SqueezeExcitation(tf.keras.layers.Layer):
-  """Creates a squeeze and excitation layer."""
-
-  def __init__(self,
-               in_filters,
-               out_filters,
-               se_ratio,
-               divisible_by=1,
-               use_3d_input=False,
-               kernel_initializer='VarianceScaling',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activation='relu',
-               gating_activation='sigmoid',
-               round_down_protect=True,
-               **kwargs):
-    """Initializes a squeeze and excitation layer.
-
-    Args:
-      in_filters: An `int` number of filters of the input tensor.
-      out_filters: An `int` number of filters of the output tensor.
-      se_ratio: A `float` or None. If not None, se ratio for the squeeze and
-        excitation layer.
-      divisible_by: An `int` that ensures all inner dimensions are divisible by
-        this number.
-      use_3d_input: A `bool` of whether input is 2D or 3D image.
-      kernel_initializer: A `str` of kernel_initializer for convolutional
-        layers.
-      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
-        Conv2D. Default to None.
-      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
-        Default to None.
-      activation: A `str` name of the activation function.
-      gating_activation: A `str` name of the activation function for final
-        gating function.
-      round_down_protect: A `bool` of whether round down more than 10% will be
-        allowed.
-      **kwargs: Additional keyword arguments to be passed.
-    """
-    super(SqueezeExcitation, self).__init__(**kwargs)
-
-    self._in_filters = in_filters
-    self._out_filters = out_filters
-    self._se_ratio = se_ratio
-    self._divisible_by = divisible_by
-    self._round_down_protect = round_down_protect
-    self._use_3d_input = use_3d_input
-    self._activation = activation
-    self._gating_activation = gating_activation
-    self._kernel_initializer = kernel_initializer
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      if not use_3d_input:
-        self._spatial_axis = [1, 2]
-      else:
-        self._spatial_axis = [1, 2, 3]
-    else:
-      if not use_3d_input:
-        self._spatial_axis = [2, 3]
-      else:
-        self._spatial_axis = [2, 3, 4]
-    self._activation_fn = tf_utils.get_activation(activation)
-    self._gating_activation_fn = tf_utils.get_activation(gating_activation)
-
-  def build(self, input_shape):
-    num_reduced_filters = make_divisible(
-        max(1, int(self._in_filters * self._se_ratio)),
-        divisor=self._divisible_by,
-        round_down_protect=self._round_down_protect)
-
-    self._se_reduce = tf.keras.layers.Conv2D(
-        filters=num_reduced_filters,
-        kernel_size=1,
-        strides=1,
-        padding='same',
-        use_bias=True,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-
-    self._se_expand = tf.keras.layers.Conv2D(
-        filters=self._out_filters,
-        kernel_size=1,
-        strides=1,
-        padding='same',
-        use_bias=True,
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-
-    super(SqueezeExcitation, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        'in_filters': self._in_filters,
-        'out_filters': self._out_filters,
-        'se_ratio': self._se_ratio,
-        'divisible_by': self._divisible_by,
-        'use_3d_input': self._use_3d_input,
-        'kernel_initializer': self._kernel_initializer,
-        'kernel_regularizer': self._kernel_regularizer,
-        'bias_regularizer': self._bias_regularizer,
-        'activation': self._activation,
-        'gating_activation': self._gating_activation,
-        'round_down_protect': self._round_down_protect,
-    }
-    base_config = super(SqueezeExcitation, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs):
-    x = tf.reduce_mean(inputs, self._spatial_axis, keepdims=True)
-    x = self._activation_fn(self._se_reduce(x))
-    x = self._gating_activation_fn(self._se_expand(x))
-    return x * inputs
-
-
-def get_stochastic_depth_rate(init_rate, i, n):
-  """Get drop connect rate for the ith block.
-
-  Args:
-    init_rate: A `float` of initial drop rate.
-    i: An `int` of order of the current block.
-    n: An `int` total number of blocks.
-
-  Returns:
-    Drop rate of the ith block.
-  """
-  if init_rate is not None:
-    if init_rate < 0 or init_rate > 1:
-      raise ValueError('Initial drop rate must be within 0 and 1.')
-    rate = init_rate * float(i) / n
-  else:
-    rate = None
-  return rate
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class StochasticDepth(tf.keras.layers.Layer):
-  """Creates a stochastic depth layer."""
-
-  def __init__(self, stochastic_depth_drop_rate, **kwargs):
-    """Initializes a stochastic depth layer.
-
-    Args:
-      stochastic_depth_drop_rate: A `float` of drop rate.
-      **kwargs: Additional keyword arguments to be passed.
-
-    Returns:
-      A output `tf.Tensor` of which should have the same shape as input.
-    """
-    super(StochasticDepth, self).__init__(**kwargs)
-    self._drop_rate = stochastic_depth_drop_rate
-
-  def get_config(self):
-    config = {'drop_rate': self._drop_rate}
-    base_config = super(StochasticDepth, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs, training=None):
-    if training is None:
-      training = tf.keras.backend.learning_phase()
-    if not training or self._drop_rate is None or self._drop_rate == 0:
-      return inputs
-
-    keep_prob = 1.0 - self._drop_rate
-    batch_size = tf.shape(inputs)[0]
-    random_tensor = keep_prob
-    random_tensor += tf.random.uniform(
-        [batch_size] + [1] * (inputs.shape.rank - 1), dtype=inputs.dtype)
-    binary_tensor = tf.floor(random_tensor)
-    output = tf.math.divide(inputs, keep_prob) * binary_tensor
-    return output
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-def pyramid_feature_fusion(inputs, target_level):
-  """Fuses all feature maps in the feature pyramid at the target level.
-
-  Args:
-    inputs: A dictionary containing the feature pyramid. The size of the input
-      tensor needs to be fixed.
-    target_level: An `int` of the target feature level for feature fusion.
-
-  Returns:
-    A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width,
-      feature_channel].
-  """
-  # Convert keys to int.
-  pyramid_feats = {int(k): v for k, v in inputs.items()}
-  min_level = min(pyramid_feats.keys())
-  max_level = max(pyramid_feats.keys())
-  resampled_feats = []
-
-  for l in range(min_level, max_level + 1):
-    if l == target_level:
-      resampled_feats.append(pyramid_feats[l])
-    else:
-      feat = pyramid_feats[l]
-      target_size = list(feat.shape[1:3])
-      target_size[0] *= 2**(l - target_level)
-      target_size[1] *= 2**(l - target_level)
-      # Casts feat to float32 so the resize op can be run on TPU.
-      feat = tf.cast(feat, tf.float32)
-      feat = tf.image.resize(
-          feat, size=target_size, method=tf.image.ResizeMethod.BILINEAR)
-      # Casts it back to be compatible with the rest opetations.
-      feat = tf.cast(feat, pyramid_feats[l].dtype)
-      resampled_feats.append(feat)
-
-  return tf.math.add_n(resampled_feats)
-
-
-class PanopticFPNFusion(tf.keras.Model):
-  """Creates a Panoptic FPN feature Fusion layer.
-
-  This implements feature fusion for semantic segmentation head from the paper:
-  Alexander Kirillov, Ross Girshick, Kaiming He and Piotr Dollar.
-  Panoptic Feature Pyramid Networks.
-  (https://arxiv.org/pdf/1901.02446.pdf)
-  """
-
-  def __init__(
-      self,
-      min_level: int = 2,
-      max_level: int = 5,
-      target_level: int = 2,
-      num_filters: int = 128,
-      num_fpn_filters: int = 256,
-      activation: str = 'relu',
-      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-      **kwargs):
-
-    """Initializes panoptic FPN feature fusion layer.
-
-    Args:
-      min_level: An `int` of minimum level to use in feature fusion.
-      max_level: An `int` of maximum level to use in feature fusion.
-      target_level: An `int` of the target feature level for feature fusion.
-      num_filters: An `int` number of filters in conv2d layers.
-      num_fpn_filters: An `int` number of filters in the FPN outputs
-      activation: A `str` name of the activation function.
-      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
-        Conv2D. Default is None.
-      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
-      **kwargs: Additional keyword arguments to be passed.
-    Returns:
-      A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width,
-        feature_channel].
-    """
-    if target_level > max_level:
-      raise ValueError('target_level should be less than max_level')
-
-    self._config_dict = {
-        'min_level': min_level,
-        'max_level': max_level,
-        'target_level': target_level,
-        'num_filters': num_filters,
-        'num_fpn_filters': num_fpn_filters,
-        'activation': activation,
-        'kernel_regularizer': kernel_regularizer,
-        'bias_regularizer': bias_regularizer,
-    }
-    norm = tfa.layers.GroupNormalization
-    conv2d = tf.keras.layers.Conv2D
-    activation_fn = tf_utils.get_activation(activation)
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      norm_axis = -1
-    else:
-      norm_axis = 1
-    inputs = self._build_inputs(num_fpn_filters, min_level, max_level)
-
-    upscaled_features = []
-    for level in range(min_level, max_level + 1):
-      num_conv_layers = max(1, level - target_level)
-      x = inputs[str(level)]
-      for i in range(num_conv_layers):
-        x = conv2d(
-            filters=num_filters,
-            kernel_size=3,
-            padding='same',
-            kernel_initializer=tf.keras.initializers.VarianceScaling(),
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer)(x)
-        x = norm(groups=32, axis=norm_axis)(x)
-        x = activation_fn(x)
-        if level != target_level:
-          x = spatial_transform_ops.nearest_upsampling(x, scale=2)
-      upscaled_features.append(x)
-
-    fused_features = tf.math.add_n(upscaled_features)
-    self._output_specs = {str(target_level): fused_features.get_shape()}
-
-    super(PanopticFPNFusion, self).__init__(
-        inputs=inputs, outputs=fused_features, **kwargs)
-
-  def _build_inputs(self, num_filters: int,
-                    min_level: int, max_level: int):
-    inputs = {}
-    for level in range(min_level, max_level + 1):
-      inputs[str(level)] = tf.keras.Input(shape=[None, None, num_filters])
-    return inputs
-
-  def get_config(self) -> Mapping[str, Any]:
-    return self._config_dict
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-  @property
-  def output_specs(self) -> Mapping[str, tf.TensorShape]:
-    """A dict of {level: TensorShape} pairs for the model output."""
-    return self._output_specs
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class Scale(tf.keras.layers.Layer):
-  """Scales the input by a trainable scalar weight.
-
-  This is useful for applying ReZero to layers, which improves convergence
-  speed. This implements the paper:
-  ReZero is All You Need: Fast Convergence at Large Depth.
-  (https://arxiv.org/pdf/2003.04887.pdf).
-  """
-
-  def __init__(
-      self,
-      initializer: tf.keras.initializers.Initializer = 'ones',
-      regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-      **kwargs):
-    """Initializes a scale layer.
-
-    Args:
-      initializer: A `str` of initializer for the scalar weight.
-      regularizer: A `tf.keras.regularizers.Regularizer` for the scalar weight.
-      **kwargs: Additional keyword arguments to be passed to this layer.
-
-    Returns:
-      An `tf.Tensor` of which should have the same shape as input.
-    """
-    super(Scale, self).__init__(**kwargs)
-
-    self._initializer = initializer
-    self._regularizer = regularizer
-
-    self._scale = self.add_weight(
-        name='scale',
-        shape=[],
-        dtype=self.dtype,
-        initializer=self._initializer,
-        regularizer=self._regularizer,
-        trainable=True)
-
-  def get_config(self):
-    """Returns a dictionary containing the config used for initialization."""
-    config = {
-        'initializer': self._initializer,
-        'regularizer': self._regularizer,
-    }
-    base_config = super(Scale, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs):
-    """Calls the layer with the given inputs."""
-    scale = tf.cast(self._scale, inputs.dtype)
-    return scale * inputs
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class TemporalSoftmaxPool(tf.keras.layers.Layer):
-  """Creates a network layer corresponding to temporal softmax pooling.
-
-  This is useful for multi-class logits (used in e.g., Charades). Modified from
-  AssembleNet Charades evaluation from:
-
-  Michael S. Ryoo, AJ Piergiovanni, Mingxing Tan, Anelia Angelova.
-  AssembleNet: Searching for Multi-Stream Neural Connectivity in Video
-  Architectures.
-  (https://arxiv.org/pdf/1905.13209.pdf).
-  """
-
-  def call(self, inputs):
-    """Calls the layer with the given inputs."""
-    assert inputs.shape.rank in (3, 4, 5)
-    frames = tf.shape(inputs)[1]
-    pre_logits = inputs / tf.sqrt(tf.cast(frames, inputs.dtype))
-    activations = tf.nn.softmax(pre_logits, axis=1)
-    outputs = inputs * activations
-    return outputs
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class PositionalEncoding(tf.keras.layers.Layer):
-  """Creates a network layer that adds a sinusoidal positional encoding.
-
-  Positional encoding is incremented across frames, and is added to the input.
-  The positional encoding is first weighted at 0 so that the network can choose
-  to ignore it. This implements:
-
-  Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
-  Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin.
-  Attention Is All You Need.
-  (https://arxiv.org/pdf/1706.03762.pdf).
-  """
-
-  def __init__(self,
-               initializer: tf.keras.initializers.Initializer = 'zeros',
-               cache_encoding: bool = False,
-               state_prefix: Optional[str] = None,
-               **kwargs):
-    """Initializes positional encoding.
-
-    Args:
-      initializer: A `str` of initializer for weighting the positional encoding.
-      cache_encoding: A `bool`. If True, cache the positional encoding tensor
-        after calling build. Otherwise, rebuild the tensor for every call.
-        Setting this to False can be useful when we want to input a variable
-        number of frames, so the positional encoding tensor can change shape.
-      state_prefix: a prefix string to identify states.
-      **kwargs: Additional keyword arguments to be passed to this layer.
-
-    Returns:
-      A `tf.Tensor` of which should have the same shape as input.
-    """
-    super(PositionalEncoding, self).__init__(**kwargs)
-    self._initializer = initializer
-    self._cache_encoding = cache_encoding
-    self._pos_encoding = None
-    self._rezero = Scale(initializer=initializer, name='rezero')
-    state_prefix = state_prefix if state_prefix is not None else ''
-    self._state_prefix = state_prefix
-    self._frame_count_name = f'{state_prefix}_pos_enc_frame_count'
-
-  def get_config(self):
-    """Returns a dictionary containing the config used for initialization."""
-    config = {
-        'initializer': self._initializer,
-        'cache_encoding': self._cache_encoding,
-        'state_prefix': self._state_prefix,
-    }
-    base_config = super(PositionalEncoding, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def _positional_encoding(self,
-                           num_positions: Union[int, tf.Tensor],
-                           hidden_size: Union[int, tf.Tensor],
-                           start_position: Union[int, tf.Tensor] = 0,
-                           dtype: str = 'float32') -> tf.Tensor:
-    """Creates a sequence of sinusoidal positional encoding vectors.
-
-    Args:
-      num_positions: the total number of positions (frames).
-      hidden_size: the number of channels used for the hidden vectors.
-      start_position: the start position.
-      dtype: the dtype of the output tensor.
-
-    Returns:
-      The positional encoding tensor with shape [num_positions, hidden_size].
-    """
-    if isinstance(start_position, tf.Tensor) and start_position.shape.rank == 1:
-      start_position = start_position[0]
-
-    # Calling `tf.range` with `dtype=tf.bfloat16` results in an error,
-    # so we cast afterward.
-    positions = tf.range(start_position, start_position + num_positions)
-    positions = tf.cast(positions, dtype)[:, tf.newaxis]
-    idx = tf.range(hidden_size)[tf.newaxis, :]
-
-    power = tf.cast(2 * (idx // 2), dtype)
-    power /= tf.cast(hidden_size, dtype)
-    angles = 1. / tf.math.pow(10_000., power)
-    radians = positions * angles
-
-    sin = tf.math.sin(radians[:, 0::2])
-    cos = tf.math.cos(radians[:, 1::2])
-    pos_encoding = tf.concat([sin, cos], axis=-1)
-
-    return pos_encoding
-
-  def _get_pos_encoding(self,
-                        input_shape: tf.Tensor,
-                        frame_count: int = 0) -> tf.Tensor:
-    """Calculates the positional encoding from the input shape.
-
-    Args:
-      input_shape: the shape of the input.
-      frame_count: a count of frames that indicates the index of the first
-        frame.
-
-    Returns:
-      The positional encoding tensor with shape [num_positions, hidden_size].
-
-    """
-    frames = input_shape[1]
-    channels = input_shape[-1]
-    pos_encoding = self._positional_encoding(
-        frames, channels, start_position=frame_count, dtype=self.dtype)
-    pos_encoding = tf.reshape(pos_encoding, [1, frames, 1, 1, channels])
-    return pos_encoding
-
-  def build(self, input_shape):
-    """Builds the layer with the given input shape.
-
-    Args:
-      input_shape: The input shape.
-
-    Raises:
-      ValueError: If using 'channels_first' data format.
-    """
-    if tf.keras.backend.image_data_format() == 'channels_first':
-      raise ValueError('"channels_first" mode is unsupported.')
-
-    if self._cache_encoding:
-      self._pos_encoding = self._get_pos_encoding(input_shape)
-
-    super(PositionalEncoding, self).build(input_shape)
-
-  def call(
-      self,
-      inputs: tf.Tensor,
-      states: Optional[States] = None,
-      output_states: bool = True,
-  ) -> Union[tf.Tensor, Tuple[tf.Tensor, States]]:
-    """Calls the layer with the given inputs.
-
-    Args:
-      inputs: An input `tf.Tensor`.
-      states: A `dict` of states such that, if any of the keys match for this
-        layer, will overwrite the contents of the buffer(s). Expected keys
-        include `state_prefix + '_pos_enc_frame_count'`.
-      output_states: A `bool`. If True, returns the output tensor and output
-        states. Returns just the output tensor otherwise.
-
-    Returns:
-      An output `tf.Tensor` (and optionally the states if `output_states=True`).
-
-    Raises:
-      ValueError: If using 'channels_first' data format.
-    """
-    states = dict(states) if states is not None else {}
-
-    # Keep a count of frames encountered across input iterations in
-    # num_frames to be able to accurately update the positional encoding.
-    num_frames = tf.shape(inputs)[1]
-    frame_count = tf.cast(states.get(self._frame_count_name, [0]), tf.int32)
-    states[self._frame_count_name] = frame_count + num_frames
-
-    if self._cache_encoding:
-      pos_encoding = self._pos_encoding
-    else:
-      pos_encoding = self._get_pos_encoding(
-          tf.shape(inputs), frame_count=frame_count)
-    pos_encoding = tf.cast(pos_encoding, inputs.dtype)
-    pos_encoding = self._rezero(pos_encoding)
-    outputs = inputs + pos_encoding
-
-    return (outputs, states) if output_states else outputs
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class GlobalAveragePool3D(tf.keras.layers.Layer):
-  """Creates a global average pooling layer with causal mode.
-
-  Implements causal mode, which runs a cumulative sum (with `tf.cumsum`) across
-  frames in the time dimension, allowing the use of a stream buffer. Sums any
-  valid input state with the current input to allow state to accumulate over
-  several iterations.
-  """
-
-  def __init__(self,
-               keepdims: bool = False,
-               causal: bool = False,
-               state_prefix: Optional[str] = None,
-               **kwargs):
-    """Initializes a global average pool layer.
-
-    Args:
-      keepdims: A `bool`. If True, keep the averaged dimensions.
-      causal: A `bool` of whether to run in causal mode with a cumulative sum
-        across frames.
-      state_prefix: a prefix string to identify states.
-      **kwargs: Additional keyword arguments to be passed to this layer.
-
-    Returns:
-      An output `tf.Tensor`.
-    """
-    super(GlobalAveragePool3D, self).__init__(**kwargs)
-
-    self._keepdims = keepdims
-    self._causal = causal
-    state_prefix = state_prefix if state_prefix is not None else ''
-    self._state_prefix = state_prefix
-
-    self._state_name = f'{state_prefix}_pool_buffer'
-    self._frame_count_name = f'{state_prefix}_pool_frame_count'
-
-  def get_config(self):
-    """Returns a dictionary containing the config used for initialization."""
-    config = {
-        'keepdims': self._keepdims,
-        'causal': self._causal,
-        'state_prefix': self._state_prefix,
-    }
-    base_config = super(GlobalAveragePool3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self,
-           inputs: tf.Tensor,
-           states: Optional[States] = None,
-           output_states: bool = True
-           ) -> Union[tf.Tensor, Tuple[tf.Tensor, States]]:
-    """Calls the layer with the given inputs.
-
-    Args:
-      inputs: An input `tf.Tensor`.
-      states: A `dict` of states such that, if any of the keys match for this
-        layer, will overwrite the contents of the buffer(s).
-        Expected keys include `state_prefix + '__pool_buffer'` and
-        `state_prefix + '__pool_frame_count'`.
-      output_states: A `bool`. If True, returns the output tensor and output
-        states. Returns just the output tensor otherwise.
-
-    Returns:
-      An output `tf.Tensor` (and optionally the states if `output_states=True`).
-      If `causal=True`, the output tensor will have shape
-      `[batch_size, num_frames, 1, 1, channels]` if `keepdims=True`. We keep
-      the frame dimension in this case to simulate a cumulative global average
-      as if we are inputting one frame at a time. If `causal=False`, the output
-      is equivalent to `tf.keras.layers.GlobalAveragePooling3D` with shape
-      `[batch_size, 1, 1, 1, channels]` if `keepdims=True` (plus the optional
-      buffer stored in `states`).
-
-    Raises:
-      ValueError: If using 'channels_first' data format.
-    """
-    states = dict(states) if states is not None else {}
-
-    if tf.keras.backend.image_data_format() == 'channels_first':
-      raise ValueError('"channels_first" mode is unsupported.')
-
-    # Shape: [batch_size, 1, 1, 1, channels]
-    buffer = states.get(self._state_name, None)
-    if buffer is None:
-      buffer = tf.zeros_like(inputs[:, :1, :1, :1], dtype=inputs.dtype)
-      states[self._state_name] = buffer
-
-    # Keep a count of frames encountered across input iterations in
-    # num_frames to be able to accurately take a cumulative average across
-    # all frames when running in streaming mode
-    num_frames = tf.shape(inputs)[1]
-    frame_count = states.get(self._frame_count_name, tf.constant([0]))
-    frame_count = tf.cast(frame_count, tf.int32)
-    states[self._frame_count_name] = frame_count + num_frames
-
-    if self._causal:
-      # Take a mean of spatial dimensions to make computation more efficient.
-      x = tf.reduce_mean(inputs, axis=[2, 3], keepdims=True)
-      x = tf.cumsum(x, axis=1)
-      x = x + buffer
-
-      # The last frame will be the value of the next state
-      # Shape: [batch_size, 1, 1, 1, channels]
-      states[self._state_name] = x[:, -1:]
-
-      # In causal mode, the divisor increments by 1 for every frame to
-      # calculate cumulative averages instead of one global average
-      mean_divisors = tf.range(num_frames) + frame_count + 1
-      mean_divisors = tf.reshape(mean_divisors, [1, num_frames, 1, 1, 1])
-      mean_divisors = tf.cast(mean_divisors, x.dtype)
-
-      # Shape: [batch_size, num_frames, 1, 1, channels]
-      x = x / mean_divisors
-    else:
-      # In non-causal mode, we (optionally) sum across frames to take a
-      # cumulative average across input iterations rather than individual
-      # frames. If no buffer state is passed, this essentially becomes
-      # regular global average pooling.
-      # Shape: [batch_size, 1, 1, 1, channels]
-      x = tf.reduce_sum(inputs, axis=(1, 2, 3), keepdims=True)
-      x = x / tf.cast(tf.shape(inputs)[2] * tf.shape(inputs)[3], x.dtype)
-      x = x + buffer
-
-      # Shape: [batch_size, 1, 1, 1, channels]
-      states[self._state_name] = x
-
-      x = x / tf.cast(frame_count + num_frames, x.dtype)
-
-    if not self._keepdims:
-      x = tf.squeeze(x, axis=(1, 2, 3))
-
-    return (x, states) if output_states else x
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class SpatialAveragePool3D(tf.keras.layers.Layer):
-  """Creates a global average pooling layer pooling across spatial dimentions."""
-
-  def __init__(self, keepdims: bool = False, **kwargs):
-    """Initializes a global average pool layer.
-
-    Args:
-      keepdims: A `bool`. If True, keep the averaged dimensions.
-      **kwargs: Additional keyword arguments to be passed to this layer.
-
-    Returns:
-      An output `tf.Tensor`.
-    """
-    super(SpatialAveragePool3D, self).__init__(**kwargs)
-    self._keepdims = keepdims
-
-  def get_config(self):
-    """Returns a dictionary containing the config used for initialization."""
-    config = {
-        'keepdims': self._keepdims,
-    }
-    base_config = super(SpatialAveragePool3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def build(self, input_shape):
-    """Builds the layer with the given input shape."""
-    if tf.keras.backend.image_data_format() == 'channels_first':
-      raise ValueError('"channels_first" mode is unsupported.')
-
-    super(SpatialAveragePool3D, self).build(input_shape)
-
-  def call(self, inputs):
-    """Calls the layer with the given inputs."""
-    if inputs.shape.rank != 5:
-      raise ValueError(
-          'Input should have rank {}, got {}'.format(5, inputs.shape.rank))
-
-    return tf.reduce_mean(inputs, axis=(2, 3), keepdims=self._keepdims)
-
-
-class CausalConvMixin:
-  """Mixin class to implement CausalConv for `tf.keras.layers.Conv` layers."""
-
-  @property
-  def use_buffered_input(self) -> bool:
-    return self._use_buffered_input
-
-  @use_buffered_input.setter
-  def use_buffered_input(self, variable: bool):
-    self._use_buffered_input = variable
-
-  def _compute_buffered_causal_padding(self,
-                                       inputs: tf.Tensor,
-                                       use_buffered_input: bool = False,
-                                       time_axis: int = 1,
-                                       ) -> List[List[int]]:
-    """Calculates padding for 'causal' option for conv layers.
-
-    Args:
-      inputs: An optional input `tf.Tensor` to be padded.
-      use_buffered_input: A `bool`. If True, use 'valid' padding along the time
-        dimension. This should be set when applying the stream buffer.
-      time_axis: An `int` of the axis of the time dimension.
-
-    Returns:
-      A list of paddings for `tf.pad`.
-    """
-    input_shape = tf.shape(inputs)[1:-1]
-
-    if tf.keras.backend.image_data_format() == 'channels_first':
-      raise ValueError('"channels_first" mode is unsupported.')
-
-    kernel_size_effective = [
-        (self.kernel_size[i] +
-         (self.kernel_size[i] - 1) * (self.dilation_rate[i] - 1))
-        for i in range(self.rank)
-    ]
-    pad_total = [kernel_size_effective[0] - 1]
-    for i in range(1, self.rank):
-      overlap = (input_shape[i] - 1) % self.strides[i] + 1
-      pad_total.append(tf.maximum(kernel_size_effective[i] - overlap, 0))
-    pad_beg = [pad_total[i] // 2 for i in range(self.rank)]
-    pad_end = [pad_total[i] - pad_beg[i] for i in range(self.rank)]
-    padding = [[pad_beg[i], pad_end[i]] for i in range(self.rank)]
-    padding = [[0, 0]] + padding + [[0, 0]]
-
-    if use_buffered_input:
-      padding[time_axis] = [0, 0]
-    else:
-      padding[time_axis] = [padding[time_axis][0] + padding[time_axis][1], 0]
-    return padding
-
-  def _causal_validate_init(self):
-    """Validates the Conv layer initial configuration."""
-    # Overriding this method is meant to circumvent unnecessary errors when
-    # using causal padding.
-    if (self.filters is not None
-        and self.filters % self.groups != 0):
-      raise ValueError(
-          'The number of filters must be evenly divisible by the number of '
-          'groups. Received: groups={}, filters={}'.format(
-              self.groups, self.filters))
-
-    if not all(self.kernel_size):
-      raise ValueError('The argument `kernel_size` cannot contain 0(s). '
-                       'Received: %s' % (self.kernel_size,))
-
-  def _buffered_spatial_output_shape(self, spatial_output_shape: List[int]):
-    """Computes the spatial output shape from the input shape."""
-    # When buffer padding, use 'valid' padding across time. The output shape
-    # across time should be the input shape minus any padding, assuming
-    # the stride across time is 1.
-    if self._use_buffered_input and spatial_output_shape[0] is not None:
-      padding = self._compute_buffered_causal_padding(
-          tf.zeros([1] + spatial_output_shape + [1]), use_buffered_input=False)
-      spatial_output_shape[0] -= sum(padding[1])
-    return spatial_output_shape
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class Conv2D(tf.keras.layers.Conv2D, CausalConvMixin):
-  """Conv2D layer supporting CausalConv.
-
-  Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
-  which applies causal padding to the temporal dimension, and same padding in
-  the spatial dimensions.
-  """
-
-  def __init__(self, *args, use_buffered_input=False, **kwargs):
-    """Initializes conv2d.
-
-    Args:
-      *args: Arguments to be passed.
-      use_buffered_input: A `bool`. If True, the input is expected to be padded
-        beforehand. In effect, calling this layer will use 'valid' padding on
-        the temporal dimension to simulate 'causal' padding.
-      **kwargs: Additional keyword arguments to be passed.
-
-    Returns:
-      An output `tf.Tensor` of the Conv2D operation.
-    """
-    super(Conv2D, self).__init__(*args, **kwargs)
-    self._use_buffered_input = use_buffered_input
-
-  def get_config(self):
-    """Returns a dictionary containing the config used for initialization."""
-    config = {
-        'use_buffered_input': self._use_buffered_input,
-    }
-    base_config = super(Conv2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def _compute_causal_padding(self, inputs):
-    """Computes causal padding dimensions for the given inputs."""
-    return self._compute_buffered_causal_padding(
-        inputs, use_buffered_input=self._use_buffered_input)
-
-  def _validate_init(self):
-    """Validates the Conv layer initial configuration."""
-    self._causal_validate_init()
-
-  def _spatial_output_shape(self, spatial_input_shape: List[int]):
-    """Computes the spatial output shape from the input shape."""
-    shape = super(Conv2D, self)._spatial_output_shape(spatial_input_shape)
-    return self._buffered_spatial_output_shape(shape)
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class DepthwiseConv2D(tf.keras.layers.DepthwiseConv2D, CausalConvMixin):
-  """DepthwiseConv2D layer supporting CausalConv.
-
-  Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
-  which applies causal padding to the temporal dimension, and same padding in
-  the spatial dimensions.
-  """
-
-  def __init__(self, *args, use_buffered_input=False, **kwargs):
-    """Initializes depthwise conv2d.
-
-    Args:
-      *args: Arguments to be passed.
-      use_buffered_input: A `bool`. If True, the input is expected to be padded
-        beforehand. In effect, calling this layer will use 'valid' padding on
-        the temporal dimension to simulate 'causal' padding.
-      **kwargs: Additional keyword arguments to be passed.
-
-    Returns:
-      An output `tf.Tensor` of the DepthwiseConv2D operation.
-    """
-    super(DepthwiseConv2D, self).__init__(*args, **kwargs)
-    self._use_buffered_input = use_buffered_input
-
-    # Causal padding is unsupported by default for DepthwiseConv2D,
-    # so we resort to valid padding internally. However, we handle
-    # causal padding as a special case with `self._is_causal`, which is
-    # defined by the super class.
-    if self.padding == 'causal':
-      self.padding = 'valid'
-
-  def get_config(self):
-    """Returns a dictionary containing the config used for initialization."""
-    config = {
-        'use_buffered_input': self._use_buffered_input,
-    }
-    base_config = super(DepthwiseConv2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs):
-    """Calls the layer with the given inputs."""
-    if self._is_causal:
-      inputs = tf.pad(inputs, self._compute_causal_padding(inputs))
-    return super(DepthwiseConv2D, self).call(inputs)
-
-  def _compute_causal_padding(self, inputs):
-    """Computes causal padding dimensions for the given inputs."""
-    return self._compute_buffered_causal_padding(
-        inputs, use_buffered_input=self._use_buffered_input)
-
-  def _validate_init(self):
-    """Validates the Conv layer initial configuration."""
-    self._causal_validate_init()
-
-  def _spatial_output_shape(self, spatial_input_shape: List[int]):
-    """Computes the spatial output shape from the input shape."""
-    shape = super(DepthwiseConv2D, self)._spatial_output_shape(
-        spatial_input_shape)
-    return self._buffered_spatial_output_shape(shape)
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class Conv3D(tf.keras.layers.Conv3D, CausalConvMixin):
-  """Conv3D layer supporting CausalConv.
-
-  Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
-  which applies causal padding to the temporal dimension, and same padding in
-  the spatial dimensions.
-  """
-
-  def __init__(self, *args, use_buffered_input=False, **kwargs):
-    """Initializes conv3d.
-
-    Args:
-      *args: Arguments to be passed.
-      use_buffered_input: A `bool`. If True, the input is expected to be padded
-        beforehand. In effect, calling this layer will use 'valid' padding on
-        the temporal dimension to simulate 'causal' padding.
-      **kwargs: Additional keyword arguments to be passed.
-
-    Returns:
-      An output `tf.Tensor` of the Conv3D operation.
-    """
-    super(Conv3D, self).__init__(*args, **kwargs)
-    self._use_buffered_input = use_buffered_input
-
-  def get_config(self):
-    """Returns a dictionary containing the config used for initialization."""
-    config = {
-        'use_buffered_input': self._use_buffered_input,
-    }
-    base_config = super(Conv3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs):
-    """Call the layer with the given inputs."""
-    # Note: tf.nn.conv3d with depthwise kernels on CPU is currently only
-    # supported when compiling with TF graph (XLA) using tf.function, so it
-    # is compiled by default here (b/186463870).
-    conv_fn = tf.function(super(Conv3D, self).call, jit_compile=True)
-    return conv_fn(inputs)
-
-  def _compute_causal_padding(self, inputs):
-    """Computes causal padding dimensions for the given inputs."""
-    return self._compute_buffered_causal_padding(
-        inputs, use_buffered_input=self._use_buffered_input)
-
-  def _validate_init(self):
-    """Validates the Conv layer initial configuration."""
-    self._causal_validate_init()
-
-  def _spatial_output_shape(self, spatial_input_shape: List[int]):
-    """Computes the spatial output shape from the input shape."""
-    shape = super(Conv3D, self)._spatial_output_shape(spatial_input_shape)
-    return self._buffered_spatial_output_shape(shape)
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class SpatialPyramidPooling(tf.keras.layers.Layer):
-  """Implements the Atrous Spatial Pyramid Pooling.
-
-  References:
-    [Rethinking Atrous Convolution for Semantic Image Segmentation](
-      https://arxiv.org/pdf/1706.05587.pdf)
-    [Encoder-Decoder with Atrous Separable Convolution for Semantic Image
-    Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
-  """
-
-  def __init__(
-      self,
-      output_channels: int,
-      dilation_rates: List[int],
-      pool_kernel_size: Optional[List[int]] = None,
-      use_sync_bn: bool = False,
-      batchnorm_momentum: float = 0.99,
-      batchnorm_epsilon: float = 0.001,
-      activation: str = 'relu',
-      dropout: float = 0.5,
-      kernel_initializer: str = 'GlorotUniform',
-      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-      interpolation: str = 'bilinear',
-      use_depthwise_convolution: bool = False,
-      **kwargs):
-    """Initializes `SpatialPyramidPooling`.
-
-    Args:
-      output_channels: Number of channels produced by SpatialPyramidPooling.
-      dilation_rates: A list of integers for parallel dilated conv.
-      pool_kernel_size: A list of integers or None. If None, global average
-        pooling is applied, otherwise an average pooling of pool_kernel_size is
-        applied.
-      use_sync_bn: A bool, whether or not to use sync batch normalization.
-      batchnorm_momentum: A float for the momentum in BatchNorm. Defaults to
-        0.99.
-      batchnorm_epsilon: A float for the epsilon value in BatchNorm. Defaults to
-        0.001.
-      activation: A `str` for type of activation to be used. Defaults to 'relu'.
-      dropout: A float for the dropout rate before output. Defaults to 0.5.
-      kernel_initializer: Kernel initializer for conv layers. Defaults to
-        `glorot_uniform`.
-      kernel_regularizer: Kernel regularizer for conv layers. Defaults to None.
-      interpolation: The interpolation method for upsampling. Defaults to
-        `bilinear`.
-      use_depthwise_convolution: Allows spatial pooling to be separable
-        depthwise convolusions. [Encoder-Decoder with Atrous Separable
-        Convolution for Semantic Image Segmentation](
-         https://arxiv.org/pdf/1802.02611.pdf)
-      **kwargs: Other keyword arguments for the layer.
-    """
-    super().__init__(**kwargs)
-
-    self._output_channels = output_channels
-    self._dilation_rates = dilation_rates
-    self._use_sync_bn = use_sync_bn
-    self._batchnorm_momentum = batchnorm_momentum
-    self._batchnorm_epsilon = batchnorm_epsilon
-    self._activation = activation
-    self._dropout = dropout
-    self._kernel_initializer = kernel_initializer
-    self._kernel_regularizer = kernel_regularizer
-    self._interpolation = interpolation
-    self._pool_kernel_size = pool_kernel_size
-    self._use_depthwise_convolution = use_depthwise_convolution
-    self._activation_fn = tf_utils.get_activation(activation)
-    if self._use_sync_bn:
-      self._bn_op = tf.keras.layers.experimental.SyncBatchNormalization
-    else:
-      self._bn_op = tf.keras.layers.BatchNormalization
-
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      self._bn_axis = -1
-    else:
-      self._bn_axis = 1
-
-  def build(self, input_shape):
-    height = input_shape[1]
-    width = input_shape[2]
-    channels = input_shape[3]
-
-    self.aspp_layers = []
-
-    conv1 = tf.keras.layers.Conv2D(
-        filters=self._output_channels,
-        kernel_size=(1, 1),
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        use_bias=False)
-    norm1 = self._bn_op(
-        axis=self._bn_axis,
-        momentum=self._batchnorm_momentum,
-        epsilon=self._batchnorm_epsilon)
-
-    self.aspp_layers.append([conv1, norm1])
-
-    for dilation_rate in self._dilation_rates:
-      leading_layers = []
-      kernel_size = (3, 3)
-      if self._use_depthwise_convolution:
-        leading_layers += [
-            tf.keras.layers.DepthwiseConv2D(
-                depth_multiplier=1,
-                kernel_size=kernel_size,
-                padding='same',
-                depthwise_regularizer=self._kernel_regularizer,
-                depthwise_initializer=self._kernel_initializer,
-                dilation_rate=dilation_rate,
-                use_bias=False)
-        ]
-        kernel_size = (1, 1)
-      conv_dilation = leading_layers + [
-          tf.keras.layers.Conv2D(
-              filters=self._output_channels,
-              kernel_size=kernel_size,
-              padding='same',
-              kernel_regularizer=self._kernel_regularizer,
-              kernel_initializer=self._kernel_initializer,
-              dilation_rate=dilation_rate,
-              use_bias=False)
-      ]
-      norm_dilation = self._bn_op(
-          axis=self._bn_axis,
-          momentum=self._batchnorm_momentum,
-          epsilon=self._batchnorm_epsilon)
-
-      self.aspp_layers.append(conv_dilation + [norm_dilation])
-
-    if self._pool_kernel_size is None:
-      pooling = [
-          tf.keras.layers.GlobalAveragePooling2D(),
-          tf.keras.layers.Reshape((1, 1, channels))
-      ]
-    else:
-      pooling = [tf.keras.layers.AveragePooling2D(self._pool_kernel_size)]
-
-    conv2 = tf.keras.layers.Conv2D(
-        filters=self._output_channels,
-        kernel_size=(1, 1),
-        kernel_initializer=self._kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        use_bias=False)
-    norm2 = self._bn_op(
-        axis=self._bn_axis,
-        momentum=self._batchnorm_momentum,
-        epsilon=self._batchnorm_epsilon)
-
-    self.aspp_layers.append(pooling + [conv2, norm2])
-
-    self._resizing_layer = tf.keras.layers.Resizing(
-        height, width, interpolation=self._interpolation, dtype=tf.float32)
-
-    self._projection = [
-        tf.keras.layers.Conv2D(
-            filters=self._output_channels,
-            kernel_size=(1, 1),
-            kernel_initializer=self._kernel_initializer,
-            kernel_regularizer=self._kernel_regularizer,
-            use_bias=False),
-        self._bn_op(
-            axis=self._bn_axis,
-            momentum=self._batchnorm_momentum,
-            epsilon=self._batchnorm_epsilon)
-    ]
-    self._dropout_layer = tf.keras.layers.Dropout(rate=self._dropout)
-    self._concat_layer = tf.keras.layers.Concatenate(axis=-1)
-
-  def call(self,
-           inputs: tf.Tensor,
-           training: Optional[bool] = None) -> tf.Tensor:
-    if training is None:
-      training = tf.keras.backend.learning_phase()
-    result = []
-    for i, layers in enumerate(self.aspp_layers):
-      x = inputs
-      for layer in layers:
-        # Apply layers sequentially.
-        x = layer(x, training=training)
-      x = self._activation_fn(x)
-
-      # Apply resize layer to the end of the last set of layers.
-      if i == len(self.aspp_layers) - 1:
-        x = self._resizing_layer(x)
-
-      result.append(tf.cast(x, inputs.dtype))
-    x = self._concat_layer(result)
-    for layer in self._projection:
-      x = layer(x, training=training)
-    x = self._activation_fn(x)
-    return self._dropout_layer(x)
-
-  def get_config(self):
-    config = {
-        'output_channels': self._output_channels,
-        'dilation_rates': self._dilation_rates,
-        'pool_kernel_size': self._pool_kernel_size,
-        'use_sync_bn': self._use_sync_bn,
-        'batchnorm_momentum': self._batchnorm_momentum,
-        'batchnorm_epsilon': self._batchnorm_epsilon,
-        'activation': self._activation,
-        'dropout': self._dropout,
-        'kernel_initializer': self._kernel_initializer,
-        'kernel_regularizer': self._kernel_regularizer,
-        'interpolation': self._interpolation,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
--- a/official/vision/modeling/layers/nn_layers_test.py
+++ b/official/vision/modeling/layers/nn_layers_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Tests for nn_layers."""
-
-# Import libraries
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.vision.modeling.layers import nn_layers
-
-
-class NNLayersTest(parameterized.TestCase, tf.test.TestCase):
-
-  def test_scale(self):
-    scale = nn_layers.Scale(initializer=tf.keras.initializers.constant(10.))
-    output = scale(3.)
-    self.assertAllEqual(output, 30.)
-
-  def test_temporal_softmax_pool(self):
-    inputs = tf.range(4, dtype=tf.float32) + 1.
-    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
-    layer = nn_layers.TemporalSoftmaxPool()
-    output = layer(inputs)
-    self.assertAllClose(
-        output,
-        [[[[[0.10153633]]],
-          [[[0.33481020]]],
-          [[[0.82801306]]],
-          [[[1.82021690]]]]])
-
-  def test_positional_encoding(self):
-    pos_encoding = nn_layers.PositionalEncoding(
-        initializer='ones', cache_encoding=False)
-    pos_encoding_cached = nn_layers.PositionalEncoding(
-        initializer='ones', cache_encoding=True)
-
-    inputs = tf.ones([1, 4, 1, 1, 3])
-    outputs, _ = pos_encoding(inputs)
-    outputs_cached, _ = pos_encoding_cached(inputs)
-
-    expected = tf.constant(
-        [[[[[1.0000000, 1.0000000, 2.0000000]]],
-          [[[1.8414710, 1.0021545, 1.5403023]]],
-          [[[1.9092975, 1.0043088, 0.5838531]]],
-          [[[1.1411200, 1.0064633, 0.0100075]]]]])
-
-    self.assertEqual(outputs.shape, expected.shape)
-    self.assertAllClose(outputs, expected)
-
-    self.assertEqual(outputs.shape, outputs_cached.shape)
-    self.assertAllClose(outputs, outputs_cached)
-
-    inputs = tf.ones([1, 5, 1, 1, 3])
-    _ = pos_encoding(inputs)
-
-  def test_positional_encoding_bfloat16(self):
-    pos_encoding = nn_layers.PositionalEncoding(initializer='ones')
-
-    inputs = tf.ones([1, 4, 1, 1, 3], dtype=tf.bfloat16)
-    outputs, _ = pos_encoding(inputs)
-
-    expected = tf.constant(
-        [[[[[1.0000000, 1.0000000, 2.0000000]]],
-          [[[1.8414710, 1.0021545, 1.5403023]]],
-          [[[1.9092975, 1.0043088, 0.5838531]]],
-          [[[1.1411200, 1.0064633, 0.0100075]]]]])
-
-    self.assertEqual(outputs.shape, expected.shape)
-    self.assertAllClose(outputs, expected)
-
-  def test_global_average_pool_basic(self):
-    pool = nn_layers.GlobalAveragePool3D(keepdims=True)
-
-    inputs = tf.ones([1, 2, 3, 4, 1])
-    outputs = pool(inputs, output_states=False)
-
-    expected = tf.ones([1, 1, 1, 1, 1])
-
-    self.assertEqual(outputs.shape, expected.shape)
-    self.assertAllEqual(outputs, expected)
-
-  def test_positional_encoding_stream(self):
-    pos_encoding = nn_layers.PositionalEncoding(
-        initializer='ones', cache_encoding=False)
-
-    inputs = tf.range(4, dtype=tf.float32) + 1.
-    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
-    inputs = tf.tile(inputs, [1, 1, 1, 1, 3])
-    expected, _ = pos_encoding(inputs)
-
-    for num_splits in [1, 2, 4]:
-      frames = tf.split(inputs, num_splits, axis=1)
-      states = {}
-      predicted = []
-      for frame in frames:
-        output, states = pos_encoding(frame, states=states)
-        predicted.append(output)
-      predicted = tf.concat(predicted, axis=1)
-
-      self.assertEqual(predicted.shape, expected.shape)
-      self.assertAllClose(predicted, expected)
-      self.assertAllClose(predicted, [[[[[1.0000000, 1.0000000, 2.0000000]]],
-                                       [[[2.8414710, 2.0021544, 2.5403023]]],
-                                       [[[3.9092975, 3.0043090, 2.5838532]]],
-                                       [[[4.1411200, 4.0064630, 3.0100074]]]]])
-
-  def test_global_average_pool_keras(self):
-    pool = nn_layers.GlobalAveragePool3D(keepdims=False)
-    keras_pool = tf.keras.layers.GlobalAveragePooling3D()
-
-    inputs = 10 * tf.random.normal([1, 2, 3, 4, 1])
-
-    outputs = pool(inputs, output_states=False)
-    keras_output = keras_pool(inputs)
-
-    self.assertAllEqual(outputs.shape, keras_output.shape)
-    self.assertAllClose(outputs, keras_output)
-
-  def test_stream_global_average_pool(self):
-    gap = nn_layers.GlobalAveragePool3D(keepdims=True, causal=False)
-
-    inputs = tf.range(4, dtype=tf.float32) + 1.
-    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
-    inputs = tf.tile(inputs, [1, 1, 2, 2, 3])
-    expected, _ = gap(inputs)
-
-    for num_splits in [1, 2, 4]:
-      frames = tf.split(inputs, num_splits, axis=1)
-      states = {}
-      predicted = None
-      for frame in frames:
-        predicted, states = gap(frame, states=states)
-
-      self.assertEqual(predicted.shape, expected.shape)
-      self.assertAllClose(predicted, expected)
-      self.assertAllClose(
-          predicted,
-          [[[[[2.5, 2.5, 2.5]]]]])
-
-  def test_causal_stream_global_average_pool(self):
-    gap = nn_layers.GlobalAveragePool3D(keepdims=True, causal=True)
-
-    inputs = tf.range(4, dtype=tf.float32) + 1.
-    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
-    inputs = tf.tile(inputs, [1, 1, 2, 2, 3])
-    expected, _ = gap(inputs)
-
-    for num_splits in [1, 2, 4]:
-      frames = tf.split(inputs, num_splits, axis=1)
-      states = {}
-      predicted = []
-      for frame in frames:
-        x, states = gap(frame, states=states)
-        predicted.append(x)
-      predicted = tf.concat(predicted, axis=1)
-
-      self.assertEqual(predicted.shape, expected.shape)
-      self.assertAllClose(predicted, expected)
-      self.assertAllClose(
-          predicted,
-          [[[[[1.0, 1.0, 1.0]]],
-            [[[1.5, 1.5, 1.5]]],
-            [[[2.0, 2.0, 2.0]]],
-            [[[2.5, 2.5, 2.5]]]]])
-
-  def test_spatial_average_pool(self):
-    pool = nn_layers.SpatialAveragePool3D(keepdims=True)
-
-    inputs = tf.range(64, dtype=tf.float32) + 1.
-    inputs = tf.reshape(inputs, [1, 4, 4, 4, 1])
-
-    output = pool(inputs)
-
-    self.assertEqual(output.shape, [1, 4, 1, 1, 1])
-    self.assertAllClose(
-        output,
-        [[[[[8.50]]],
-          [[[24.5]]],
-          [[[40.5]]],
-          [[[56.5]]]]])
-
-  def test_conv2d_causal(self):
-    conv2d = nn_layers.Conv2D(
-        filters=3,
-        kernel_size=(3, 3),
-        strides=(1, 2),
-        padding='causal',
-        use_buffered_input=True,
-        kernel_initializer='ones',
-        use_bias=False,
-    )
-
-    inputs = tf.ones([1, 4, 2, 3])
-
-    paddings = [[0, 0], [2, 0], [0, 0], [0, 0]]
-    padded_inputs = tf.pad(inputs, paddings)
-    predicted = conv2d(padded_inputs)
-
-    expected = tf.constant(
-        [[[[6.0, 6.0, 6.0]],
-          [[12., 12., 12.]],
-          [[18., 18., 18.]],
-          [[18., 18., 18.]]]])
-
-    self.assertEqual(predicted.shape, expected.shape)
-    self.assertAllClose(predicted, expected)
-
-    conv2d.use_buffered_input = False
-    predicted = conv2d(inputs)
-
-    self.assertFalse(conv2d.use_buffered_input)
-    self.assertEqual(predicted.shape, expected.shape)
-    self.assertAllClose(predicted, expected)
-
-  def test_depthwise_conv2d_causal(self):
-    conv2d = nn_layers.DepthwiseConv2D(
-        kernel_size=(3, 3),
-        strides=(1, 1),
-        padding='causal',
-        use_buffered_input=True,
-        depthwise_initializer='ones',
-        use_bias=False,
-    )
-
-    inputs = tf.ones([1, 2, 2, 3])
-
-    paddings = [[0, 0], [2, 0], [0, 0], [0, 0]]
-    padded_inputs = tf.pad(inputs, paddings)
-    predicted = conv2d(padded_inputs)
-
-    expected = tf.constant(
-        [[[[2., 2., 2.],
-           [2., 2., 2.]],
-          [[4., 4., 4.],
-           [4., 4., 4.]]]])
-
-    self.assertEqual(predicted.shape, expected.shape)
-    self.assertAllClose(predicted, expected)
-
-    conv2d.use_buffered_input = False
-    predicted = conv2d(inputs)
-
-    self.assertEqual(predicted.shape, expected.shape)
-    self.assertAllClose(predicted, expected)
-
-  def test_conv3d_causal(self):
-    conv3d = nn_layers.Conv3D(
-        filters=3,
-        kernel_size=(3, 3, 3),
-        strides=(1, 2, 2),
-        padding='causal',
-        use_buffered_input=True,
-        kernel_initializer='ones',
-        use_bias=False,
-    )
-
-    inputs = tf.ones([1, 2, 4, 4, 3])
-
-    paddings = [[0, 0], [2, 0], [0, 0], [0, 0], [0, 0]]
-    padded_inputs = tf.pad(inputs, paddings)
-    predicted = conv3d(padded_inputs)
-
-    expected = tf.constant(
-        [[[[[27., 27., 27.],
-            [18., 18., 18.]],
-           [[18., 18., 18.],
-            [12., 12., 12.]]],
-          [[[54., 54., 54.],
-            [36., 36., 36.]],
-           [[36., 36., 36.],
-            [24., 24., 24.]]]]])
-
-    self.assertEqual(predicted.shape, expected.shape)
-    self.assertAllClose(predicted, expected)
-
-    conv3d.use_buffered_input = False
-    predicted = conv3d(inputs)
-
-    self.assertEqual(predicted.shape, expected.shape)
-    self.assertAllClose(predicted, expected)
-
-  def test_depthwise_conv3d_causal(self):
-    conv3d = nn_layers.Conv3D(
-        filters=3,
-        kernel_size=(3, 3, 3),
-        strides=(1, 2, 2),
-        padding='causal',
-        use_buffered_input=True,
-        kernel_initializer='ones',
-        use_bias=False,
-        groups=3,
-    )
-
-    inputs = tf.ones([1, 2, 4, 4, 3])
-
-    paddings = [[0, 0], [2, 0], [0, 0], [0, 0], [0, 0]]
-    padded_inputs = tf.pad(inputs, paddings)
-    predicted = conv3d(padded_inputs)
-
-    expected = tf.constant(
-        [[[[[9.0, 9.0, 9.0],
-            [6.0, 6.0, 6.0]],
-           [[6.0, 6.0, 6.0],
-            [4.0, 4.0, 4.0]]],
-          [[[18.0, 18.0, 18.0],
-            [12., 12., 12.]],
-           [[12., 12., 12.],
-            [8., 8., 8.]]]]])
-
-    output_shape = conv3d._spatial_output_shape([4, 4, 4])
-    self.assertAllClose(output_shape, [2, 2, 2])
-
-    self.assertEqual(predicted.shape, expected.shape)
-    self.assertAllClose(predicted, expected)
-
-    conv3d.use_buffered_input = False
-    predicted = conv3d(inputs)
-
-    self.assertEqual(predicted.shape, expected.shape)
-    self.assertAllClose(predicted, expected)
-
-  def test_conv3d_causal_padding_2d(self):
-    """Test to ensure causal padding works like standard padding."""
-    conv3d = nn_layers.Conv3D(
-        filters=1,
-        kernel_size=(1, 3, 3),
-        strides=(1, 2, 2),
-        padding='causal',
-        use_buffered_input=False,
-        kernel_initializer='ones',
-        use_bias=False,
-    )
-
-    keras_conv3d = tf.keras.layers.Conv3D(
-        filters=1,
-        kernel_size=(1, 3, 3),
-        strides=(1, 2, 2),
-        padding='same',
-        kernel_initializer='ones',
-        use_bias=False,
-    )
-
-    inputs = tf.ones([1, 1, 4, 4, 1])
-
-    predicted = conv3d(inputs)
-    expected = keras_conv3d(inputs)
-
-    self.assertEqual(predicted.shape, expected.shape)
-    self.assertAllClose(predicted, expected)
-
-    self.assertAllClose(predicted,
-                        [[[[[9.],
-                            [6.]],
-                           [[6.],
-                            [4.]]]]])
-
-  def test_conv3d_causal_padding_1d(self):
-    """Test to ensure causal padding works like standard padding."""
-    conv3d = nn_layers.Conv3D(
-        filters=1,
-        kernel_size=(3, 1, 1),
-        strides=(2, 1, 1),
-        padding='causal',
-        use_buffered_input=False,
-        kernel_initializer='ones',
-        use_bias=False,
-    )
-
-    keras_conv1d = tf.keras.layers.Conv1D(
-        filters=1,
-        kernel_size=3,
-        strides=2,
-        padding='causal',
-        kernel_initializer='ones',
-        use_bias=False,
-    )
-
-    inputs = tf.ones([1, 4, 1, 1, 1])
-
-    predicted = conv3d(inputs)
-    expected = keras_conv1d(tf.squeeze(inputs, axis=[2, 3]))
-    expected = tf.reshape(expected, [1, 2, 1, 1, 1])
-
-    self.assertEqual(predicted.shape, expected.shape)
-    self.assertAllClose(predicted, expected)
-
-    self.assertAllClose(predicted,
-                        [[[[[1.]]],
-                          [[[3.]]]]])
-
-  @parameterized.parameters(
-      (None, []),
-      (None, [6, 12, 18]),
-      ([32, 32], [6, 12, 18]),
-  )
-  def test_aspp(self, pool_kernel_size, dilation_rates):
-    inputs = tf.keras.Input(shape=(64, 64, 128), dtype=tf.float32)
-    layer = nn_layers.SpatialPyramidPooling(
-        output_channels=256,
-        dilation_rates=dilation_rates,
-        pool_kernel_size=pool_kernel_size)
-    output = layer(inputs)
-    self.assertAllEqual([None, 64, 64, 256], output.shape)
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/layers/roi_aligner.py
+++ b/official/vision/modeling/layers/roi_aligner.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Contains definitions of ROI aligner."""
-
-from typing import Mapping
-import tensorflow as tf
-
-from official.vision.ops import spatial_transform_ops
-
-
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class MultilevelROIAligner(tf.keras.layers.Layer):
-  """Performs ROIAlign for the second stage processing."""
-
-  def __init__(self, crop_size: int = 7, sample_offset: float = 0.5, **kwargs):
-    """Initializes a ROI aligner.
-
-    Args:
-      crop_size: An `int` of the output size of the cropped features.
-      sample_offset: A `float` in [0, 1] of the subpixel sample offset.
-      **kwargs: Additional keyword arguments passed to Layer.
-    """
-    self._config_dict = {
-        'crop_size': crop_size,
-        'sample_offset': sample_offset,
-    }
-    super(MultilevelROIAligner, self).__init__(**kwargs)
-
-  def call(self,
-           features: Mapping[str, tf.Tensor],
-           boxes: tf.Tensor,
-           training: bool = None):
-    """Generates ROIs.
-
-    Args:
-      features: A dictionary with key as pyramid level and value as features.
-        The features are in shape of
-        [batch_size, height_l, width_l, num_filters].
-      boxes: A 3-D `tf.Tensor` of shape [batch_size, num_boxes, 4]. Each row
-        represents a box with [y1, x1, y2, x2] in un-normalized coordinates.
-        from grid point.
-      training: A `bool` of whether it is in training mode.
-
-    Returns:
-      A 5-D `tf.Tensor` representing feature crop of shape
-      [batch_size, num_boxes, crop_size, crop_size, num_filters].
-    """
-    roi_features = spatial_transform_ops.multilevel_crop_and_resize(
-        features,
-        boxes,
-        output_size=self._config_dict['crop_size'],
-        sample_offset=self._config_dict['sample_offset'])
-    return roi_features
-
-  def get_config(self):
-    return self._config_dict
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
--- a/official/vision/modeling/layers/roi_aligner_test.py
+++ b/official/vision/modeling/layers/roi_aligner_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for roi_aligner.py."""
-
-# Import libraries
-import tensorflow as tf
-
-from official.vision.modeling.layers import roi_aligner
-
-
-class MultilevelROIAlignerTest(tf.test.TestCase):
-
-  def test_serialize_deserialize(self):
-    kwargs = dict(
-        crop_size=7,
-        sample_offset=0.5,
-    )
-    aligner = roi_aligner.MultilevelROIAligner(**kwargs)
-
-    expected_config = dict(kwargs)
-    self.assertEqual(aligner.get_config(), expected_config)
-
-    new_aligner = roi_aligner.MultilevelROIAligner.from_config(
-        aligner.get_config())
-
-    self.assertAllEqual(aligner.get_config(), new_aligner.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()