Internal change

PiperOrigin-RevId: 431756117

Internal change
PiperOrigin-RevId: 431756117
c8e6faf7 · A. Unique TensorFlower · 13a5e4fb · c8e6faf7 · c8e6faf7 · c8e6faf7
Commit c8e6faf7 authored Mar 01, 2022 by A. Unique TensorFlower
20 changed files
--- a/official/vision/modeling/layers/detection_generator_test.py
+++ b/official/vision/modeling/layers/detection_generator_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for detection_generator.py."""
+# Import libraries
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling.layers import detection_generator
+from official.vision.ops import anchor
+
+
+class SelectTopKScoresTest(tf.test.TestCase):
+
+  def testSelectTopKScores(self):
+    pre_nms_num_boxes = 2
+    scores_data = [[[0.2, 0.2], [0.1, 0.9], [0.5, 0.1], [0.3, 0.5]]]
+    scores_in = tf.constant(scores_data, dtype=tf.float32)
+    top_k_scores, top_k_indices = detection_generator._select_top_k_scores(
+        scores_in, pre_nms_num_detections=pre_nms_num_boxes)
+    expected_top_k_scores = np.array([[[0.5, 0.9], [0.3, 0.5]]],
+                                     dtype=np.float32)
+
+    expected_top_k_indices = [[[2, 1], [3, 3]]]
+
+    self.assertAllEqual(top_k_scores.numpy(), expected_top_k_scores)
+    self.assertAllEqual(top_k_indices.numpy(), expected_top_k_indices)
+
+
+class DetectionGeneratorTest(
+    parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.product(
+      nms_version=['batched', 'v1', 'v2'],
+      use_cpu_nms=[True, False],
+      soft_nms_sigma=[None, 0.1])
+  def testDetectionsOutputShape(self, nms_version, use_cpu_nms, soft_nms_sigma):
+    max_num_detections = 10
+    num_classes = 4
+    pre_nms_top_k = 5000
+    pre_nms_score_threshold = 0.01
+    batch_size = 1
+    kwargs = {
+        'apply_nms': True,
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'nms_iou_threshold': 0.5,
+        'max_num_detections': max_num_detections,
+        'nms_version': nms_version,
+        'use_cpu_nms': use_cpu_nms,
+        'soft_nms_sigma': soft_nms_sigma,
+    }
+    generator = detection_generator.DetectionGenerator(**kwargs)
+
+    cls_outputs_all = (
+        np.random.rand(84, num_classes) - 0.5) * 3  # random 84x3 outputs.
+    box_outputs_all = np.random.rand(84, 4 * num_classes)  # random 84 boxes.
+    anchor_boxes_all = np.random.rand(84, 4)  # random 84 boxes.
+    class_outputs = tf.reshape(
+        tf.convert_to_tensor(cls_outputs_all, dtype=tf.float32),
+        [1, 84, num_classes])
+    box_outputs = tf.reshape(
+        tf.convert_to_tensor(box_outputs_all, dtype=tf.float32),
+        [1, 84, 4 * num_classes])
+    anchor_boxes = tf.reshape(
+        tf.convert_to_tensor(anchor_boxes_all, dtype=tf.float32),
+        [1, 84, 4])
+    image_info = tf.constant(
+        [[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]],
+        dtype=tf.float32)
+    results = generator(
+        box_outputs, class_outputs, anchor_boxes, image_info[:, 1, :])
+    boxes = results['detection_boxes']
+    classes = results['detection_classes']
+    scores = results['detection_scores']
+    valid_detections = results['num_detections']
+
+    self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4))
+    self.assertEqual(scores.numpy().shape, (batch_size, max_num_detections,))
+    self.assertEqual(classes.numpy().shape, (batch_size, max_num_detections,))
+    self.assertEqual(valid_detections.numpy().shape, (batch_size,))
+
+  def test_serialize_deserialize(self):
+    kwargs = {
+        'apply_nms': True,
+        'pre_nms_top_k': 1000,
+        'pre_nms_score_threshold': 0.1,
+        'nms_iou_threshold': 0.5,
+        'max_num_detections': 10,
+        'nms_version': 'v2',
+        'use_cpu_nms': False,
+        'soft_nms_sigma': None,
+    }
+    generator = detection_generator.DetectionGenerator(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(generator.get_config(), expected_config)
+
+    new_generator = (
+        detection_generator.DetectionGenerator.from_config(
+            generator.get_config()))
+
+    self.assertAllEqual(generator.get_config(), new_generator.get_config())
+
+
+class MultilevelDetectionGeneratorTest(
+    parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ('batched', False, True, None, None),
+      ('batched', False, False, None, None),
+      ('v2', False, True, None, None),
+      ('v2', False, False, None, None),
+      ('v1', True, True, 0.0, None),
+      ('v1', True, False, 0.1, None),
+      ('v1', True, False, None, None),
+      ('tflite', False, False, None, True),
+      ('tflite', False, False, None, False),
+  )
+  def testDetectionsOutputShape(self, nms_version, has_att_heads, use_cpu_nms,
+                                soft_nms_sigma, use_regular_nms):
+    min_level = 4
+    max_level = 6
+    num_scales = 2
+    max_num_detections = 10
+    aspect_ratios = [1.0, 2.0]
+    anchor_scale = 2.0
+    output_size = [64, 64]
+    num_classes = 4
+    pre_nms_top_k = 5000
+    pre_nms_score_threshold = 0.01
+    batch_size = 1
+    tflite_post_processing_config = {
+        'max_detections': max_num_detections,
+        'max_classes_per_detection': 1,
+        'use_regular_nms': use_regular_nms,
+        'nms_score_threshold': 0.01,
+        'nms_iou_threshold': 0.5
+    }
+    kwargs = {
+        'apply_nms': True,
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'nms_iou_threshold': 0.5,
+        'max_num_detections': max_num_detections,
+        'nms_version': nms_version,
+        'use_cpu_nms': use_cpu_nms,
+        'soft_nms_sigma': soft_nms_sigma,
+        'tflite_post_processing_config': tflite_post_processing_config
+    }
+
+    input_anchor = anchor.build_anchor_generator(min_level, max_level,
+                                                 num_scales, aspect_ratios,
+                                                 anchor_scale)
+    anchor_boxes = input_anchor(output_size)
+    cls_outputs_all = (
+        np.random.rand(84, num_classes) - 0.5) * 3  # random 84x3 outputs.
+    box_outputs_all = np.random.rand(84, 4)  # random 84 boxes.
+    class_outputs = {
+        '4':
+            tf.reshape(
+                tf.convert_to_tensor(cls_outputs_all[0:64], dtype=tf.float32),
+                [1, 8, 8, num_classes]),
+        '5':
+            tf.reshape(
+                tf.convert_to_tensor(cls_outputs_all[64:80], dtype=tf.float32),
+                [1, 4, 4, num_classes]),
+        '6':
+            tf.reshape(
+                tf.convert_to_tensor(cls_outputs_all[80:84], dtype=tf.float32),
+                [1, 2, 2, num_classes]),
+    }
+    box_outputs = {
+        '4': tf.reshape(tf.convert_to_tensor(
+            box_outputs_all[0:64], dtype=tf.float32), [1, 8, 8, 4]),
+        '5': tf.reshape(tf.convert_to_tensor(
+            box_outputs_all[64:80], dtype=tf.float32), [1, 4, 4, 4]),
+        '6': tf.reshape(tf.convert_to_tensor(
+            box_outputs_all[80:84], dtype=tf.float32), [1, 2, 2, 4]),
+    }
+    if has_att_heads:
+      att_outputs_all = np.random.rand(84, 1)  # random attributes.
+      att_outputs = {
+          'depth': {
+              '4':
+                  tf.reshape(
+                      tf.convert_to_tensor(
+                          att_outputs_all[0:64], dtype=tf.float32),
+                      [1, 8, 8, 1]),
+              '5':
+                  tf.reshape(
+                      tf.convert_to_tensor(
+                          att_outputs_all[64:80], dtype=tf.float32),
+                      [1, 4, 4, 1]),
+              '6':
+                  tf.reshape(
+                      tf.convert_to_tensor(
+                          att_outputs_all[80:84], dtype=tf.float32),
+                      [1, 2, 2, 1]),
+          }
+      }
+    else:
+      att_outputs = None
+    image_info = tf.constant([[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]],
+                             dtype=tf.float32)
+    generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
+    results = generator(box_outputs, class_outputs, anchor_boxes,
+                        image_info[:, 1, :], att_outputs)
+    boxes = results['detection_boxes']
+    classes = results['detection_classes']
+    scores = results['detection_scores']
+    valid_detections = results['num_detections']
+
+    if nms_version == 'tflite':
+      # When nms_version is `tflite`, all output tensors are empty as the actual
+      # post-processing happens in the TFLite model.
+      self.assertEqual(boxes.numpy().shape, ())
+      self.assertEqual(scores.numpy().shape, ())
+      self.assertEqual(classes.numpy().shape, ())
+      self.assertEqual(valid_detections.numpy().shape, ())
+    else:
+      self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4))
+      self.assertEqual(scores.numpy().shape, (
+          batch_size,
+          max_num_detections,
+      ))
+      self.assertEqual(classes.numpy().shape, (
+          batch_size,
+          max_num_detections,
+      ))
+      self.assertEqual(valid_detections.numpy().shape, (batch_size,))
+      if has_att_heads:
+        for att in results['detection_attributes'].values():
+          self.assertEqual(att.numpy().shape,
+                           (batch_size, max_num_detections, 1))
+
+  def test_serialize_deserialize(self):
+    tflite_post_processing_config = {
+        'max_detections': 100,
+        'max_classes_per_detection': 1,
+        'use_regular_nms': True,
+        'nms_score_threshold': 0.01,
+        'nms_iou_threshold': 0.5
+    }
+    kwargs = {
+        'apply_nms': True,
+        'pre_nms_top_k': 1000,
+        'pre_nms_score_threshold': 0.1,
+        'nms_iou_threshold': 0.5,
+        'max_num_detections': 10,
+        'nms_version': 'v2',
+        'use_cpu_nms': False,
+        'soft_nms_sigma': None,
+        'tflite_post_processing_config': tflite_post_processing_config
+    }
+    generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(generator.get_config(), expected_config)
+
+    new_generator = (
+        detection_generator.MultilevelDetectionGenerator.from_config(
+            generator.get_config()))
+
+    self.assertAllEqual(generator.get_config(), new_generator.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/mask_sampler.py
+++ b/official/vision/modeling/layers/mask_sampler.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of mask sampler."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.ops import spatial_transform_ops
+
+
+def _sample_and_crop_foreground_masks(candidate_rois: tf.Tensor,
+                                      candidate_gt_boxes: tf.Tensor,
+                                      candidate_gt_classes: tf.Tensor,
+                                      candidate_gt_indices: tf.Tensor,
+                                      gt_masks: tf.Tensor,
+                                      num_sampled_masks: int = 128,
+                                      mask_target_size: int = 28):
+  """Samples and creates cropped foreground masks for training.
+
+  Args:
+    candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is the
+      number of candidate RoIs to be considered for mask sampling. It includes
+      both positive and negative RoIs. The `num_mask_samples_per_image` positive
+      RoIs will be sampled to create mask training targets.
+    candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing
+      the corresponding groundtruth boxes to the `candidate_rois`.
+    candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing the
+      corresponding groundtruth classes to the `candidate_rois`. 0 in the tensor
+      corresponds to the background class, i.e. negative RoIs.
+    candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the
+      corresponding groundtruth instance indices to the `candidate_gt_boxes`,
+      i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i] and
+      gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is
+      the superset of candidate_gt_boxes.
+    gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height,
+      mask_width] containing all the groundtruth masks which sample masks are
+      drawn from.
+    num_sampled_masks: An `int` that specifies the number of masks to sample.
+    mask_target_size: An `int` that specifies the final cropped mask size after
+      sampling. The output masks are resized w.r.t the sampled RoIs.
+
+  Returns:
+    foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the
+      RoI that corresponds to the sampled foreground masks, where
+      K = num_mask_samples_per_image.
+    foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the
+      classes corresponding to the sampled foreground masks.
+    cropoped_foreground_masks: A `tf.Tensor` of shape of
+      [batch_size, K, mask_target_size, mask_target_size] storing the cropped
+      foreground masks used for training.
+  """
+  _, fg_instance_indices = tf.nn.top_k(
+      tf.cast(tf.greater(candidate_gt_classes, 0), dtype=tf.int32),
+      k=num_sampled_masks)
+
+  fg_instance_indices_shape = tf.shape(fg_instance_indices)
+  batch_indices = (
+      tf.expand_dims(tf.range(fg_instance_indices_shape[0]), axis=-1) *
+      tf.ones([1, fg_instance_indices_shape[-1]], dtype=tf.int32))
+
+  gather_nd_instance_indices = tf.stack(
+      [batch_indices, fg_instance_indices], axis=-1)
+  foreground_rois = tf.gather_nd(
+      candidate_rois, gather_nd_instance_indices)
+  foreground_boxes = tf.gather_nd(
+      candidate_gt_boxes, gather_nd_instance_indices)
+  foreground_classes = tf.gather_nd(
+      candidate_gt_classes, gather_nd_instance_indices)
+  foreground_gt_indices = tf.gather_nd(
+      candidate_gt_indices, gather_nd_instance_indices)
+  foreground_gt_indices = tf.where(
+      tf.equal(foreground_gt_indices, -1),
+      tf.zeros_like(foreground_gt_indices),
+      foreground_gt_indices)
+
+  foreground_gt_indices_shape = tf.shape(foreground_gt_indices)
+  batch_indices = (
+      tf.expand_dims(tf.range(foreground_gt_indices_shape[0]), axis=-1) *
+      tf.ones([1, foreground_gt_indices_shape[-1]], dtype=tf.int32))
+  gather_nd_gt_indices = tf.stack(
+      [batch_indices, foreground_gt_indices], axis=-1)
+  foreground_masks = tf.gather_nd(gt_masks, gather_nd_gt_indices)
+
+  cropped_foreground_masks = spatial_transform_ops.crop_mask_in_target_box(
+      foreground_masks, foreground_boxes, foreground_rois, mask_target_size,
+      sample_offset=0.5)
+
+  return foreground_rois, foreground_classes, cropped_foreground_masks
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MaskSampler(tf.keras.layers.Layer):
+  """Samples and creates mask training targets."""
+
+  def __init__(self, mask_target_size: int, num_sampled_masks: int, **kwargs):
+    self._config_dict = {
+        'mask_target_size': mask_target_size,
+        'num_sampled_masks': num_sampled_masks,
+    }
+    super(MaskSampler, self).__init__(**kwargs)
+
+  def call(self, candidate_rois: tf.Tensor, candidate_gt_boxes: tf.Tensor,
+           candidate_gt_classes: tf.Tensor, candidate_gt_indices: tf.Tensor,
+           gt_masks: tf.Tensor):
+    """Samples and creates mask targets for training.
+
+    Args:
+      candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is
+        the number of candidate RoIs to be considered for mask sampling. It
+        includes both positive and negative RoIs. The
+        `num_mask_samples_per_image` positive RoIs will be sampled to create
+        mask training targets.
+      candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing
+        the corresponding groundtruth boxes to the `candidate_rois`.
+      candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing
+        the corresponding groundtruth classes to the `candidate_rois`. 0 in the
+        tensor corresponds to the background class, i.e. negative RoIs.
+      candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the
+        corresponding groundtruth instance indices to the `candidate_gt_boxes`,
+        i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i],
+          where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >=
+          N, is the superset of candidate_gt_boxes.
+      gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height,
+        mask_width] containing all the groundtruth masks which sample masks are
+        drawn from. after sampling. The output masks are resized w.r.t the
+        sampled RoIs.
+
+    Returns:
+      foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the
+        RoI that corresponds to the sampled foreground masks, where
+        K = num_mask_samples_per_image.
+      foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the
+        classes corresponding to the sampled foreground masks.
+      cropoped_foreground_masks: A `tf.Tensor` of shape of
+        [batch_size, K, mask_target_size, mask_target_size] storing the
+        cropped foreground masks used for training.
+    """
+    foreground_rois, foreground_classes, cropped_foreground_masks = (
+        _sample_and_crop_foreground_masks(
+            candidate_rois,
+            candidate_gt_boxes,
+            candidate_gt_classes,
+            candidate_gt_indices,
+            gt_masks,
+            self._config_dict['num_sampled_masks'],
+            self._config_dict['mask_target_size']))
+    return foreground_rois, foreground_classes, cropped_foreground_masks
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/layers/nn_blocks.py
+++ b/official/vision/modeling/layers/nn_blocks.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains common building blocks for neural networks."""
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Text
+
+# Import libraries
+from absl import logging
+import tensorflow as tf
+
+from official.modeling import tf_utils
+from official.vision.modeling.layers import nn_layers
+
+
+def _pad_strides(strides: int, axis: int) -> Tuple[int, int, int, int]:
+  """Converts int to len 4 strides (`tf.nn.avg_pool` uses length 4)."""
+  if axis == 1:
+    return (1, 1, strides, strides)
+  else:
+    return (1, strides, strides, 1)
+
+
+def _maybe_downsample(x: tf.Tensor, out_filter: int, strides: int,
+                      axis: int) -> tf.Tensor:
+  """Downsamples feature map and 0-pads tensor if in_filter != out_filter."""
+  data_format = 'NCHW' if axis == 1 else 'NHWC'
+  strides = _pad_strides(strides, axis=axis)
+
+  x = tf.nn.avg_pool(x, strides, strides, 'VALID', data_format=data_format)
+
+  in_filter = x.shape[axis]
+  if in_filter < out_filter:
+    # Pad on channel dimension with 0s: half on top half on bottom.
+    pad_size = [(out_filter - in_filter) // 2, (out_filter - in_filter) // 2]
+    if axis == 1:
+      x = tf.pad(x, [[0, 0], pad_size, [0, 0], [0, 0]])
+    else:
+      x = tf.pad(x, [[0, 0], [0, 0], [0, 0], pad_size])
+
+  return x + 0.
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ResidualBlock(tf.keras.layers.Layer):
+  """A residual block."""
+
+  def __init__(self,
+               filters,
+               strides,
+               use_projection=False,
+               se_ratio=None,
+               resnetd_shortcut=False,
+               stochastic_depth_drop_rate=None,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_explicit_padding: bool = False,
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               bn_trainable=True,
+               **kwargs):
+    """Initializes a residual block with BN after convolutions.
+
+    Args:
+      filters: An `int` number of filters for the first two convolutions. Note
+        that the third and final convolution will use 4 times as many filters.
+      strides: An `int` block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      use_projection: A `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
+      resnetd_shortcut: A `bool` if True, apply the resnetd style modification
+        to the shortcut connection. Not implemented in residual blocks.
+      stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
+        the stochastic depth layer.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
+        inputs so that the output dimensions are the same as if 'SAME' padding
+        were used.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      bn_trainable: A `bool` that indicates whether batch norm layers should be
+        trainable. Default to True.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(ResidualBlock, self).__init__(**kwargs)
+
+    self._filters = filters
+    self._strides = strides
+    self._use_projection = use_projection
+    self._se_ratio = se_ratio
+    self._resnetd_shortcut = resnetd_shortcut
+    self._use_explicit_padding = use_explicit_padding
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+    self._bn_trainable = bn_trainable
+
+  def build(self, input_shape):
+    if self._use_projection:
+      self._shortcut = tf.keras.layers.Conv2D(
+          filters=self._filters,
+          kernel_size=1,
+          strides=self._strides,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon,
+          trainable=self._bn_trainable)
+
+    conv1_padding = 'same'
+    # explicit padding here is added for centernet
+    if self._use_explicit_padding:
+      self._pad = tf.keras.layers.ZeroPadding2D(padding=(1, 1))
+      conv1_padding = 'valid'
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=self._strides,
+        padding=conv1_padding,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon,
+        trainable=self._bn_trainable)
+
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon,
+        trainable=self._bn_trainable)
+
+    if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
+      self._squeeze_excitation = nn_layers.SqueezeExcitation(
+          in_filters=self._filters,
+          out_filters=self._filters,
+          se_ratio=self._se_ratio,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+    else:
+      self._squeeze_excitation = None
+
+    if self._stochastic_depth_drop_rate:
+      self._stochastic_depth = nn_layers.StochasticDepth(
+          self._stochastic_depth_drop_rate)
+    else:
+      self._stochastic_depth = None
+
+    super(ResidualBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'use_projection': self._use_projection,
+        'se_ratio': self._se_ratio,
+        'resnetd_shortcut': self._resnetd_shortcut,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_explicit_padding': self._use_explicit_padding,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'bn_trainable': self._bn_trainable
+    }
+    base_config = super(ResidualBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    shortcut = inputs
+    if self._use_projection:
+      shortcut = self._shortcut(shortcut)
+      shortcut = self._norm0(shortcut)
+
+    if self._use_explicit_padding:
+      inputs = self._pad(inputs)
+    x = self._conv1(inputs)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+
+    if self._squeeze_excitation:
+      x = self._squeeze_excitation(x)
+
+    if self._stochastic_depth:
+      x = self._stochastic_depth(x, training=training)
+
+    return self._activation_fn(x + shortcut)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BottleneckBlock(tf.keras.layers.Layer):
+  """A standard bottleneck block."""
+
+  def __init__(self,
+               filters,
+               strides,
+               dilation_rate=1,
+               use_projection=False,
+               se_ratio=None,
+               resnetd_shortcut=False,
+               stochastic_depth_drop_rate=None,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               bn_trainable=True,
+               **kwargs):
+    """Initializes a standard bottleneck block with BN after convolutions.
+
+    Args:
+      filters: An `int` number of filters for the first two convolutions. Note
+        that the third and final convolution will use 4 times as many filters.
+      strides: An `int` block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      dilation_rate: An `int` dilation_rate of convolutions. Default to 1.
+      use_projection: A `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
+      resnetd_shortcut: A `bool`. If True, apply the resnetd style modification
+        to the shortcut connection.
+      stochastic_depth_drop_rate: A `float` or None. If not None, drop rate for
+        the stochastic depth layer.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      bn_trainable: A `bool` that indicates whether batch norm layers should be
+        trainable. Default to True.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(BottleneckBlock, self).__init__(**kwargs)
+
+    self._filters = filters
+    self._strides = strides
+    self._dilation_rate = dilation_rate
+    self._use_projection = use_projection
+    self._se_ratio = se_ratio
+    self._resnetd_shortcut = resnetd_shortcut
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._bn_trainable = bn_trainable
+
+  def build(self, input_shape):
+    if self._use_projection:
+      if self._resnetd_shortcut:
+        self._shortcut0 = tf.keras.layers.AveragePooling2D(
+            pool_size=2, strides=self._strides, padding='same')
+        self._shortcut1 = tf.keras.layers.Conv2D(
+            filters=self._filters * 4,
+            kernel_size=1,
+            strides=1,
+            use_bias=False,
+            kernel_initializer=self._kernel_initializer,
+            kernel_regularizer=self._kernel_regularizer,
+            bias_regularizer=self._bias_regularizer)
+      else:
+        self._shortcut = tf.keras.layers.Conv2D(
+            filters=self._filters * 4,
+            kernel_size=1,
+            strides=self._strides,
+            use_bias=False,
+            kernel_initializer=self._kernel_initializer,
+            kernel_regularizer=self._kernel_regularizer,
+            bias_regularizer=self._bias_regularizer)
+
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon,
+          trainable=self._bn_trainable)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon,
+        trainable=self._bn_trainable)
+    self._activation1 = tf_utils.get_activation(
+        self._activation, use_keras_layer=True)
+
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=self._strides,
+        dilation_rate=self._dilation_rate,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon,
+        trainable=self._bn_trainable)
+    self._activation2 = tf_utils.get_activation(
+        self._activation, use_keras_layer=True)
+
+    self._conv3 = tf.keras.layers.Conv2D(
+        filters=self._filters * 4,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm3 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon,
+        trainable=self._bn_trainable)
+    self._activation3 = tf_utils.get_activation(
+        self._activation, use_keras_layer=True)
+
+    if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
+      self._squeeze_excitation = nn_layers.SqueezeExcitation(
+          in_filters=self._filters * 4,
+          out_filters=self._filters * 4,
+          se_ratio=self._se_ratio,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+    else:
+      self._squeeze_excitation = None
+
+    if self._stochastic_depth_drop_rate:
+      self._stochastic_depth = nn_layers.StochasticDepth(
+          self._stochastic_depth_drop_rate)
+    else:
+      self._stochastic_depth = None
+    self._add = tf.keras.layers.Add()
+
+    super(BottleneckBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'dilation_rate': self._dilation_rate,
+        'use_projection': self._use_projection,
+        'se_ratio': self._se_ratio,
+        'resnetd_shortcut': self._resnetd_shortcut,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'bn_trainable': self._bn_trainable
+    }
+    base_config = super(BottleneckBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    shortcut = inputs
+    if self._use_projection:
+      if self._resnetd_shortcut:
+        shortcut = self._shortcut0(shortcut)
+        shortcut = self._shortcut1(shortcut)
+      else:
+        shortcut = self._shortcut(shortcut)
+      shortcut = self._norm0(shortcut)
+
+    x = self._conv1(inputs)
+    x = self._norm1(x)
+    x = self._activation1(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+    x = self._activation2(x)
+
+    x = self._conv3(x)
+    x = self._norm3(x)
+
+    if self._squeeze_excitation:
+      x = self._squeeze_excitation(x)
+
+    if self._stochastic_depth:
+      x = self._stochastic_depth(x, training=training)
+
+    x = self._add([x, shortcut])
+    return self._activation3(x)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class InvertedBottleneckBlock(tf.keras.layers.Layer):
+  """An inverted bottleneck block."""
+
+  def __init__(self,
+               in_filters,
+               out_filters,
+               expand_ratio,
+               strides,
+               kernel_size=3,
+               se_ratio=None,
+               stochastic_depth_drop_rate=None,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               se_inner_activation='relu',
+               se_gating_activation='sigmoid',
+               se_round_down_protect=True,
+               expand_se_in_filters=False,
+               depthwise_activation=None,
+               use_sync_bn=False,
+               dilation_rate=1,
+               divisible_by=1,
+               regularize_depthwise=False,
+               use_depthwise=True,
+               use_residual=True,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               output_intermediate_endpoints=False,
+               **kwargs):
+    """Initializes an inverted bottleneck block with BN after convolutions.
+
+    Args:
+      in_filters: An `int` number of filters of the input tensor.
+      out_filters: An `int` number of filters of the output tensor.
+      expand_ratio: An `int` of expand_ratio for an inverted bottleneck block.
+      strides: An `int` block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      kernel_size: An `int` kernel_size of the depthwise conv layer.
+      se_ratio: A `float` or None. If not None, se ratio for the squeeze and
+        excitation layer.
+      stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
+        the stochastic depth layer.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      se_inner_activation: A `str` name of squeeze-excitation inner activation.
+      se_gating_activation: A `str` name of squeeze-excitation gating
+        activation.
+      se_round_down_protect: A `bool` of whether round down more than 10%
+        will be allowed in SE layer.
+      expand_se_in_filters: A `bool` of whether or not to expand in_filter in
+        squeeze and excitation layer.
+      depthwise_activation: A `str` name of the activation function for
+        depthwise only.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      dilation_rate: An `int` that specifies the dilation rate to use for.
+      divisible_by: An `int` that ensures all inner dimensions are divisible by
+        this number.
+      dilated convolution: An `int` to specify the same value for all spatial
+        dimensions.
+      regularize_depthwise: A `bool` of whether or not apply regularization on
+        depthwise.
+      use_depthwise: A `bool` of whether to uses fused convolutions instead of
+        depthwise.
+      use_residual: A `bool` of whether to include residual connection between
+        input and output.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      output_intermediate_endpoints: A `bool` of whether or not output the
+        intermediate endpoints.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(InvertedBottleneckBlock, self).__init__(**kwargs)
+
+    self._in_filters = in_filters
+    self._out_filters = out_filters
+    self._expand_ratio = expand_ratio
+    self._strides = strides
+    self._kernel_size = kernel_size
+    self._se_ratio = se_ratio
+    self._divisible_by = divisible_by
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._dilation_rate = dilation_rate
+    self._use_sync_bn = use_sync_bn
+    self._regularize_depthwise = regularize_depthwise
+    self._use_depthwise = use_depthwise
+    self._use_residual = use_residual
+    self._activation = activation
+    self._se_inner_activation = se_inner_activation
+    self._se_gating_activation = se_gating_activation
+    self._depthwise_activation = depthwise_activation
+    self._se_round_down_protect = se_round_down_protect
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._expand_se_in_filters = expand_se_in_filters
+    self._output_intermediate_endpoints = output_intermediate_endpoints
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    if not depthwise_activation:
+      self._depthwise_activation = activation
+    if regularize_depthwise:
+      self._depthsize_regularizer = kernel_regularizer
+    else:
+      self._depthsize_regularizer = None
+
+  def build(self, input_shape):
+    expand_filters = self._in_filters
+    if self._expand_ratio > 1:
+      # First 1x1 conv for channel expansion.
+      expand_filters = nn_layers.make_divisible(
+          self._in_filters * self._expand_ratio, self._divisible_by)
+
+      expand_kernel = 1 if self._use_depthwise else self._kernel_size
+      expand_stride = 1 if self._use_depthwise else self._strides
+
+      self._conv0 = tf.keras.layers.Conv2D(
+          filters=expand_filters,
+          kernel_size=expand_kernel,
+          strides=expand_stride,
+          padding='same',
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+      self._activation_layer = tf_utils.get_activation(
+          self._activation, use_keras_layer=True)
+
+    if self._use_depthwise:
+      # Depthwise conv.
+      self._conv1 = tf.keras.layers.DepthwiseConv2D(
+          kernel_size=(self._kernel_size, self._kernel_size),
+          strides=self._strides,
+          padding='same',
+          depth_multiplier=1,
+          dilation_rate=self._dilation_rate,
+          use_bias=False,
+          depthwise_initializer=self._kernel_initializer,
+          depthwise_regularizer=self._depthsize_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm1 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+      self._depthwise_activation_layer = tf_utils.get_activation(
+          self._depthwise_activation, use_keras_layer=True)
+
+    # Squeeze and excitation.
+    if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
+      logging.info('Use Squeeze and excitation.')
+      in_filters = self._in_filters
+      if self._expand_se_in_filters:
+        in_filters = expand_filters
+      self._squeeze_excitation = nn_layers.SqueezeExcitation(
+          in_filters=in_filters,
+          out_filters=expand_filters,
+          se_ratio=self._se_ratio,
+          divisible_by=self._divisible_by,
+          round_down_protect=self._se_round_down_protect,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer,
+          activation=self._se_inner_activation,
+          gating_activation=self._se_gating_activation)
+    else:
+      self._squeeze_excitation = None
+
+    # Last 1x1 conv.
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._out_filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    if self._stochastic_depth_drop_rate:
+      self._stochastic_depth = nn_layers.StochasticDepth(
+          self._stochastic_depth_drop_rate)
+    else:
+      self._stochastic_depth = None
+    self._add = tf.keras.layers.Add()
+
+    super(InvertedBottleneckBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'in_filters': self._in_filters,
+        'out_filters': self._out_filters,
+        'expand_ratio': self._expand_ratio,
+        'strides': self._strides,
+        'kernel_size': self._kernel_size,
+        'se_ratio': self._se_ratio,
+        'divisible_by': self._divisible_by,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'se_inner_activation': self._se_inner_activation,
+        'se_gating_activation': self._se_gating_activation,
+        'se_round_down_protect': self._se_round_down_protect,
+        'expand_se_in_filters': self._expand_se_in_filters,
+        'depthwise_activation': self._depthwise_activation,
+        'dilation_rate': self._dilation_rate,
+        'use_sync_bn': self._use_sync_bn,
+        'regularize_depthwise': self._regularize_depthwise,
+        'use_depthwise': self._use_depthwise,
+        'use_residual': self._use_residual,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(InvertedBottleneckBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    endpoints = {}
+    shortcut = inputs
+    if self._expand_ratio > 1:
+      x = self._conv0(inputs)
+      x = self._norm0(x)
+      x = self._activation_layer(x)
+    else:
+      x = inputs
+
+    if self._use_depthwise:
+      x = self._conv1(x)
+      x = self._norm1(x)
+      x = self._depthwise_activation_layer(x)
+      if self._output_intermediate_endpoints:
+        endpoints['depthwise'] = x
+
+    if self._squeeze_excitation:
+      x = self._squeeze_excitation(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+
+    if (self._use_residual and self._in_filters == self._out_filters and
+        self._strides == 1):
+      if self._stochastic_depth:
+        x = self._stochastic_depth(x, training=training)
+      x = self._add([x, shortcut])
+
+    if self._output_intermediate_endpoints:
+      return x, endpoints
+    return x
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ResidualInner(tf.keras.layers.Layer):
+  """Creates a single inner block of a residual.
+
+  This corresponds to `F`/`G` functions in the RevNet paper:
+  Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
+  The Reversible Residual Network: Backpropagation Without Storing Activations.
+  (https://arxiv.org/pdf/1707.04585.pdf)
+  """
+
+  def __init__(
+      self,
+      filters: int,
+      strides: int,
+      kernel_initializer: Union[str, Callable[
+          ..., tf.keras.initializers.Initializer]] = 'VarianceScaling',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      batch_norm_first: bool = True,
+      **kwargs):
+    """Initializes a ResidualInner.
+
+    Args:
+      filters: An `int` of output filter size.
+      strides: An `int` of stride size for convolution for the residual block.
+      kernel_initializer: A `str` or `tf.keras.initializers.Initializer`
+        instance for convolutional layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D.
+      activation: A `str` or `callable` instance of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      batch_norm_first: A `bool` of whether to apply activation and batch norm
+        before conv.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(ResidualInner, self).__init__(**kwargs)
+
+    self.strides = strides
+    self.filters = filters
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._kernel_regularizer = kernel_regularizer
+    self._activation = tf.keras.activations.get(activation)
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._batch_norm_first = batch_norm_first
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: tf.TensorShape):
+    if self._batch_norm_first:
+      self._batch_norm_0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+
+    self._conv2d_1 = tf.keras.layers.Conv2D(
+        filters=self.filters,
+        kernel_size=3,
+        strides=self.strides,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+
+    self._batch_norm_1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv2d_2 = tf.keras.layers.Conv2D(
+        filters=self.filters,
+        kernel_size=3,
+        strides=1,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+
+    super(ResidualInner, self).build(input_shape)
+
+  def get_config(self) -> Dict[str, Any]:
+    config = {
+        'filters': self.filters,
+        'strides': self.strides,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'batch_norm_first': self._batch_norm_first,
+    }
+    base_config = super(ResidualInner, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self,
+           inputs: tf.Tensor,
+           training: Optional[bool] = None) -> tf.Tensor:
+    x = inputs
+    if self._batch_norm_first:
+      x = self._batch_norm_0(x, training=training)
+      x = self._activation_fn(x)
+    x = self._conv2d_1(x)
+
+    x = self._batch_norm_1(x, training=training)
+    x = self._activation_fn(x)
+    x = self._conv2d_2(x)
+    return x
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BottleneckResidualInner(tf.keras.layers.Layer):
+  """Creates a single inner block of a bottleneck.
+
+  This corresponds to `F`/`G` functions in the RevNet paper:
+  Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
+  The Reversible Residual Network: Backpropagation Without Storing Activations.
+  (https://arxiv.org/pdf/1707.04585.pdf)
+  """
+
+  def __init__(
+      self,
+      filters: int,
+      strides: int,
+      kernel_initializer: Union[str, Callable[
+          ..., tf.keras.initializers.Initializer]] = 'VarianceScaling',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      batch_norm_first: bool = True,
+      **kwargs):
+    """Initializes a BottleneckResidualInner.
+
+    Args:
+      filters: An `int` number of filters for first 2 convolutions. Last Last,
+        and thus the number of output channels from the bottlneck block is
+        `4*filters`
+      strides: An `int` of stride size for convolution for the residual block.
+      kernel_initializer: A `str` or `tf.keras.initializers.Initializer`
+        instance for convolutional layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D.
+      activation: A `str` or `callable` instance of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      batch_norm_first: A `bool` of whether to apply activation and batch norm
+        before conv.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(BottleneckResidualInner, self).__init__(**kwargs)
+
+    self.strides = strides
+    self.filters = filters
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._kernel_regularizer = kernel_regularizer
+    self._activation = tf.keras.activations.get(activation)
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._batch_norm_first = batch_norm_first
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: tf.TensorShape):
+    if self._batch_norm_first:
+      self._batch_norm_0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+    self._conv2d_1 = tf.keras.layers.Conv2D(
+        filters=self.filters,
+        kernel_size=1,
+        strides=self.strides,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+    self._batch_norm_1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    self._conv2d_2 = tf.keras.layers.Conv2D(
+        filters=self.filters,
+        kernel_size=3,
+        strides=1,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+    self._batch_norm_2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    self._conv2d_3 = tf.keras.layers.Conv2D(
+        filters=self.filters * 4,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+
+    super(BottleneckResidualInner, self).build(input_shape)
+
+  def get_config(self) -> Dict[str, Any]:
+    config = {
+        'filters': self.filters,
+        'strides': self.strides,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'batch_norm_first': self._batch_norm_first,
+    }
+    base_config = super(BottleneckResidualInner, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self,
+           inputs: tf.Tensor,
+           training: Optional[bool] = None) -> tf.Tensor:
+    x = inputs
+    if self._batch_norm_first:
+      x = self._batch_norm_0(x, training=training)
+      x = self._activation_fn(x)
+    x = self._conv2d_1(x)
+
+    x = self._batch_norm_1(x, training=training)
+    x = self._activation_fn(x)
+    x = self._conv2d_2(x)
+
+    x = self._batch_norm_2(x, training=training)
+    x = self._activation_fn(x)
+    x = self._conv2d_3(x)
+
+    return x
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ReversibleLayer(tf.keras.layers.Layer):
+  """Creates a reversible layer.
+
+  Computes y1 = x1 + f(x2), y2 = x2 + g(y1), where f and g can be arbitrary
+  layers that are stateless, which in this case are `ResidualInner` layers.
+  """
+
+  def __init__(self,
+               f: tf.keras.layers.Layer,
+               g: tf.keras.layers.Layer,
+               manual_grads: bool = True,
+               **kwargs):
+    """Initializes a ReversibleLayer.
+
+    Args:
+      f: A `tf.keras.layers.Layer` instance of `f` inner block referred to in
+        paper. Each reversible layer consists of two inner functions. For
+        example, in RevNet the reversible residual consists of two f/g inner
+        (bottleneck) residual functions. Where the input to the reversible layer
+        is x, the input gets partitioned in the channel dimension and the
+        forward pass follows (eq8): x = [x1; x2], z1 = x1 + f(x2), y2 = x2 +
+          g(z1), y1 = stop_gradient(z1).
+      g: A `tf.keras.layers.Layer` instance of `g` inner block referred to in
+        paper. Detailed explanation same as above as `f` arg.
+      manual_grads: A `bool` [Testing Only] of whether to manually take
+        gradients as in Algorithm 1 or defer to autograd.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(ReversibleLayer, self).__init__(**kwargs)
+
+    self._f = f
+    self._g = g
+    self._manual_grads = manual_grads
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._axis = -1
+    else:
+      self._axis = 1
+
+  def get_config(self) -> Dict[str, Any]:
+    config = {
+        'f': self._f,
+        'g': self._g,
+        'manual_grads': self._manual_grads,
+    }
+    base_config = super(ReversibleLayer, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def _ckpt_non_trainable_vars(self):
+    self._f_non_trainable_vars = [
+        v.read_value() for v in self._f.non_trainable_variables
+    ]
+    self._g_non_trainable_vars = [
+        v.read_value() for v in self._g.non_trainable_variables
+    ]
+
+  def _load_ckpt_non_trainable_vars(self):
+    for v, v_chkpt in zip(self._f.non_trainable_variables,
+                          self._f_non_trainable_vars):
+      v.assign(v_chkpt)
+    for v, v_chkpt in zip(self._g.non_trainable_variables,
+                          self._g_non_trainable_vars):
+      v.assign(v_chkpt)
+
+  def call(self,
+           inputs: tf.Tensor,
+           training: Optional[bool] = None) -> tf.Tensor:
+
+    @tf.custom_gradient
+    def reversible(
+        x: tf.Tensor
+    ) -> Tuple[tf.Tensor, Callable[[Any], Tuple[List[tf.Tensor],
+                                                List[tf.Tensor]]]]:
+      """Implements Algorithm 1 in the RevNet paper.
+
+         Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
+         The Reversible Residual Network: Backpropagation Without Storing
+         Activations.
+         (https://arxiv.org/pdf/1707.04585.pdf)
+
+      Args:
+        x: An input `tf.Tensor.
+
+      Returns:
+        y: The output [y1; y2] in Algorithm 1.
+        grad_fn: A callable function that computes the gradients.
+      """
+      with tf.GradientTape() as fwdtape:
+        fwdtape.watch(x)
+        x1, x2 = tf.split(x, num_or_size_splits=2, axis=self._axis)
+        f_x2 = self._f(x2, training=training)
+        x1_down = _maybe_downsample(x1, f_x2.shape[self._axis], self._f.strides,
+                                    self._axis)
+        z1 = f_x2 + x1_down
+        g_z1 = self._g(z1, training=training)
+        x2_down = _maybe_downsample(x2, g_z1.shape[self._axis], self._f.strides,
+                                    self._axis)
+        y2 = x2_down + g_z1
+
+        # Equation 8: https://arxiv.org/pdf/1707.04585.pdf
+        # Decouple y1 and z1 so that their derivatives are different.
+        y1 = tf.identity(z1)
+        y = tf.concat([y1, y2], axis=self._axis)
+
+        irreversible = ((self._f.strides != 1 or self._g.strides != 1) or
+                        (y.shape[self._axis] != inputs.shape[self._axis]))
+
+        # Checkpointing moving mean/variance for batch normalization layers
+        # as they shouldn't be updated during the custom gradient pass of f/g.
+        self._ckpt_non_trainable_vars()
+
+      def grad_fn(
+          dy: tf.Tensor,
+          variables: Optional[List[tf.Variable]] = None,
+      ) -> Tuple[List[tf.Tensor], List[tf.Tensor]]:
+        """Given dy calculate (dy/dx)|_{x_{input}} using f/g."""
+        if irreversible or not self._manual_grads:
+          grads_combined = fwdtape.gradient(
+              y, [x] + variables, output_gradients=dy)
+          dx = grads_combined[0]
+          grad_vars = grads_combined[1:]
+        else:
+          y1_nograd = tf.stop_gradient(y1)
+          y2_nograd = tf.stop_gradient(y2)
+          dy1, dy2 = tf.split(dy, num_or_size_splits=2, axis=self._axis)
+
+          # Index mapping from self.f/g.trainable_variables to grad_fn
+          # input `variables` kwarg so that we can reorder dwf + dwg
+          # variable gradient list to match `variables` order.
+          f_var_refs = [v.ref() for v in self._f.trainable_variables]
+          g_var_refs = [v.ref() for v in self._g.trainable_variables]
+          fg_var_refs = f_var_refs + g_var_refs
+          self_to_var_index = [fg_var_refs.index(v.ref()) for v in variables]
+
+          # Algorithm 1 in paper (line # documented in-line)
+          z1 = y1_nograd  # line 2
+          with tf.GradientTape() as gtape:
+            gtape.watch(z1)
+            g_z1 = self._g(z1, training=training)
+          x2 = y2_nograd - g_z1  # line 3
+
+          with tf.GradientTape() as ftape:
+            ftape.watch(x2)
+            f_x2 = self._f(x2, training=training)
+          x1 = z1 - f_x2  # pylint: disable=unused-variable      # line 4
+
+          # Compute gradients
+          g_grads_combined = gtape.gradient(
+              g_z1, [z1] + self._g.trainable_variables, output_gradients=dy2)
+          dz1 = dy1 + g_grads_combined[0]  # line 5
+          dwg = g_grads_combined[1:]  # line 9
+
+          f_grads_combined = ftape.gradient(
+              f_x2, [x2] + self._f.trainable_variables, output_gradients=dz1)
+          dx2 = dy2 + f_grads_combined[0]  # line 6
+          dwf = f_grads_combined[1:]  # line 8
+          dx1 = dz1  # line 7
+
+          # Pack the input and variable gradients.
+          dx = tf.concat([dx1, dx2], axis=self._axis)
+          grad_vars = dwf + dwg
+          # Reorder gradients (trainable_variables to variables kwarg order)
+          grad_vars = [grad_vars[i] for i in self_to_var_index]
+
+          # Restore batch normalization moving mean/variance for correctness.
+          self._load_ckpt_non_trainable_vars()
+
+        return dx, grad_vars  # grad_fn end
+
+      return y, grad_fn  # reversible end
+
+    activations = reversible(inputs)
+    return activations
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class DepthwiseSeparableConvBlock(tf.keras.layers.Layer):
+  """Creates an depthwise separable convolution block with batch normalization."""
+
+  def __init__(
+      self,
+      filters: int,
+      kernel_size: int = 3,
+      strides: int = 1,
+      regularize_depthwise=False,
+      activation: Text = 'relu6',
+      kernel_initializer: Text = 'VarianceScaling',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      dilation_rate: int = 1,
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      **kwargs):
+    """Initializes a convolution block with batch normalization.
+
+    Args:
+      filters: An `int` number of filters for the first two convolutions. Note
+        that the third and final convolution will use 4 times as many filters.
+      kernel_size: An `int` that specifies the height and width of the 2D
+        convolution window.
+      strides: An `int` of block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      regularize_depthwise: A `bool`. If Ture, apply regularization on
+        depthwise.
+      activation: A `str` name of the activation function.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      dilation_rate: An `int` or tuple/list of 2 `int`, specifying the dilation
+        rate to use for dilated convolution. Can be a single integer to specify
+        the same value for all spatial dimensions.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(DepthwiseSeparableConvBlock, self).__init__(**kwargs)
+    self._filters = filters
+    self._kernel_size = kernel_size
+    self._strides = strides
+    self._activation = activation
+    self._regularize_depthwise = regularize_depthwise
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._dilation_rate = dilation_rate
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+    if regularize_depthwise:
+      self._depthsize_regularizer = kernel_regularizer
+    else:
+      self._depthsize_regularizer = None
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'regularize_depthwise': self._regularize_depthwise,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(DepthwiseSeparableConvBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+
+    self._dwconv0 = tf.keras.layers.DepthwiseConv2D(
+        kernel_size=self._kernel_size,
+        strides=self._strides,
+        padding='same',
+        depth_multiplier=1,
+        dilation_rate=self._dilation_rate,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._depthsize_regularizer,
+        use_bias=False)
+    self._norm0 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    super(DepthwiseSeparableConvBlock, self).build(input_shape)
+
+  def call(self, inputs, training=None):
+    x = self._dwconv0(inputs)
+    x = self._norm0(x)
+    x = self._activation_fn(x)
+
+    x = self._conv1(x)
+    x = self._norm1(x)
+    return self._activation_fn(x)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class TuckerConvBlock(tf.keras.layers.Layer):
+  """An Tucker block (generalized bottleneck)."""
+
+  def __init__(self,
+               in_filters,
+               out_filters,
+               input_compression_ratio,
+               output_compression_ratio,
+               strides,
+               kernel_size=3,
+               stochastic_depth_drop_rate=None,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               divisible_by=1,
+               use_residual=True,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """Initializes an inverted bottleneck block with BN after convolutions.
+
+    Args:
+      in_filters: An `int` number of filters of the input tensor.
+      out_filters: An `int` number of filters of the output tensor.
+      input_compression_ratio: An `float` of compression ratio for
+        input filters.
+      output_compression_ratio: An `float` of compression ratio for
+        output filters.
+      strides: An `int` block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      kernel_size: An `int` kernel_size of the depthwise conv layer.
+      stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
+        the stochastic depth layer.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      divisible_by: An `int` that ensures all inner dimensions are divisible by
+        this number.
+      use_residual: A `bool` of whether to include residual connection between
+        input and output.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(TuckerConvBlock, self).__init__(**kwargs)
+
+    self._in_filters = in_filters
+    self._out_filters = out_filters
+    self._input_compression_ratio = input_compression_ratio
+    self._output_compression_ratio = output_compression_ratio
+    self._strides = strides
+    self._kernel_size = kernel_size
+    self._divisible_by = divisible_by
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._use_sync_bn = use_sync_bn
+    self._use_residual = use_residual
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+
+  def build(self, input_shape):
+    input_compressed_filters = nn_layers.make_divisible(
+        value=self._in_filters * self._input_compression_ratio,
+        divisor=self._divisible_by,
+        round_down_protect=False)
+
+    self._conv0 = tf.keras.layers.Conv2D(
+        filters=input_compressed_filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm0 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    self._activation_layer0 = tf_utils.get_activation(
+        self._activation, use_keras_layer=True)
+
+    output_compressed_filters = nn_layers.make_divisible(
+        value=self._out_filters * self._output_compression_ratio,
+        divisor=self._divisible_by,
+        round_down_protect=False)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=output_compressed_filters,
+        kernel_size=self._kernel_size,
+        strides=self._strides,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    self._activation_layer1 = tf_utils.get_activation(
+        self._activation, use_keras_layer=True)
+
+    # Last 1x1 conv.
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._out_filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    if self._stochastic_depth_drop_rate:
+      self._stochastic_depth = nn_layers.StochasticDepth(
+          self._stochastic_depth_drop_rate)
+    else:
+      self._stochastic_depth = None
+    self._add = tf.keras.layers.Add()
+
+    super(TuckerConvBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'in_filters': self._in_filters,
+        'out_filters': self._out_filters,
+        'input_compression_ratio': self._input_compression_ratio,
+        'output_compression_ratio': self._output_compression_ratio,
+        'strides': self._strides,
+        'kernel_size': self._kernel_size,
+        'divisible_by': self._divisible_by,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'use_residual': self._use_residual,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(TuckerConvBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    shortcut = inputs
+
+    x = self._conv0(inputs)
+    x = self._norm0(x)
+    x = self._activation_layer0(x)
+
+    x = self._conv1(x)
+    x = self._norm1(x)
+    x = self._activation_layer1(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+
+    if (self._use_residual and
+        self._in_filters == self._out_filters and
+        self._strides == 1):
+      if self._stochastic_depth:
+        x = self._stochastic_depth(x, training=training)
+      x = self._add([x, shortcut])
+
+    return x
--- a/official/vision/modeling/layers/nn_blocks_3d.py
+++ b/official/vision/modeling/layers/nn_blocks_3d.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains common building blocks for 3D networks."""
+# Import libraries
+import tensorflow as tf
+
+from official.modeling import tf_utils
+from official.vision.modeling.layers import nn_layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SelfGating(tf.keras.layers.Layer):
+  """Feature gating as used in S3D-G.
+
+  This implements the S3D-G network from:
+  Saining Xie, Chen Sun, Jonathan Huang, Zhuowen Tu, Kevin Murphy.
+  Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video
+  Classification.
+  (https://arxiv.org/pdf/1712.04851.pdf)
+  """
+
+  def __init__(self, filters, **kwargs):
+    """Initializes a self-gating layer.
+
+    Args:
+      filters: An `int` number of filters for the convolutional layer.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(SelfGating, self).__init__(**kwargs)
+    self._filters = filters
+
+  def build(self, input_shape):
+    self._spatial_temporal_average = tf.keras.layers.GlobalAveragePooling3D()
+
+    # No BN and activation after conv.
+    self._transformer_w = tf.keras.layers.Conv3D(
+        filters=self._filters,
+        kernel_size=[1, 1, 1],
+        use_bias=True,
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            mean=0.0, stddev=0.01))
+
+    super(SelfGating, self).build(input_shape)
+
+  def call(self, inputs):
+    x = self._spatial_temporal_average(inputs)
+
+    x = tf.expand_dims(x, 1)
+    x = tf.expand_dims(x, 2)
+    x = tf.expand_dims(x, 3)
+
+    x = self._transformer_w(x)
+    x = tf.nn.sigmoid(x)
+
+    return tf.math.multiply(x, inputs)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BottleneckBlock3D(tf.keras.layers.Layer):
+  """Creates a 3D bottleneck block."""
+
+  def __init__(self,
+               filters,
+               temporal_kernel_size,
+               temporal_strides,
+               spatial_strides,
+               stochastic_depth_drop_rate=0.0,
+               se_ratio=None,
+               use_self_gating=False,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """Initializes a 3D bottleneck block with BN after convolutions.
+
+    Args:
+      filters: An `int` number of filters for the first two convolutions. Note
+        that the third and final convolution will use 4 times as many filters.
+      temporal_kernel_size: An `int` of kernel size for the temporal
+        convolutional layer.
+      temporal_strides: An `int` of ftemporal stride for the temporal
+        convolutional layer.
+      spatial_strides: An `int` of spatial stride for the spatial convolutional
+        layer.
+      stochastic_depth_drop_rate: A `float` or None. If not None, drop rate for
+        the stochastic depth layer.
+      se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
+      use_self_gating: A `bool` of whether to apply self-gating module or not.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(BottleneckBlock3D, self).__init__(**kwargs)
+
+    self._filters = filters
+    self._temporal_kernel_size = temporal_kernel_size
+    self._spatial_strides = spatial_strides
+    self._temporal_strides = temporal_strides
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._use_self_gating = use_self_gating
+    self._se_ratio = se_ratio
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape):
+    self._shortcut_maxpool = tf.keras.layers.MaxPool3D(
+        pool_size=[1, 1, 1],
+        strides=[
+            self._temporal_strides, self._spatial_strides, self._spatial_strides
+        ])
+
+    self._shortcut_conv = tf.keras.layers.Conv3D(
+        filters=4 * self._filters,
+        kernel_size=1,
+        strides=[
+            self._temporal_strides, self._spatial_strides, self._spatial_strides
+        ],
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm0 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._temporal_conv = tf.keras.layers.Conv3D(
+        filters=self._filters,
+        kernel_size=[self._temporal_kernel_size, 1, 1],
+        strides=[self._temporal_strides, 1, 1],
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._spatial_conv = tf.keras.layers.Conv3D(
+        filters=self._filters,
+        kernel_size=[1, 3, 3],
+        strides=[1, self._spatial_strides, self._spatial_strides],
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._expand_conv = tf.keras.layers.Conv3D(
+        filters=4 * self._filters,
+        kernel_size=[1, 1, 1],
+        strides=[1, 1, 1],
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm3 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
+      self._squeeze_excitation = nn_layers.SqueezeExcitation(
+          in_filters=self._filters * 4,
+          out_filters=self._filters * 4,
+          se_ratio=self._se_ratio,
+          use_3d_input=True,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+    else:
+      self._squeeze_excitation = None
+
+    if self._stochastic_depth_drop_rate:
+      self._stochastic_depth = nn_layers.StochasticDepth(
+          self._stochastic_depth_drop_rate)
+    else:
+      self._stochastic_depth = None
+
+    if self._use_self_gating:
+      self._self_gating = SelfGating(filters=4 * self._filters)
+    else:
+      self._self_gating = None
+
+    super(BottleneckBlock3D, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'temporal_kernel_size': self._temporal_kernel_size,
+        'temporal_strides': self._temporal_strides,
+        'spatial_strides': self._spatial_strides,
+        'use_self_gating': self._use_self_gating,
+        'se_ratio': self._se_ratio,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(BottleneckBlock3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    in_filters = inputs.shape.as_list()[-1]
+    if in_filters == 4 * self._filters:
+      if self._temporal_strides == 1 and self._spatial_strides == 1:
+        shortcut = inputs
+      else:
+        shortcut = self._shortcut_maxpool(inputs)
+    else:
+      shortcut = self._shortcut_conv(inputs)
+      shortcut = self._norm0(shortcut)
+
+    x = self._temporal_conv(inputs)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+
+    x = self._spatial_conv(x)
+    x = self._norm2(x)
+    x = self._activation_fn(x)
+
+    x = self._expand_conv(x)
+    x = self._norm3(x)
+
+    # Apply self-gating, SE, stochastic depth.
+    if self._self_gating:
+      x = self._self_gating(x)
+    if self._squeeze_excitation:
+      x = self._squeeze_excitation(x)
+    if self._stochastic_depth:
+      x = self._stochastic_depth(x, training=training)
+
+    # Apply activation before additional modules.
+    x = self._activation_fn(x + shortcut)
+
+    return x
--- a/official/vision/modeling/layers/nn_blocks_3d_test.py
+++ b/official/vision/modeling/layers/nn_blocks_3d_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for resnet."""
+
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.modeling.layers import nn_blocks_3d
+
+
+class NNBlocksTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (nn_blocks_3d.BottleneckBlock3D, 1, 1, 2, True, 0.2, 0.1),
+      (nn_blocks_3d.BottleneckBlock3D, 3, 2, 1, False, 0.0, 0.0),
+  )
+  def test_bottleneck_block_creation(self, block_fn, temporal_kernel_size,
+                                     temporal_strides, spatial_strides,
+                                     use_self_gating, se_ratio,
+                                     stochastic_depth):
+    temporal_size = 16
+    spatial_size = 128
+    filters = 256
+    inputs = tf.keras.Input(
+        shape=(temporal_size, spatial_size, spatial_size, filters * 4),
+        batch_size=1)
+    block = block_fn(
+        filters=filters,
+        temporal_kernel_size=temporal_kernel_size,
+        temporal_strides=temporal_strides,
+        spatial_strides=spatial_strides,
+        use_self_gating=use_self_gating,
+        se_ratio=se_ratio,
+        stochastic_depth_drop_rate=stochastic_depth)
+
+    features = block(inputs)
+
+    self.assertAllEqual([
+        1, temporal_size // temporal_strides, spatial_size // spatial_strides,
+        spatial_size // spatial_strides, filters * 4
+    ], features.shape.as_list())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/nn_blocks_test.py
+++ b/official/vision/modeling/layers/nn_blocks_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for nn_blocks."""
+
+from typing import Any, Iterable, Tuple
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.modeling.layers import nn_blocks
+
+
+def distribution_strategy_combinations() -> Iterable[Tuple[Any, ...]]:
+  """Returns the combinations of end-to-end tests to run."""
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],)
+
+
+class NNBlocksTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (nn_blocks.ResidualBlock, 1, False, 0.0, None),
+      (nn_blocks.ResidualBlock, 2, True, 0.2, 0.25),
+  )
+  def test_residual_block_creation(self, block_fn, strides, use_projection,
+                                   stochastic_depth_drop_rate, se_ratio):
+    input_size = 128
+    filter_size = 256
+    inputs = tf.keras.Input(
+        shape=(input_size, input_size, filter_size), batch_size=1)
+    block = block_fn(
+        filter_size,
+        strides,
+        use_projection=use_projection,
+        se_ratio=se_ratio,
+        stochastic_depth_drop_rate=stochastic_depth_drop_rate,
+    )
+
+    features = block(inputs)
+
+    self.assertAllEqual(
+        [1, input_size // strides, input_size // strides, filter_size],
+        features.shape.as_list())
+
+  @parameterized.parameters(
+      (nn_blocks.BottleneckBlock, 1, False, 0.0, None),
+      (nn_blocks.BottleneckBlock, 2, True, 0.2, 0.25),
+  )
+  def test_bottleneck_block_creation(self, block_fn, strides, use_projection,
+                                     stochastic_depth_drop_rate, se_ratio):
+    input_size = 128
+    filter_size = 256
+    inputs = tf.keras.Input(
+        shape=(input_size, input_size, filter_size * 4), batch_size=1)
+    block = block_fn(
+        filter_size,
+        strides,
+        use_projection=use_projection,
+        se_ratio=se_ratio,
+        stochastic_depth_drop_rate=stochastic_depth_drop_rate)
+
+    features = block(inputs)
+
+    self.assertAllEqual(
+        [1, input_size // strides, input_size // strides, filter_size * 4],
+        features.shape.as_list())
+
+  @parameterized.parameters(
+      (nn_blocks.InvertedBottleneckBlock, 1, 1, None, None),
+      (nn_blocks.InvertedBottleneckBlock, 6, 1, None, None),
+      (nn_blocks.InvertedBottleneckBlock, 1, 2, None, None),
+      (nn_blocks.InvertedBottleneckBlock, 1, 1, 0.2, None),
+      (nn_blocks.InvertedBottleneckBlock, 1, 1, None, 0.2),
+  )
+  def test_invertedbottleneck_block_creation(self, block_fn, expand_ratio,
+                                             strides, se_ratio,
+                                             stochastic_depth_drop_rate):
+    input_size = 128
+    in_filters = 24
+    out_filters = 40
+    inputs = tf.keras.Input(
+        shape=(input_size, input_size, in_filters), batch_size=1)
+    block = block_fn(
+        in_filters=in_filters,
+        out_filters=out_filters,
+        expand_ratio=expand_ratio,
+        strides=strides,
+        se_ratio=se_ratio,
+        stochastic_depth_drop_rate=stochastic_depth_drop_rate)
+
+    features = block(inputs)
+
+    self.assertAllEqual(
+        [1, input_size // strides, input_size // strides, out_filters],
+        features.shape.as_list())
+
+  @parameterized.parameters(
+      (nn_blocks.TuckerConvBlock, 1, 0.25, 0.25),
+      (nn_blocks.TuckerConvBlock, 2, 0.25, 0.25),
+  )
+  def test_tucker_conv_block(
+      self, block_fn, strides,
+      input_compression_ratio, output_compression_ratio):
+    input_size = 128
+    in_filters = 24
+    out_filters = 24
+    inputs = tf.keras.Input(
+        shape=(input_size, input_size, in_filters), batch_size=1)
+    block = block_fn(
+        in_filters=in_filters,
+        out_filters=out_filters,
+        input_compression_ratio=input_compression_ratio,
+        output_compression_ratio=output_compression_ratio,
+        strides=strides)
+
+    features = block(inputs)
+
+    self.assertAllEqual(
+        [1, input_size // strides, input_size // strides, out_filters],
+        features.shape.as_list())
+
+
+class ResidualInnerTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_shape(self, distribution):
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = 64
+    strides = 2
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
+    with distribution.scope():
+      test_layer = nn_blocks.ResidualInner(filters, strides)
+
+    output = test_layer(input_tensor)
+    expected_output_shape = [bsz, h // strides, w // strides, filters]
+    self.assertEqual(expected_output_shape, output.shape.as_list())
+
+
+class BottleneckResidualInnerTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_shape(self, distribution):
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = 64
+    strides = 2
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
+    with distribution.scope():
+      test_layer = nn_blocks.BottleneckResidualInner(filters, strides)
+
+    output = test_layer(input_tensor)
+    expected_output_shape = [bsz, h // strides, w // strides, filters * 4]
+    self.assertEqual(expected_output_shape, output.shape.as_list())
+
+
+class DepthwiseSeparableConvBlockTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_shape(self, distribution):
+    batch_size, height, width, num_channels = 8, 32, 32, 32
+    num_filters = 64
+    strides = 2
+
+    input_tensor = tf.random.normal(
+        shape=[batch_size, height, width, num_channels])
+    with distribution.scope():
+      block = nn_blocks.DepthwiseSeparableConvBlock(
+          num_filters, strides=strides)
+      config_dict = block.get_config()
+      recreate_block = nn_blocks.DepthwiseSeparableConvBlock(**config_dict)
+
+    output_tensor = block(input_tensor)
+    expected_output_shape = [
+        batch_size, height // strides, width // strides, num_filters
+    ]
+    self.assertEqual(output_tensor.shape.as_list(), expected_output_shape)
+
+    output_tensor = recreate_block(input_tensor)
+    self.assertEqual(output_tensor.shape.as_list(), expected_output_shape)
+
+
+class ReversibleLayerTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_downsampling_non_reversible_step(self, distribution):
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = 64
+    strides = 2
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
+    with distribution.scope():
+      f = nn_blocks.ResidualInner(
+          filters=filters // 2, strides=strides, batch_norm_first=True)
+      g = nn_blocks.ResidualInner(
+          filters=filters // 2, strides=1, batch_norm_first=True)
+      test_layer = nn_blocks.ReversibleLayer(f, g)
+      test_layer.build(input_tensor.shape)
+      optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
+
+    @tf.function
+    def step_fn():
+      with tf.GradientTape() as tape:
+        output = test_layer(input_tensor, training=True)
+      grads = tape.gradient(output, test_layer.trainable_variables)
+      # Test applying gradients with optimizer works
+      optimizer.apply_gradients(zip(grads, test_layer.trainable_variables))
+
+      return output
+
+    replica_output = distribution.run(step_fn)
+    outputs = distribution.experimental_local_results(replica_output)
+
+    # Assert forward pass shape
+    expected_output_shape = [bsz, h // strides, w // strides, filters]
+    for output in outputs:
+      self.assertEqual(expected_output_shape, output.shape.as_list())
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_reversible_step(self, distribution):
+    # Reversible layers satisfy: (a) strides = 1 (b) in_filter = out_filter
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = c
+    strides = 1
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
+    with distribution.scope():
+      f = nn_blocks.ResidualInner(
+          filters=filters // 2, strides=strides, batch_norm_first=False)
+      g = nn_blocks.ResidualInner(
+          filters=filters // 2, strides=1, batch_norm_first=False)
+      test_layer = nn_blocks.ReversibleLayer(f, g)
+      test_layer(input_tensor, training=False)  # init weights
+      optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
+
+    @tf.function
+    def step_fn():
+      with tf.GradientTape() as tape:
+        output = test_layer(input_tensor, training=True)
+      grads = tape.gradient(output, test_layer.trainable_variables)
+      # Test applying gradients with optimizer works
+      optimizer.apply_gradients(zip(grads, test_layer.trainable_variables))
+
+      return output
+
+    @tf.function
+    def fwd():
+      test_layer(input_tensor)
+
+    distribution.run(fwd)  # Initialize variables
+    prev_variables = tf.identity_n(test_layer.trainable_variables)
+    replica_output = distribution.run(step_fn)
+    outputs = distribution.experimental_local_results(replica_output)
+
+    # Assert variables values have changed values
+    for v0, v1 in zip(prev_variables, test_layer.trainable_variables):
+      self.assertNotAllEqual(v0, v1)
+
+    # Assert forward pass shape
+    expected_output_shape = [bsz, h // strides, w // strides, filters]
+    for output in outputs:
+      self.assertEqual(expected_output_shape, output.shape.as_list())
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_manual_gradients_correctness(self, distribution):
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = c
+    strides = 1
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c * 4])  # bottleneck
+    with distribution.scope():
+      f_manual = nn_blocks.BottleneckResidualInner(
+          filters=filters // 2, strides=strides, batch_norm_first=False)
+      g_manual = nn_blocks.BottleneckResidualInner(
+          filters=filters // 2, strides=1, batch_norm_first=False)
+      manual_grad_layer = nn_blocks.ReversibleLayer(f_manual, g_manual)
+      manual_grad_layer(input_tensor, training=False)  # init weights
+
+      f_auto = nn_blocks.BottleneckResidualInner(
+          filters=filters // 2, strides=strides, batch_norm_first=False)
+      g_auto = nn_blocks.BottleneckResidualInner(
+          filters=filters // 2, strides=1, batch_norm_first=False)
+      auto_grad_layer = nn_blocks.ReversibleLayer(
+          f_auto, g_auto, manual_grads=False)
+      auto_grad_layer(input_tensor)  # init weights
+      # Clone all weights (tf.keras.layers.Layer has no .clone())
+      auto_grad_layer._f.set_weights(manual_grad_layer._f.get_weights())
+      auto_grad_layer._g.set_weights(manual_grad_layer._g.get_weights())
+
+    @tf.function
+    def manual_fn():
+      with tf.GradientTape() as tape:
+        output = manual_grad_layer(input_tensor, training=True)
+      grads = tape.gradient(output, manual_grad_layer.trainable_variables)
+      return grads
+
+    @tf.function
+    def auto_fn():
+      with tf.GradientTape() as tape:
+        output = auto_grad_layer(input_tensor, training=True)
+      grads = tape.gradient(output, auto_grad_layer.trainable_variables)
+      return grads
+
+    manual_grads = distribution.run(manual_fn)
+    auto_grads = distribution.run(auto_fn)
+
+    # Assert gradients calculated manually are close to that from autograd
+    for manual_grad, auto_grad in zip(manual_grads, auto_grads):
+      self.assertAllClose(
+          distribution.experimental_local_results(manual_grad),
+          distribution.experimental_local_results(auto_grad),
+          atol=5e-3,
+          rtol=5e-3)
+
+    # Verify that BN moving mean and variance is correct.
+    for manual_var, auto_var in zip(manual_grad_layer.non_trainable_variables,
+                                    auto_grad_layer.non_trainable_variables):
+      self.assertAllClose(manual_var, auto_var)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/nn_layers.py
+++ b/official/vision/modeling/layers/nn_layers.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains common building blocks for neural networks."""
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
+
+from absl import logging
+import tensorflow as tf
+import tensorflow_addons as tfa
+
+from official.modeling import tf_utils
+from official.vision.ops import spatial_transform_ops
+
+
+# Type annotations.
+States = Dict[str, tf.Tensor]
+Activation = Union[str, Callable]
+
+
+def make_divisible(value: float,
+                   divisor: int,
+                   min_value: Optional[float] = None,
+                   round_down_protect: bool = True,
+                   ) -> int:
+  """This is to ensure that all layers have channels that are divisible by 8.
+
+  Args:
+    value: A `float` of original value.
+    divisor: An `int` of the divisor that need to be checked upon.
+    min_value: A `float` of  minimum value threshold.
+    round_down_protect: A `bool` indicating whether round down more than 10%
+      will be allowed.
+
+  Returns:
+    The adjusted value in `int` that is divisible against divisor.
+  """
+  if min_value is None:
+    min_value = divisor
+  new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if round_down_protect and new_value < 0.9 * value:
+    new_value += divisor
+  return int(new_value)
+
+
+def round_filters(filters: int,
+                  multiplier: float,
+                  divisor: int = 8,
+                  min_depth: Optional[int] = None,
+                  round_down_protect: bool = True,
+                  skip: bool = False) -> int:
+  """Rounds number of filters based on width multiplier."""
+  orig_f = filters
+  if skip or not multiplier:
+    return filters
+
+  new_filters = make_divisible(value=filters * multiplier,
+                               divisor=divisor,
+                               min_value=min_depth,
+                               round_down_protect=round_down_protect)
+
+  logging.info('round_filter input=%s output=%s', orig_f, new_filters)
+  return int(new_filters)
+
+
+def get_padding_for_kernel_size(kernel_size):
+  """Compute padding size given kernel size."""
+  if kernel_size == 7:
+    return (3, 3)
+  elif kernel_size == 3:
+    return (1, 1)
+  else:
+    raise ValueError('Padding for kernel size {} not known.'.format(
+        kernel_size))
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SqueezeExcitation(tf.keras.layers.Layer):
+  """Creates a squeeze and excitation layer."""
+
+  def __init__(self,
+               in_filters,
+               out_filters,
+               se_ratio,
+               divisible_by=1,
+               use_3d_input=False,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               gating_activation='sigmoid',
+               round_down_protect=True,
+               **kwargs):
+    """Initializes a squeeze and excitation layer.
+
+    Args:
+      in_filters: An `int` number of filters of the input tensor.
+      out_filters: An `int` number of filters of the output tensor.
+      se_ratio: A `float` or None. If not None, se ratio for the squeeze and
+        excitation layer.
+      divisible_by: An `int` that ensures all inner dimensions are divisible by
+        this number.
+      use_3d_input: A `bool` of whether input is 2D or 3D image.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      gating_activation: A `str` name of the activation function for final
+        gating function.
+      round_down_protect: A `bool` of whether round down more than 10% will be
+        allowed.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(SqueezeExcitation, self).__init__(**kwargs)
+
+    self._in_filters = in_filters
+    self._out_filters = out_filters
+    self._se_ratio = se_ratio
+    self._divisible_by = divisible_by
+    self._round_down_protect = round_down_protect
+    self._use_3d_input = use_3d_input
+    self._activation = activation
+    self._gating_activation = gating_activation
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      if not use_3d_input:
+        self._spatial_axis = [1, 2]
+      else:
+        self._spatial_axis = [1, 2, 3]
+    else:
+      if not use_3d_input:
+        self._spatial_axis = [2, 3]
+      else:
+        self._spatial_axis = [2, 3, 4]
+    self._activation_fn = tf_utils.get_activation(activation)
+    self._gating_activation_fn = tf_utils.get_activation(gating_activation)
+
+  def build(self, input_shape):
+    num_reduced_filters = make_divisible(
+        max(1, int(self._in_filters * self._se_ratio)),
+        divisor=self._divisible_by,
+        round_down_protect=self._round_down_protect)
+
+    self._se_reduce = tf.keras.layers.Conv2D(
+        filters=num_reduced_filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=True,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+
+    self._se_expand = tf.keras.layers.Conv2D(
+        filters=self._out_filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=True,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+
+    super(SqueezeExcitation, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'in_filters': self._in_filters,
+        'out_filters': self._out_filters,
+        'se_ratio': self._se_ratio,
+        'divisible_by': self._divisible_by,
+        'use_3d_input': self._use_3d_input,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'gating_activation': self._gating_activation,
+        'round_down_protect': self._round_down_protect,
+    }
+    base_config = super(SqueezeExcitation, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    x = tf.reduce_mean(inputs, self._spatial_axis, keepdims=True)
+    x = self._activation_fn(self._se_reduce(x))
+    x = self._gating_activation_fn(self._se_expand(x))
+    return x * inputs
+
+
+def get_stochastic_depth_rate(init_rate, i, n):
+  """Get drop connect rate for the ith block.
+
+  Args:
+    init_rate: A `float` of initial drop rate.
+    i: An `int` of order of the current block.
+    n: An `int` total number of blocks.
+
+  Returns:
+    Drop rate of the ith block.
+  """
+  if init_rate is not None:
+    if init_rate < 0 or init_rate > 1:
+      raise ValueError('Initial drop rate must be within 0 and 1.')
+    rate = init_rate * float(i) / n
+  else:
+    rate = None
+  return rate
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class StochasticDepth(tf.keras.layers.Layer):
+  """Creates a stochastic depth layer."""
+
+  def __init__(self, stochastic_depth_drop_rate, **kwargs):
+    """Initializes a stochastic depth layer.
+
+    Args:
+      stochastic_depth_drop_rate: A `float` of drop rate.
+      **kwargs: Additional keyword arguments to be passed.
+
+    Returns:
+      A output `tf.Tensor` of which should have the same shape as input.
+    """
+    super(StochasticDepth, self).__init__(**kwargs)
+    self._drop_rate = stochastic_depth_drop_rate
+
+  def get_config(self):
+    config = {'drop_rate': self._drop_rate}
+    base_config = super(StochasticDepth, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    if training is None:
+      training = tf.keras.backend.learning_phase()
+    if not training or self._drop_rate is None or self._drop_rate == 0:
+      return inputs
+
+    keep_prob = 1.0 - self._drop_rate
+    batch_size = tf.shape(inputs)[0]
+    random_tensor = keep_prob
+    random_tensor += tf.random.uniform(
+        [batch_size] + [1] * (inputs.shape.rank - 1), dtype=inputs.dtype)
+    binary_tensor = tf.floor(random_tensor)
+    output = tf.math.divide(inputs, keep_prob) * binary_tensor
+    return output
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+def pyramid_feature_fusion(inputs, target_level):
+  """Fuses all feature maps in the feature pyramid at the target level.
+
+  Args:
+    inputs: A dictionary containing the feature pyramid. The size of the input
+      tensor needs to be fixed.
+    target_level: An `int` of the target feature level for feature fusion.
+
+  Returns:
+    A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width,
+      feature_channel].
+  """
+  # Convert keys to int.
+  pyramid_feats = {int(k): v for k, v in inputs.items()}
+  min_level = min(pyramid_feats.keys())
+  max_level = max(pyramid_feats.keys())
+  resampled_feats = []
+
+  for l in range(min_level, max_level + 1):
+    if l == target_level:
+      resampled_feats.append(pyramid_feats[l])
+    else:
+      feat = pyramid_feats[l]
+      target_size = list(feat.shape[1:3])
+      target_size[0] *= 2**(l - target_level)
+      target_size[1] *= 2**(l - target_level)
+      # Casts feat to float32 so the resize op can be run on TPU.
+      feat = tf.cast(feat, tf.float32)
+      feat = tf.image.resize(
+          feat, size=target_size, method=tf.image.ResizeMethod.BILINEAR)
+      # Casts it back to be compatible with the rest opetations.
+      feat = tf.cast(feat, pyramid_feats[l].dtype)
+      resampled_feats.append(feat)
+
+  return tf.math.add_n(resampled_feats)
+
+
+class PanopticFPNFusion(tf.keras.Model):
+  """Creates a Panoptic FPN feature Fusion layer.
+
+  This implements feature fusion for semantic segmentation head from the paper:
+  Alexander Kirillov, Ross Girshick, Kaiming He and Piotr Dollar.
+  Panoptic Feature Pyramid Networks.
+  (https://arxiv.org/pdf/1901.02446.pdf)
+  """
+
+  def __init__(
+      self,
+      min_level: int = 2,
+      max_level: int = 5,
+      target_level: int = 2,
+      num_filters: int = 128,
+      num_fpn_filters: int = 256,
+      activation: str = 'relu',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+
+    """Initializes panoptic FPN feature fusion layer.
+
+    Args:
+      min_level: An `int` of minimum level to use in feature fusion.
+      max_level: An `int` of maximum level to use in feature fusion.
+      target_level: An `int` of the target feature level for feature fusion.
+      num_filters: An `int` number of filters in conv2d layers.
+      num_fpn_filters: An `int` number of filters in the FPN outputs
+      activation: A `str` name of the activation function.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    Returns:
+      A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width,
+        feature_channel].
+    """
+    if target_level > max_level:
+      raise ValueError('target_level should be less than max_level')
+
+    self._config_dict = {
+        'min_level': min_level,
+        'max_level': max_level,
+        'target_level': target_level,
+        'num_filters': num_filters,
+        'num_fpn_filters': num_fpn_filters,
+        'activation': activation,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+    }
+    norm = tfa.layers.GroupNormalization
+    conv2d = tf.keras.layers.Conv2D
+    activation_fn = tf_utils.get_activation(activation)
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      norm_axis = -1
+    else:
+      norm_axis = 1
+    inputs = self._build_inputs(num_fpn_filters, min_level, max_level)
+
+    upscaled_features = []
+    for level in range(min_level, max_level + 1):
+      num_conv_layers = max(1, level - target_level)
+      x = inputs[str(level)]
+      for i in range(num_conv_layers):
+        x = conv2d(
+            filters=num_filters,
+            kernel_size=3,
+            padding='same',
+            kernel_initializer=tf.keras.initializers.VarianceScaling(),
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer)(x)
+        x = norm(groups=32, axis=norm_axis)(x)
+        x = activation_fn(x)
+        if level != target_level:
+          x = spatial_transform_ops.nearest_upsampling(x, scale=2)
+      upscaled_features.append(x)
+
+    fused_features = tf.math.add_n(upscaled_features)
+    self._output_specs = {str(target_level): fused_features.get_shape()}
+
+    super(PanopticFPNFusion, self).__init__(
+        inputs=inputs, outputs=fused_features, **kwargs)
+
+  def _build_inputs(self, num_filters: int,
+                    min_level: int, max_level: int):
+    inputs = {}
+    for level in range(min_level, max_level + 1):
+      inputs[str(level)] = tf.keras.Input(shape=[None, None, num_filters])
+    return inputs
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def output_specs(self) -> Mapping[str, tf.TensorShape]:
+    """A dict of {level: TensorShape} pairs for the model output."""
+    return self._output_specs
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class Scale(tf.keras.layers.Layer):
+  """Scales the input by a trainable scalar weight.
+
+  This is useful for applying ReZero to layers, which improves convergence
+  speed. This implements the paper:
+  ReZero is All You Need: Fast Convergence at Large Depth.
+  (https://arxiv.org/pdf/2003.04887.pdf).
+  """
+
+  def __init__(
+      self,
+      initializer: tf.keras.initializers.Initializer = 'ones',
+      regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+    """Initializes a scale layer.
+
+    Args:
+      initializer: A `str` of initializer for the scalar weight.
+      regularizer: A `tf.keras.regularizers.Regularizer` for the scalar weight.
+      **kwargs: Additional keyword arguments to be passed to this layer.
+
+    Returns:
+      An `tf.Tensor` of which should have the same shape as input.
+    """
+    super(Scale, self).__init__(**kwargs)
+
+    self._initializer = initializer
+    self._regularizer = regularizer
+
+    self._scale = self.add_weight(
+        name='scale',
+        shape=[],
+        dtype=self.dtype,
+        initializer=self._initializer,
+        regularizer=self._regularizer,
+        trainable=True)
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'initializer': self._initializer,
+        'regularizer': self._regularizer,
+    }
+    base_config = super(Scale, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    """Calls the layer with the given inputs."""
+    scale = tf.cast(self._scale, inputs.dtype)
+    return scale * inputs
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class TemporalSoftmaxPool(tf.keras.layers.Layer):
+  """Creates a network layer corresponding to temporal softmax pooling.
+
+  This is useful for multi-class logits (used in e.g., Charades). Modified from
+  AssembleNet Charades evaluation from:
+
+  Michael S. Ryoo, AJ Piergiovanni, Mingxing Tan, Anelia Angelova.
+  AssembleNet: Searching for Multi-Stream Neural Connectivity in Video
+  Architectures.
+  (https://arxiv.org/pdf/1905.13209.pdf).
+  """
+
+  def call(self, inputs):
+    """Calls the layer with the given inputs."""
+    assert inputs.shape.rank in (3, 4, 5)
+    frames = tf.shape(inputs)[1]
+    pre_logits = inputs / tf.sqrt(tf.cast(frames, inputs.dtype))
+    activations = tf.nn.softmax(pre_logits, axis=1)
+    outputs = inputs * activations
+    return outputs
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class PositionalEncoding(tf.keras.layers.Layer):
+  """Creates a network layer that adds a sinusoidal positional encoding.
+
+  Positional encoding is incremented across frames, and is added to the input.
+  The positional encoding is first weighted at 0 so that the network can choose
+  to ignore it. This implements:
+
+  Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
+  Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin.
+  Attention Is All You Need.
+  (https://arxiv.org/pdf/1706.03762.pdf).
+  """
+
+  def __init__(self,
+               initializer: tf.keras.initializers.Initializer = 'zeros',
+               cache_encoding: bool = False,
+               state_prefix: Optional[str] = None,
+               **kwargs):
+    """Initializes positional encoding.
+
+    Args:
+      initializer: A `str` of initializer for weighting the positional encoding.
+      cache_encoding: A `bool`. If True, cache the positional encoding tensor
+        after calling build. Otherwise, rebuild the tensor for every call.
+        Setting this to False can be useful when we want to input a variable
+        number of frames, so the positional encoding tensor can change shape.
+      state_prefix: a prefix string to identify states.
+      **kwargs: Additional keyword arguments to be passed to this layer.
+
+    Returns:
+      A `tf.Tensor` of which should have the same shape as input.
+    """
+    super(PositionalEncoding, self).__init__(**kwargs)
+    self._initializer = initializer
+    self._cache_encoding = cache_encoding
+    self._pos_encoding = None
+    self._rezero = Scale(initializer=initializer, name='rezero')
+    state_prefix = state_prefix if state_prefix is not None else ''
+    self._state_prefix = state_prefix
+    self._frame_count_name = f'{state_prefix}_pos_enc_frame_count'
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'initializer': self._initializer,
+        'cache_encoding': self._cache_encoding,
+        'state_prefix': self._state_prefix,
+    }
+    base_config = super(PositionalEncoding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def _positional_encoding(self,
+                           num_positions: Union[int, tf.Tensor],
+                           hidden_size: Union[int, tf.Tensor],
+                           start_position: Union[int, tf.Tensor] = 0,
+                           dtype: str = 'float32') -> tf.Tensor:
+    """Creates a sequence of sinusoidal positional encoding vectors.
+
+    Args:
+      num_positions: the total number of positions (frames).
+      hidden_size: the number of channels used for the hidden vectors.
+      start_position: the start position.
+      dtype: the dtype of the output tensor.
+
+    Returns:
+      The positional encoding tensor with shape [num_positions, hidden_size].
+    """
+    if isinstance(start_position, tf.Tensor) and start_position.shape.rank == 1:
+      start_position = start_position[0]
+
+    # Calling `tf.range` with `dtype=tf.bfloat16` results in an error,
+    # so we cast afterward.
+    positions = tf.range(start_position, start_position + num_positions)
+    positions = tf.cast(positions, dtype)[:, tf.newaxis]
+    idx = tf.range(hidden_size)[tf.newaxis, :]
+
+    power = tf.cast(2 * (idx // 2), dtype)
+    power /= tf.cast(hidden_size, dtype)
+    angles = 1. / tf.math.pow(10_000., power)
+    radians = positions * angles
+
+    sin = tf.math.sin(radians[:, 0::2])
+    cos = tf.math.cos(radians[:, 1::2])
+    pos_encoding = tf.concat([sin, cos], axis=-1)
+
+    return pos_encoding
+
+  def _get_pos_encoding(self,
+                        input_shape: tf.Tensor,
+                        frame_count: int = 0) -> tf.Tensor:
+    """Calculates the positional encoding from the input shape.
+
+    Args:
+      input_shape: the shape of the input.
+      frame_count: a count of frames that indicates the index of the first
+        frame.
+
+    Returns:
+      The positional encoding tensor with shape [num_positions, hidden_size].
+
+    """
+    frames = input_shape[1]
+    channels = input_shape[-1]
+    pos_encoding = self._positional_encoding(
+        frames, channels, start_position=frame_count, dtype=self.dtype)
+    pos_encoding = tf.reshape(pos_encoding, [1, frames, 1, 1, channels])
+    return pos_encoding
+
+  def build(self, input_shape):
+    """Builds the layer with the given input shape.
+
+    Args:
+      input_shape: The input shape.
+
+    Raises:
+      ValueError: If using 'channels_first' data format.
+    """
+    if tf.keras.backend.image_data_format() == 'channels_first':
+      raise ValueError('"channels_first" mode is unsupported.')
+
+    if self._cache_encoding:
+      self._pos_encoding = self._get_pos_encoding(input_shape)
+
+    super(PositionalEncoding, self).build(input_shape)
+
+  def call(
+      self,
+      inputs: tf.Tensor,
+      states: Optional[States] = None,
+      output_states: bool = True,
+  ) -> Union[tf.Tensor, Tuple[tf.Tensor, States]]:
+    """Calls the layer with the given inputs.
+
+    Args:
+      inputs: An input `tf.Tensor`.
+      states: A `dict` of states such that, if any of the keys match for this
+        layer, will overwrite the contents of the buffer(s). Expected keys
+        include `state_prefix + '_pos_enc_frame_count'`.
+      output_states: A `bool`. If True, returns the output tensor and output
+        states. Returns just the output tensor otherwise.
+
+    Returns:
+      An output `tf.Tensor` (and optionally the states if `output_states=True`).
+
+    Raises:
+      ValueError: If using 'channels_first' data format.
+    """
+    states = dict(states) if states is not None else {}
+
+    # Keep a count of frames encountered across input iterations in
+    # num_frames to be able to accurately update the positional encoding.
+    num_frames = tf.shape(inputs)[1]
+    frame_count = tf.cast(states.get(self._frame_count_name, [0]), tf.int32)
+    states[self._frame_count_name] = frame_count + num_frames
+
+    if self._cache_encoding:
+      pos_encoding = self._pos_encoding
+    else:
+      pos_encoding = self._get_pos_encoding(
+          tf.shape(inputs), frame_count=frame_count)
+    pos_encoding = tf.cast(pos_encoding, inputs.dtype)
+    pos_encoding = self._rezero(pos_encoding)
+    outputs = inputs + pos_encoding
+
+    return (outputs, states) if output_states else outputs
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class GlobalAveragePool3D(tf.keras.layers.Layer):
+  """Creates a global average pooling layer with causal mode.
+
+  Implements causal mode, which runs a cumulative sum (with `tf.cumsum`) across
+  frames in the time dimension, allowing the use of a stream buffer. Sums any
+  valid input state with the current input to allow state to accumulate over
+  several iterations.
+  """
+
+  def __init__(self,
+               keepdims: bool = False,
+               causal: bool = False,
+               state_prefix: Optional[str] = None,
+               **kwargs):
+    """Initializes a global average pool layer.
+
+    Args:
+      keepdims: A `bool`. If True, keep the averaged dimensions.
+      causal: A `bool` of whether to run in causal mode with a cumulative sum
+        across frames.
+      state_prefix: a prefix string to identify states.
+      **kwargs: Additional keyword arguments to be passed to this layer.
+
+    Returns:
+      An output `tf.Tensor`.
+    """
+    super(GlobalAveragePool3D, self).__init__(**kwargs)
+
+    self._keepdims = keepdims
+    self._causal = causal
+    state_prefix = state_prefix if state_prefix is not None else ''
+    self._state_prefix = state_prefix
+
+    self._state_name = f'{state_prefix}_pool_buffer'
+    self._frame_count_name = f'{state_prefix}_pool_frame_count'
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'keepdims': self._keepdims,
+        'causal': self._causal,
+        'state_prefix': self._state_prefix,
+    }
+    base_config = super(GlobalAveragePool3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self,
+           inputs: tf.Tensor,
+           states: Optional[States] = None,
+           output_states: bool = True
+           ) -> Union[tf.Tensor, Tuple[tf.Tensor, States]]:
+    """Calls the layer with the given inputs.
+
+    Args:
+      inputs: An input `tf.Tensor`.
+      states: A `dict` of states such that, if any of the keys match for this
+        layer, will overwrite the contents of the buffer(s).
+        Expected keys include `state_prefix + '__pool_buffer'` and
+        `state_prefix + '__pool_frame_count'`.
+      output_states: A `bool`. If True, returns the output tensor and output
+        states. Returns just the output tensor otherwise.
+
+    Returns:
+      An output `tf.Tensor` (and optionally the states if `output_states=True`).
+      If `causal=True`, the output tensor will have shape
+      `[batch_size, num_frames, 1, 1, channels]` if `keepdims=True`. We keep
+      the frame dimension in this case to simulate a cumulative global average
+      as if we are inputting one frame at a time. If `causal=False`, the output
+      is equivalent to `tf.keras.layers.GlobalAveragePooling3D` with shape
+      `[batch_size, 1, 1, 1, channels]` if `keepdims=True` (plus the optional
+      buffer stored in `states`).
+
+    Raises:
+      ValueError: If using 'channels_first' data format.
+    """
+    states = dict(states) if states is not None else {}
+
+    if tf.keras.backend.image_data_format() == 'channels_first':
+      raise ValueError('"channels_first" mode is unsupported.')
+
+    # Shape: [batch_size, 1, 1, 1, channels]
+    buffer = states.get(self._state_name, None)
+    if buffer is None:
+      buffer = tf.zeros_like(inputs[:, :1, :1, :1], dtype=inputs.dtype)
+      states[self._state_name] = buffer
+
+    # Keep a count of frames encountered across input iterations in
+    # num_frames to be able to accurately take a cumulative average across
+    # all frames when running in streaming mode
+    num_frames = tf.shape(inputs)[1]
+    frame_count = states.get(self._frame_count_name, tf.constant([0]))
+    frame_count = tf.cast(frame_count, tf.int32)
+    states[self._frame_count_name] = frame_count + num_frames
+
+    if self._causal:
+      # Take a mean of spatial dimensions to make computation more efficient.
+      x = tf.reduce_mean(inputs, axis=[2, 3], keepdims=True)
+      x = tf.cumsum(x, axis=1)
+      x = x + buffer
+
+      # The last frame will be the value of the next state
+      # Shape: [batch_size, 1, 1, 1, channels]
+      states[self._state_name] = x[:, -1:]
+
+      # In causal mode, the divisor increments by 1 for every frame to
+      # calculate cumulative averages instead of one global average
+      mean_divisors = tf.range(num_frames) + frame_count + 1
+      mean_divisors = tf.reshape(mean_divisors, [1, num_frames, 1, 1, 1])
+      mean_divisors = tf.cast(mean_divisors, x.dtype)
+
+      # Shape: [batch_size, num_frames, 1, 1, channels]
+      x = x / mean_divisors
+    else:
+      # In non-causal mode, we (optionally) sum across frames to take a
+      # cumulative average across input iterations rather than individual
+      # frames. If no buffer state is passed, this essentially becomes
+      # regular global average pooling.
+      # Shape: [batch_size, 1, 1, 1, channels]
+      x = tf.reduce_sum(inputs, axis=(1, 2, 3), keepdims=True)
+      x = x / tf.cast(tf.shape(inputs)[2] * tf.shape(inputs)[3], x.dtype)
+      x = x + buffer
+
+      # Shape: [batch_size, 1, 1, 1, channels]
+      states[self._state_name] = x
+
+      x = x / tf.cast(frame_count + num_frames, x.dtype)
+
+    if not self._keepdims:
+      x = tf.squeeze(x, axis=(1, 2, 3))
+
+    return (x, states) if output_states else x
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SpatialAveragePool3D(tf.keras.layers.Layer):
+  """Creates a global average pooling layer pooling across spatial dimentions."""
+
+  def __init__(self, keepdims: bool = False, **kwargs):
+    """Initializes a global average pool layer.
+
+    Args:
+      keepdims: A `bool`. If True, keep the averaged dimensions.
+      **kwargs: Additional keyword arguments to be passed to this layer.
+
+    Returns:
+      An output `tf.Tensor`.
+    """
+    super(SpatialAveragePool3D, self).__init__(**kwargs)
+    self._keepdims = keepdims
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'keepdims': self._keepdims,
+    }
+    base_config = super(SpatialAveragePool3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+    """Builds the layer with the given input shape."""
+    if tf.keras.backend.image_data_format() == 'channels_first':
+      raise ValueError('"channels_first" mode is unsupported.')
+
+    super(SpatialAveragePool3D, self).build(input_shape)
+
+  def call(self, inputs):
+    """Calls the layer with the given inputs."""
+    if inputs.shape.rank != 5:
+      raise ValueError(
+          'Input should have rank {}, got {}'.format(5, inputs.shape.rank))
+
+    return tf.reduce_mean(inputs, axis=(2, 3), keepdims=self._keepdims)
+
+
+class CausalConvMixin:
+  """Mixin class to implement CausalConv for `tf.keras.layers.Conv` layers."""
+
+  @property
+  def use_buffered_input(self) -> bool:
+    return self._use_buffered_input
+
+  @use_buffered_input.setter
+  def use_buffered_input(self, variable: bool):
+    self._use_buffered_input = variable
+
+  def _compute_buffered_causal_padding(self,
+                                       inputs: tf.Tensor,
+                                       use_buffered_input: bool = False,
+                                       time_axis: int = 1,
+                                       ) -> List[List[int]]:
+    """Calculates padding for 'causal' option for conv layers.
+
+    Args:
+      inputs: An optional input `tf.Tensor` to be padded.
+      use_buffered_input: A `bool`. If True, use 'valid' padding along the time
+        dimension. This should be set when applying the stream buffer.
+      time_axis: An `int` of the axis of the time dimension.
+
+    Returns:
+      A list of paddings for `tf.pad`.
+    """
+    input_shape = tf.shape(inputs)[1:-1]
+
+    if tf.keras.backend.image_data_format() == 'channels_first':
+      raise ValueError('"channels_first" mode is unsupported.')
+
+    kernel_size_effective = [
+        (self.kernel_size[i] +
+         (self.kernel_size[i] - 1) * (self.dilation_rate[i] - 1))
+        for i in range(self.rank)
+    ]
+    pad_total = [kernel_size_effective[0] - 1]
+    for i in range(1, self.rank):
+      overlap = (input_shape[i] - 1) % self.strides[i] + 1
+      pad_total.append(tf.maximum(kernel_size_effective[i] - overlap, 0))
+    pad_beg = [pad_total[i] // 2 for i in range(self.rank)]
+    pad_end = [pad_total[i] - pad_beg[i] for i in range(self.rank)]
+    padding = [[pad_beg[i], pad_end[i]] for i in range(self.rank)]
+    padding = [[0, 0]] + padding + [[0, 0]]
+
+    if use_buffered_input:
+      padding[time_axis] = [0, 0]
+    else:
+      padding[time_axis] = [padding[time_axis][0] + padding[time_axis][1], 0]
+    return padding
+
+  def _causal_validate_init(self):
+    """Validates the Conv layer initial configuration."""
+    # Overriding this method is meant to circumvent unnecessary errors when
+    # using causal padding.
+    if (self.filters is not None
+        and self.filters % self.groups != 0):
+      raise ValueError(
+          'The number of filters must be evenly divisible by the number of '
+          'groups. Received: groups={}, filters={}'.format(
+              self.groups, self.filters))
+
+    if not all(self.kernel_size):
+      raise ValueError('The argument `kernel_size` cannot contain 0(s). '
+                       'Received: %s' % (self.kernel_size,))
+
+  def _buffered_spatial_output_shape(self, spatial_output_shape: List[int]):
+    """Computes the spatial output shape from the input shape."""
+    # When buffer padding, use 'valid' padding across time. The output shape
+    # across time should be the input shape minus any padding, assuming
+    # the stride across time is 1.
+    if self._use_buffered_input and spatial_output_shape[0] is not None:
+      padding = self._compute_buffered_causal_padding(
+          tf.zeros([1] + spatial_output_shape + [1]), use_buffered_input=False)
+      spatial_output_shape[0] -= sum(padding[1])
+    return spatial_output_shape
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class Conv2D(tf.keras.layers.Conv2D, CausalConvMixin):
+  """Conv2D layer supporting CausalConv.
+
+  Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
+  which applies causal padding to the temporal dimension, and same padding in
+  the spatial dimensions.
+  """
+
+  def __init__(self, *args, use_buffered_input=False, **kwargs):
+    """Initializes conv2d.
+
+    Args:
+      *args: Arguments to be passed.
+      use_buffered_input: A `bool`. If True, the input is expected to be padded
+        beforehand. In effect, calling this layer will use 'valid' padding on
+        the temporal dimension to simulate 'causal' padding.
+      **kwargs: Additional keyword arguments to be passed.
+
+    Returns:
+      An output `tf.Tensor` of the Conv2D operation.
+    """
+    super(Conv2D, self).__init__(*args, **kwargs)
+    self._use_buffered_input = use_buffered_input
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'use_buffered_input': self._use_buffered_input,
+    }
+    base_config = super(Conv2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def _compute_causal_padding(self, inputs):
+    """Computes causal padding dimensions for the given inputs."""
+    return self._compute_buffered_causal_padding(
+        inputs, use_buffered_input=self._use_buffered_input)
+
+  def _validate_init(self):
+    """Validates the Conv layer initial configuration."""
+    self._causal_validate_init()
+
+  def _spatial_output_shape(self, spatial_input_shape: List[int]):
+    """Computes the spatial output shape from the input shape."""
+    shape = super(Conv2D, self)._spatial_output_shape(spatial_input_shape)
+    return self._buffered_spatial_output_shape(shape)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class DepthwiseConv2D(tf.keras.layers.DepthwiseConv2D, CausalConvMixin):
+  """DepthwiseConv2D layer supporting CausalConv.
+
+  Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
+  which applies causal padding to the temporal dimension, and same padding in
+  the spatial dimensions.
+  """
+
+  def __init__(self, *args, use_buffered_input=False, **kwargs):
+    """Initializes depthwise conv2d.
+
+    Args:
+      *args: Arguments to be passed.
+      use_buffered_input: A `bool`. If True, the input is expected to be padded
+        beforehand. In effect, calling this layer will use 'valid' padding on
+        the temporal dimension to simulate 'causal' padding.
+      **kwargs: Additional keyword arguments to be passed.
+
+    Returns:
+      An output `tf.Tensor` of the DepthwiseConv2D operation.
+    """
+    super(DepthwiseConv2D, self).__init__(*args, **kwargs)
+    self._use_buffered_input = use_buffered_input
+
+    # Causal padding is unsupported by default for DepthwiseConv2D,
+    # so we resort to valid padding internally. However, we handle
+    # causal padding as a special case with `self._is_causal`, which is
+    # defined by the super class.
+    if self.padding == 'causal':
+      self.padding = 'valid'
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'use_buffered_input': self._use_buffered_input,
+    }
+    base_config = super(DepthwiseConv2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    """Calls the layer with the given inputs."""
+    if self._is_causal:
+      inputs = tf.pad(inputs, self._compute_causal_padding(inputs))
+    return super(DepthwiseConv2D, self).call(inputs)
+
+  def _compute_causal_padding(self, inputs):
+    """Computes causal padding dimensions for the given inputs."""
+    return self._compute_buffered_causal_padding(
+        inputs, use_buffered_input=self._use_buffered_input)
+
+  def _validate_init(self):
+    """Validates the Conv layer initial configuration."""
+    self._causal_validate_init()
+
+  def _spatial_output_shape(self, spatial_input_shape: List[int]):
+    """Computes the spatial output shape from the input shape."""
+    shape = super(DepthwiseConv2D, self)._spatial_output_shape(
+        spatial_input_shape)
+    return self._buffered_spatial_output_shape(shape)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class Conv3D(tf.keras.layers.Conv3D, CausalConvMixin):
+  """Conv3D layer supporting CausalConv.
+
+  Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
+  which applies causal padding to the temporal dimension, and same padding in
+  the spatial dimensions.
+  """
+
+  def __init__(self, *args, use_buffered_input=False, **kwargs):
+    """Initializes conv3d.
+
+    Args:
+      *args: Arguments to be passed.
+      use_buffered_input: A `bool`. If True, the input is expected to be padded
+        beforehand. In effect, calling this layer will use 'valid' padding on
+        the temporal dimension to simulate 'causal' padding.
+      **kwargs: Additional keyword arguments to be passed.
+
+    Returns:
+      An output `tf.Tensor` of the Conv3D operation.
+    """
+    super(Conv3D, self).__init__(*args, **kwargs)
+    self._use_buffered_input = use_buffered_input
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'use_buffered_input': self._use_buffered_input,
+    }
+    base_config = super(Conv3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    """Call the layer with the given inputs."""
+    # Note: tf.nn.conv3d with depthwise kernels on CPU is currently only
+    # supported when compiling with TF graph (XLA) using tf.function, so it
+    # is compiled by default here (b/186463870).
+    conv_fn = tf.function(super(Conv3D, self).call, jit_compile=True)
+    return conv_fn(inputs)
+
+  def _compute_causal_padding(self, inputs):
+    """Computes causal padding dimensions for the given inputs."""
+    return self._compute_buffered_causal_padding(
+        inputs, use_buffered_input=self._use_buffered_input)
+
+  def _validate_init(self):
+    """Validates the Conv layer initial configuration."""
+    self._causal_validate_init()
+
+  def _spatial_output_shape(self, spatial_input_shape: List[int]):
+    """Computes the spatial output shape from the input shape."""
+    shape = super(Conv3D, self)._spatial_output_shape(spatial_input_shape)
+    return self._buffered_spatial_output_shape(shape)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SpatialPyramidPooling(tf.keras.layers.Layer):
+  """Implements the Atrous Spatial Pyramid Pooling.
+
+  References:
+    [Rethinking Atrous Convolution for Semantic Image Segmentation](
+      https://arxiv.org/pdf/1706.05587.pdf)
+    [Encoder-Decoder with Atrous Separable Convolution for Semantic Image
+    Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
+  """
+
+  def __init__(
+      self,
+      output_channels: int,
+      dilation_rates: List[int],
+      pool_kernel_size: Optional[List[int]] = None,
+      use_sync_bn: bool = False,
+      batchnorm_momentum: float = 0.99,
+      batchnorm_epsilon: float = 0.001,
+      activation: str = 'relu',
+      dropout: float = 0.5,
+      kernel_initializer: str = 'GlorotUniform',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      interpolation: str = 'bilinear',
+      use_depthwise_convolution: bool = False,
+      **kwargs):
+    """Initializes `SpatialPyramidPooling`.
+
+    Args:
+      output_channels: Number of channels produced by SpatialPyramidPooling.
+      dilation_rates: A list of integers for parallel dilated conv.
+      pool_kernel_size: A list of integers or None. If None, global average
+        pooling is applied, otherwise an average pooling of pool_kernel_size is
+        applied.
+      use_sync_bn: A bool, whether or not to use sync batch normalization.
+      batchnorm_momentum: A float for the momentum in BatchNorm. Defaults to
+        0.99.
+      batchnorm_epsilon: A float for the epsilon value in BatchNorm. Defaults to
+        0.001.
+      activation: A `str` for type of activation to be used. Defaults to 'relu'.
+      dropout: A float for the dropout rate before output. Defaults to 0.5.
+      kernel_initializer: Kernel initializer for conv layers. Defaults to
+        `glorot_uniform`.
+      kernel_regularizer: Kernel regularizer for conv layers. Defaults to None.
+      interpolation: The interpolation method for upsampling. Defaults to
+        `bilinear`.
+      use_depthwise_convolution: Allows spatial pooling to be separable
+        depthwise convolusions. [Encoder-Decoder with Atrous Separable
+        Convolution for Semantic Image Segmentation](
+         https://arxiv.org/pdf/1802.02611.pdf)
+      **kwargs: Other keyword arguments for the layer.
+    """
+    super().__init__(**kwargs)
+
+    self._output_channels = output_channels
+    self._dilation_rates = dilation_rates
+    self._use_sync_bn = use_sync_bn
+    self._batchnorm_momentum = batchnorm_momentum
+    self._batchnorm_epsilon = batchnorm_epsilon
+    self._activation = activation
+    self._dropout = dropout
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._interpolation = interpolation
+    self._pool_kernel_size = pool_kernel_size
+    self._use_depthwise_convolution = use_depthwise_convolution
+    self._activation_fn = tf_utils.get_activation(activation)
+    if self._use_sync_bn:
+      self._bn_op = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._bn_op = tf.keras.layers.BatchNormalization
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+
+  def build(self, input_shape):
+    height = input_shape[1]
+    width = input_shape[2]
+    channels = input_shape[3]
+
+    self.aspp_layers = []
+
+    conv1 = tf.keras.layers.Conv2D(
+        filters=self._output_channels,
+        kernel_size=(1, 1),
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        use_bias=False)
+    norm1 = self._bn_op(
+        axis=self._bn_axis,
+        momentum=self._batchnorm_momentum,
+        epsilon=self._batchnorm_epsilon)
+
+    self.aspp_layers.append([conv1, norm1])
+
+    for dilation_rate in self._dilation_rates:
+      leading_layers = []
+      kernel_size = (3, 3)
+      if self._use_depthwise_convolution:
+        leading_layers += [
+            tf.keras.layers.DepthwiseConv2D(
+                depth_multiplier=1,
+                kernel_size=kernel_size,
+                padding='same',
+                depthwise_regularizer=self._kernel_regularizer,
+                depthwise_initializer=self._kernel_initializer,
+                dilation_rate=dilation_rate,
+                use_bias=False)
+        ]
+        kernel_size = (1, 1)
+      conv_dilation = leading_layers + [
+          tf.keras.layers.Conv2D(
+              filters=self._output_channels,
+              kernel_size=kernel_size,
+              padding='same',
+              kernel_regularizer=self._kernel_regularizer,
+              kernel_initializer=self._kernel_initializer,
+              dilation_rate=dilation_rate,
+              use_bias=False)
+      ]
+      norm_dilation = self._bn_op(
+          axis=self._bn_axis,
+          momentum=self._batchnorm_momentum,
+          epsilon=self._batchnorm_epsilon)
+
+      self.aspp_layers.append(conv_dilation + [norm_dilation])
+
+    if self._pool_kernel_size is None:
+      pooling = [
+          tf.keras.layers.GlobalAveragePooling2D(),
+          tf.keras.layers.Reshape((1, 1, channels))
+      ]
+    else:
+      pooling = [tf.keras.layers.AveragePooling2D(self._pool_kernel_size)]
+
+    conv2 = tf.keras.layers.Conv2D(
+        filters=self._output_channels,
+        kernel_size=(1, 1),
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        use_bias=False)
+    norm2 = self._bn_op(
+        axis=self._bn_axis,
+        momentum=self._batchnorm_momentum,
+        epsilon=self._batchnorm_epsilon)
+
+    self.aspp_layers.append(pooling + [conv2, norm2])
+
+    self._resizing_layer = tf.keras.layers.Resizing(
+        height, width, interpolation=self._interpolation, dtype=tf.float32)
+
+    self._projection = [
+        tf.keras.layers.Conv2D(
+            filters=self._output_channels,
+            kernel_size=(1, 1),
+            kernel_initializer=self._kernel_initializer,
+            kernel_regularizer=self._kernel_regularizer,
+            use_bias=False),
+        self._bn_op(
+            axis=self._bn_axis,
+            momentum=self._batchnorm_momentum,
+            epsilon=self._batchnorm_epsilon)
+    ]
+    self._dropout_layer = tf.keras.layers.Dropout(rate=self._dropout)
+    self._concat_layer = tf.keras.layers.Concatenate(axis=-1)
+
+  def call(self,
+           inputs: tf.Tensor,
+           training: Optional[bool] = None) -> tf.Tensor:
+    if training is None:
+      training = tf.keras.backend.learning_phase()
+    result = []
+    for i, layers in enumerate(self.aspp_layers):
+      x = inputs
+      for layer in layers:
+        # Apply layers sequentially.
+        x = layer(x, training=training)
+      x = self._activation_fn(x)
+
+      # Apply resize layer to the end of the last set of layers.
+      if i == len(self.aspp_layers) - 1:
+        x = self._resizing_layer(x)
+
+      result.append(tf.cast(x, inputs.dtype))
+    x = self._concat_layer(result)
+    for layer in self._projection:
+      x = layer(x, training=training)
+    x = self._activation_fn(x)
+    return self._dropout_layer(x)
+
+  def get_config(self):
+    config = {
+        'output_channels': self._output_channels,
+        'dilation_rates': self._dilation_rates,
+        'pool_kernel_size': self._pool_kernel_size,
+        'use_sync_bn': self._use_sync_bn,
+        'batchnorm_momentum': self._batchnorm_momentum,
+        'batchnorm_epsilon': self._batchnorm_epsilon,
+        'activation': self._activation,
+        'dropout': self._dropout,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'interpolation': self._interpolation,
+    }
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(config.items()))
--- a/official/vision/modeling/layers/nn_layers_test.py
+++ b/official/vision/modeling/layers/nn_layers_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for nn_layers."""
+
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.modeling.layers import nn_layers
+
+
+class NNLayersTest(parameterized.TestCase, tf.test.TestCase):
+
+  def test_scale(self):
+    scale = nn_layers.Scale(initializer=tf.keras.initializers.constant(10.))
+    output = scale(3.)
+    self.assertAllEqual(output, 30.)
+
+  def test_temporal_softmax_pool(self):
+    inputs = tf.range(4, dtype=tf.float32) + 1.
+    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
+    layer = nn_layers.TemporalSoftmaxPool()
+    output = layer(inputs)
+    self.assertAllClose(
+        output,
+        [[[[[0.10153633]]],
+          [[[0.33481020]]],
+          [[[0.82801306]]],
+          [[[1.82021690]]]]])
+
+  def test_positional_encoding(self):
+    pos_encoding = nn_layers.PositionalEncoding(
+        initializer='ones', cache_encoding=False)
+    pos_encoding_cached = nn_layers.PositionalEncoding(
+        initializer='ones', cache_encoding=True)
+
+    inputs = tf.ones([1, 4, 1, 1, 3])
+    outputs, _ = pos_encoding(inputs)
+    outputs_cached, _ = pos_encoding_cached(inputs)
+
+    expected = tf.constant(
+        [[[[[1.0000000, 1.0000000, 2.0000000]]],
+          [[[1.8414710, 1.0021545, 1.5403023]]],
+          [[[1.9092975, 1.0043088, 0.5838531]]],
+          [[[1.1411200, 1.0064633, 0.0100075]]]]])
+
+    self.assertEqual(outputs.shape, expected.shape)
+    self.assertAllClose(outputs, expected)
+
+    self.assertEqual(outputs.shape, outputs_cached.shape)
+    self.assertAllClose(outputs, outputs_cached)
+
+    inputs = tf.ones([1, 5, 1, 1, 3])
+    _ = pos_encoding(inputs)
+
+  def test_positional_encoding_bfloat16(self):
+    pos_encoding = nn_layers.PositionalEncoding(initializer='ones')
+
+    inputs = tf.ones([1, 4, 1, 1, 3], dtype=tf.bfloat16)
+    outputs, _ = pos_encoding(inputs)
+
+    expected = tf.constant(
+        [[[[[1.0000000, 1.0000000, 2.0000000]]],
+          [[[1.8414710, 1.0021545, 1.5403023]]],
+          [[[1.9092975, 1.0043088, 0.5838531]]],
+          [[[1.1411200, 1.0064633, 0.0100075]]]]])
+
+    self.assertEqual(outputs.shape, expected.shape)
+    self.assertAllClose(outputs, expected)
+
+  def test_global_average_pool_basic(self):
+    pool = nn_layers.GlobalAveragePool3D(keepdims=True)
+
+    inputs = tf.ones([1, 2, 3, 4, 1])
+    outputs = pool(inputs, output_states=False)
+
+    expected = tf.ones([1, 1, 1, 1, 1])
+
+    self.assertEqual(outputs.shape, expected.shape)
+    self.assertAllEqual(outputs, expected)
+
+  def test_positional_encoding_stream(self):
+    pos_encoding = nn_layers.PositionalEncoding(
+        initializer='ones', cache_encoding=False)
+
+    inputs = tf.range(4, dtype=tf.float32) + 1.
+    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
+    inputs = tf.tile(inputs, [1, 1, 1, 1, 3])
+    expected, _ = pos_encoding(inputs)
+
+    for num_splits in [1, 2, 4]:
+      frames = tf.split(inputs, num_splits, axis=1)
+      states = {}
+      predicted = []
+      for frame in frames:
+        output, states = pos_encoding(frame, states=states)
+        predicted.append(output)
+      predicted = tf.concat(predicted, axis=1)
+
+      self.assertEqual(predicted.shape, expected.shape)
+      self.assertAllClose(predicted, expected)
+      self.assertAllClose(predicted, [[[[[1.0000000, 1.0000000, 2.0000000]]],
+                                       [[[2.8414710, 2.0021544, 2.5403023]]],
+                                       [[[3.9092975, 3.0043090, 2.5838532]]],
+                                       [[[4.1411200, 4.0064630, 3.0100074]]]]])
+
+  def test_global_average_pool_keras(self):
+    pool = nn_layers.GlobalAveragePool3D(keepdims=False)
+    keras_pool = tf.keras.layers.GlobalAveragePooling3D()
+
+    inputs = 10 * tf.random.normal([1, 2, 3, 4, 1])
+
+    outputs = pool(inputs, output_states=False)
+    keras_output = keras_pool(inputs)
+
+    self.assertAllEqual(outputs.shape, keras_output.shape)
+    self.assertAllClose(outputs, keras_output)
+
+  def test_stream_global_average_pool(self):
+    gap = nn_layers.GlobalAveragePool3D(keepdims=True, causal=False)
+
+    inputs = tf.range(4, dtype=tf.float32) + 1.
+    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
+    inputs = tf.tile(inputs, [1, 1, 2, 2, 3])
+    expected, _ = gap(inputs)
+
+    for num_splits in [1, 2, 4]:
+      frames = tf.split(inputs, num_splits, axis=1)
+      states = {}
+      predicted = None
+      for frame in frames:
+        predicted, states = gap(frame, states=states)
+
+      self.assertEqual(predicted.shape, expected.shape)
+      self.assertAllClose(predicted, expected)
+      self.assertAllClose(
+          predicted,
+          [[[[[2.5, 2.5, 2.5]]]]])
+
+  def test_causal_stream_global_average_pool(self):
+    gap = nn_layers.GlobalAveragePool3D(keepdims=True, causal=True)
+
+    inputs = tf.range(4, dtype=tf.float32) + 1.
+    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
+    inputs = tf.tile(inputs, [1, 1, 2, 2, 3])
+    expected, _ = gap(inputs)
+
+    for num_splits in [1, 2, 4]:
+      frames = tf.split(inputs, num_splits, axis=1)
+      states = {}
+      predicted = []
+      for frame in frames:
+        x, states = gap(frame, states=states)
+        predicted.append(x)
+      predicted = tf.concat(predicted, axis=1)
+
+      self.assertEqual(predicted.shape, expected.shape)
+      self.assertAllClose(predicted, expected)
+      self.assertAllClose(
+          predicted,
+          [[[[[1.0, 1.0, 1.0]]],
+            [[[1.5, 1.5, 1.5]]],
+            [[[2.0, 2.0, 2.0]]],
+            [[[2.5, 2.5, 2.5]]]]])
+
+  def test_spatial_average_pool(self):
+    pool = nn_layers.SpatialAveragePool3D(keepdims=True)
+
+    inputs = tf.range(64, dtype=tf.float32) + 1.
+    inputs = tf.reshape(inputs, [1, 4, 4, 4, 1])
+
+    output = pool(inputs)
+
+    self.assertEqual(output.shape, [1, 4, 1, 1, 1])
+    self.assertAllClose(
+        output,
+        [[[[[8.50]]],
+          [[[24.5]]],
+          [[[40.5]]],
+          [[[56.5]]]]])
+
+  def test_conv2d_causal(self):
+    conv2d = nn_layers.Conv2D(
+        filters=3,
+        kernel_size=(3, 3),
+        strides=(1, 2),
+        padding='causal',
+        use_buffered_input=True,
+        kernel_initializer='ones',
+        use_bias=False,
+    )
+
+    inputs = tf.ones([1, 4, 2, 3])
+
+    paddings = [[0, 0], [2, 0], [0, 0], [0, 0]]
+    padded_inputs = tf.pad(inputs, paddings)
+    predicted = conv2d(padded_inputs)
+
+    expected = tf.constant(
+        [[[[6.0, 6.0, 6.0]],
+          [[12., 12., 12.]],
+          [[18., 18., 18.]],
+          [[18., 18., 18.]]]])
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+    conv2d.use_buffered_input = False
+    predicted = conv2d(inputs)
+
+    self.assertFalse(conv2d.use_buffered_input)
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+  def test_depthwise_conv2d_causal(self):
+    conv2d = nn_layers.DepthwiseConv2D(
+        kernel_size=(3, 3),
+        strides=(1, 1),
+        padding='causal',
+        use_buffered_input=True,
+        depthwise_initializer='ones',
+        use_bias=False,
+    )
+
+    inputs = tf.ones([1, 2, 2, 3])
+
+    paddings = [[0, 0], [2, 0], [0, 0], [0, 0]]
+    padded_inputs = tf.pad(inputs, paddings)
+    predicted = conv2d(padded_inputs)
+
+    expected = tf.constant(
+        [[[[2., 2., 2.],
+           [2., 2., 2.]],
+          [[4., 4., 4.],
+           [4., 4., 4.]]]])
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+    conv2d.use_buffered_input = False
+    predicted = conv2d(inputs)
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+  def test_conv3d_causal(self):
+    conv3d = nn_layers.Conv3D(
+        filters=3,
+        kernel_size=(3, 3, 3),
+        strides=(1, 2, 2),
+        padding='causal',
+        use_buffered_input=True,
+        kernel_initializer='ones',
+        use_bias=False,
+    )
+
+    inputs = tf.ones([1, 2, 4, 4, 3])
+
+    paddings = [[0, 0], [2, 0], [0, 0], [0, 0], [0, 0]]
+    padded_inputs = tf.pad(inputs, paddings)
+    predicted = conv3d(padded_inputs)
+
+    expected = tf.constant(
+        [[[[[27., 27., 27.],
+            [18., 18., 18.]],
+           [[18., 18., 18.],
+            [12., 12., 12.]]],
+          [[[54., 54., 54.],
+            [36., 36., 36.]],
+           [[36., 36., 36.],
+            [24., 24., 24.]]]]])
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+    conv3d.use_buffered_input = False
+    predicted = conv3d(inputs)
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+  def test_depthwise_conv3d_causal(self):
+    conv3d = nn_layers.Conv3D(
+        filters=3,
+        kernel_size=(3, 3, 3),
+        strides=(1, 2, 2),
+        padding='causal',
+        use_buffered_input=True,
+        kernel_initializer='ones',
+        use_bias=False,
+        groups=3,
+    )
+
+    inputs = tf.ones([1, 2, 4, 4, 3])
+
+    paddings = [[0, 0], [2, 0], [0, 0], [0, 0], [0, 0]]
+    padded_inputs = tf.pad(inputs, paddings)
+    predicted = conv3d(padded_inputs)
+
+    expected = tf.constant(
+        [[[[[9.0, 9.0, 9.0],
+            [6.0, 6.0, 6.0]],
+           [[6.0, 6.0, 6.0],
+            [4.0, 4.0, 4.0]]],
+          [[[18.0, 18.0, 18.0],
+            [12., 12., 12.]],
+           [[12., 12., 12.],
+            [8., 8., 8.]]]]])
+
+    output_shape = conv3d._spatial_output_shape([4, 4, 4])
+    self.assertAllClose(output_shape, [2, 2, 2])
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+    conv3d.use_buffered_input = False
+    predicted = conv3d(inputs)
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+  def test_conv3d_causal_padding_2d(self):
+    """Test to ensure causal padding works like standard padding."""
+    conv3d = nn_layers.Conv3D(
+        filters=1,
+        kernel_size=(1, 3, 3),
+        strides=(1, 2, 2),
+        padding='causal',
+        use_buffered_input=False,
+        kernel_initializer='ones',
+        use_bias=False,
+    )
+
+    keras_conv3d = tf.keras.layers.Conv3D(
+        filters=1,
+        kernel_size=(1, 3, 3),
+        strides=(1, 2, 2),
+        padding='same',
+        kernel_initializer='ones',
+        use_bias=False,
+    )
+
+    inputs = tf.ones([1, 1, 4, 4, 1])
+
+    predicted = conv3d(inputs)
+    expected = keras_conv3d(inputs)
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+    self.assertAllClose(predicted,
+                        [[[[[9.],
+                            [6.]],
+                           [[6.],
+                            [4.]]]]])
+
+  def test_conv3d_causal_padding_1d(self):
+    """Test to ensure causal padding works like standard padding."""
+    conv3d = nn_layers.Conv3D(
+        filters=1,
+        kernel_size=(3, 1, 1),
+        strides=(2, 1, 1),
+        padding='causal',
+        use_buffered_input=False,
+        kernel_initializer='ones',
+        use_bias=False,
+    )
+
+    keras_conv1d = tf.keras.layers.Conv1D(
+        filters=1,
+        kernel_size=3,
+        strides=2,
+        padding='causal',
+        kernel_initializer='ones',
+        use_bias=False,
+    )
+
+    inputs = tf.ones([1, 4, 1, 1, 1])
+
+    predicted = conv3d(inputs)
+    expected = keras_conv1d(tf.squeeze(inputs, axis=[2, 3]))
+    expected = tf.reshape(expected, [1, 2, 1, 1, 1])
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+    self.assertAllClose(predicted,
+                        [[[[[1.]]],
+                          [[[3.]]]]])
+
+  @parameterized.parameters(
+      (None, []),
+      (None, [6, 12, 18]),
+      ([32, 32], [6, 12, 18]),
+  )
+  def test_aspp(self, pool_kernel_size, dilation_rates):
+    inputs = tf.keras.Input(shape=(64, 64, 128), dtype=tf.float32)
+    layer = nn_layers.SpatialPyramidPooling(
+        output_channels=256,
+        dilation_rates=dilation_rates,
+        pool_kernel_size=pool_kernel_size)
+    output = layer(inputs)
+    self.assertAllEqual([None, 64, 64, 256], output.shape)
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/roi_aligner.py
+++ b/official/vision/modeling/layers/roi_aligner.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of ROI aligner."""
+
+from typing import Mapping
+import tensorflow as tf
+
+from official.vision.ops import spatial_transform_ops
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MultilevelROIAligner(tf.keras.layers.Layer):
+  """Performs ROIAlign for the second stage processing."""
+
+  def __init__(self, crop_size: int = 7, sample_offset: float = 0.5, **kwargs):
+    """Initializes a ROI aligner.
+
+    Args:
+      crop_size: An `int` of the output size of the cropped features.
+      sample_offset: A `float` in [0, 1] of the subpixel sample offset.
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'crop_size': crop_size,
+        'sample_offset': sample_offset,
+    }
+    super(MultilevelROIAligner, self).__init__(**kwargs)
+
+  def call(self,
+           features: Mapping[str, tf.Tensor],
+           boxes: tf.Tensor,
+           training: bool = None):
+    """Generates ROIs.
+
+    Args:
+      features: A dictionary with key as pyramid level and value as features.
+        The features are in shape of
+        [batch_size, height_l, width_l, num_filters].
+      boxes: A 3-D `tf.Tensor` of shape [batch_size, num_boxes, 4]. Each row
+        represents a box with [y1, x1, y2, x2] in un-normalized coordinates.
+        from grid point.
+      training: A `bool` of whether it is in training mode.
+
+    Returns:
+      A 5-D `tf.Tensor` representing feature crop of shape
+      [batch_size, num_boxes, crop_size, crop_size, num_filters].
+    """
+    roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        features,
+        boxes,
+        output_size=self._config_dict['crop_size'],
+        sample_offset=self._config_dict['sample_offset'])
+    return roi_features
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/modeling/layers/roi_aligner_test.py
+++ b/official/vision/modeling/layers/roi_aligner_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for roi_aligner.py."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.modeling.layers import roi_aligner
+
+
+class MultilevelROIAlignerTest(tf.test.TestCase):
+
+  def test_serialize_deserialize(self):
+    kwargs = dict(
+        crop_size=7,
+        sample_offset=0.5,
+    )
+    aligner = roi_aligner.MultilevelROIAligner(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(aligner.get_config(), expected_config)
+
+    new_aligner = roi_aligner.MultilevelROIAligner.from_config(
+        aligner.get_config())
+
+    self.assertAllEqual(aligner.get_config(), new_aligner.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/roi_generator.py
+++ b/official/vision/modeling/layers/roi_generator.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of ROI generator."""
+from typing import Optional, Mapping
+# Import libraries
+import tensorflow as tf
+
+from official.vision.ops import box_ops
+from official.vision.ops import nms
+
+
+def _multilevel_propose_rois(raw_boxes: Mapping[str, tf.Tensor],
+                             raw_scores: Mapping[str, tf.Tensor],
+                             anchor_boxes: Mapping[str, tf.Tensor],
+                             image_shape: tf.Tensor,
+                             pre_nms_top_k: int = 2000,
+                             pre_nms_score_threshold: float = 0.0,
+                             pre_nms_min_size_threshold: float = 0.0,
+                             nms_iou_threshold: float = 0.7,
+                             num_proposals: int = 1000,
+                             use_batched_nms: bool = False,
+                             decode_boxes: bool = True,
+                             clip_boxes: bool = True,
+                             apply_sigmoid_to_score: bool = True):
+  """Proposes RoIs given a group of candidates from different FPN levels.
+
+  The following describes the steps:
+    1. For each individual level:
+      a. Apply sigmoid transform if specified.
+      b. Decode boxes if specified.
+      c. Clip boxes if specified.
+      d. Filter small boxes and those fall outside image if specified.
+      e. Apply pre-NMS filtering including pre-NMS top k and score thresholding.
+      f. Apply NMS.
+    2. Aggregate post-NMS boxes from each level.
+    3. Apply an overall top k to generate the final selected RoIs.
+
+  Args:
+    raw_boxes: A `dict` with keys representing FPN levels and values
+      representing box tenors of shape
+      [batch_size, feature_h, feature_w, num_anchors * 4].
+    raw_scores: A `dict` with keys representing FPN levels and values
+      representing logit tensors of shape
+      [batch_size, feature_h, feature_w, num_anchors].
+    anchor_boxes: A `dict` with keys representing FPN levels and values
+      representing anchor box tensors of shape
+      [batch_size, feature_h * feature_w * num_anchors, 4].
+    image_shape: A `tf.Tensor` of shape [batch_size, 2] where the last dimension
+      are [height, width] of the scaled image.
+    pre_nms_top_k: An `int` of top scoring RPN proposals *per level* to keep
+      before applying NMS. Default: 2000.
+    pre_nms_score_threshold: A `float` between 0 and 1 representing the minimal
+      box score to keep before applying NMS. This is often used as a
+      pre-filtering step for better performance. Default: 0, no filtering is
+      applied.
+    pre_nms_min_size_threshold: A `float` representing the minimal box size in
+      each side (w.r.t. the scaled image) to keep before applying NMS. This is
+      often used as a pre-filtering step for better performance. Default: 0, no
+      filtering is applied.
+    nms_iou_threshold: A `float` between 0 and 1 representing the IoU threshold
+      used for NMS. If 0.0, no NMS is applied. Default: 0.7.
+    num_proposals: An `int` of top scoring RPN proposals *in total* to keep
+      after applying NMS. Default: 1000.
+    use_batched_nms: A `bool` indicating whether NMS is applied in batch using
+      `tf.image.combined_non_max_suppression`. Currently only available in
+      CPU/GPU. Default is False.
+    decode_boxes: A `bool` indicating whether `raw_boxes` needs to be decoded
+      using `anchor_boxes`. If False, use `raw_boxes` directly and ignore
+      `anchor_boxes`. Default is True.
+    clip_boxes: A `bool` indicating whether boxes are first clipped to the
+      scaled image size before appliying NMS. If False, no clipping is applied
+      and `image_shape` is ignored. Default is True.
+    apply_sigmoid_to_score: A `bool` indicating whether apply sigmoid to
+      `raw_scores` before applying NMS. Default is True.
+
+  Returns:
+    selected_rois: A `tf.Tensor` of shape [batch_size, num_proposals, 4],
+      representing the box coordinates of the selected proposals w.r.t. the
+      scaled image.
+    selected_roi_scores: A `tf.Tensor` of shape [batch_size, num_proposals, 1],
+      representing the scores of the selected proposals.
+  """
+  with tf.name_scope('multilevel_propose_rois'):
+    rois = []
+    roi_scores = []
+    image_shape = tf.expand_dims(image_shape, axis=1)
+    for level in sorted(raw_scores.keys()):
+      with tf.name_scope('level_%s' % level):
+        _, feature_h, feature_w, num_anchors_per_location = (
+            raw_scores[level].get_shape().as_list())
+
+        num_boxes = feature_h * feature_w * num_anchors_per_location
+        this_level_scores = tf.reshape(raw_scores[level], [-1, num_boxes])
+        this_level_boxes = tf.reshape(raw_boxes[level], [-1, num_boxes, 4])
+        this_level_anchors = tf.cast(
+            tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
+            dtype=this_level_scores.dtype)
+
+        if apply_sigmoid_to_score:
+          this_level_scores = tf.sigmoid(this_level_scores)
+
+        if decode_boxes:
+          this_level_boxes = box_ops.decode_boxes(
+              this_level_boxes, this_level_anchors)
+        if clip_boxes:
+          this_level_boxes = box_ops.clip_boxes(
+              this_level_boxes, image_shape)
+
+        if pre_nms_min_size_threshold > 0.0:
+          this_level_boxes, this_level_scores = box_ops.filter_boxes(
+              this_level_boxes,
+              this_level_scores,
+              image_shape,
+              pre_nms_min_size_threshold)
+
+        this_level_pre_nms_top_k = min(num_boxes, pre_nms_top_k)
+        this_level_post_nms_top_k = min(num_boxes, num_proposals)
+        if nms_iou_threshold > 0.0:
+          if use_batched_nms:
+            this_level_rois, this_level_roi_scores, _, _ = (
+                tf.image.combined_non_max_suppression(
+                    tf.expand_dims(this_level_boxes, axis=2),
+                    tf.expand_dims(this_level_scores, axis=-1),
+                    max_output_size_per_class=this_level_pre_nms_top_k,
+                    max_total_size=this_level_post_nms_top_k,
+                    iou_threshold=nms_iou_threshold,
+                    score_threshold=pre_nms_score_threshold,
+                    pad_per_class=False,
+                    clip_boxes=False))
+          else:
+            if pre_nms_score_threshold > 0.0:
+              this_level_boxes, this_level_scores = (
+                  box_ops.filter_boxes_by_scores(
+                      this_level_boxes,
+                      this_level_scores,
+                      pre_nms_score_threshold))
+            this_level_boxes, this_level_scores = box_ops.top_k_boxes(
+                this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k)
+            this_level_roi_scores, this_level_rois = (
+                nms.sorted_non_max_suppression_padded(
+                    this_level_scores,
+                    this_level_boxes,
+                    max_output_size=this_level_post_nms_top_k,
+                    iou_threshold=nms_iou_threshold))
+        else:
+          this_level_rois, this_level_roi_scores = box_ops.top_k_boxes(
+              this_level_boxes,
+              this_level_scores,
+              k=this_level_post_nms_top_k)
+
+        rois.append(this_level_rois)
+        roi_scores.append(this_level_roi_scores)
+
+    all_rois = tf.concat(rois, axis=1)
+    all_roi_scores = tf.concat(roi_scores, axis=1)
+
+    with tf.name_scope('top_k_rois'):
+      _, num_valid_rois = all_roi_scores.get_shape().as_list()
+      overall_top_k = min(num_valid_rois, num_proposals)
+
+      selected_rois, selected_roi_scores = box_ops.top_k_boxes(
+          all_rois, all_roi_scores, k=overall_top_k)
+
+    return selected_rois, selected_roi_scores
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MultilevelROIGenerator(tf.keras.layers.Layer):
+  """Proposes RoIs for the second stage processing."""
+
+  def __init__(self,
+               pre_nms_top_k: int = 2000,
+               pre_nms_score_threshold: float = 0.0,
+               pre_nms_min_size_threshold: float = 0.0,
+               nms_iou_threshold: float = 0.7,
+               num_proposals: int = 1000,
+               test_pre_nms_top_k: int = 1000,
+               test_pre_nms_score_threshold: float = 0.0,
+               test_pre_nms_min_size_threshold: float = 0.0,
+               test_nms_iou_threshold: float = 0.7,
+               test_num_proposals: int = 1000,
+               use_batched_nms: bool = False,
+               **kwargs):
+    """Initializes a ROI generator.
+
+    The ROI generator transforms the raw predictions from RPN to ROIs.
+
+    Args:
+      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
+        before applying NMS.
+      pre_nms_score_threshold: A `float` of the score threshold to apply before
+        applying NMS. Proposals whose scores are below this threshold are
+        thrown away.
+      pre_nms_min_size_threshold: A `float` of the threshold of each side of the
+        box (w.r.t. the scaled image). Proposals whose sides are below this
+        threshold are thrown away.
+      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
+      num_proposals: An `int` of the final number of proposals to generate.
+      test_pre_nms_top_k: An `int` of the number of top scores proposals to be
+        kept before applying NMS in testing.
+      test_pre_nms_score_threshold: A `float` of the score threshold to apply
+        before applying NMS in testing. Proposals whose scores are below this
+        threshold are thrown away.
+      test_pre_nms_min_size_threshold: A `float` of the threshold of each side
+        of the box (w.r.t. the scaled image) in testing. Proposals whose sides
+        are below this threshold are thrown away.
+      test_nms_iou_threshold: A `float` in [0, 1] of the NMS IoU threshold in
+        testing.
+      test_num_proposals: An `int` of the final number of proposals to generate
+        in testing.
+      use_batched_nms: A `bool` of whether or not use
+        `tf.image.combined_non_max_suppression`.
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'pre_nms_min_size_threshold': pre_nms_min_size_threshold,
+        'nms_iou_threshold': nms_iou_threshold,
+        'num_proposals': num_proposals,
+        'test_pre_nms_top_k': test_pre_nms_top_k,
+        'test_pre_nms_score_threshold': test_pre_nms_score_threshold,
+        'test_pre_nms_min_size_threshold': test_pre_nms_min_size_threshold,
+        'test_nms_iou_threshold': test_nms_iou_threshold,
+        'test_num_proposals': test_num_proposals,
+        'use_batched_nms': use_batched_nms,
+    }
+    super(MultilevelROIGenerator, self).__init__(**kwargs)
+
+  def call(self,
+           raw_boxes: Mapping[str, tf.Tensor],
+           raw_scores: Mapping[str, tf.Tensor],
+           anchor_boxes: Mapping[str, tf.Tensor],
+           image_shape: tf.Tensor,
+           training: Optional[bool] = None):
+    """Proposes RoIs given a group of candidates from different FPN levels.
+
+    The following describes the steps:
+      1. For each individual level:
+        a. Apply sigmoid transform if specified.
+        b. Decode boxes if specified.
+        c. Clip boxes if specified.
+        d. Filter small boxes and those fall outside image if specified.
+        e. Apply pre-NMS filtering including pre-NMS top k and score
+           thresholding.
+        f. Apply NMS.
+      2. Aggregate post-NMS boxes from each level.
+      3. Apply an overall top k to generate the final selected RoIs.
+
+    Args:
+      raw_boxes: A `dict` with keys representing FPN levels and values
+        representing box tenors of shape
+        [batch, feature_h, feature_w, num_anchors * 4].
+      raw_scores: A `dict` with keys representing FPN levels and values
+        representing logit tensors of shape
+        [batch, feature_h, feature_w, num_anchors].
+      anchor_boxes: A `dict` with keys representing FPN levels and values
+        representing anchor box tensors of shape
+        [batch, feature_h * feature_w * num_anchors, 4].
+      image_shape: A `tf.Tensor` of shape [batch, 2] where the last dimension
+        are [height, width] of the scaled image.
+      training: A `bool` that indicates whether it is in training mode.
+
+    Returns:
+      roi_boxes: A `tf.Tensor` of shape [batch, num_proposals, 4], the proposed
+        ROIs in the scaled image coordinate.
+      roi_scores: A `tf.Tensor` of shape [batch, num_proposals], scores of the
+        proposed ROIs.
+    """
+    roi_boxes, roi_scores = _multilevel_propose_rois(
+        raw_boxes,
+        raw_scores,
+        anchor_boxes,
+        image_shape,
+        pre_nms_top_k=(
+            self._config_dict['pre_nms_top_k'] if training
+            else self._config_dict['test_pre_nms_top_k']),
+        pre_nms_score_threshold=(
+            self._config_dict['pre_nms_score_threshold'] if training
+            else self._config_dict['test_pre_nms_score_threshold']),
+        pre_nms_min_size_threshold=(
+            self._config_dict['pre_nms_min_size_threshold'] if training
+            else self._config_dict['test_pre_nms_min_size_threshold']),
+        nms_iou_threshold=(
+            self._config_dict['nms_iou_threshold'] if training
+            else self._config_dict['test_nms_iou_threshold']),
+        num_proposals=(
+            self._config_dict['num_proposals'] if training
+            else self._config_dict['test_num_proposals']),
+        use_batched_nms=self._config_dict['use_batched_nms'],
+        decode_boxes=True,
+        clip_boxes=True,
+        apply_sigmoid_to_score=True)
+    return roi_boxes, roi_scores
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/modeling/layers/roi_sampler.py
+++ b/official/vision/modeling/layers/roi_sampler.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of ROI sampler."""
+# Import libraries
+
+import tensorflow as tf
+
+from official.vision.modeling.layers import box_sampler
+from official.vision.ops import box_matcher
+from official.vision.ops import iou_similarity
+from official.vision.ops import target_gather
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ROISampler(tf.keras.layers.Layer):
+  """Samples ROIs and assigns targets to the sampled ROIs."""
+
+  def __init__(self,
+               mix_gt_boxes: bool = True,
+               num_sampled_rois: int = 512,
+               foreground_fraction: float = 0.25,
+               foreground_iou_threshold: float = 0.5,
+               background_iou_high_threshold: float = 0.5,
+               background_iou_low_threshold: float = 0,
+               skip_subsampling: bool = False,
+               **kwargs):
+    """Initializes a ROI sampler.
+
+    Args:
+      mix_gt_boxes: A `bool` of whether to mix the groundtruth boxes with
+        proposed ROIs.
+      num_sampled_rois: An `int` of the number of sampled ROIs per image.
+      foreground_fraction: A `float` in [0, 1], what percentage of proposed ROIs
+        should be sampled from the foreground boxes.
+      foreground_iou_threshold: A `float` that represents the IoU threshold for
+        a box to be considered as positive (if >= `foreground_iou_threshold`).
+      background_iou_high_threshold: A `float` that represents the IoU threshold
+        for a box to be considered as negative (if overlap in
+        [`background_iou_low_threshold`, `background_iou_high_threshold`]).
+      background_iou_low_threshold: A `float` that represents the IoU threshold
+        for a box to be considered as negative (if overlap in
+        [`background_iou_low_threshold`, `background_iou_high_threshold`])
+      skip_subsampling: a bool that determines if we want to skip the sampling
+        procedure than balances the fg/bg classes. Used for upper frcnn layers
+        in cascade RCNN.
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'mix_gt_boxes': mix_gt_boxes,
+        'num_sampled_rois': num_sampled_rois,
+        'foreground_fraction': foreground_fraction,
+        'foreground_iou_threshold': foreground_iou_threshold,
+        'background_iou_high_threshold': background_iou_high_threshold,
+        'background_iou_low_threshold': background_iou_low_threshold,
+        'skip_subsampling': skip_subsampling,
+    }
+
+    self._sim_calc = iou_similarity.IouSimilarity()
+    self._box_matcher = box_matcher.BoxMatcher(
+        thresholds=[
+            background_iou_low_threshold, background_iou_high_threshold,
+            foreground_iou_threshold
+        ],
+        indicators=[-3, -1, -2, 1])
+    self._target_gather = target_gather.TargetGather()
+
+    self._sampler = box_sampler.BoxSampler(
+        num_sampled_rois, foreground_fraction)
+    super(ROISampler, self).__init__(**kwargs)
+
+  def call(self, boxes: tf.Tensor, gt_boxes: tf.Tensor, gt_classes: tf.Tensor):
+    """Assigns the proposals with groundtruth classes and performs subsmpling.
+
+    Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
+    following algorithm to generate the final `num_samples_per_image` RoIs.
+      1. Calculates the IoU between each proposal box and each gt_boxes.
+      2. Assigns each proposed box with a groundtruth class and box by choosing
+         the largest IoU overlap.
+      3. Samples `num_samples_per_image` boxes from all proposed boxes, and
+         returns box_targets, class_targets, and RoIs.
+
+    Args:
+      boxes: A `tf.Tensor` of shape of [batch_size, N, 4]. N is the number of
+        proposals before groundtruth assignment. The last dimension is the
+        box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
+        format.
+      gt_boxes: A `tf.Tensor` of shape of [batch_size, MAX_NUM_INSTANCES, 4].
+        The coordinates of gt_boxes are in the pixel coordinates of the scaled
+        image. This tensor might have padding of values -1 indicating the
+        invalid box coordinates.
+      gt_classes: A `tf.Tensor` with a shape of [batch_size, MAX_NUM_INSTANCES].
+        This tensor might have paddings with values of -1 indicating the invalid
+        classes.
+
+    Returns:
+      sampled_rois: A `tf.Tensor` of shape of [batch_size, K, 4], representing
+        the coordinates of the sampled RoIs, where K is the number of the
+        sampled RoIs, i.e. K = num_samples_per_image.
+      sampled_gt_boxes: A `tf.Tensor` of shape of [batch_size, K, 4], storing
+        the box coordinates of the matched groundtruth boxes of the samples
+        RoIs.
+      sampled_gt_classes: A `tf.Tensor` of shape of [batch_size, K], storing the
+        classes of the matched groundtruth boxes of the sampled RoIs.
+      sampled_gt_indices: A `tf.Tensor` of shape of [batch_size, K], storing the
+        indices of the sampled groudntruth boxes in the original `gt_boxes`
+        tensor, i.e.,
+        gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
+    """
+    gt_boxes = tf.cast(gt_boxes, dtype=boxes.dtype)
+    if self._config_dict['mix_gt_boxes']:
+      boxes = tf.concat([boxes, gt_boxes], axis=1)
+
+    boxes_invalid_mask = tf.less(
+        tf.reduce_max(boxes, axis=-1, keepdims=True), 0.0)
+    gt_invalid_mask = tf.less(
+        tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
+    similarity_matrix = self._sim_calc(boxes, gt_boxes, boxes_invalid_mask,
+                                       gt_invalid_mask)
+    matched_gt_indices, match_indicators = self._box_matcher(similarity_matrix)
+    positive_matches = tf.greater_equal(match_indicators, 0)
+    negative_matches = tf.equal(match_indicators, -1)
+    ignored_matches = tf.equal(match_indicators, -2)
+    invalid_matches = tf.equal(match_indicators, -3)
+
+    background_mask = tf.expand_dims(
+        tf.logical_or(negative_matches, invalid_matches), -1)
+    gt_classes = tf.expand_dims(gt_classes, axis=-1)
+    matched_gt_classes = self._target_gather(gt_classes, matched_gt_indices,
+                                             background_mask)
+    matched_gt_classes = tf.where(background_mask,
+                                  tf.zeros_like(matched_gt_classes),
+                                  matched_gt_classes)
+    matched_gt_boxes = self._target_gather(gt_boxes, matched_gt_indices,
+                                           tf.tile(background_mask, [1, 1, 4]))
+    matched_gt_boxes = tf.where(background_mask,
+                                tf.zeros_like(matched_gt_boxes),
+                                matched_gt_boxes)
+    matched_gt_indices = tf.where(
+        tf.squeeze(background_mask, -1), -tf.ones_like(matched_gt_indices),
+        matched_gt_indices)
+
+    if self._config_dict['skip_subsampling']:
+      return (boxes, matched_gt_boxes, tf.squeeze(matched_gt_classes,
+                                                  axis=-1), matched_gt_indices)
+
+    sampled_indices = self._sampler(
+        positive_matches, negative_matches, ignored_matches)
+
+    sampled_rois = self._target_gather(boxes, sampled_indices)
+    sampled_gt_boxes = self._target_gather(matched_gt_boxes, sampled_indices)
+    sampled_gt_classes = tf.squeeze(self._target_gather(
+        matched_gt_classes, sampled_indices), axis=-1)
+    sampled_gt_indices = tf.squeeze(self._target_gather(
+        tf.expand_dims(matched_gt_indices, -1), sampled_indices), axis=-1)
+    return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
+            sampled_gt_indices)
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/maskrcnn_model.py
+++ b/official/vision/modeling/maskrcnn_model.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""R-CNN(-RS) models."""
+
+from typing import Any, List, Mapping, Optional, Tuple, Union
+
+import tensorflow as tf
+
+from official.vision.ops import anchor
+from official.vision.ops import box_ops
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MaskRCNNModel(tf.keras.Model):
+  """The Mask R-CNN(-RS) and Cascade RCNN-RS models."""
+
+  def __init__(self,
+               backbone: tf.keras.Model,
+               decoder: tf.keras.Model,
+               rpn_head: tf.keras.layers.Layer,
+               detection_head: Union[tf.keras.layers.Layer,
+                                     List[tf.keras.layers.Layer]],
+               roi_generator: tf.keras.layers.Layer,
+               roi_sampler: Union[tf.keras.layers.Layer,
+                                  List[tf.keras.layers.Layer]],
+               roi_aligner: tf.keras.layers.Layer,
+               detection_generator: tf.keras.layers.Layer,
+               mask_head: Optional[tf.keras.layers.Layer] = None,
+               mask_sampler: Optional[tf.keras.layers.Layer] = None,
+               mask_roi_aligner: Optional[tf.keras.layers.Layer] = None,
+               class_agnostic_bbox_pred: bool = False,
+               cascade_class_ensemble: bool = False,
+               min_level: Optional[int] = None,
+               max_level: Optional[int] = None,
+               num_scales: Optional[int] = None,
+               aspect_ratios: Optional[List[float]] = None,
+               anchor_size: Optional[float] = None,
+               **kwargs):
+    """Initializes the R-CNN(-RS) model.
+
+    Args:
+      backbone: `tf.keras.Model`, the backbone network.
+      decoder: `tf.keras.Model`, the decoder network.
+      rpn_head: the RPN head.
+      detection_head: the detection head or a list of heads.
+      roi_generator: the ROI generator.
+      roi_sampler: a single ROI sampler or a list of ROI samplers for cascade
+        detection heads.
+      roi_aligner: the ROI aligner.
+      detection_generator: the detection generator.
+      mask_head: the mask head.
+      mask_sampler: the mask sampler.
+      mask_roi_aligner: the ROI alginer for mask prediction.
+      class_agnostic_bbox_pred: if True, perform class agnostic bounding box
+        prediction. Needs to be `True` for Cascade RCNN models.
+      cascade_class_ensemble: if True, ensemble classification scores over all
+        detection heads.
+      min_level: Minimum level in output feature maps.
+      max_level: Maximum level in output feature maps.
+      num_scales: A number representing intermediate scales added on each level.
+        For instances, num_scales=2 adds one additional intermediate anchor
+        scales [2^0, 2^0.5] on each level.
+      aspect_ratios: A list representing the aspect raito anchors added on each
+        level. The number indicates the ratio of width to height. For instances,
+        aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level.
+      anchor_size: A number representing the scale of size of the base anchor to
+        the feature stride 2^level.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(MaskRCNNModel, self).__init__(**kwargs)
+    self._config_dict = {
+        'backbone': backbone,
+        'decoder': decoder,
+        'rpn_head': rpn_head,
+        'detection_head': detection_head,
+        'roi_generator': roi_generator,
+        'roi_sampler': roi_sampler,
+        'roi_aligner': roi_aligner,
+        'detection_generator': detection_generator,
+        'mask_head': mask_head,
+        'mask_sampler': mask_sampler,
+        'mask_roi_aligner': mask_roi_aligner,
+        'class_agnostic_bbox_pred': class_agnostic_bbox_pred,
+        'cascade_class_ensemble': cascade_class_ensemble,
+        'min_level': min_level,
+        'max_level': max_level,
+        'num_scales': num_scales,
+        'aspect_ratios': aspect_ratios,
+        'anchor_size': anchor_size,
+    }
+    self.backbone = backbone
+    self.decoder = decoder
+    self.rpn_head = rpn_head
+    if not isinstance(detection_head, (list, tuple)):
+      self.detection_head = [detection_head]
+    else:
+      self.detection_head = detection_head
+    self.roi_generator = roi_generator
+    if not isinstance(roi_sampler, (list, tuple)):
+      self.roi_sampler = [roi_sampler]
+    else:
+      self.roi_sampler = roi_sampler
+    if len(self.roi_sampler) > 1 and not class_agnostic_bbox_pred:
+      raise ValueError(
+          '`class_agnostic_bbox_pred` needs to be True if multiple detection heads are specified.'
+      )
+    self.roi_aligner = roi_aligner
+    self.detection_generator = detection_generator
+    self._include_mask = mask_head is not None
+    self.mask_head = mask_head
+    if self._include_mask and mask_sampler is None:
+      raise ValueError('`mask_sampler` is not provided in Mask R-CNN.')
+    self.mask_sampler = mask_sampler
+    if self._include_mask and mask_roi_aligner is None:
+      raise ValueError('`mask_roi_aligner` is not provided in Mask R-CNN.')
+    self.mask_roi_aligner = mask_roi_aligner
+    # Weights for the regression losses for each FRCNN layer.
+    # TODO(xianzhi): Make the weights configurable.
+    self._cascade_layer_to_weights = [
+        [10.0, 10.0, 5.0, 5.0],
+        [20.0, 20.0, 10.0, 10.0],
+        [30.0, 30.0, 15.0, 15.0],
+    ]
+
+  def call(self,
+           images: tf.Tensor,
+           image_shape: tf.Tensor,
+           anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+           gt_boxes: Optional[tf.Tensor] = None,
+           gt_classes: Optional[tf.Tensor] = None,
+           gt_masks: Optional[tf.Tensor] = None,
+           training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
+
+    model_outputs, intermediate_outputs = self._call_box_outputs(
+        images=images, image_shape=image_shape, anchor_boxes=anchor_boxes,
+        gt_boxes=gt_boxes, gt_classes=gt_classes, training=training)
+    if not self._include_mask:
+      return model_outputs
+
+    model_mask_outputs = self._call_mask_outputs(
+        model_box_outputs=model_outputs,
+        features=model_outputs['decoder_features'],
+        current_rois=intermediate_outputs['current_rois'],
+        matched_gt_indices=intermediate_outputs['matched_gt_indices'],
+        matched_gt_boxes=intermediate_outputs['matched_gt_boxes'],
+        matched_gt_classes=intermediate_outputs['matched_gt_classes'],
+        gt_masks=gt_masks,
+        training=training)
+    model_outputs.update(model_mask_outputs)
+    return model_outputs
+
+  def _get_backbone_and_decoder_features(self, images):
+
+    backbone_features = self.backbone(images)
+    if self.decoder:
+      features = self.decoder(backbone_features)
+    else:
+      features = backbone_features
+    return backbone_features, features
+
+  def _call_box_outputs(
+      self, images: tf.Tensor,
+      image_shape: tf.Tensor,
+      anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+      gt_boxes: Optional[tf.Tensor] = None,
+      gt_classes: Optional[tf.Tensor] = None,
+      training: Optional[bool] = None) -> Tuple[
+          Mapping[str, tf.Tensor], Mapping[str, tf.Tensor]]:
+    """Implementation of the Faster-RCNN logic for boxes."""
+    model_outputs = {}
+
+    # Feature extraction.
+    (backbone_features,
+     decoder_features) = self._get_backbone_and_decoder_features(images)
+
+    # Region proposal network.
+    rpn_scores, rpn_boxes = self.rpn_head(decoder_features)
+
+    model_outputs.update({
+        'backbone_features': backbone_features,
+        'decoder_features': decoder_features,
+        'rpn_boxes': rpn_boxes,
+        'rpn_scores': rpn_scores
+    })
+
+    # Generate anchor boxes for this batch if not provided.
+    if anchor_boxes is None:
+      _, image_height, image_width, _ = images.get_shape().as_list()
+      anchor_boxes = anchor.Anchor(
+          min_level=self._config_dict['min_level'],
+          max_level=self._config_dict['max_level'],
+          num_scales=self._config_dict['num_scales'],
+          aspect_ratios=self._config_dict['aspect_ratios'],
+          anchor_size=self._config_dict['anchor_size'],
+          image_size=(image_height, image_width)).multilevel_boxes
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0),
+            [tf.shape(images)[0], 1, 1, 1])
+
+    # Generate RoIs.
+    current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores, anchor_boxes,
+                                         image_shape, training)
+
+    next_rois = current_rois
+    all_class_outputs = []
+    for cascade_num in range(len(self.roi_sampler)):
+      # In cascade RCNN we want the higher layers to have different regression
+      # weights as the predicted deltas become smaller and smaller.
+      regression_weights = self._cascade_layer_to_weights[cascade_num]
+      current_rois = next_rois
+
+      (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
+       matched_gt_classes, matched_gt_indices,
+       current_rois) = self._run_frcnn_head(
+           features=decoder_features,
+           rois=current_rois,
+           gt_boxes=gt_boxes,
+           gt_classes=gt_classes,
+           training=training,
+           model_outputs=model_outputs,
+           cascade_num=cascade_num,
+           regression_weights=regression_weights)
+      all_class_outputs.append(class_outputs)
+
+      # Generate ROIs for the next cascade head if there is any.
+      if cascade_num < len(self.roi_sampler) - 1:
+        next_rois = box_ops.decode_boxes(
+            tf.cast(box_outputs, tf.float32),
+            current_rois,
+            weights=regression_weights)
+        next_rois = box_ops.clip_boxes(next_rois,
+                                       tf.expand_dims(image_shape, axis=1))
+
+    if not training:
+      if self._config_dict['cascade_class_ensemble']:
+        class_outputs = tf.add_n(all_class_outputs) / len(all_class_outputs)
+
+      detections = self.detection_generator(
+          box_outputs,
+          class_outputs,
+          current_rois,
+          image_shape,
+          regression_weights,
+          bbox_per_class=(not self._config_dict['class_agnostic_bbox_pred']))
+      model_outputs.update({
+          'cls_outputs': class_outputs,
+          'box_outputs': box_outputs,
+      })
+      if self.detection_generator.get_config()['apply_nms']:
+        model_outputs.update({
+            'detection_boxes': detections['detection_boxes'],
+            'detection_scores': detections['detection_scores'],
+            'detection_classes': detections['detection_classes'],
+            'num_detections': detections['num_detections']
+        })
+      else:
+        model_outputs.update({
+            'decoded_boxes': detections['decoded_boxes'],
+            'decoded_box_scores': detections['decoded_box_scores']
+        })
+
+    intermediate_outputs = {
+        'matched_gt_boxes': matched_gt_boxes,
+        'matched_gt_indices': matched_gt_indices,
+        'matched_gt_classes': matched_gt_classes,
+        'current_rois': current_rois,
+    }
+    return (model_outputs, intermediate_outputs)
+
+  def _call_mask_outputs(
+      self,
+      model_box_outputs: Mapping[str, tf.Tensor],
+      features: tf.Tensor,
+      current_rois: tf.Tensor,
+      matched_gt_indices: tf.Tensor,
+      matched_gt_boxes: tf.Tensor,
+      matched_gt_classes: tf.Tensor,
+      gt_masks: tf.Tensor,
+      training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
+    """Implementation of Mask-RCNN mask prediction logic."""
+
+    model_outputs = dict(model_box_outputs)
+    if training:
+      current_rois, roi_classes, roi_masks = self.mask_sampler(
+          current_rois, matched_gt_boxes, matched_gt_classes,
+          matched_gt_indices, gt_masks)
+      roi_masks = tf.stop_gradient(roi_masks)
+
+      model_outputs.update({
+          'mask_class_targets': roi_classes,
+          'mask_targets': roi_masks,
+      })
+    else:
+      current_rois = model_outputs['detection_boxes']
+      roi_classes = model_outputs['detection_classes']
+
+    mask_logits, mask_probs = self._features_to_mask_outputs(
+        features, current_rois, roi_classes)
+
+    if training:
+      model_outputs.update({
+          'mask_outputs': mask_logits,
+      })
+    else:
+      model_outputs.update({
+          'detection_masks': mask_probs,
+      })
+    return model_outputs
+
+  def _run_frcnn_head(self, features, rois, gt_boxes, gt_classes, training,
+                      model_outputs, cascade_num, regression_weights):
+    """Runs the frcnn head that does both class and box prediction.
+
+    Args:
+      features: `list` of features from the feature extractor.
+      rois: `list` of current rois that will be used to predict bbox refinement
+        and classes from.
+      gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4].
+        This tensor might have paddings with a negative value.
+      gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
+        classes. It is padded with -1s to indicate the invalid classes.
+      training: `bool`, if model is training or being evaluated.
+      model_outputs: `dict`, used for storing outputs used for eval and losses.
+      cascade_num: `int`, the current frcnn layer in the cascade.
+      regression_weights: `list`, weights used for l1 loss in bounding box
+        regression.
+
+    Returns:
+      class_outputs: Class predictions for rois.
+      box_outputs: Box predictions for rois. These are formatted for the
+        regression loss and need to be converted before being used as rois
+        in the next stage.
+      model_outputs: Updated dict with predictions used for losses and eval.
+      matched_gt_boxes: If `is_training` is true, then these give the gt box
+        location of its positive match.
+      matched_gt_classes: If `is_training` is true, then these give the gt class
+         of the predicted box.
+      matched_gt_boxes: If `is_training` is true, then these give the box
+        location of its positive match.
+      matched_gt_indices: If `is_training` is true, then gives the index of
+        the positive box match. Used for mask prediction.
+      rois: The sampled rois used for this layer.
+    """
+    # Only used during training.
+    matched_gt_boxes, matched_gt_classes, matched_gt_indices = (None, None,
+                                                                None)
+    if training and gt_boxes is not None:
+      rois = tf.stop_gradient(rois)
+
+      current_roi_sampler = self.roi_sampler[cascade_num]
+      rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
+          current_roi_sampler(rois, gt_boxes, gt_classes))
+      # Create bounding box training targets.
+      box_targets = box_ops.encode_boxes(
+          matched_gt_boxes, rois, weights=regression_weights)
+      # If the target is background, the box target is set to all 0s.
+      box_targets = tf.where(
+          tf.tile(
+              tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
+              [1, 1, 4]), tf.zeros_like(box_targets), box_targets)
+      model_outputs.update({
+          'class_targets_{}'.format(cascade_num)
+          if cascade_num else 'class_targets':
+              matched_gt_classes,
+          'box_targets_{}'.format(cascade_num)
+          if cascade_num else 'box_targets':
+              box_targets,
+      })
+
+    # Get roi features.
+    roi_features = self.roi_aligner(features, rois)
+
+    # Run frcnn head to get class and bbox predictions.
+    current_detection_head = self.detection_head[cascade_num]
+    class_outputs, box_outputs = current_detection_head(roi_features)
+
+    model_outputs.update({
+        'class_outputs_{}'.format(cascade_num)
+        if cascade_num else 'class_outputs':
+            class_outputs,
+        'box_outputs_{}'.format(cascade_num) if cascade_num else 'box_outputs':
+            box_outputs,
+    })
+    return (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
+            matched_gt_classes, matched_gt_indices, rois)
+
+  def _features_to_mask_outputs(self, features, rois, roi_classes):
+    # Mask RoI align.
+    mask_roi_features = self.mask_roi_aligner(features, rois)
+
+    # Mask head.
+    raw_masks = self.mask_head([mask_roi_features, roi_classes])
+
+    return raw_masks, tf.nn.sigmoid(raw_masks)
+
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(
+        backbone=self.backbone,
+        rpn_head=self.rpn_head,
+        detection_head=self.detection_head)
+    if self.decoder is not None:
+      items.update(decoder=self.decoder)
+    if self._include_mask:
+      items.update(mask_head=self.mask_head)
+
+    return items
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/maskrcnn_model_test.py
+++ b/official/vision/modeling/maskrcnn_model_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for maskrcnn_model.py."""
+
+import os
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.modeling import maskrcnn_model
+from official.vision.modeling.backbones import resnet
+from official.vision.modeling.decoders import fpn
+from official.vision.modeling.heads import dense_prediction_heads
+from official.vision.modeling.heads import instance_heads
+from official.vision.modeling.layers import detection_generator
+from official.vision.modeling.layers import mask_sampler
+from official.vision.modeling.layers import roi_aligner
+from official.vision.modeling.layers import roi_generator
+from official.vision.modeling.layers import roi_sampler
+from official.vision.ops import anchor
+
+
+class MaskRCNNModelTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          include_mask=[True, False],
+          use_separable_conv=[True, False],
+          build_anchor_boxes=[True, False],
+          is_training=[True, False]))
+  def test_build_model(self, include_mask, use_separable_conv,
+                       build_anchor_boxes, is_training):
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    anchor_size = 3
+    resnet_model_id = 50
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+    image_size = 384
+    images = np.random.rand(2, image_size, image_size, 3)
+    image_shape = np.array([[image_size, image_size], [image_size, image_size]])
+
+    if build_anchor_boxes:
+      anchor_boxes = anchor.Anchor(
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=3,
+          image_size=(image_size, image_size)).multilevel_boxes
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
+    else:
+      anchor_boxes = None
+
+    backbone = resnet.ResNet(model_id=resnet_model_id)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level,
+        use_separable_conv=use_separable_conv)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_anchors_per_location=num_anchors_per_location,
+        num_convs=1)
+    detection_head = instance_heads.DetectionHead(num_classes=num_classes)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    if include_mask:
+      mask_head = instance_heads.MaskHead(
+          num_classes=num_classes, upsample_factor=2)
+      mask_sampler_obj = mask_sampler.MaskSampler(
+          mask_target_size=28, num_sampled_masks=1)
+      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+    else:
+      mask_head = None
+      mask_sampler_obj = None
+      mask_roi_aligner_obj = None
+    model = maskrcnn_model.MaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size)
+
+    gt_boxes = np.array(
+        [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
+         [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
+        dtype=np.float32)
+    gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
+    if include_mask:
+      gt_masks = np.ones((2, 3, 100, 100))
+    else:
+      gt_masks = None
+
+    # Results will be checked in test_forward.
+    _ = model(
+        images,
+        image_shape,
+        anchor_boxes,
+        gt_boxes,
+        gt_classes,
+        gt_masks,
+        training=is_training)
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          include_mask=[True, False],
+          build_anchor_boxes=[True, False],
+          use_cascade_heads=[True, False],
+          training=[True, False],
+      ))
+  def test_forward(self, strategy, include_mask, build_anchor_boxes, training,
+                   use_cascade_heads):
+    num_classes = 3
+    min_level = 3
+    max_level = 4
+    num_scales = 3
+    aspect_ratios = [1.0]
+    anchor_size = 3
+    if use_cascade_heads:
+      cascade_iou_thresholds = [0.6]
+      class_agnostic_bbox_pred = True
+      cascade_class_ensemble = True
+    else:
+      cascade_iou_thresholds = None
+      class_agnostic_bbox_pred = False
+      cascade_class_ensemble = False
+
+    image_size = (256, 256)
+    images = np.random.rand(2, image_size[0], image_size[1], 3)
+    image_shape = np.array([[224, 100], [100, 224]])
+    with strategy.scope():
+      if build_anchor_boxes:
+        anchor_boxes = anchor.Anchor(
+            min_level=min_level,
+            max_level=max_level,
+            num_scales=num_scales,
+            aspect_ratios=aspect_ratios,
+            anchor_size=anchor_size,
+            image_size=image_size).multilevel_boxes
+      else:
+        anchor_boxes = None
+      num_anchors_per_location = len(aspect_ratios) * num_scales
+
+      input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+      backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+      decoder = fpn.FPN(
+          min_level=min_level,
+          max_level=max_level,
+          input_specs=backbone.output_specs)
+      rpn_head = dense_prediction_heads.RPNHead(
+          min_level=min_level,
+          max_level=max_level,
+          num_anchors_per_location=num_anchors_per_location)
+      detection_head = instance_heads.DetectionHead(
+          num_classes=num_classes,
+          class_agnostic_bbox_pred=class_agnostic_bbox_pred)
+      roi_generator_obj = roi_generator.MultilevelROIGenerator()
+
+      roi_sampler_cascade = []
+      roi_sampler_obj = roi_sampler.ROISampler()
+      roi_sampler_cascade.append(roi_sampler_obj)
+      if cascade_iou_thresholds:
+        for iou in cascade_iou_thresholds:
+          roi_sampler_obj = roi_sampler.ROISampler(
+              mix_gt_boxes=False,
+              foreground_iou_threshold=iou,
+              background_iou_high_threshold=iou,
+              background_iou_low_threshold=0.0,
+              skip_subsampling=True)
+          roi_sampler_cascade.append(roi_sampler_obj)
+      roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+      detection_generator_obj = detection_generator.DetectionGenerator()
+      if include_mask:
+        mask_head = instance_heads.MaskHead(
+            num_classes=num_classes, upsample_factor=2)
+        mask_sampler_obj = mask_sampler.MaskSampler(
+            mask_target_size=28, num_sampled_masks=1)
+        mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+      else:
+        mask_head = None
+        mask_sampler_obj = None
+        mask_roi_aligner_obj = None
+      model = maskrcnn_model.MaskRCNNModel(
+          backbone,
+          decoder,
+          rpn_head,
+          detection_head,
+          roi_generator_obj,
+          roi_sampler_obj,
+          roi_aligner_obj,
+          detection_generator_obj,
+          mask_head,
+          mask_sampler_obj,
+          mask_roi_aligner_obj,
+          class_agnostic_bbox_pred=class_agnostic_bbox_pred,
+          cascade_class_ensemble=cascade_class_ensemble,
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=anchor_size)
+
+      gt_boxes = np.array(
+          [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
+           [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
+          dtype=np.float32)
+      gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
+      if include_mask:
+        gt_masks = np.ones((2, 3, 100, 100))
+      else:
+        gt_masks = None
+
+      results = model(
+          images,
+          image_shape,
+          anchor_boxes,
+          gt_boxes,
+          gt_classes,
+          gt_masks,
+          training=training)
+
+    self.assertIn('rpn_boxes', results)
+    self.assertIn('rpn_scores', results)
+    if training:
+      self.assertIn('class_targets', results)
+      self.assertIn('box_targets', results)
+      self.assertIn('class_outputs', results)
+      self.assertIn('box_outputs', results)
+      if include_mask:
+        self.assertIn('mask_outputs', results)
+    else:
+      self.assertIn('detection_boxes', results)
+      self.assertIn('detection_scores', results)
+      self.assertIn('detection_classes', results)
+      self.assertIn('num_detections', results)
+      if include_mask:
+        self.assertIn('detection_masks', results)
+
+  @parameterized.parameters(
+      (False,),
+      (True,),
+  )
+  def test_serialize_deserialize(self, include_mask):
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+    backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+    decoder = fpn.FPN(
+        min_level=3, max_level=7, input_specs=backbone.output_specs)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=3, max_level=7, num_anchors_per_location=3)
+    detection_head = instance_heads.DetectionHead(num_classes=2)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    if include_mask:
+      mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
+      mask_sampler_obj = mask_sampler.MaskSampler(
+          mask_target_size=28, num_sampled_masks=1)
+      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+    else:
+      mask_head = None
+      mask_sampler_obj = None
+      mask_roi_aligner_obj = None
+    model = maskrcnn_model.MaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj,
+        min_level=3,
+        max_level=7,
+        num_scales=3,
+        aspect_ratios=[1.0],
+        anchor_size=3)
+
+    config = model.get_config()
+    new_model = maskrcnn_model.MaskRCNNModel.from_config(config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+  @parameterized.parameters(
+      (False,),
+      (True,),
+  )
+  def test_checkpoint(self, include_mask):
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+    backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+    decoder = fpn.FPN(
+        min_level=3, max_level=7, input_specs=backbone.output_specs)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=3, max_level=7, num_anchors_per_location=3)
+    detection_head = instance_heads.DetectionHead(num_classes=2)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    if include_mask:
+      mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
+      mask_sampler_obj = mask_sampler.MaskSampler(
+          mask_target_size=28, num_sampled_masks=1)
+      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+    else:
+      mask_head = None
+      mask_sampler_obj = None
+      mask_roi_aligner_obj = None
+    model = maskrcnn_model.MaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj,
+        min_level=3,
+        max_level=7,
+        num_scales=3,
+        aspect_ratios=[1.0],
+        anchor_size=3)
+    expect_checkpoint_items = dict(
+        backbone=backbone,
+        decoder=decoder,
+        rpn_head=rpn_head,
+        detection_head=[detection_head])
+    if include_mask:
+      expect_checkpoint_items['mask_head'] = mask_head
+    self.assertAllEqual(expect_checkpoint_items, model.checkpoint_items)
+
+    # Test save and load checkpoints.
+    ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items)
+    save_dir = self.create_tempdir().full_path
+    ckpt.save(os.path.join(save_dir, 'ckpt'))
+
+    partial_ckpt = tf.train.Checkpoint(backbone=backbone)
+    partial_ckpt.read(tf.train.latest_checkpoint(
+        save_dir)).expect_partial().assert_existing_objects_matched()
+
+    if include_mask:
+      partial_ckpt_mask = tf.train.Checkpoint(
+          backbone=backbone, mask_head=mask_head)
+      partial_ckpt_mask.restore(tf.train.latest_checkpoint(
+          save_dir)).expect_partial().assert_existing_objects_matched()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/retinanet_model.py
+++ b/official/vision/modeling/retinanet_model.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""RetinaNet."""
+from typing import Any, Mapping, List, Optional, Union
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.ops import anchor
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class RetinaNetModel(tf.keras.Model):
+  """The RetinaNet model class."""
+
+  def __init__(self,
+               backbone: tf.keras.Model,
+               decoder: tf.keras.Model,
+               head: tf.keras.layers.Layer,
+               detection_generator: tf.keras.layers.Layer,
+               min_level: Optional[int] = None,
+               max_level: Optional[int] = None,
+               num_scales: Optional[int] = None,
+               aspect_ratios: Optional[List[float]] = None,
+               anchor_size: Optional[float] = None,
+               **kwargs):
+    """Classification initialization function.
+
+    Args:
+      backbone: `tf.keras.Model` a backbone network.
+      decoder: `tf.keras.Model` a decoder network.
+      head: `RetinaNetHead`, the RetinaNet head.
+      detection_generator: the detection generator.
+      min_level: Minimum level in output feature maps.
+      max_level: Maximum level in output feature maps.
+      num_scales: A number representing intermediate scales added
+        on each level. For instances, num_scales=2 adds one additional
+        intermediate anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: A list representing the aspect raito
+        anchors added on each level. The number indicates the ratio of width to
+        height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
+        on each scale level.
+      anchor_size: A number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(RetinaNetModel, self).__init__(**kwargs)
+    self._config_dict = {
+        'backbone': backbone,
+        'decoder': decoder,
+        'head': head,
+        'detection_generator': detection_generator,
+        'min_level': min_level,
+        'max_level': max_level,
+        'num_scales': num_scales,
+        'aspect_ratios': aspect_ratios,
+        'anchor_size': anchor_size,
+    }
+    self._backbone = backbone
+    self._decoder = decoder
+    self._head = head
+    self._detection_generator = detection_generator
+
+  def call(self,
+           images: tf.Tensor,
+           image_shape: Optional[tf.Tensor] = None,
+           anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+           output_intermediate_features: bool = False,
+           training: bool = None) -> Mapping[str, tf.Tensor]:
+    """Forward pass of the RetinaNet model.
+
+    Args:
+      images: `Tensor`, the input batched images, whose shape is
+        [batch, height, width, 3].
+      image_shape: `Tensor`, the actual shape of the input images, whose shape
+        is [batch, 2] where the last dimension is [height, width]. Note that
+        this is the actual image shape excluding paddings. For example, images
+        in the batch may be resized into different shapes before padding to the
+        fixed size.
+      anchor_boxes: a dict of tensors which includes multilevel anchors.
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the anchor coordinates of a particular feature
+            level, whose shape is [height_l, width_l, num_anchors_per_location].
+      output_intermediate_features: `bool` indicating whether to return the
+        intermediate feature maps generated by backbone and decoder.
+      training: `bool`, indicating whether it is in training mode.
+
+    Returns:
+      scores: a dict of tensors which includes scores of the predictions.
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the box scores predicted from a particular feature
+            level, whose shape is
+            [batch, height_l, width_l, num_classes * num_anchors_per_location].
+      boxes: a dict of tensors which includes coordinates of the predictions.
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the box coordinates predicted from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l, 4 * num_anchors_per_location].
+      attributes: a dict of (attribute_name, attribute_predictions). Each
+        attribute prediction is a dict that includes:
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the attribute predictions from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l, att_size * num_anchors_per_location].
+    """
+    outputs = {}
+    # Feature extraction.
+    features = self.backbone(images)
+    if output_intermediate_features:
+      outputs.update(
+          {'backbone_{}'.format(k): v for k, v in features.items()})
+    if self.decoder:
+      features = self.decoder(features)
+    if output_intermediate_features:
+      outputs.update(
+          {'decoder_{}'.format(k): v for k, v in features.items()})
+
+    # Dense prediction. `raw_attributes` can be empty.
+    raw_scores, raw_boxes, raw_attributes = self.head(features)
+
+    if training:
+      outputs.update({
+          'cls_outputs': raw_scores,
+          'box_outputs': raw_boxes,
+      })
+      if raw_attributes:
+        outputs.update({'attribute_outputs': raw_attributes})
+      return outputs
+    else:
+      # Generate anchor boxes for this batch if not provided.
+      if anchor_boxes is None:
+        _, image_height, image_width, _ = images.get_shape().as_list()
+        anchor_boxes = anchor.Anchor(
+            min_level=self._config_dict['min_level'],
+            max_level=self._config_dict['max_level'],
+            num_scales=self._config_dict['num_scales'],
+            aspect_ratios=self._config_dict['aspect_ratios'],
+            anchor_size=self._config_dict['anchor_size'],
+            image_size=(image_height, image_width)).multilevel_boxes
+        for l in anchor_boxes:
+          anchor_boxes[l] = tf.tile(
+              tf.expand_dims(anchor_boxes[l], axis=0),
+              [tf.shape(images)[0], 1, 1, 1])
+
+      # Post-processing.
+      final_results = self.detection_generator(raw_boxes, raw_scores,
+                                               anchor_boxes, image_shape,
+                                               raw_attributes)
+      outputs.update({
+          'cls_outputs': raw_scores,
+          'box_outputs': raw_boxes,
+      })
+      if self.detection_generator.get_config()['apply_nms']:
+        outputs.update({
+            'detection_boxes': final_results['detection_boxes'],
+            'detection_scores': final_results['detection_scores'],
+            'detection_classes': final_results['detection_classes'],
+            'num_detections': final_results['num_detections']
+        })
+      else:
+        outputs.update({
+            'decoded_boxes': final_results['decoded_boxes'],
+            'decoded_box_scores': final_results['decoded_box_scores']
+        })
+
+      if raw_attributes:
+        outputs.update({
+            'attribute_outputs': raw_attributes,
+            'detection_attributes': final_results['detection_attributes'],
+        })
+      return outputs
+
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(backbone=self.backbone, head=self.head)
+    if self.decoder is not None:
+      items.update(decoder=self.decoder)
+
+    return items
+
+  @property
+  def backbone(self) -> tf.keras.Model:
+    return self._backbone
+
+  @property
+  def decoder(self) -> tf.keras.Model:
+    return self._decoder
+
+  @property
+  def head(self) -> tf.keras.layers.Layer:
+    return self._head
+
+  @property
+  def detection_generator(self) -> tf.keras.layers.Layer:
+    return self._detection_generator
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/retinanet_model_test.py
+++ b/official/vision/modeling/retinanet_model_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for RetinaNet models."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.modeling import retinanet_model
+from official.vision.modeling.backbones import resnet
+from official.vision.modeling.decoders import fpn
+from official.vision.modeling.heads import dense_prediction_heads
+from official.vision.modeling.layers import detection_generator
+from official.vision.ops import anchor
+
+
+class RetinaNetTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      {
+          'use_separable_conv': True,
+          'build_anchor_boxes': True,
+          'is_training': False,
+          'has_att_heads': False
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': True,
+          'is_training': False,
+          'has_att_heads': False
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': False,
+          'is_training': False,
+          'has_att_heads': False
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': False,
+          'is_training': True,
+          'has_att_heads': False
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': True,
+          'is_training': True,
+          'has_att_heads': True
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': True,
+          'is_training': False,
+          'has_att_heads': True
+      },
+  )
+  def test_build_model(self, use_separable_conv, build_anchor_boxes,
+                       is_training, has_att_heads):
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    anchor_size = 3
+    fpn_num_filters = 256
+    head_num_convs = 4
+    head_num_filters = 256
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+    image_size = 384
+    images = np.random.rand(2, image_size, image_size, 3)
+    image_shape = np.array([[image_size, image_size], [image_size, image_size]])
+
+    if build_anchor_boxes:
+      anchor_boxes = anchor.Anchor(
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=anchor_size,
+          image_size=(image_size, image_size)).multilevel_boxes
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
+    else:
+      anchor_boxes = None
+
+    if has_att_heads:
+      attribute_heads = [dict(name='depth', type='regression', size=1)]
+    else:
+      attribute_heads = None
+
+    backbone = resnet.ResNet(model_id=50)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level,
+        num_filters=fpn_num_filters,
+        use_separable_conv=use_separable_conv)
+    head = dense_prediction_heads.RetinaNetHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_classes=num_classes,
+        attribute_heads=attribute_heads,
+        num_anchors_per_location=num_anchors_per_location,
+        use_separable_conv=use_separable_conv,
+        num_convs=head_num_convs,
+        num_filters=head_num_filters)
+    generator = detection_generator.MultilevelDetectionGenerator(
+        max_num_detections=10)
+    model = retinanet_model.RetinaNetModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head,
+        detection_generator=generator,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size)
+
+    _ = model(images, image_shape, anchor_boxes, training=is_training)
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          image_size=[
+              (128, 128),
+          ],
+          training=[True, False],
+          has_att_heads=[True, False],
+          output_intermediate_features=[True, False],
+          soft_nms_sigma=[None, 0.0, 0.1],
+      ))
+  def test_forward(self, strategy, image_size, training, has_att_heads,
+                   output_intermediate_features, soft_nms_sigma):
+    """Test for creation of a R50-FPN RetinaNet."""
+    tf.keras.backend.set_image_data_format('channels_last')
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+
+    images = np.random.rand(2, image_size[0], image_size[1], 3)
+    image_shape = np.array(
+        [[image_size[0], image_size[1]], [image_size[0], image_size[1]]])
+
+    with strategy.scope():
+      anchor_gen = anchor.build_anchor_generator(
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=3)
+      anchor_boxes = anchor_gen(image_size)
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
+
+      backbone = resnet.ResNet(model_id=50)
+      decoder = fpn.FPN(
+          input_specs=backbone.output_specs,
+          min_level=min_level,
+          max_level=max_level)
+
+      if has_att_heads:
+        attribute_heads = [dict(name='depth', type='regression', size=1)]
+      else:
+        attribute_heads = None
+      head = dense_prediction_heads.RetinaNetHead(
+          min_level=min_level,
+          max_level=max_level,
+          num_classes=num_classes,
+          attribute_heads=attribute_heads,
+          num_anchors_per_location=num_anchors_per_location)
+      generator = detection_generator.MultilevelDetectionGenerator(
+          max_num_detections=10,
+          nms_version='v1',
+          use_cpu_nms=soft_nms_sigma is not None,
+          soft_nms_sigma=soft_nms_sigma)
+      model = retinanet_model.RetinaNetModel(
+          backbone=backbone,
+          decoder=decoder,
+          head=head,
+          detection_generator=generator)
+
+      model_outputs = model(
+          images,
+          image_shape,
+          anchor_boxes,
+          output_intermediate_features=output_intermediate_features,
+          training=training)
+
+    if training:
+      cls_outputs = model_outputs['cls_outputs']
+      box_outputs = model_outputs['box_outputs']
+      for level in range(min_level, max_level + 1):
+        self.assertIn(str(level), cls_outputs)
+        self.assertIn(str(level), box_outputs)
+        self.assertAllEqual([
+            2,
+            image_size[0] // 2**level,
+            image_size[1] // 2**level,
+            num_classes * num_anchors_per_location
+        ], cls_outputs[str(level)].numpy().shape)
+        self.assertAllEqual([
+            2,
+            image_size[0] // 2**level,
+            image_size[1] // 2**level,
+            4 * num_anchors_per_location
+        ], box_outputs[str(level)].numpy().shape)
+        if has_att_heads:
+          att_outputs = model_outputs['attribute_outputs']
+          for att in att_outputs.values():
+            self.assertAllEqual([
+                2, image_size[0] // 2**level, image_size[1] // 2**level,
+                1 * num_anchors_per_location
+            ], att[str(level)].numpy().shape)
+    else:
+      self.assertIn('detection_boxes', model_outputs)
+      self.assertIn('detection_scores', model_outputs)
+      self.assertIn('detection_classes', model_outputs)
+      self.assertIn('num_detections', model_outputs)
+      self.assertAllEqual(
+          [2, 10, 4], model_outputs['detection_boxes'].numpy().shape)
+      self.assertAllEqual(
+          [2, 10], model_outputs['detection_scores'].numpy().shape)
+      self.assertAllEqual(
+          [2, 10], model_outputs['detection_classes'].numpy().shape)
+      self.assertAllEqual(
+          [2,], model_outputs['num_detections'].numpy().shape)
+      if has_att_heads:
+        self.assertIn('detection_attributes', model_outputs)
+        self.assertAllEqual(
+            [2, 10, 1],
+            model_outputs['detection_attributes']['depth'].numpy().shape)
+    if output_intermediate_features:
+      for l in range(2, 6):
+        self.assertIn('backbone_{}'.format(l), model_outputs)
+        self.assertAllEqual([
+            2, image_size[0] // 2**l, image_size[1] // 2**l,
+            backbone.output_specs[str(l)].as_list()[-1]
+        ], model_outputs['backbone_{}'.format(l)].numpy().shape)
+      for l in range(min_level, max_level + 1):
+        self.assertIn('decoder_{}'.format(l), model_outputs)
+        self.assertAllEqual([
+            2, image_size[0] // 2**l, image_size[1] // 2**l,
+            decoder.output_specs[str(l)].as_list()[-1]
+        ], model_outputs['decoder_{}'.format(l)].numpy().shape)
+
+  def test_serialize_deserialize(self):
+    """Validate the network can be serialized and deserialized."""
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+
+    backbone = resnet.ResNet(model_id=50)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level)
+    head = dense_prediction_heads.RetinaNetHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_classes=num_classes,
+        num_anchors_per_location=num_anchors_per_location)
+    generator = detection_generator.MultilevelDetectionGenerator(
+        max_num_detections=10)
+    model = retinanet_model.RetinaNetModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head,
+        detection_generator=generator,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=3)
+
+    config = model.get_config()
+    new_model = retinanet_model.RetinaNetModel.from_config(config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/segmentation_model.py
+++ b/official/vision/modeling/segmentation_model.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Build segmentation models."""
+from typing import Any, Mapping, Union, Optional, Dict
+
+# Import libraries
+import tensorflow as tf
+
+layers = tf.keras.layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SegmentationModel(tf.keras.Model):
+  """A Segmentation class model.
+
+  Input images are passed through backbone first. Decoder network is then
+  applied, and finally, segmentation head is applied on the output of the
+  decoder network. Layers such as ASPP should be part of decoder. Any feature
+  fusion is done as part of the segmentation head (i.e. deeplabv3+ feature
+  fusion is not part of the decoder, instead it is part of the segmentation
+  head). This way, different feature fusion techniques can be combined with
+  different backbones, and decoders.
+  """
+
+  def __init__(self, backbone: tf.keras.Model, decoder: tf.keras.Model,
+               head: tf.keras.layers.Layer,
+               mask_scoring_head: Optional[tf.keras.layers.Layer] = None,
+               **kwargs):
+    """Segmentation initialization function.
+
+    Args:
+      backbone: a backbone network.
+      decoder: a decoder network. E.g. FPN.
+      head: segmentation head.
+      mask_scoring_head: mask scoring head.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(SegmentationModel, self).__init__(**kwargs)
+    self._config_dict = {
+        'backbone': backbone,
+        'decoder': decoder,
+        'head': head,
+        'mask_scoring_head': mask_scoring_head,
+    }
+    self.backbone = backbone
+    self.decoder = decoder
+    self.head = head
+    self.mask_scoring_head = mask_scoring_head
+
+  def call(self, inputs: tf.Tensor, training: bool = None
+           ) -> Dict[str, tf.Tensor]:
+    backbone_features = self.backbone(inputs)
+
+    if self.decoder:
+      decoder_features = self.decoder(backbone_features)
+    else:
+      decoder_features = backbone_features
+
+    logits = self.head((backbone_features, decoder_features))
+    outputs = {'logits': logits}
+    if self.mask_scoring_head:
+      mask_scores = self.mask_scoring_head(logits)
+      outputs.update({'mask_scores': mask_scores})
+    return outputs
+
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(backbone=self.backbone, head=self.head)
+    if self.decoder is not None:
+      items.update(decoder=self.decoder)
+    if self.mask_scoring_head is not None:
+      items.update(mask_scoring_head=self.mask_scoring_head)
+    return items
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/modeling/segmentation_model_test.py
+++ b/official/vision/modeling/segmentation_model_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for segmentation network."""
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling import backbones
+from official.vision.modeling import segmentation_model
+from official.vision.modeling.decoders import fpn
+from official.vision.modeling.heads import segmentation_heads
+
+
+class SegmentationNetworkTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (128, 2),
+      (128, 3),
+      (128, 4),
+      (256, 2),
+      (256, 3),
+      (256, 4),
+  )
+  def test_segmentation_network_creation(
+      self, input_size, level):
+    """Test for creation of a segmentation network."""
+    num_classes = 10
+    inputs = np.random.rand(2, input_size, input_size, 3)
+    tf.keras.backend.set_image_data_format('channels_last')
+    backbone = backbones.ResNet(model_id=50)
+
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs, min_level=2, max_level=7)
+    head = segmentation_heads.SegmentationHead(num_classes, level=level)
+
+    model = segmentation_model.SegmentationModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head,
+        mask_scoring_head=None,
+    )
+
+    outputs = model(inputs)
+    self.assertAllEqual(
+        [2, input_size // (2**level), input_size // (2**level), num_classes],
+        outputs['logits'].numpy().shape)
+
+  def test_serialize_deserialize(self):
+    """Validate the network can be serialized and deserialized."""
+    num_classes = 3
+    backbone = backbones.ResNet(model_id=50)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs, min_level=3, max_level=7)
+    head = segmentation_heads.SegmentationHead(num_classes, level=3)
+    model = segmentation_model.SegmentationModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head
+    )
+
+    config = model.get_config()
+    new_model = segmentation_model.SegmentationModel.from_config(config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/video_classification_model.py
+++ b/official/vision/modeling/video_classification_model.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Build video classification models."""
+from typing import Any, Mapping, Optional, Union, List, Text
+
+import tensorflow as tf
+
+layers = tf.keras.layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class VideoClassificationModel(tf.keras.Model):
+  """A video classification class builder."""
+
+  def __init__(
+      self,
+      backbone: tf.keras.Model,
+      num_classes: int,
+      input_specs: Optional[Mapping[str, tf.keras.layers.InputSpec]] = None,
+      dropout_rate: float = 0.0,
+      aggregate_endpoints: bool = False,
+      kernel_initializer: str = 'random_uniform',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      require_endpoints: Optional[List[Text]] = None,
+      **kwargs):
+    """Video Classification initialization function.
+
+    Args:
+      backbone: a 3d backbone network.
+      num_classes: `int` number of classes in classification task.
+      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
+      dropout_rate: `float` rate for dropout regularization.
+      aggregate_endpoints: `bool` aggregate all end ponits or only use the
+        final end point.
+      kernel_initializer: kernel initializer for the dense layer.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object. Default to
+        None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object. Default to
+        None.
+      require_endpoints: the required endpoints for prediction. If None or
+        empty, then only uses the final endpoint.
+      **kwargs: keyword arguments to be passed.
+    """
+    if not input_specs:
+      input_specs = {
+          'image': layers.InputSpec(shape=[None, None, None, None, 3])
+      }
+    self._self_setattr_tracking = False
+    self._config_dict = {
+        'backbone': backbone,
+        'num_classes': num_classes,
+        'input_specs': input_specs,
+        'dropout_rate': dropout_rate,
+        'aggregate_endpoints': aggregate_endpoints,
+        'kernel_initializer': kernel_initializer,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+        'require_endpoints': require_endpoints,
+    }
+    self._input_specs = input_specs
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._backbone = backbone
+
+    inputs = {
+        k: tf.keras.Input(shape=v.shape[1:]) for k, v in input_specs.items()
+    }
+    endpoints = backbone(inputs['image'])
+
+    if aggregate_endpoints:
+      pooled_feats = []
+      for endpoint in endpoints.values():
+        x_pool = tf.keras.layers.GlobalAveragePooling3D()(endpoint)
+        pooled_feats.append(x_pool)
+      x = tf.concat(pooled_feats, axis=1)
+    else:
+      if not require_endpoints:
+        # Uses the last endpoint for prediction.
+        x = endpoints[max(endpoints.keys())]
+        x = tf.keras.layers.GlobalAveragePooling3D()(x)
+      else:
+        # Concats all the required endpoints for prediction.
+        outputs = []
+        for name in require_endpoints:
+          x = endpoints[name]
+          x = tf.keras.layers.GlobalAveragePooling3D()(x)
+          outputs.append(x)
+        x = tf.concat(outputs, axis=1)
+
+    x = tf.keras.layers.Dropout(dropout_rate)(x)
+    x = tf.keras.layers.Dense(
+        num_classes, kernel_initializer=kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            x)
+
+    super(VideoClassificationModel, self).__init__(
+        inputs=inputs, outputs=x, **kwargs)
+
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    return dict(backbone=self.backbone)
+
+  @property
+  def backbone(self) -> tf.keras.Model:
+    return self._backbone
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/modeling/video_classification_model_test.py
+++ b/official/vision/modeling/video_classification_model_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for video classification network."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling import backbones
+from official.vision.modeling import video_classification_model
+
+
+class VideoClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (50, 8, 112, 'relu', False),
+      (50, 8, 112, 'swish', True),
+  )
+  def test_resnet3d_network_creation(self, model_id, temporal_size,
+                                     spatial_size, activation,
+                                     aggregate_endpoints):
+    """Test for creation of a ResNet3D-50 classifier."""
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, temporal_size, spatial_size, spatial_size, 3])
+    temporal_strides = [1, 1, 1, 1]
+    temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
+                             (1, 3, 1)]
+
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    backbone = backbones.ResNet3D(
+        model_id=model_id,
+        temporal_strides=temporal_strides,
+        temporal_kernel_sizes=temporal_kernel_sizes,
+        input_specs=input_specs,
+        activation=activation)
+
+    num_classes = 1000
+    model = video_classification_model.VideoClassificationModel(
+        backbone=backbone,
+        num_classes=num_classes,
+        input_specs={'image': input_specs},
+        dropout_rate=0.2,
+        aggregate_endpoints=aggregate_endpoints,
+    )
+
+    inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
+    logits = model(inputs)
+    self.assertAllEqual([2, num_classes], logits.numpy().shape)
+
+  def test_serialize_deserialize(self):
+    """Validate the classification network can be serialized and deserialized."""
+    model_id = 50
+    temporal_strides = [1, 1, 1, 1]
+    temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
+                             (1, 3, 1)]
+
+    backbone = backbones.ResNet3D(
+        model_id=model_id,
+        temporal_strides=temporal_strides,
+        temporal_kernel_sizes=temporal_kernel_sizes)
+
+    model = video_classification_model.VideoClassificationModel(
+        backbone=backbone, num_classes=1000)
+
+    config = model.get_config()
+    new_model = video_classification_model.VideoClassificationModel.from_config(
+        config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()