Merge branch 'tensorflow:master' into panoptic-deeplab-modeling

0225b135 · Srihari Humbarwadi · GitHub · 7479dbb8 · 4c571a3c · 0225b135
Unverified Commit 0225b135 authored Mar 05, 2022 by Srihari Humbarwadi Committed by GitHub Mar 05, 2022
20 changed files
--- a/official/vision/modeling/layers/nn_blocks_test.py
+++ b/official/vision/modeling/layers/nn_blocks_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for nn_blocks."""
+
+from typing import Any, Iterable, Tuple
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.modeling.layers import nn_blocks
+
+
+def distribution_strategy_combinations() -> Iterable[Tuple[Any, ...]]:
+  """Returns the combinations of end-to-end tests to run."""
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],)
+
+
+class NNBlocksTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (nn_blocks.ResidualBlock, 1, False, 0.0, None),
+      (nn_blocks.ResidualBlock, 2, True, 0.2, 0.25),
+  )
+  def test_residual_block_creation(self, block_fn, strides, use_projection,
+                                   stochastic_depth_drop_rate, se_ratio):
+    input_size = 128
+    filter_size = 256
+    inputs = tf.keras.Input(
+        shape=(input_size, input_size, filter_size), batch_size=1)
+    block = block_fn(
+        filter_size,
+        strides,
+        use_projection=use_projection,
+        se_ratio=se_ratio,
+        stochastic_depth_drop_rate=stochastic_depth_drop_rate,
+    )
+
+    features = block(inputs)
+
+    self.assertAllEqual(
+        [1, input_size // strides, input_size // strides, filter_size],
+        features.shape.as_list())
+
+  @parameterized.parameters(
+      (nn_blocks.BottleneckBlock, 1, False, 0.0, None),
+      (nn_blocks.BottleneckBlock, 2, True, 0.2, 0.25),
+  )
+  def test_bottleneck_block_creation(self, block_fn, strides, use_projection,
+                                     stochastic_depth_drop_rate, se_ratio):
+    input_size = 128
+    filter_size = 256
+    inputs = tf.keras.Input(
+        shape=(input_size, input_size, filter_size * 4), batch_size=1)
+    block = block_fn(
+        filter_size,
+        strides,
+        use_projection=use_projection,
+        se_ratio=se_ratio,
+        stochastic_depth_drop_rate=stochastic_depth_drop_rate)
+
+    features = block(inputs)
+
+    self.assertAllEqual(
+        [1, input_size // strides, input_size // strides, filter_size * 4],
+        features.shape.as_list())
+
+  @parameterized.parameters(
+      (nn_blocks.InvertedBottleneckBlock, 1, 1, None, None),
+      (nn_blocks.InvertedBottleneckBlock, 6, 1, None, None),
+      (nn_blocks.InvertedBottleneckBlock, 1, 2, None, None),
+      (nn_blocks.InvertedBottleneckBlock, 1, 1, 0.2, None),
+      (nn_blocks.InvertedBottleneckBlock, 1, 1, None, 0.2),
+  )
+  def test_invertedbottleneck_block_creation(self, block_fn, expand_ratio,
+                                             strides, se_ratio,
+                                             stochastic_depth_drop_rate):
+    input_size = 128
+    in_filters = 24
+    out_filters = 40
+    inputs = tf.keras.Input(
+        shape=(input_size, input_size, in_filters), batch_size=1)
+    block = block_fn(
+        in_filters=in_filters,
+        out_filters=out_filters,
+        expand_ratio=expand_ratio,
+        strides=strides,
+        se_ratio=se_ratio,
+        stochastic_depth_drop_rate=stochastic_depth_drop_rate)
+
+    features = block(inputs)
+
+    self.assertAllEqual(
+        [1, input_size // strides, input_size // strides, out_filters],
+        features.shape.as_list())
+
+  @parameterized.parameters(
+      (nn_blocks.TuckerConvBlock, 1, 0.25, 0.25),
+      (nn_blocks.TuckerConvBlock, 2, 0.25, 0.25),
+  )
+  def test_tucker_conv_block(
+      self, block_fn, strides,
+      input_compression_ratio, output_compression_ratio):
+    input_size = 128
+    in_filters = 24
+    out_filters = 24
+    inputs = tf.keras.Input(
+        shape=(input_size, input_size, in_filters), batch_size=1)
+    block = block_fn(
+        in_filters=in_filters,
+        out_filters=out_filters,
+        input_compression_ratio=input_compression_ratio,
+        output_compression_ratio=output_compression_ratio,
+        strides=strides)
+
+    features = block(inputs)
+
+    self.assertAllEqual(
+        [1, input_size // strides, input_size // strides, out_filters],
+        features.shape.as_list())
+
+
+class ResidualInnerTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_shape(self, distribution):
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = 64
+    strides = 2
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
+    with distribution.scope():
+      test_layer = nn_blocks.ResidualInner(filters, strides)
+
+    output = test_layer(input_tensor)
+    expected_output_shape = [bsz, h // strides, w // strides, filters]
+    self.assertEqual(expected_output_shape, output.shape.as_list())
+
+
+class BottleneckResidualInnerTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_shape(self, distribution):
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = 64
+    strides = 2
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
+    with distribution.scope():
+      test_layer = nn_blocks.BottleneckResidualInner(filters, strides)
+
+    output = test_layer(input_tensor)
+    expected_output_shape = [bsz, h // strides, w // strides, filters * 4]
+    self.assertEqual(expected_output_shape, output.shape.as_list())
+
+
+class DepthwiseSeparableConvBlockTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_shape(self, distribution):
+    batch_size, height, width, num_channels = 8, 32, 32, 32
+    num_filters = 64
+    strides = 2
+
+    input_tensor = tf.random.normal(
+        shape=[batch_size, height, width, num_channels])
+    with distribution.scope():
+      block = nn_blocks.DepthwiseSeparableConvBlock(
+          num_filters, strides=strides)
+      config_dict = block.get_config()
+      recreate_block = nn_blocks.DepthwiseSeparableConvBlock(**config_dict)
+
+    output_tensor = block(input_tensor)
+    expected_output_shape = [
+        batch_size, height // strides, width // strides, num_filters
+    ]
+    self.assertEqual(output_tensor.shape.as_list(), expected_output_shape)
+
+    output_tensor = recreate_block(input_tensor)
+    self.assertEqual(output_tensor.shape.as_list(), expected_output_shape)
+
+
+class ReversibleLayerTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_downsampling_non_reversible_step(self, distribution):
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = 64
+    strides = 2
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
+    with distribution.scope():
+      f = nn_blocks.ResidualInner(
+          filters=filters // 2, strides=strides, batch_norm_first=True)
+      g = nn_blocks.ResidualInner(
+          filters=filters // 2, strides=1, batch_norm_first=True)
+      test_layer = nn_blocks.ReversibleLayer(f, g)
+      test_layer.build(input_tensor.shape)
+      optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
+
+    @tf.function
+    def step_fn():
+      with tf.GradientTape() as tape:
+        output = test_layer(input_tensor, training=True)
+      grads = tape.gradient(output, test_layer.trainable_variables)
+      # Test applying gradients with optimizer works
+      optimizer.apply_gradients(zip(grads, test_layer.trainable_variables))
+
+      return output
+
+    replica_output = distribution.run(step_fn)
+    outputs = distribution.experimental_local_results(replica_output)
+
+    # Assert forward pass shape
+    expected_output_shape = [bsz, h // strides, w // strides, filters]
+    for output in outputs:
+      self.assertEqual(expected_output_shape, output.shape.as_list())
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_reversible_step(self, distribution):
+    # Reversible layers satisfy: (a) strides = 1 (b) in_filter = out_filter
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = c
+    strides = 1
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
+    with distribution.scope():
+      f = nn_blocks.ResidualInner(
+          filters=filters // 2, strides=strides, batch_norm_first=False)
+      g = nn_blocks.ResidualInner(
+          filters=filters // 2, strides=1, batch_norm_first=False)
+      test_layer = nn_blocks.ReversibleLayer(f, g)
+      test_layer(input_tensor, training=False)  # init weights
+      optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
+
+    @tf.function
+    def step_fn():
+      with tf.GradientTape() as tape:
+        output = test_layer(input_tensor, training=True)
+      grads = tape.gradient(output, test_layer.trainable_variables)
+      # Test applying gradients with optimizer works
+      optimizer.apply_gradients(zip(grads, test_layer.trainable_variables))
+
+      return output
+
+    @tf.function
+    def fwd():
+      test_layer(input_tensor)
+
+    distribution.run(fwd)  # Initialize variables
+    prev_variables = tf.identity_n(test_layer.trainable_variables)
+    replica_output = distribution.run(step_fn)
+    outputs = distribution.experimental_local_results(replica_output)
+
+    # Assert variables values have changed values
+    for v0, v1 in zip(prev_variables, test_layer.trainable_variables):
+      self.assertNotAllEqual(v0, v1)
+
+    # Assert forward pass shape
+    expected_output_shape = [bsz, h // strides, w // strides, filters]
+    for output in outputs:
+      self.assertEqual(expected_output_shape, output.shape.as_list())
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_manual_gradients_correctness(self, distribution):
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = c
+    strides = 1
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c * 4])  # bottleneck
+    with distribution.scope():
+      f_manual = nn_blocks.BottleneckResidualInner(
+          filters=filters // 2, strides=strides, batch_norm_first=False)
+      g_manual = nn_blocks.BottleneckResidualInner(
+          filters=filters // 2, strides=1, batch_norm_first=False)
+      manual_grad_layer = nn_blocks.ReversibleLayer(f_manual, g_manual)
+      manual_grad_layer(input_tensor, training=False)  # init weights
+
+      f_auto = nn_blocks.BottleneckResidualInner(
+          filters=filters // 2, strides=strides, batch_norm_first=False)
+      g_auto = nn_blocks.BottleneckResidualInner(
+          filters=filters // 2, strides=1, batch_norm_first=False)
+      auto_grad_layer = nn_blocks.ReversibleLayer(
+          f_auto, g_auto, manual_grads=False)
+      auto_grad_layer(input_tensor)  # init weights
+      # Clone all weights (tf.keras.layers.Layer has no .clone())
+      auto_grad_layer._f.set_weights(manual_grad_layer._f.get_weights())
+      auto_grad_layer._g.set_weights(manual_grad_layer._g.get_weights())
+
+    @tf.function
+    def manual_fn():
+      with tf.GradientTape() as tape:
+        output = manual_grad_layer(input_tensor, training=True)
+      grads = tape.gradient(output, manual_grad_layer.trainable_variables)
+      return grads
+
+    @tf.function
+    def auto_fn():
+      with tf.GradientTape() as tape:
+        output = auto_grad_layer(input_tensor, training=True)
+      grads = tape.gradient(output, auto_grad_layer.trainable_variables)
+      return grads
+
+    manual_grads = distribution.run(manual_fn)
+    auto_grads = distribution.run(auto_fn)
+
+    # Assert gradients calculated manually are close to that from autograd
+    for manual_grad, auto_grad in zip(manual_grads, auto_grads):
+      self.assertAllClose(
+          distribution.experimental_local_results(manual_grad),
+          distribution.experimental_local_results(auto_grad),
+          atol=5e-3,
+          rtol=5e-3)
+
+    # Verify that BN moving mean and variance is correct.
+    for manual_var, auto_var in zip(manual_grad_layer.non_trainable_variables,
+                                    auto_grad_layer.non_trainable_variables):
+      self.assertAllClose(manual_var, auto_var)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/nn_layers.py
+++ b/official/vision/modeling/layers/nn_layers.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains common building blocks for neural networks."""
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
+
+from absl import logging
+import tensorflow as tf
+import tensorflow_addons as tfa
+
+from official.modeling import tf_utils
+from official.vision.ops import spatial_transform_ops
+
+
+# Type annotations.
+States = Dict[str, tf.Tensor]
+Activation = Union[str, Callable]
+
+
+def make_divisible(value: float,
+                   divisor: int,
+                   min_value: Optional[float] = None,
+                   round_down_protect: bool = True,
+                   ) -> int:
+  """This is to ensure that all layers have channels that are divisible by 8.
+
+  Args:
+    value: A `float` of original value.
+    divisor: An `int` of the divisor that need to be checked upon.
+    min_value: A `float` of  minimum value threshold.
+    round_down_protect: A `bool` indicating whether round down more than 10%
+      will be allowed.
+
+  Returns:
+    The adjusted value in `int` that is divisible against divisor.
+  """
+  if min_value is None:
+    min_value = divisor
+  new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if round_down_protect and new_value < 0.9 * value:
+    new_value += divisor
+  return int(new_value)
+
+
+def round_filters(filters: int,
+                  multiplier: float,
+                  divisor: int = 8,
+                  min_depth: Optional[int] = None,
+                  round_down_protect: bool = True,
+                  skip: bool = False) -> int:
+  """Rounds number of filters based on width multiplier."""
+  orig_f = filters
+  if skip or not multiplier:
+    return filters
+
+  new_filters = make_divisible(value=filters * multiplier,
+                               divisor=divisor,
+                               min_value=min_depth,
+                               round_down_protect=round_down_protect)
+
+  logging.info('round_filter input=%s output=%s', orig_f, new_filters)
+  return int(new_filters)
+
+
+def get_padding_for_kernel_size(kernel_size):
+  """Compute padding size given kernel size."""
+  if kernel_size == 7:
+    return (3, 3)
+  elif kernel_size == 3:
+    return (1, 1)
+  else:
+    raise ValueError('Padding for kernel size {} not known.'.format(
+        kernel_size))
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SqueezeExcitation(tf.keras.layers.Layer):
+  """Creates a squeeze and excitation layer."""
+
+  def __init__(self,
+               in_filters,
+               out_filters,
+               se_ratio,
+               divisible_by=1,
+               use_3d_input=False,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               gating_activation='sigmoid',
+               round_down_protect=True,
+               **kwargs):
+    """Initializes a squeeze and excitation layer.
+
+    Args:
+      in_filters: An `int` number of filters of the input tensor.
+      out_filters: An `int` number of filters of the output tensor.
+      se_ratio: A `float` or None. If not None, se ratio for the squeeze and
+        excitation layer.
+      divisible_by: An `int` that ensures all inner dimensions are divisible by
+        this number.
+      use_3d_input: A `bool` of whether input is 2D or 3D image.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      gating_activation: A `str` name of the activation function for final
+        gating function.
+      round_down_protect: A `bool` of whether round down more than 10% will be
+        allowed.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(SqueezeExcitation, self).__init__(**kwargs)
+
+    self._in_filters = in_filters
+    self._out_filters = out_filters
+    self._se_ratio = se_ratio
+    self._divisible_by = divisible_by
+    self._round_down_protect = round_down_protect
+    self._use_3d_input = use_3d_input
+    self._activation = activation
+    self._gating_activation = gating_activation
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      if not use_3d_input:
+        self._spatial_axis = [1, 2]
+      else:
+        self._spatial_axis = [1, 2, 3]
+    else:
+      if not use_3d_input:
+        self._spatial_axis = [2, 3]
+      else:
+        self._spatial_axis = [2, 3, 4]
+    self._activation_fn = tf_utils.get_activation(activation)
+    self._gating_activation_fn = tf_utils.get_activation(gating_activation)
+
+  def build(self, input_shape):
+    num_reduced_filters = make_divisible(
+        max(1, int(self._in_filters * self._se_ratio)),
+        divisor=self._divisible_by,
+        round_down_protect=self._round_down_protect)
+
+    self._se_reduce = tf.keras.layers.Conv2D(
+        filters=num_reduced_filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=True,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+
+    self._se_expand = tf.keras.layers.Conv2D(
+        filters=self._out_filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=True,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+
+    super(SqueezeExcitation, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'in_filters': self._in_filters,
+        'out_filters': self._out_filters,
+        'se_ratio': self._se_ratio,
+        'divisible_by': self._divisible_by,
+        'use_3d_input': self._use_3d_input,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'gating_activation': self._gating_activation,
+        'round_down_protect': self._round_down_protect,
+    }
+    base_config = super(SqueezeExcitation, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    x = tf.reduce_mean(inputs, self._spatial_axis, keepdims=True)
+    x = self._activation_fn(self._se_reduce(x))
+    x = self._gating_activation_fn(self._se_expand(x))
+    return x * inputs
+
+
+def get_stochastic_depth_rate(init_rate, i, n):
+  """Get drop connect rate for the ith block.
+
+  Args:
+    init_rate: A `float` of initial drop rate.
+    i: An `int` of order of the current block.
+    n: An `int` total number of blocks.
+
+  Returns:
+    Drop rate of the ith block.
+  """
+  if init_rate is not None:
+    if init_rate < 0 or init_rate > 1:
+      raise ValueError('Initial drop rate must be within 0 and 1.')
+    rate = init_rate * float(i) / n
+  else:
+    rate = None
+  return rate
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class StochasticDepth(tf.keras.layers.Layer):
+  """Creates a stochastic depth layer."""
+
+  def __init__(self, stochastic_depth_drop_rate, **kwargs):
+    """Initializes a stochastic depth layer.
+
+    Args:
+      stochastic_depth_drop_rate: A `float` of drop rate.
+      **kwargs: Additional keyword arguments to be passed.
+
+    Returns:
+      A output `tf.Tensor` of which should have the same shape as input.
+    """
+    super(StochasticDepth, self).__init__(**kwargs)
+    self._drop_rate = stochastic_depth_drop_rate
+
+  def get_config(self):
+    config = {'drop_rate': self._drop_rate}
+    base_config = super(StochasticDepth, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    if training is None:
+      training = tf.keras.backend.learning_phase()
+    if not training or self._drop_rate is None or self._drop_rate == 0:
+      return inputs
+
+    keep_prob = 1.0 - self._drop_rate
+    batch_size = tf.shape(inputs)[0]
+    random_tensor = keep_prob
+    random_tensor += tf.random.uniform(
+        [batch_size] + [1] * (inputs.shape.rank - 1), dtype=inputs.dtype)
+    binary_tensor = tf.floor(random_tensor)
+    output = tf.math.divide(inputs, keep_prob) * binary_tensor
+    return output
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+def pyramid_feature_fusion(inputs, target_level):
+  """Fuses all feature maps in the feature pyramid at the target level.
+
+  Args:
+    inputs: A dictionary containing the feature pyramid. The size of the input
+      tensor needs to be fixed.
+    target_level: An `int` of the target feature level for feature fusion.
+
+  Returns:
+    A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width,
+      feature_channel].
+  """
+  # Convert keys to int.
+  pyramid_feats = {int(k): v for k, v in inputs.items()}
+  min_level = min(pyramid_feats.keys())
+  max_level = max(pyramid_feats.keys())
+  resampled_feats = []
+
+  for l in range(min_level, max_level + 1):
+    if l == target_level:
+      resampled_feats.append(pyramid_feats[l])
+    else:
+      feat = pyramid_feats[l]
+      target_size = list(feat.shape[1:3])
+      target_size[0] *= 2**(l - target_level)
+      target_size[1] *= 2**(l - target_level)
+      # Casts feat to float32 so the resize op can be run on TPU.
+      feat = tf.cast(feat, tf.float32)
+      feat = tf.image.resize(
+          feat, size=target_size, method=tf.image.ResizeMethod.BILINEAR)
+      # Casts it back to be compatible with the rest opetations.
+      feat = tf.cast(feat, pyramid_feats[l].dtype)
+      resampled_feats.append(feat)
+
+  return tf.math.add_n(resampled_feats)
+
+
+class PanopticFPNFusion(tf.keras.Model):
+  """Creates a Panoptic FPN feature Fusion layer.
+
+  This implements feature fusion for semantic segmentation head from the paper:
+  Alexander Kirillov, Ross Girshick, Kaiming He and Piotr Dollar.
+  Panoptic Feature Pyramid Networks.
+  (https://arxiv.org/pdf/1901.02446.pdf)
+  """
+
+  def __init__(
+      self,
+      min_level: int = 2,
+      max_level: int = 5,
+      target_level: int = 2,
+      num_filters: int = 128,
+      num_fpn_filters: int = 256,
+      activation: str = 'relu',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+
+    """Initializes panoptic FPN feature fusion layer.
+
+    Args:
+      min_level: An `int` of minimum level to use in feature fusion.
+      max_level: An `int` of maximum level to use in feature fusion.
+      target_level: An `int` of the target feature level for feature fusion.
+      num_filters: An `int` number of filters in conv2d layers.
+      num_fpn_filters: An `int` number of filters in the FPN outputs
+      activation: A `str` name of the activation function.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    Returns:
+      A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width,
+        feature_channel].
+    """
+    if target_level > max_level:
+      raise ValueError('target_level should be less than max_level')
+
+    self._config_dict = {
+        'min_level': min_level,
+        'max_level': max_level,
+        'target_level': target_level,
+        'num_filters': num_filters,
+        'num_fpn_filters': num_fpn_filters,
+        'activation': activation,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+    }
+    norm = tfa.layers.GroupNormalization
+    conv2d = tf.keras.layers.Conv2D
+    activation_fn = tf_utils.get_activation(activation)
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      norm_axis = -1
+    else:
+      norm_axis = 1
+    inputs = self._build_inputs(num_fpn_filters, min_level, max_level)
+
+    upscaled_features = []
+    for level in range(min_level, max_level + 1):
+      num_conv_layers = max(1, level - target_level)
+      x = inputs[str(level)]
+      for i in range(num_conv_layers):
+        x = conv2d(
+            filters=num_filters,
+            kernel_size=3,
+            padding='same',
+            kernel_initializer=tf.keras.initializers.VarianceScaling(),
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer)(x)
+        x = norm(groups=32, axis=norm_axis)(x)
+        x = activation_fn(x)
+        if level != target_level:
+          x = spatial_transform_ops.nearest_upsampling(x, scale=2)
+      upscaled_features.append(x)
+
+    fused_features = tf.math.add_n(upscaled_features)
+    self._output_specs = {str(target_level): fused_features.get_shape()}
+
+    super(PanopticFPNFusion, self).__init__(
+        inputs=inputs, outputs=fused_features, **kwargs)
+
+  def _build_inputs(self, num_filters: int,
+                    min_level: int, max_level: int):
+    inputs = {}
+    for level in range(min_level, max_level + 1):
+      inputs[str(level)] = tf.keras.Input(shape=[None, None, num_filters])
+    return inputs
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def output_specs(self) -> Mapping[str, tf.TensorShape]:
+    """A dict of {level: TensorShape} pairs for the model output."""
+    return self._output_specs
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class Scale(tf.keras.layers.Layer):
+  """Scales the input by a trainable scalar weight.
+
+  This is useful for applying ReZero to layers, which improves convergence
+  speed. This implements the paper:
+  ReZero is All You Need: Fast Convergence at Large Depth.
+  (https://arxiv.org/pdf/2003.04887.pdf).
+  """
+
+  def __init__(
+      self,
+      initializer: tf.keras.initializers.Initializer = 'ones',
+      regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+    """Initializes a scale layer.
+
+    Args:
+      initializer: A `str` of initializer for the scalar weight.
+      regularizer: A `tf.keras.regularizers.Regularizer` for the scalar weight.
+      **kwargs: Additional keyword arguments to be passed to this layer.
+
+    Returns:
+      An `tf.Tensor` of which should have the same shape as input.
+    """
+    super(Scale, self).__init__(**kwargs)
+
+    self._initializer = initializer
+    self._regularizer = regularizer
+
+    self._scale = self.add_weight(
+        name='scale',
+        shape=[],
+        dtype=self.dtype,
+        initializer=self._initializer,
+        regularizer=self._regularizer,
+        trainable=True)
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'initializer': self._initializer,
+        'regularizer': self._regularizer,
+    }
+    base_config = super(Scale, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    """Calls the layer with the given inputs."""
+    scale = tf.cast(self._scale, inputs.dtype)
+    return scale * inputs
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class TemporalSoftmaxPool(tf.keras.layers.Layer):
+  """Creates a network layer corresponding to temporal softmax pooling.
+
+  This is useful for multi-class logits (used in e.g., Charades). Modified from
+  AssembleNet Charades evaluation from:
+
+  Michael S. Ryoo, AJ Piergiovanni, Mingxing Tan, Anelia Angelova.
+  AssembleNet: Searching for Multi-Stream Neural Connectivity in Video
+  Architectures.
+  (https://arxiv.org/pdf/1905.13209.pdf).
+  """
+
+  def call(self, inputs):
+    """Calls the layer with the given inputs."""
+    assert inputs.shape.rank in (3, 4, 5)
+    frames = tf.shape(inputs)[1]
+    pre_logits = inputs / tf.sqrt(tf.cast(frames, inputs.dtype))
+    activations = tf.nn.softmax(pre_logits, axis=1)
+    outputs = inputs * activations
+    return outputs
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class PositionalEncoding(tf.keras.layers.Layer):
+  """Creates a network layer that adds a sinusoidal positional encoding.
+
+  Positional encoding is incremented across frames, and is added to the input.
+  The positional encoding is first weighted at 0 so that the network can choose
+  to ignore it. This implements:
+
+  Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
+  Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin.
+  Attention Is All You Need.
+  (https://arxiv.org/pdf/1706.03762.pdf).
+  """
+
+  def __init__(self,
+               initializer: tf.keras.initializers.Initializer = 'zeros',
+               cache_encoding: bool = False,
+               state_prefix: Optional[str] = None,
+               **kwargs):
+    """Initializes positional encoding.
+
+    Args:
+      initializer: A `str` of initializer for weighting the positional encoding.
+      cache_encoding: A `bool`. If True, cache the positional encoding tensor
+        after calling build. Otherwise, rebuild the tensor for every call.
+        Setting this to False can be useful when we want to input a variable
+        number of frames, so the positional encoding tensor can change shape.
+      state_prefix: a prefix string to identify states.
+      **kwargs: Additional keyword arguments to be passed to this layer.
+
+    Returns:
+      A `tf.Tensor` of which should have the same shape as input.
+    """
+    super(PositionalEncoding, self).__init__(**kwargs)
+    self._initializer = initializer
+    self._cache_encoding = cache_encoding
+    self._pos_encoding = None
+    self._rezero = Scale(initializer=initializer, name='rezero')
+    state_prefix = state_prefix if state_prefix is not None else ''
+    self._state_prefix = state_prefix
+    self._frame_count_name = f'{state_prefix}_pos_enc_frame_count'
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'initializer': self._initializer,
+        'cache_encoding': self._cache_encoding,
+        'state_prefix': self._state_prefix,
+    }
+    base_config = super(PositionalEncoding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def _positional_encoding(self,
+                           num_positions: Union[int, tf.Tensor],
+                           hidden_size: Union[int, tf.Tensor],
+                           start_position: Union[int, tf.Tensor] = 0,
+                           dtype: str = 'float32') -> tf.Tensor:
+    """Creates a sequence of sinusoidal positional encoding vectors.
+
+    Args:
+      num_positions: the total number of positions (frames).
+      hidden_size: the number of channels used for the hidden vectors.
+      start_position: the start position.
+      dtype: the dtype of the output tensor.
+
+    Returns:
+      The positional encoding tensor with shape [num_positions, hidden_size].
+    """
+    if isinstance(start_position, tf.Tensor) and start_position.shape.rank == 1:
+      start_position = start_position[0]
+
+    # Calling `tf.range` with `dtype=tf.bfloat16` results in an error,
+    # so we cast afterward.
+    positions = tf.range(start_position, start_position + num_positions)
+    positions = tf.cast(positions, dtype)[:, tf.newaxis]
+    idx = tf.range(hidden_size)[tf.newaxis, :]
+
+    power = tf.cast(2 * (idx // 2), dtype)
+    power /= tf.cast(hidden_size, dtype)
+    angles = 1. / tf.math.pow(10_000., power)
+    radians = positions * angles
+
+    sin = tf.math.sin(radians[:, 0::2])
+    cos = tf.math.cos(radians[:, 1::2])
+    pos_encoding = tf.concat([sin, cos], axis=-1)
+
+    return pos_encoding
+
+  def _get_pos_encoding(self,
+                        input_shape: tf.Tensor,
+                        frame_count: int = 0) -> tf.Tensor:
+    """Calculates the positional encoding from the input shape.
+
+    Args:
+      input_shape: the shape of the input.
+      frame_count: a count of frames that indicates the index of the first
+        frame.
+
+    Returns:
+      The positional encoding tensor with shape [num_positions, hidden_size].
+
+    """
+    frames = input_shape[1]
+    channels = input_shape[-1]
+    pos_encoding = self._positional_encoding(
+        frames, channels, start_position=frame_count, dtype=self.dtype)
+    pos_encoding = tf.reshape(pos_encoding, [1, frames, 1, 1, channels])
+    return pos_encoding
+
+  def build(self, input_shape):
+    """Builds the layer with the given input shape.
+
+    Args:
+      input_shape: The input shape.
+
+    Raises:
+      ValueError: If using 'channels_first' data format.
+    """
+    if tf.keras.backend.image_data_format() == 'channels_first':
+      raise ValueError('"channels_first" mode is unsupported.')
+
+    if self._cache_encoding:
+      self._pos_encoding = self._get_pos_encoding(input_shape)
+
+    super(PositionalEncoding, self).build(input_shape)
+
+  def call(
+      self,
+      inputs: tf.Tensor,
+      states: Optional[States] = None,
+      output_states: bool = True,
+  ) -> Union[tf.Tensor, Tuple[tf.Tensor, States]]:
+    """Calls the layer with the given inputs.
+
+    Args:
+      inputs: An input `tf.Tensor`.
+      states: A `dict` of states such that, if any of the keys match for this
+        layer, will overwrite the contents of the buffer(s). Expected keys
+        include `state_prefix + '_pos_enc_frame_count'`.
+      output_states: A `bool`. If True, returns the output tensor and output
+        states. Returns just the output tensor otherwise.
+
+    Returns:
+      An output `tf.Tensor` (and optionally the states if `output_states=True`).
+
+    Raises:
+      ValueError: If using 'channels_first' data format.
+    """
+    states = dict(states) if states is not None else {}
+
+    # Keep a count of frames encountered across input iterations in
+    # num_frames to be able to accurately update the positional encoding.
+    num_frames = tf.shape(inputs)[1]
+    frame_count = tf.cast(states.get(self._frame_count_name, [0]), tf.int32)
+    states[self._frame_count_name] = frame_count + num_frames
+
+    if self._cache_encoding:
+      pos_encoding = self._pos_encoding
+    else:
+      pos_encoding = self._get_pos_encoding(
+          tf.shape(inputs), frame_count=frame_count)
+    pos_encoding = tf.cast(pos_encoding, inputs.dtype)
+    pos_encoding = self._rezero(pos_encoding)
+    outputs = inputs + pos_encoding
+
+    return (outputs, states) if output_states else outputs
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class GlobalAveragePool3D(tf.keras.layers.Layer):
+  """Creates a global average pooling layer with causal mode.
+
+  Implements causal mode, which runs a cumulative sum (with `tf.cumsum`) across
+  frames in the time dimension, allowing the use of a stream buffer. Sums any
+  valid input state with the current input to allow state to accumulate over
+  several iterations.
+  """
+
+  def __init__(self,
+               keepdims: bool = False,
+               causal: bool = False,
+               state_prefix: Optional[str] = None,
+               **kwargs):
+    """Initializes a global average pool layer.
+
+    Args:
+      keepdims: A `bool`. If True, keep the averaged dimensions.
+      causal: A `bool` of whether to run in causal mode with a cumulative sum
+        across frames.
+      state_prefix: a prefix string to identify states.
+      **kwargs: Additional keyword arguments to be passed to this layer.
+
+    Returns:
+      An output `tf.Tensor`.
+    """
+    super(GlobalAveragePool3D, self).__init__(**kwargs)
+
+    self._keepdims = keepdims
+    self._causal = causal
+    state_prefix = state_prefix if state_prefix is not None else ''
+    self._state_prefix = state_prefix
+
+    self._state_name = f'{state_prefix}_pool_buffer'
+    self._frame_count_name = f'{state_prefix}_pool_frame_count'
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'keepdims': self._keepdims,
+        'causal': self._causal,
+        'state_prefix': self._state_prefix,
+    }
+    base_config = super(GlobalAveragePool3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self,
+           inputs: tf.Tensor,
+           states: Optional[States] = None,
+           output_states: bool = True
+           ) -> Union[tf.Tensor, Tuple[tf.Tensor, States]]:
+    """Calls the layer with the given inputs.
+
+    Args:
+      inputs: An input `tf.Tensor`.
+      states: A `dict` of states such that, if any of the keys match for this
+        layer, will overwrite the contents of the buffer(s).
+        Expected keys include `state_prefix + '__pool_buffer'` and
+        `state_prefix + '__pool_frame_count'`.
+      output_states: A `bool`. If True, returns the output tensor and output
+        states. Returns just the output tensor otherwise.
+
+    Returns:
+      An output `tf.Tensor` (and optionally the states if `output_states=True`).
+      If `causal=True`, the output tensor will have shape
+      `[batch_size, num_frames, 1, 1, channels]` if `keepdims=True`. We keep
+      the frame dimension in this case to simulate a cumulative global average
+      as if we are inputting one frame at a time. If `causal=False`, the output
+      is equivalent to `tf.keras.layers.GlobalAveragePooling3D` with shape
+      `[batch_size, 1, 1, 1, channels]` if `keepdims=True` (plus the optional
+      buffer stored in `states`).
+
+    Raises:
+      ValueError: If using 'channels_first' data format.
+    """
+    states = dict(states) if states is not None else {}
+
+    if tf.keras.backend.image_data_format() == 'channels_first':
+      raise ValueError('"channels_first" mode is unsupported.')
+
+    # Shape: [batch_size, 1, 1, 1, channels]
+    buffer = states.get(self._state_name, None)
+    if buffer is None:
+      buffer = tf.zeros_like(inputs[:, :1, :1, :1], dtype=inputs.dtype)
+      states[self._state_name] = buffer
+
+    # Keep a count of frames encountered across input iterations in
+    # num_frames to be able to accurately take a cumulative average across
+    # all frames when running in streaming mode
+    num_frames = tf.shape(inputs)[1]
+    frame_count = states.get(self._frame_count_name, tf.constant([0]))
+    frame_count = tf.cast(frame_count, tf.int32)
+    states[self._frame_count_name] = frame_count + num_frames
+
+    if self._causal:
+      # Take a mean of spatial dimensions to make computation more efficient.
+      x = tf.reduce_mean(inputs, axis=[2, 3], keepdims=True)
+      x = tf.cumsum(x, axis=1)
+      x = x + buffer
+
+      # The last frame will be the value of the next state
+      # Shape: [batch_size, 1, 1, 1, channels]
+      states[self._state_name] = x[:, -1:]
+
+      # In causal mode, the divisor increments by 1 for every frame to
+      # calculate cumulative averages instead of one global average
+      mean_divisors = tf.range(num_frames) + frame_count + 1
+      mean_divisors = tf.reshape(mean_divisors, [1, num_frames, 1, 1, 1])
+      mean_divisors = tf.cast(mean_divisors, x.dtype)
+
+      # Shape: [batch_size, num_frames, 1, 1, channels]
+      x = x / mean_divisors
+    else:
+      # In non-causal mode, we (optionally) sum across frames to take a
+      # cumulative average across input iterations rather than individual
+      # frames. If no buffer state is passed, this essentially becomes
+      # regular global average pooling.
+      # Shape: [batch_size, 1, 1, 1, channels]
+      x = tf.reduce_sum(inputs, axis=(1, 2, 3), keepdims=True)
+      x = x / tf.cast(tf.shape(inputs)[2] * tf.shape(inputs)[3], x.dtype)
+      x = x + buffer
+
+      # Shape: [batch_size, 1, 1, 1, channels]
+      states[self._state_name] = x
+
+      x = x / tf.cast(frame_count + num_frames, x.dtype)
+
+    if not self._keepdims:
+      x = tf.squeeze(x, axis=(1, 2, 3))
+
+    return (x, states) if output_states else x
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SpatialAveragePool3D(tf.keras.layers.Layer):
+  """Creates a global average pooling layer pooling across spatial dimentions."""
+
+  def __init__(self, keepdims: bool = False, **kwargs):
+    """Initializes a global average pool layer.
+
+    Args:
+      keepdims: A `bool`. If True, keep the averaged dimensions.
+      **kwargs: Additional keyword arguments to be passed to this layer.
+
+    Returns:
+      An output `tf.Tensor`.
+    """
+    super(SpatialAveragePool3D, self).__init__(**kwargs)
+    self._keepdims = keepdims
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'keepdims': self._keepdims,
+    }
+    base_config = super(SpatialAveragePool3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+    """Builds the layer with the given input shape."""
+    if tf.keras.backend.image_data_format() == 'channels_first':
+      raise ValueError('"channels_first" mode is unsupported.')
+
+    super(SpatialAveragePool3D, self).build(input_shape)
+
+  def call(self, inputs):
+    """Calls the layer with the given inputs."""
+    if inputs.shape.rank != 5:
+      raise ValueError(
+          'Input should have rank {}, got {}'.format(5, inputs.shape.rank))
+
+    return tf.reduce_mean(inputs, axis=(2, 3), keepdims=self._keepdims)
+
+
+class CausalConvMixin:
+  """Mixin class to implement CausalConv for `tf.keras.layers.Conv` layers."""
+
+  @property
+  def use_buffered_input(self) -> bool:
+    return self._use_buffered_input
+
+  @use_buffered_input.setter
+  def use_buffered_input(self, variable: bool):
+    self._use_buffered_input = variable
+
+  def _compute_buffered_causal_padding(self,
+                                       inputs: tf.Tensor,
+                                       use_buffered_input: bool = False,
+                                       time_axis: int = 1,
+                                       ) -> List[List[int]]:
+    """Calculates padding for 'causal' option for conv layers.
+
+    Args:
+      inputs: An optional input `tf.Tensor` to be padded.
+      use_buffered_input: A `bool`. If True, use 'valid' padding along the time
+        dimension. This should be set when applying the stream buffer.
+      time_axis: An `int` of the axis of the time dimension.
+
+    Returns:
+      A list of paddings for `tf.pad`.
+    """
+    input_shape = tf.shape(inputs)[1:-1]
+
+    if tf.keras.backend.image_data_format() == 'channels_first':
+      raise ValueError('"channels_first" mode is unsupported.')
+
+    kernel_size_effective = [
+        (self.kernel_size[i] +
+         (self.kernel_size[i] - 1) * (self.dilation_rate[i] - 1))
+        for i in range(self.rank)
+    ]
+    pad_total = [kernel_size_effective[0] - 1]
+    for i in range(1, self.rank):
+      overlap = (input_shape[i] - 1) % self.strides[i] + 1
+      pad_total.append(tf.maximum(kernel_size_effective[i] - overlap, 0))
+    pad_beg = [pad_total[i] // 2 for i in range(self.rank)]
+    pad_end = [pad_total[i] - pad_beg[i] for i in range(self.rank)]
+    padding = [[pad_beg[i], pad_end[i]] for i in range(self.rank)]
+    padding = [[0, 0]] + padding + [[0, 0]]
+
+    if use_buffered_input:
+      padding[time_axis] = [0, 0]
+    else:
+      padding[time_axis] = [padding[time_axis][0] + padding[time_axis][1], 0]
+    return padding
+
+  def _causal_validate_init(self):
+    """Validates the Conv layer initial configuration."""
+    # Overriding this method is meant to circumvent unnecessary errors when
+    # using causal padding.
+    if (self.filters is not None
+        and self.filters % self.groups != 0):
+      raise ValueError(
+          'The number of filters must be evenly divisible by the number of '
+          'groups. Received: groups={}, filters={}'.format(
+              self.groups, self.filters))
+
+    if not all(self.kernel_size):
+      raise ValueError('The argument `kernel_size` cannot contain 0(s). '
+                       'Received: %s' % (self.kernel_size,))
+
+  def _buffered_spatial_output_shape(self, spatial_output_shape: List[int]):
+    """Computes the spatial output shape from the input shape."""
+    # When buffer padding, use 'valid' padding across time. The output shape
+    # across time should be the input shape minus any padding, assuming
+    # the stride across time is 1.
+    if self._use_buffered_input and spatial_output_shape[0] is not None:
+      padding = self._compute_buffered_causal_padding(
+          tf.zeros([1] + spatial_output_shape + [1]), use_buffered_input=False)
+      spatial_output_shape[0] -= sum(padding[1])
+    return spatial_output_shape
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class Conv2D(tf.keras.layers.Conv2D, CausalConvMixin):
+  """Conv2D layer supporting CausalConv.
+
+  Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
+  which applies causal padding to the temporal dimension, and same padding in
+  the spatial dimensions.
+  """
+
+  def __init__(self, *args, use_buffered_input=False, **kwargs):
+    """Initializes conv2d.
+
+    Args:
+      *args: Arguments to be passed.
+      use_buffered_input: A `bool`. If True, the input is expected to be padded
+        beforehand. In effect, calling this layer will use 'valid' padding on
+        the temporal dimension to simulate 'causal' padding.
+      **kwargs: Additional keyword arguments to be passed.
+
+    Returns:
+      An output `tf.Tensor` of the Conv2D operation.
+    """
+    super(Conv2D, self).__init__(*args, **kwargs)
+    self._use_buffered_input = use_buffered_input
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'use_buffered_input': self._use_buffered_input,
+    }
+    base_config = super(Conv2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def _compute_causal_padding(self, inputs):
+    """Computes causal padding dimensions for the given inputs."""
+    return self._compute_buffered_causal_padding(
+        inputs, use_buffered_input=self._use_buffered_input)
+
+  def _validate_init(self):
+    """Validates the Conv layer initial configuration."""
+    self._causal_validate_init()
+
+  def _spatial_output_shape(self, spatial_input_shape: List[int]):
+    """Computes the spatial output shape from the input shape."""
+    shape = super(Conv2D, self)._spatial_output_shape(spatial_input_shape)
+    return self._buffered_spatial_output_shape(shape)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class DepthwiseConv2D(tf.keras.layers.DepthwiseConv2D, CausalConvMixin):
+  """DepthwiseConv2D layer supporting CausalConv.
+
+  Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
+  which applies causal padding to the temporal dimension, and same padding in
+  the spatial dimensions.
+  """
+
+  def __init__(self, *args, use_buffered_input=False, **kwargs):
+    """Initializes depthwise conv2d.
+
+    Args:
+      *args: Arguments to be passed.
+      use_buffered_input: A `bool`. If True, the input is expected to be padded
+        beforehand. In effect, calling this layer will use 'valid' padding on
+        the temporal dimension to simulate 'causal' padding.
+      **kwargs: Additional keyword arguments to be passed.
+
+    Returns:
+      An output `tf.Tensor` of the DepthwiseConv2D operation.
+    """
+    super(DepthwiseConv2D, self).__init__(*args, **kwargs)
+    self._use_buffered_input = use_buffered_input
+
+    # Causal padding is unsupported by default for DepthwiseConv2D,
+    # so we resort to valid padding internally. However, we handle
+    # causal padding as a special case with `self._is_causal`, which is
+    # defined by the super class.
+    if self.padding == 'causal':
+      self.padding = 'valid'
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'use_buffered_input': self._use_buffered_input,
+    }
+    base_config = super(DepthwiseConv2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    """Calls the layer with the given inputs."""
+    if self._is_causal:
+      inputs = tf.pad(inputs, self._compute_causal_padding(inputs))
+    return super(DepthwiseConv2D, self).call(inputs)
+
+  def _compute_causal_padding(self, inputs):
+    """Computes causal padding dimensions for the given inputs."""
+    return self._compute_buffered_causal_padding(
+        inputs, use_buffered_input=self._use_buffered_input)
+
+  def _validate_init(self):
+    """Validates the Conv layer initial configuration."""
+    self._causal_validate_init()
+
+  def _spatial_output_shape(self, spatial_input_shape: List[int]):
+    """Computes the spatial output shape from the input shape."""
+    shape = super(DepthwiseConv2D, self)._spatial_output_shape(
+        spatial_input_shape)
+    return self._buffered_spatial_output_shape(shape)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class Conv3D(tf.keras.layers.Conv3D, CausalConvMixin):
+  """Conv3D layer supporting CausalConv.
+
+  Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
+  which applies causal padding to the temporal dimension, and same padding in
+  the spatial dimensions.
+  """
+
+  def __init__(self, *args, use_buffered_input=False, **kwargs):
+    """Initializes conv3d.
+
+    Args:
+      *args: Arguments to be passed.
+      use_buffered_input: A `bool`. If True, the input is expected to be padded
+        beforehand. In effect, calling this layer will use 'valid' padding on
+        the temporal dimension to simulate 'causal' padding.
+      **kwargs: Additional keyword arguments to be passed.
+
+    Returns:
+      An output `tf.Tensor` of the Conv3D operation.
+    """
+    super(Conv3D, self).__init__(*args, **kwargs)
+    self._use_buffered_input = use_buffered_input
+
+  def get_config(self):
+    """Returns a dictionary containing the config used for initialization."""
+    config = {
+        'use_buffered_input': self._use_buffered_input,
+    }
+    base_config = super(Conv3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    """Call the layer with the given inputs."""
+    # Note: tf.nn.conv3d with depthwise kernels on CPU is currently only
+    # supported when compiling with TF graph (XLA) using tf.function, so it
+    # is compiled by default here (b/186463870).
+    conv_fn = tf.function(super(Conv3D, self).call, jit_compile=True)
+    return conv_fn(inputs)
+
+  def _compute_causal_padding(self, inputs):
+    """Computes causal padding dimensions for the given inputs."""
+    return self._compute_buffered_causal_padding(
+        inputs, use_buffered_input=self._use_buffered_input)
+
+  def _validate_init(self):
+    """Validates the Conv layer initial configuration."""
+    self._causal_validate_init()
+
+  def _spatial_output_shape(self, spatial_input_shape: List[int]):
+    """Computes the spatial output shape from the input shape."""
+    shape = super(Conv3D, self)._spatial_output_shape(spatial_input_shape)
+    return self._buffered_spatial_output_shape(shape)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SpatialPyramidPooling(tf.keras.layers.Layer):
+  """Implements the Atrous Spatial Pyramid Pooling.
+
+  References:
+    [Rethinking Atrous Convolution for Semantic Image Segmentation](
+      https://arxiv.org/pdf/1706.05587.pdf)
+    [Encoder-Decoder with Atrous Separable Convolution for Semantic Image
+    Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
+  """
+
+  def __init__(
+      self,
+      output_channels: int,
+      dilation_rates: List[int],
+      pool_kernel_size: Optional[List[int]] = None,
+      use_sync_bn: bool = False,
+      batchnorm_momentum: float = 0.99,
+      batchnorm_epsilon: float = 0.001,
+      activation: str = 'relu',
+      dropout: float = 0.5,
+      kernel_initializer: str = 'GlorotUniform',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      interpolation: str = 'bilinear',
+      use_depthwise_convolution: bool = False,
+      **kwargs):
+    """Initializes `SpatialPyramidPooling`.
+
+    Args:
+      output_channels: Number of channels produced by SpatialPyramidPooling.
+      dilation_rates: A list of integers for parallel dilated conv.
+      pool_kernel_size: A list of integers or None. If None, global average
+        pooling is applied, otherwise an average pooling of pool_kernel_size is
+        applied.
+      use_sync_bn: A bool, whether or not to use sync batch normalization.
+      batchnorm_momentum: A float for the momentum in BatchNorm. Defaults to
+        0.99.
+      batchnorm_epsilon: A float for the epsilon value in BatchNorm. Defaults to
+        0.001.
+      activation: A `str` for type of activation to be used. Defaults to 'relu'.
+      dropout: A float for the dropout rate before output. Defaults to 0.5.
+      kernel_initializer: Kernel initializer for conv layers. Defaults to
+        `glorot_uniform`.
+      kernel_regularizer: Kernel regularizer for conv layers. Defaults to None.
+      interpolation: The interpolation method for upsampling. Defaults to
+        `bilinear`.
+      use_depthwise_convolution: Allows spatial pooling to be separable
+        depthwise convolusions. [Encoder-Decoder with Atrous Separable
+        Convolution for Semantic Image Segmentation](
+         https://arxiv.org/pdf/1802.02611.pdf)
+      **kwargs: Other keyword arguments for the layer.
+    """
+    super().__init__(**kwargs)
+
+    self._output_channels = output_channels
+    self._dilation_rates = dilation_rates
+    self._use_sync_bn = use_sync_bn
+    self._batchnorm_momentum = batchnorm_momentum
+    self._batchnorm_epsilon = batchnorm_epsilon
+    self._activation = activation
+    self._dropout = dropout
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._interpolation = interpolation
+    self._pool_kernel_size = pool_kernel_size
+    self._use_depthwise_convolution = use_depthwise_convolution
+    self._activation_fn = tf_utils.get_activation(activation)
+    if self._use_sync_bn:
+      self._bn_op = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._bn_op = tf.keras.layers.BatchNormalization
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+
+  def build(self, input_shape):
+    height = input_shape[1]
+    width = input_shape[2]
+    channels = input_shape[3]
+
+    self.aspp_layers = []
+
+    conv1 = tf.keras.layers.Conv2D(
+        filters=self._output_channels,
+        kernel_size=(1, 1),
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        use_bias=False)
+    norm1 = self._bn_op(
+        axis=self._bn_axis,
+        momentum=self._batchnorm_momentum,
+        epsilon=self._batchnorm_epsilon)
+
+    self.aspp_layers.append([conv1, norm1])
+
+    for dilation_rate in self._dilation_rates:
+      leading_layers = []
+      kernel_size = (3, 3)
+      if self._use_depthwise_convolution:
+        leading_layers += [
+            tf.keras.layers.DepthwiseConv2D(
+                depth_multiplier=1,
+                kernel_size=kernel_size,
+                padding='same',
+                depthwise_regularizer=self._kernel_regularizer,
+                depthwise_initializer=self._kernel_initializer,
+                dilation_rate=dilation_rate,
+                use_bias=False)
+        ]
+        kernel_size = (1, 1)
+      conv_dilation = leading_layers + [
+          tf.keras.layers.Conv2D(
+              filters=self._output_channels,
+              kernel_size=kernel_size,
+              padding='same',
+              kernel_regularizer=self._kernel_regularizer,
+              kernel_initializer=self._kernel_initializer,
+              dilation_rate=dilation_rate,
+              use_bias=False)
+      ]
+      norm_dilation = self._bn_op(
+          axis=self._bn_axis,
+          momentum=self._batchnorm_momentum,
+          epsilon=self._batchnorm_epsilon)
+
+      self.aspp_layers.append(conv_dilation + [norm_dilation])
+
+    if self._pool_kernel_size is None:
+      pooling = [
+          tf.keras.layers.GlobalAveragePooling2D(),
+          tf.keras.layers.Reshape((1, 1, channels))
+      ]
+    else:
+      pooling = [tf.keras.layers.AveragePooling2D(self._pool_kernel_size)]
+
+    conv2 = tf.keras.layers.Conv2D(
+        filters=self._output_channels,
+        kernel_size=(1, 1),
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        use_bias=False)
+    norm2 = self._bn_op(
+        axis=self._bn_axis,
+        momentum=self._batchnorm_momentum,
+        epsilon=self._batchnorm_epsilon)
+
+    self.aspp_layers.append(pooling + [conv2, norm2])
+
+    self._resizing_layer = tf.keras.layers.Resizing(
+        height, width, interpolation=self._interpolation, dtype=tf.float32)
+
+    self._projection = [
+        tf.keras.layers.Conv2D(
+            filters=self._output_channels,
+            kernel_size=(1, 1),
+            kernel_initializer=self._kernel_initializer,
+            kernel_regularizer=self._kernel_regularizer,
+            use_bias=False),
+        self._bn_op(
+            axis=self._bn_axis,
+            momentum=self._batchnorm_momentum,
+            epsilon=self._batchnorm_epsilon)
+    ]
+    self._dropout_layer = tf.keras.layers.Dropout(rate=self._dropout)
+    self._concat_layer = tf.keras.layers.Concatenate(axis=-1)
+
+  def call(self,
+           inputs: tf.Tensor,
+           training: Optional[bool] = None) -> tf.Tensor:
+    if training is None:
+      training = tf.keras.backend.learning_phase()
+    result = []
+    for i, layers in enumerate(self.aspp_layers):
+      x = inputs
+      for layer in layers:
+        # Apply layers sequentially.
+        x = layer(x, training=training)
+      x = self._activation_fn(x)
+
+      # Apply resize layer to the end of the last set of layers.
+      if i == len(self.aspp_layers) - 1:
+        x = self._resizing_layer(x)
+
+      result.append(tf.cast(x, inputs.dtype))
+    x = self._concat_layer(result)
+    for layer in self._projection:
+      x = layer(x, training=training)
+    x = self._activation_fn(x)
+    return self._dropout_layer(x)
+
+  def get_config(self):
+    config = {
+        'output_channels': self._output_channels,
+        'dilation_rates': self._dilation_rates,
+        'pool_kernel_size': self._pool_kernel_size,
+        'use_sync_bn': self._use_sync_bn,
+        'batchnorm_momentum': self._batchnorm_momentum,
+        'batchnorm_epsilon': self._batchnorm_epsilon,
+        'activation': self._activation,
+        'dropout': self._dropout,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'interpolation': self._interpolation,
+    }
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(config.items()))
--- a/official/vision/modeling/layers/nn_layers_test.py
+++ b/official/vision/modeling/layers/nn_layers_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for nn_layers."""
+
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.modeling.layers import nn_layers
+
+
+class NNLayersTest(parameterized.TestCase, tf.test.TestCase):
+
+  def test_scale(self):
+    scale = nn_layers.Scale(initializer=tf.keras.initializers.constant(10.))
+    output = scale(3.)
+    self.assertAllEqual(output, 30.)
+
+  def test_temporal_softmax_pool(self):
+    inputs = tf.range(4, dtype=tf.float32) + 1.
+    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
+    layer = nn_layers.TemporalSoftmaxPool()
+    output = layer(inputs)
+    self.assertAllClose(
+        output,
+        [[[[[0.10153633]]],
+          [[[0.33481020]]],
+          [[[0.82801306]]],
+          [[[1.82021690]]]]])
+
+  def test_positional_encoding(self):
+    pos_encoding = nn_layers.PositionalEncoding(
+        initializer='ones', cache_encoding=False)
+    pos_encoding_cached = nn_layers.PositionalEncoding(
+        initializer='ones', cache_encoding=True)
+
+    inputs = tf.ones([1, 4, 1, 1, 3])
+    outputs, _ = pos_encoding(inputs)
+    outputs_cached, _ = pos_encoding_cached(inputs)
+
+    expected = tf.constant(
+        [[[[[1.0000000, 1.0000000, 2.0000000]]],
+          [[[1.8414710, 1.0021545, 1.5403023]]],
+          [[[1.9092975, 1.0043088, 0.5838531]]],
+          [[[1.1411200, 1.0064633, 0.0100075]]]]])
+
+    self.assertEqual(outputs.shape, expected.shape)
+    self.assertAllClose(outputs, expected)
+
+    self.assertEqual(outputs.shape, outputs_cached.shape)
+    self.assertAllClose(outputs, outputs_cached)
+
+    inputs = tf.ones([1, 5, 1, 1, 3])
+    _ = pos_encoding(inputs)
+
+  def test_positional_encoding_bfloat16(self):
+    pos_encoding = nn_layers.PositionalEncoding(initializer='ones')
+
+    inputs = tf.ones([1, 4, 1, 1, 3], dtype=tf.bfloat16)
+    outputs, _ = pos_encoding(inputs)
+
+    expected = tf.constant(
+        [[[[[1.0000000, 1.0000000, 2.0000000]]],
+          [[[1.8414710, 1.0021545, 1.5403023]]],
+          [[[1.9092975, 1.0043088, 0.5838531]]],
+          [[[1.1411200, 1.0064633, 0.0100075]]]]])
+
+    self.assertEqual(outputs.shape, expected.shape)
+    self.assertAllClose(outputs, expected)
+
+  def test_global_average_pool_basic(self):
+    pool = nn_layers.GlobalAveragePool3D(keepdims=True)
+
+    inputs = tf.ones([1, 2, 3, 4, 1])
+    outputs = pool(inputs, output_states=False)
+
+    expected = tf.ones([1, 1, 1, 1, 1])
+
+    self.assertEqual(outputs.shape, expected.shape)
+    self.assertAllEqual(outputs, expected)
+
+  def test_positional_encoding_stream(self):
+    pos_encoding = nn_layers.PositionalEncoding(
+        initializer='ones', cache_encoding=False)
+
+    inputs = tf.range(4, dtype=tf.float32) + 1.
+    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
+    inputs = tf.tile(inputs, [1, 1, 1, 1, 3])
+    expected, _ = pos_encoding(inputs)
+
+    for num_splits in [1, 2, 4]:
+      frames = tf.split(inputs, num_splits, axis=1)
+      states = {}
+      predicted = []
+      for frame in frames:
+        output, states = pos_encoding(frame, states=states)
+        predicted.append(output)
+      predicted = tf.concat(predicted, axis=1)
+
+      self.assertEqual(predicted.shape, expected.shape)
+      self.assertAllClose(predicted, expected)
+      self.assertAllClose(predicted, [[[[[1.0000000, 1.0000000, 2.0000000]]],
+                                       [[[2.8414710, 2.0021544, 2.5403023]]],
+                                       [[[3.9092975, 3.0043090, 2.5838532]]],
+                                       [[[4.1411200, 4.0064630, 3.0100074]]]]])
+
+  def test_global_average_pool_keras(self):
+    pool = nn_layers.GlobalAveragePool3D(keepdims=False)
+    keras_pool = tf.keras.layers.GlobalAveragePooling3D()
+
+    inputs = 10 * tf.random.normal([1, 2, 3, 4, 1])
+
+    outputs = pool(inputs, output_states=False)
+    keras_output = keras_pool(inputs)
+
+    self.assertAllEqual(outputs.shape, keras_output.shape)
+    self.assertAllClose(outputs, keras_output)
+
+  def test_stream_global_average_pool(self):
+    gap = nn_layers.GlobalAveragePool3D(keepdims=True, causal=False)
+
+    inputs = tf.range(4, dtype=tf.float32) + 1.
+    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
+    inputs = tf.tile(inputs, [1, 1, 2, 2, 3])
+    expected, _ = gap(inputs)
+
+    for num_splits in [1, 2, 4]:
+      frames = tf.split(inputs, num_splits, axis=1)
+      states = {}
+      predicted = None
+      for frame in frames:
+        predicted, states = gap(frame, states=states)
+
+      self.assertEqual(predicted.shape, expected.shape)
+      self.assertAllClose(predicted, expected)
+      self.assertAllClose(
+          predicted,
+          [[[[[2.5, 2.5, 2.5]]]]])
+
+  def test_causal_stream_global_average_pool(self):
+    gap = nn_layers.GlobalAveragePool3D(keepdims=True, causal=True)
+
+    inputs = tf.range(4, dtype=tf.float32) + 1.
+    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
+    inputs = tf.tile(inputs, [1, 1, 2, 2, 3])
+    expected, _ = gap(inputs)
+
+    for num_splits in [1, 2, 4]:
+      frames = tf.split(inputs, num_splits, axis=1)
+      states = {}
+      predicted = []
+      for frame in frames:
+        x, states = gap(frame, states=states)
+        predicted.append(x)
+      predicted = tf.concat(predicted, axis=1)
+
+      self.assertEqual(predicted.shape, expected.shape)
+      self.assertAllClose(predicted, expected)
+      self.assertAllClose(
+          predicted,
+          [[[[[1.0, 1.0, 1.0]]],
+            [[[1.5, 1.5, 1.5]]],
+            [[[2.0, 2.0, 2.0]]],
+            [[[2.5, 2.5, 2.5]]]]])
+
+  def test_spatial_average_pool(self):
+    pool = nn_layers.SpatialAveragePool3D(keepdims=True)
+
+    inputs = tf.range(64, dtype=tf.float32) + 1.
+    inputs = tf.reshape(inputs, [1, 4, 4, 4, 1])
+
+    output = pool(inputs)
+
+    self.assertEqual(output.shape, [1, 4, 1, 1, 1])
+    self.assertAllClose(
+        output,
+        [[[[[8.50]]],
+          [[[24.5]]],
+          [[[40.5]]],
+          [[[56.5]]]]])
+
+  def test_conv2d_causal(self):
+    conv2d = nn_layers.Conv2D(
+        filters=3,
+        kernel_size=(3, 3),
+        strides=(1, 2),
+        padding='causal',
+        use_buffered_input=True,
+        kernel_initializer='ones',
+        use_bias=False,
+    )
+
+    inputs = tf.ones([1, 4, 2, 3])
+
+    paddings = [[0, 0], [2, 0], [0, 0], [0, 0]]
+    padded_inputs = tf.pad(inputs, paddings)
+    predicted = conv2d(padded_inputs)
+
+    expected = tf.constant(
+        [[[[6.0, 6.0, 6.0]],
+          [[12., 12., 12.]],
+          [[18., 18., 18.]],
+          [[18., 18., 18.]]]])
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+    conv2d.use_buffered_input = False
+    predicted = conv2d(inputs)
+
+    self.assertFalse(conv2d.use_buffered_input)
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+  def test_depthwise_conv2d_causal(self):
+    conv2d = nn_layers.DepthwiseConv2D(
+        kernel_size=(3, 3),
+        strides=(1, 1),
+        padding='causal',
+        use_buffered_input=True,
+        depthwise_initializer='ones',
+        use_bias=False,
+    )
+
+    inputs = tf.ones([1, 2, 2, 3])
+
+    paddings = [[0, 0], [2, 0], [0, 0], [0, 0]]
+    padded_inputs = tf.pad(inputs, paddings)
+    predicted = conv2d(padded_inputs)
+
+    expected = tf.constant(
+        [[[[2., 2., 2.],
+           [2., 2., 2.]],
+          [[4., 4., 4.],
+           [4., 4., 4.]]]])
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+    conv2d.use_buffered_input = False
+    predicted = conv2d(inputs)
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+  def test_conv3d_causal(self):
+    conv3d = nn_layers.Conv3D(
+        filters=3,
+        kernel_size=(3, 3, 3),
+        strides=(1, 2, 2),
+        padding='causal',
+        use_buffered_input=True,
+        kernel_initializer='ones',
+        use_bias=False,
+    )
+
+    inputs = tf.ones([1, 2, 4, 4, 3])
+
+    paddings = [[0, 0], [2, 0], [0, 0], [0, 0], [0, 0]]
+    padded_inputs = tf.pad(inputs, paddings)
+    predicted = conv3d(padded_inputs)
+
+    expected = tf.constant(
+        [[[[[27., 27., 27.],
+            [18., 18., 18.]],
+           [[18., 18., 18.],
+            [12., 12., 12.]]],
+          [[[54., 54., 54.],
+            [36., 36., 36.]],
+           [[36., 36., 36.],
+            [24., 24., 24.]]]]])
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+    conv3d.use_buffered_input = False
+    predicted = conv3d(inputs)
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+  def test_depthwise_conv3d_causal(self):
+    conv3d = nn_layers.Conv3D(
+        filters=3,
+        kernel_size=(3, 3, 3),
+        strides=(1, 2, 2),
+        padding='causal',
+        use_buffered_input=True,
+        kernel_initializer='ones',
+        use_bias=False,
+        groups=3,
+    )
+
+    inputs = tf.ones([1, 2, 4, 4, 3])
+
+    paddings = [[0, 0], [2, 0], [0, 0], [0, 0], [0, 0]]
+    padded_inputs = tf.pad(inputs, paddings)
+    predicted = conv3d(padded_inputs)
+
+    expected = tf.constant(
+        [[[[[9.0, 9.0, 9.0],
+            [6.0, 6.0, 6.0]],
+           [[6.0, 6.0, 6.0],
+            [4.0, 4.0, 4.0]]],
+          [[[18.0, 18.0, 18.0],
+            [12., 12., 12.]],
+           [[12., 12., 12.],
+            [8., 8., 8.]]]]])
+
+    output_shape = conv3d._spatial_output_shape([4, 4, 4])
+    self.assertAllClose(output_shape, [2, 2, 2])
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+    conv3d.use_buffered_input = False
+    predicted = conv3d(inputs)
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+  def test_conv3d_causal_padding_2d(self):
+    """Test to ensure causal padding works like standard padding."""
+    conv3d = nn_layers.Conv3D(
+        filters=1,
+        kernel_size=(1, 3, 3),
+        strides=(1, 2, 2),
+        padding='causal',
+        use_buffered_input=False,
+        kernel_initializer='ones',
+        use_bias=False,
+    )
+
+    keras_conv3d = tf.keras.layers.Conv3D(
+        filters=1,
+        kernel_size=(1, 3, 3),
+        strides=(1, 2, 2),
+        padding='same',
+        kernel_initializer='ones',
+        use_bias=False,
+    )
+
+    inputs = tf.ones([1, 1, 4, 4, 1])
+
+    predicted = conv3d(inputs)
+    expected = keras_conv3d(inputs)
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+    self.assertAllClose(predicted,
+                        [[[[[9.],
+                            [6.]],
+                           [[6.],
+                            [4.]]]]])
+
+  def test_conv3d_causal_padding_1d(self):
+    """Test to ensure causal padding works like standard padding."""
+    conv3d = nn_layers.Conv3D(
+        filters=1,
+        kernel_size=(3, 1, 1),
+        strides=(2, 1, 1),
+        padding='causal',
+        use_buffered_input=False,
+        kernel_initializer='ones',
+        use_bias=False,
+    )
+
+    keras_conv1d = tf.keras.layers.Conv1D(
+        filters=1,
+        kernel_size=3,
+        strides=2,
+        padding='causal',
+        kernel_initializer='ones',
+        use_bias=False,
+    )
+
+    inputs = tf.ones([1, 4, 1, 1, 1])
+
+    predicted = conv3d(inputs)
+    expected = keras_conv1d(tf.squeeze(inputs, axis=[2, 3]))
+    expected = tf.reshape(expected, [1, 2, 1, 1, 1])
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+
+    self.assertAllClose(predicted,
+                        [[[[[1.]]],
+                          [[[3.]]]]])
+
+  @parameterized.parameters(
+      (None, []),
+      (None, [6, 12, 18]),
+      ([32, 32], [6, 12, 18]),
+  )
+  def test_aspp(self, pool_kernel_size, dilation_rates):
+    inputs = tf.keras.Input(shape=(64, 64, 128), dtype=tf.float32)
+    layer = nn_layers.SpatialPyramidPooling(
+        output_channels=256,
+        dilation_rates=dilation_rates,
+        pool_kernel_size=pool_kernel_size)
+    output = layer(inputs)
+    self.assertAllEqual([None, 64, 64, 256], output.shape)
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/roi_aligner.py
+++ b/official/vision/modeling/layers/roi_aligner.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of ROI aligner."""
+
+from typing import Mapping
+import tensorflow as tf
+
+from official.vision.ops import spatial_transform_ops
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MultilevelROIAligner(tf.keras.layers.Layer):
+  """Performs ROIAlign for the second stage processing."""
+
+  def __init__(self, crop_size: int = 7, sample_offset: float = 0.5, **kwargs):
+    """Initializes a ROI aligner.
+
+    Args:
+      crop_size: An `int` of the output size of the cropped features.
+      sample_offset: A `float` in [0, 1] of the subpixel sample offset.
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'crop_size': crop_size,
+        'sample_offset': sample_offset,
+    }
+    super(MultilevelROIAligner, self).__init__(**kwargs)
+
+  def call(self,
+           features: Mapping[str, tf.Tensor],
+           boxes: tf.Tensor,
+           training: bool = None):
+    """Generates ROIs.
+
+    Args:
+      features: A dictionary with key as pyramid level and value as features.
+        The features are in shape of
+        [batch_size, height_l, width_l, num_filters].
+      boxes: A 3-D `tf.Tensor` of shape [batch_size, num_boxes, 4]. Each row
+        represents a box with [y1, x1, y2, x2] in un-normalized coordinates.
+        from grid point.
+      training: A `bool` of whether it is in training mode.
+
+    Returns:
+      A 5-D `tf.Tensor` representing feature crop of shape
+      [batch_size, num_boxes, crop_size, crop_size, num_filters].
+    """
+    roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        features,
+        boxes,
+        output_size=self._config_dict['crop_size'],
+        sample_offset=self._config_dict['sample_offset'])
+    return roi_features
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/modeling/layers/roi_aligner_test.py
+++ b/official/vision/modeling/layers/roi_aligner_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for roi_aligner.py."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.modeling.layers import roi_aligner
+
+
+class MultilevelROIAlignerTest(tf.test.TestCase):
+
+  def test_serialize_deserialize(self):
+    kwargs = dict(
+        crop_size=7,
+        sample_offset=0.5,
+    )
+    aligner = roi_aligner.MultilevelROIAligner(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(aligner.get_config(), expected_config)
+
+    new_aligner = roi_aligner.MultilevelROIAligner.from_config(
+        aligner.get_config())
+
+    self.assertAllEqual(aligner.get_config(), new_aligner.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/roi_generator.py
+++ b/official/vision/modeling/layers/roi_generator.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of ROI generator."""
+from typing import Optional, Mapping
+# Import libraries
+import tensorflow as tf
+
+from official.vision.ops import box_ops
+from official.vision.ops import nms
+
+
+def _multilevel_propose_rois(raw_boxes: Mapping[str, tf.Tensor],
+                             raw_scores: Mapping[str, tf.Tensor],
+                             anchor_boxes: Mapping[str, tf.Tensor],
+                             image_shape: tf.Tensor,
+                             pre_nms_top_k: int = 2000,
+                             pre_nms_score_threshold: float = 0.0,
+                             pre_nms_min_size_threshold: float = 0.0,
+                             nms_iou_threshold: float = 0.7,
+                             num_proposals: int = 1000,
+                             use_batched_nms: bool = False,
+                             decode_boxes: bool = True,
+                             clip_boxes: bool = True,
+                             apply_sigmoid_to_score: bool = True):
+  """Proposes RoIs given a group of candidates from different FPN levels.
+
+  The following describes the steps:
+    1. For each individual level:
+      a. Apply sigmoid transform if specified.
+      b. Decode boxes if specified.
+      c. Clip boxes if specified.
+      d. Filter small boxes and those fall outside image if specified.
+      e. Apply pre-NMS filtering including pre-NMS top k and score thresholding.
+      f. Apply NMS.
+    2. Aggregate post-NMS boxes from each level.
+    3. Apply an overall top k to generate the final selected RoIs.
+
+  Args:
+    raw_boxes: A `dict` with keys representing FPN levels and values
+      representing box tenors of shape
+      [batch_size, feature_h, feature_w, num_anchors * 4].
+    raw_scores: A `dict` with keys representing FPN levels and values
+      representing logit tensors of shape
+      [batch_size, feature_h, feature_w, num_anchors].
+    anchor_boxes: A `dict` with keys representing FPN levels and values
+      representing anchor box tensors of shape
+      [batch_size, feature_h * feature_w * num_anchors, 4].
+    image_shape: A `tf.Tensor` of shape [batch_size, 2] where the last dimension
+      are [height, width] of the scaled image.
+    pre_nms_top_k: An `int` of top scoring RPN proposals *per level* to keep
+      before applying NMS. Default: 2000.
+    pre_nms_score_threshold: A `float` between 0 and 1 representing the minimal
+      box score to keep before applying NMS. This is often used as a
+      pre-filtering step for better performance. Default: 0, no filtering is
+      applied.
+    pre_nms_min_size_threshold: A `float` representing the minimal box size in
+      each side (w.r.t. the scaled image) to keep before applying NMS. This is
+      often used as a pre-filtering step for better performance. Default: 0, no
+      filtering is applied.
+    nms_iou_threshold: A `float` between 0 and 1 representing the IoU threshold
+      used for NMS. If 0.0, no NMS is applied. Default: 0.7.
+    num_proposals: An `int` of top scoring RPN proposals *in total* to keep
+      after applying NMS. Default: 1000.
+    use_batched_nms: A `bool` indicating whether NMS is applied in batch using
+      `tf.image.combined_non_max_suppression`. Currently only available in
+      CPU/GPU. Default is False.
+    decode_boxes: A `bool` indicating whether `raw_boxes` needs to be decoded
+      using `anchor_boxes`. If False, use `raw_boxes` directly and ignore
+      `anchor_boxes`. Default is True.
+    clip_boxes: A `bool` indicating whether boxes are first clipped to the
+      scaled image size before appliying NMS. If False, no clipping is applied
+      and `image_shape` is ignored. Default is True.
+    apply_sigmoid_to_score: A `bool` indicating whether apply sigmoid to
+      `raw_scores` before applying NMS. Default is True.
+
+  Returns:
+    selected_rois: A `tf.Tensor` of shape [batch_size, num_proposals, 4],
+      representing the box coordinates of the selected proposals w.r.t. the
+      scaled image.
+    selected_roi_scores: A `tf.Tensor` of shape [batch_size, num_proposals, 1],
+      representing the scores of the selected proposals.
+  """
+  with tf.name_scope('multilevel_propose_rois'):
+    rois = []
+    roi_scores = []
+    image_shape = tf.expand_dims(image_shape, axis=1)
+    for level in sorted(raw_scores.keys()):
+      with tf.name_scope('level_%s' % level):
+        _, feature_h, feature_w, num_anchors_per_location = (
+            raw_scores[level].get_shape().as_list())
+
+        num_boxes = feature_h * feature_w * num_anchors_per_location
+        this_level_scores = tf.reshape(raw_scores[level], [-1, num_boxes])
+        this_level_boxes = tf.reshape(raw_boxes[level], [-1, num_boxes, 4])
+        this_level_anchors = tf.cast(
+            tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
+            dtype=this_level_scores.dtype)
+
+        if apply_sigmoid_to_score:
+          this_level_scores = tf.sigmoid(this_level_scores)
+
+        if decode_boxes:
+          this_level_boxes = box_ops.decode_boxes(
+              this_level_boxes, this_level_anchors)
+        if clip_boxes:
+          this_level_boxes = box_ops.clip_boxes(
+              this_level_boxes, image_shape)
+
+        if pre_nms_min_size_threshold > 0.0:
+          this_level_boxes, this_level_scores = box_ops.filter_boxes(
+              this_level_boxes,
+              this_level_scores,
+              image_shape,
+              pre_nms_min_size_threshold)
+
+        this_level_pre_nms_top_k = min(num_boxes, pre_nms_top_k)
+        this_level_post_nms_top_k = min(num_boxes, num_proposals)
+        if nms_iou_threshold > 0.0:
+          if use_batched_nms:
+            this_level_rois, this_level_roi_scores, _, _ = (
+                tf.image.combined_non_max_suppression(
+                    tf.expand_dims(this_level_boxes, axis=2),
+                    tf.expand_dims(this_level_scores, axis=-1),
+                    max_output_size_per_class=this_level_pre_nms_top_k,
+                    max_total_size=this_level_post_nms_top_k,
+                    iou_threshold=nms_iou_threshold,
+                    score_threshold=pre_nms_score_threshold,
+                    pad_per_class=False,
+                    clip_boxes=False))
+          else:
+            if pre_nms_score_threshold > 0.0:
+              this_level_boxes, this_level_scores = (
+                  box_ops.filter_boxes_by_scores(
+                      this_level_boxes,
+                      this_level_scores,
+                      pre_nms_score_threshold))
+            this_level_boxes, this_level_scores = box_ops.top_k_boxes(
+                this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k)
+            this_level_roi_scores, this_level_rois = (
+                nms.sorted_non_max_suppression_padded(
+                    this_level_scores,
+                    this_level_boxes,
+                    max_output_size=this_level_post_nms_top_k,
+                    iou_threshold=nms_iou_threshold))
+        else:
+          this_level_rois, this_level_roi_scores = box_ops.top_k_boxes(
+              this_level_boxes,
+              this_level_scores,
+              k=this_level_post_nms_top_k)
+
+        rois.append(this_level_rois)
+        roi_scores.append(this_level_roi_scores)
+
+    all_rois = tf.concat(rois, axis=1)
+    all_roi_scores = tf.concat(roi_scores, axis=1)
+
+    with tf.name_scope('top_k_rois'):
+      _, num_valid_rois = all_roi_scores.get_shape().as_list()
+      overall_top_k = min(num_valid_rois, num_proposals)
+
+      selected_rois, selected_roi_scores = box_ops.top_k_boxes(
+          all_rois, all_roi_scores, k=overall_top_k)
+
+    return selected_rois, selected_roi_scores
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MultilevelROIGenerator(tf.keras.layers.Layer):
+  """Proposes RoIs for the second stage processing."""
+
+  def __init__(self,
+               pre_nms_top_k: int = 2000,
+               pre_nms_score_threshold: float = 0.0,
+               pre_nms_min_size_threshold: float = 0.0,
+               nms_iou_threshold: float = 0.7,
+               num_proposals: int = 1000,
+               test_pre_nms_top_k: int = 1000,
+               test_pre_nms_score_threshold: float = 0.0,
+               test_pre_nms_min_size_threshold: float = 0.0,
+               test_nms_iou_threshold: float = 0.7,
+               test_num_proposals: int = 1000,
+               use_batched_nms: bool = False,
+               **kwargs):
+    """Initializes a ROI generator.
+
+    The ROI generator transforms the raw predictions from RPN to ROIs.
+
+    Args:
+      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
+        before applying NMS.
+      pre_nms_score_threshold: A `float` of the score threshold to apply before
+        applying NMS. Proposals whose scores are below this threshold are
+        thrown away.
+      pre_nms_min_size_threshold: A `float` of the threshold of each side of the
+        box (w.r.t. the scaled image). Proposals whose sides are below this
+        threshold are thrown away.
+      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
+      num_proposals: An `int` of the final number of proposals to generate.
+      test_pre_nms_top_k: An `int` of the number of top scores proposals to be
+        kept before applying NMS in testing.
+      test_pre_nms_score_threshold: A `float` of the score threshold to apply
+        before applying NMS in testing. Proposals whose scores are below this
+        threshold are thrown away.
+      test_pre_nms_min_size_threshold: A `float` of the threshold of each side
+        of the box (w.r.t. the scaled image) in testing. Proposals whose sides
+        are below this threshold are thrown away.
+      test_nms_iou_threshold: A `float` in [0, 1] of the NMS IoU threshold in
+        testing.
+      test_num_proposals: An `int` of the final number of proposals to generate
+        in testing.
+      use_batched_nms: A `bool` of whether or not use
+        `tf.image.combined_non_max_suppression`.
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'pre_nms_min_size_threshold': pre_nms_min_size_threshold,
+        'nms_iou_threshold': nms_iou_threshold,
+        'num_proposals': num_proposals,
+        'test_pre_nms_top_k': test_pre_nms_top_k,
+        'test_pre_nms_score_threshold': test_pre_nms_score_threshold,
+        'test_pre_nms_min_size_threshold': test_pre_nms_min_size_threshold,
+        'test_nms_iou_threshold': test_nms_iou_threshold,
+        'test_num_proposals': test_num_proposals,
+        'use_batched_nms': use_batched_nms,
+    }
+    super(MultilevelROIGenerator, self).__init__(**kwargs)
+
+  def call(self,
+           raw_boxes: Mapping[str, tf.Tensor],
+           raw_scores: Mapping[str, tf.Tensor],
+           anchor_boxes: Mapping[str, tf.Tensor],
+           image_shape: tf.Tensor,
+           training: Optional[bool] = None):
+    """Proposes RoIs given a group of candidates from different FPN levels.
+
+    The following describes the steps:
+      1. For each individual level:
+        a. Apply sigmoid transform if specified.
+        b. Decode boxes if specified.
+        c. Clip boxes if specified.
+        d. Filter small boxes and those fall outside image if specified.
+        e. Apply pre-NMS filtering including pre-NMS top k and score
+           thresholding.
+        f. Apply NMS.
+      2. Aggregate post-NMS boxes from each level.
+      3. Apply an overall top k to generate the final selected RoIs.
+
+    Args:
+      raw_boxes: A `dict` with keys representing FPN levels and values
+        representing box tenors of shape
+        [batch, feature_h, feature_w, num_anchors * 4].
+      raw_scores: A `dict` with keys representing FPN levels and values
+        representing logit tensors of shape
+        [batch, feature_h, feature_w, num_anchors].
+      anchor_boxes: A `dict` with keys representing FPN levels and values
+        representing anchor box tensors of shape
+        [batch, feature_h * feature_w * num_anchors, 4].
+      image_shape: A `tf.Tensor` of shape [batch, 2] where the last dimension
+        are [height, width] of the scaled image.
+      training: A `bool` that indicates whether it is in training mode.
+
+    Returns:
+      roi_boxes: A `tf.Tensor` of shape [batch, num_proposals, 4], the proposed
+        ROIs in the scaled image coordinate.
+      roi_scores: A `tf.Tensor` of shape [batch, num_proposals], scores of the
+        proposed ROIs.
+    """
+    roi_boxes, roi_scores = _multilevel_propose_rois(
+        raw_boxes,
+        raw_scores,
+        anchor_boxes,
+        image_shape,
+        pre_nms_top_k=(
+            self._config_dict['pre_nms_top_k'] if training
+            else self._config_dict['test_pre_nms_top_k']),
+        pre_nms_score_threshold=(
+            self._config_dict['pre_nms_score_threshold'] if training
+            else self._config_dict['test_pre_nms_score_threshold']),
+        pre_nms_min_size_threshold=(
+            self._config_dict['pre_nms_min_size_threshold'] if training
+            else self._config_dict['test_pre_nms_min_size_threshold']),
+        nms_iou_threshold=(
+            self._config_dict['nms_iou_threshold'] if training
+            else self._config_dict['test_nms_iou_threshold']),
+        num_proposals=(
+            self._config_dict['num_proposals'] if training
+            else self._config_dict['test_num_proposals']),
+        use_batched_nms=self._config_dict['use_batched_nms'],
+        decode_boxes=True,
+        clip_boxes=True,
+        apply_sigmoid_to_score=True)
+    return roi_boxes, roi_scores
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/modeling/layers/roi_sampler.py
+++ b/official/vision/modeling/layers/roi_sampler.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of ROI sampler."""
+# Import libraries
+
+import tensorflow as tf
+
+from official.vision.modeling.layers import box_sampler
+from official.vision.ops import box_matcher
+from official.vision.ops import iou_similarity
+from official.vision.ops import target_gather
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ROISampler(tf.keras.layers.Layer):
+  """Samples ROIs and assigns targets to the sampled ROIs."""
+
+  def __init__(self,
+               mix_gt_boxes: bool = True,
+               num_sampled_rois: int = 512,
+               foreground_fraction: float = 0.25,
+               foreground_iou_threshold: float = 0.5,
+               background_iou_high_threshold: float = 0.5,
+               background_iou_low_threshold: float = 0,
+               skip_subsampling: bool = False,
+               **kwargs):
+    """Initializes a ROI sampler.
+
+    Args:
+      mix_gt_boxes: A `bool` of whether to mix the groundtruth boxes with
+        proposed ROIs.
+      num_sampled_rois: An `int` of the number of sampled ROIs per image.
+      foreground_fraction: A `float` in [0, 1], what percentage of proposed ROIs
+        should be sampled from the foreground boxes.
+      foreground_iou_threshold: A `float` that represents the IoU threshold for
+        a box to be considered as positive (if >= `foreground_iou_threshold`).
+      background_iou_high_threshold: A `float` that represents the IoU threshold
+        for a box to be considered as negative (if overlap in
+        [`background_iou_low_threshold`, `background_iou_high_threshold`]).
+      background_iou_low_threshold: A `float` that represents the IoU threshold
+        for a box to be considered as negative (if overlap in
+        [`background_iou_low_threshold`, `background_iou_high_threshold`])
+      skip_subsampling: a bool that determines if we want to skip the sampling
+        procedure than balances the fg/bg classes. Used for upper frcnn layers
+        in cascade RCNN.
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'mix_gt_boxes': mix_gt_boxes,
+        'num_sampled_rois': num_sampled_rois,
+        'foreground_fraction': foreground_fraction,
+        'foreground_iou_threshold': foreground_iou_threshold,
+        'background_iou_high_threshold': background_iou_high_threshold,
+        'background_iou_low_threshold': background_iou_low_threshold,
+        'skip_subsampling': skip_subsampling,
+    }
+
+    self._sim_calc = iou_similarity.IouSimilarity()
+    self._box_matcher = box_matcher.BoxMatcher(
+        thresholds=[
+            background_iou_low_threshold, background_iou_high_threshold,
+            foreground_iou_threshold
+        ],
+        indicators=[-3, -1, -2, 1])
+    self._target_gather = target_gather.TargetGather()
+
+    self._sampler = box_sampler.BoxSampler(
+        num_sampled_rois, foreground_fraction)
+    super(ROISampler, self).__init__(**kwargs)
+
+  def call(self, boxes: tf.Tensor, gt_boxes: tf.Tensor, gt_classes: tf.Tensor):
+    """Assigns the proposals with groundtruth classes and performs subsmpling.
+
+    Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
+    following algorithm to generate the final `num_samples_per_image` RoIs.
+      1. Calculates the IoU between each proposal box and each gt_boxes.
+      2. Assigns each proposed box with a groundtruth class and box by choosing
+         the largest IoU overlap.
+      3. Samples `num_samples_per_image` boxes from all proposed boxes, and
+         returns box_targets, class_targets, and RoIs.
+
+    Args:
+      boxes: A `tf.Tensor` of shape of [batch_size, N, 4]. N is the number of
+        proposals before groundtruth assignment. The last dimension is the
+        box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
+        format.
+      gt_boxes: A `tf.Tensor` of shape of [batch_size, MAX_NUM_INSTANCES, 4].
+        The coordinates of gt_boxes are in the pixel coordinates of the scaled
+        image. This tensor might have padding of values -1 indicating the
+        invalid box coordinates.
+      gt_classes: A `tf.Tensor` with a shape of [batch_size, MAX_NUM_INSTANCES].
+        This tensor might have paddings with values of -1 indicating the invalid
+        classes.
+
+    Returns:
+      sampled_rois: A `tf.Tensor` of shape of [batch_size, K, 4], representing
+        the coordinates of the sampled RoIs, where K is the number of the
+        sampled RoIs, i.e. K = num_samples_per_image.
+      sampled_gt_boxes: A `tf.Tensor` of shape of [batch_size, K, 4], storing
+        the box coordinates of the matched groundtruth boxes of the samples
+        RoIs.
+      sampled_gt_classes: A `tf.Tensor` of shape of [batch_size, K], storing the
+        classes of the matched groundtruth boxes of the sampled RoIs.
+      sampled_gt_indices: A `tf.Tensor` of shape of [batch_size, K], storing the
+        indices of the sampled groudntruth boxes in the original `gt_boxes`
+        tensor, i.e.,
+        gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
+    """
+    gt_boxes = tf.cast(gt_boxes, dtype=boxes.dtype)
+    if self._config_dict['mix_gt_boxes']:
+      boxes = tf.concat([boxes, gt_boxes], axis=1)
+
+    boxes_invalid_mask = tf.less(
+        tf.reduce_max(boxes, axis=-1, keepdims=True), 0.0)
+    gt_invalid_mask = tf.less(
+        tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
+    similarity_matrix = self._sim_calc(boxes, gt_boxes, boxes_invalid_mask,
+                                       gt_invalid_mask)
+    matched_gt_indices, match_indicators = self._box_matcher(similarity_matrix)
+    positive_matches = tf.greater_equal(match_indicators, 0)
+    negative_matches = tf.equal(match_indicators, -1)
+    ignored_matches = tf.equal(match_indicators, -2)
+    invalid_matches = tf.equal(match_indicators, -3)
+
+    background_mask = tf.expand_dims(
+        tf.logical_or(negative_matches, invalid_matches), -1)
+    gt_classes = tf.expand_dims(gt_classes, axis=-1)
+    matched_gt_classes = self._target_gather(gt_classes, matched_gt_indices,
+                                             background_mask)
+    matched_gt_classes = tf.where(background_mask,
+                                  tf.zeros_like(matched_gt_classes),
+                                  matched_gt_classes)
+    matched_gt_boxes = self._target_gather(gt_boxes, matched_gt_indices,
+                                           tf.tile(background_mask, [1, 1, 4]))
+    matched_gt_boxes = tf.where(background_mask,
+                                tf.zeros_like(matched_gt_boxes),
+                                matched_gt_boxes)
+    matched_gt_indices = tf.where(
+        tf.squeeze(background_mask, -1), -tf.ones_like(matched_gt_indices),
+        matched_gt_indices)
+
+    if self._config_dict['skip_subsampling']:
+      return (boxes, matched_gt_boxes, tf.squeeze(matched_gt_classes,
+                                                  axis=-1), matched_gt_indices)
+
+    sampled_indices = self._sampler(
+        positive_matches, negative_matches, ignored_matches)
+
+    sampled_rois = self._target_gather(boxes, sampled_indices)
+    sampled_gt_boxes = self._target_gather(matched_gt_boxes, sampled_indices)
+    sampled_gt_classes = tf.squeeze(self._target_gather(
+        matched_gt_classes, sampled_indices), axis=-1)
+    sampled_gt_indices = tf.squeeze(self._target_gather(
+        tf.expand_dims(matched_gt_indices, -1), sampled_indices), axis=-1)
+    return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
+            sampled_gt_indices)
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/maskrcnn_model.py
+++ b/official/vision/modeling/maskrcnn_model.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""R-CNN(-RS) models."""
+
+from typing import Any, List, Mapping, Optional, Tuple, Union
+
+import tensorflow as tf
+
+from official.vision.ops import anchor
+from official.vision.ops import box_ops
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MaskRCNNModel(tf.keras.Model):
+  """The Mask R-CNN(-RS) and Cascade RCNN-RS models."""
+
+  def __init__(self,
+               backbone: tf.keras.Model,
+               decoder: tf.keras.Model,
+               rpn_head: tf.keras.layers.Layer,
+               detection_head: Union[tf.keras.layers.Layer,
+                                     List[tf.keras.layers.Layer]],
+               roi_generator: tf.keras.layers.Layer,
+               roi_sampler: Union[tf.keras.layers.Layer,
+                                  List[tf.keras.layers.Layer]],
+               roi_aligner: tf.keras.layers.Layer,
+               detection_generator: tf.keras.layers.Layer,
+               mask_head: Optional[tf.keras.layers.Layer] = None,
+               mask_sampler: Optional[tf.keras.layers.Layer] = None,
+               mask_roi_aligner: Optional[tf.keras.layers.Layer] = None,
+               class_agnostic_bbox_pred: bool = False,
+               cascade_class_ensemble: bool = False,
+               min_level: Optional[int] = None,
+               max_level: Optional[int] = None,
+               num_scales: Optional[int] = None,
+               aspect_ratios: Optional[List[float]] = None,
+               anchor_size: Optional[float] = None,
+               **kwargs):
+    """Initializes the R-CNN(-RS) model.
+
+    Args:
+      backbone: `tf.keras.Model`, the backbone network.
+      decoder: `tf.keras.Model`, the decoder network.
+      rpn_head: the RPN head.
+      detection_head: the detection head or a list of heads.
+      roi_generator: the ROI generator.
+      roi_sampler: a single ROI sampler or a list of ROI samplers for cascade
+        detection heads.
+      roi_aligner: the ROI aligner.
+      detection_generator: the detection generator.
+      mask_head: the mask head.
+      mask_sampler: the mask sampler.
+      mask_roi_aligner: the ROI alginer for mask prediction.
+      class_agnostic_bbox_pred: if True, perform class agnostic bounding box
+        prediction. Needs to be `True` for Cascade RCNN models.
+      cascade_class_ensemble: if True, ensemble classification scores over all
+        detection heads.
+      min_level: Minimum level in output feature maps.
+      max_level: Maximum level in output feature maps.
+      num_scales: A number representing intermediate scales added on each level.
+        For instances, num_scales=2 adds one additional intermediate anchor
+        scales [2^0, 2^0.5] on each level.
+      aspect_ratios: A list representing the aspect raito anchors added on each
+        level. The number indicates the ratio of width to height. For instances,
+        aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level.
+      anchor_size: A number representing the scale of size of the base anchor to
+        the feature stride 2^level.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(MaskRCNNModel, self).__init__(**kwargs)
+    self._config_dict = {
+        'backbone': backbone,
+        'decoder': decoder,
+        'rpn_head': rpn_head,
+        'detection_head': detection_head,
+        'roi_generator': roi_generator,
+        'roi_sampler': roi_sampler,
+        'roi_aligner': roi_aligner,
+        'detection_generator': detection_generator,
+        'mask_head': mask_head,
+        'mask_sampler': mask_sampler,
+        'mask_roi_aligner': mask_roi_aligner,
+        'class_agnostic_bbox_pred': class_agnostic_bbox_pred,
+        'cascade_class_ensemble': cascade_class_ensemble,
+        'min_level': min_level,
+        'max_level': max_level,
+        'num_scales': num_scales,
+        'aspect_ratios': aspect_ratios,
+        'anchor_size': anchor_size,
+    }
+    self.backbone = backbone
+    self.decoder = decoder
+    self.rpn_head = rpn_head
+    if not isinstance(detection_head, (list, tuple)):
+      self.detection_head = [detection_head]
+    else:
+      self.detection_head = detection_head
+    self.roi_generator = roi_generator
+    if not isinstance(roi_sampler, (list, tuple)):
+      self.roi_sampler = [roi_sampler]
+    else:
+      self.roi_sampler = roi_sampler
+    if len(self.roi_sampler) > 1 and not class_agnostic_bbox_pred:
+      raise ValueError(
+          '`class_agnostic_bbox_pred` needs to be True if multiple detection heads are specified.'
+      )
+    self.roi_aligner = roi_aligner
+    self.detection_generator = detection_generator
+    self._include_mask = mask_head is not None
+    self.mask_head = mask_head
+    if self._include_mask and mask_sampler is None:
+      raise ValueError('`mask_sampler` is not provided in Mask R-CNN.')
+    self.mask_sampler = mask_sampler
+    if self._include_mask and mask_roi_aligner is None:
+      raise ValueError('`mask_roi_aligner` is not provided in Mask R-CNN.')
+    self.mask_roi_aligner = mask_roi_aligner
+    # Weights for the regression losses for each FRCNN layer.
+    # TODO(xianzhi): Make the weights configurable.
+    self._cascade_layer_to_weights = [
+        [10.0, 10.0, 5.0, 5.0],
+        [20.0, 20.0, 10.0, 10.0],
+        [30.0, 30.0, 15.0, 15.0],
+    ]
+
+  def call(self,
+           images: tf.Tensor,
+           image_shape: tf.Tensor,
+           anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+           gt_boxes: Optional[tf.Tensor] = None,
+           gt_classes: Optional[tf.Tensor] = None,
+           gt_masks: Optional[tf.Tensor] = None,
+           training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
+
+    model_outputs, intermediate_outputs = self._call_box_outputs(
+        images=images, image_shape=image_shape, anchor_boxes=anchor_boxes,
+        gt_boxes=gt_boxes, gt_classes=gt_classes, training=training)
+    if not self._include_mask:
+      return model_outputs
+
+    model_mask_outputs = self._call_mask_outputs(
+        model_box_outputs=model_outputs,
+        features=model_outputs['decoder_features'],
+        current_rois=intermediate_outputs['current_rois'],
+        matched_gt_indices=intermediate_outputs['matched_gt_indices'],
+        matched_gt_boxes=intermediate_outputs['matched_gt_boxes'],
+        matched_gt_classes=intermediate_outputs['matched_gt_classes'],
+        gt_masks=gt_masks,
+        training=training)
+    model_outputs.update(model_mask_outputs)
+    return model_outputs
+
+  def _get_backbone_and_decoder_features(self, images):
+
+    backbone_features = self.backbone(images)
+    if self.decoder:
+      features = self.decoder(backbone_features)
+    else:
+      features = backbone_features
+    return backbone_features, features
+
+  def _call_box_outputs(
+      self, images: tf.Tensor,
+      image_shape: tf.Tensor,
+      anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+      gt_boxes: Optional[tf.Tensor] = None,
+      gt_classes: Optional[tf.Tensor] = None,
+      training: Optional[bool] = None) -> Tuple[
+          Mapping[str, tf.Tensor], Mapping[str, tf.Tensor]]:
+    """Implementation of the Faster-RCNN logic for boxes."""
+    model_outputs = {}
+
+    # Feature extraction.
+    (backbone_features,
+     decoder_features) = self._get_backbone_and_decoder_features(images)
+
+    # Region proposal network.
+    rpn_scores, rpn_boxes = self.rpn_head(decoder_features)
+
+    model_outputs.update({
+        'backbone_features': backbone_features,
+        'decoder_features': decoder_features,
+        'rpn_boxes': rpn_boxes,
+        'rpn_scores': rpn_scores
+    })
+
+    # Generate anchor boxes for this batch if not provided.
+    if anchor_boxes is None:
+      _, image_height, image_width, _ = images.get_shape().as_list()
+      anchor_boxes = anchor.Anchor(
+          min_level=self._config_dict['min_level'],
+          max_level=self._config_dict['max_level'],
+          num_scales=self._config_dict['num_scales'],
+          aspect_ratios=self._config_dict['aspect_ratios'],
+          anchor_size=self._config_dict['anchor_size'],
+          image_size=(image_height, image_width)).multilevel_boxes
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0),
+            [tf.shape(images)[0], 1, 1, 1])
+
+    # Generate RoIs.
+    current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores, anchor_boxes,
+                                         image_shape, training)
+
+    next_rois = current_rois
+    all_class_outputs = []
+    for cascade_num in range(len(self.roi_sampler)):
+      # In cascade RCNN we want the higher layers to have different regression
+      # weights as the predicted deltas become smaller and smaller.
+      regression_weights = self._cascade_layer_to_weights[cascade_num]
+      current_rois = next_rois
+
+      (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
+       matched_gt_classes, matched_gt_indices,
+       current_rois) = self._run_frcnn_head(
+           features=decoder_features,
+           rois=current_rois,
+           gt_boxes=gt_boxes,
+           gt_classes=gt_classes,
+           training=training,
+           model_outputs=model_outputs,
+           cascade_num=cascade_num,
+           regression_weights=regression_weights)
+      all_class_outputs.append(class_outputs)
+
+      # Generate ROIs for the next cascade head if there is any.
+      if cascade_num < len(self.roi_sampler) - 1:
+        next_rois = box_ops.decode_boxes(
+            tf.cast(box_outputs, tf.float32),
+            current_rois,
+            weights=regression_weights)
+        next_rois = box_ops.clip_boxes(next_rois,
+                                       tf.expand_dims(image_shape, axis=1))
+
+    if not training:
+      if self._config_dict['cascade_class_ensemble']:
+        class_outputs = tf.add_n(all_class_outputs) / len(all_class_outputs)
+
+      detections = self.detection_generator(
+          box_outputs,
+          class_outputs,
+          current_rois,
+          image_shape,
+          regression_weights,
+          bbox_per_class=(not self._config_dict['class_agnostic_bbox_pred']))
+      model_outputs.update({
+          'cls_outputs': class_outputs,
+          'box_outputs': box_outputs,
+      })
+      if self.detection_generator.get_config()['apply_nms']:
+        model_outputs.update({
+            'detection_boxes': detections['detection_boxes'],
+            'detection_scores': detections['detection_scores'],
+            'detection_classes': detections['detection_classes'],
+            'num_detections': detections['num_detections']
+        })
+      else:
+        model_outputs.update({
+            'decoded_boxes': detections['decoded_boxes'],
+            'decoded_box_scores': detections['decoded_box_scores']
+        })
+
+    intermediate_outputs = {
+        'matched_gt_boxes': matched_gt_boxes,
+        'matched_gt_indices': matched_gt_indices,
+        'matched_gt_classes': matched_gt_classes,
+        'current_rois': current_rois,
+    }
+    return (model_outputs, intermediate_outputs)
+
+  def _call_mask_outputs(
+      self,
+      model_box_outputs: Mapping[str, tf.Tensor],
+      features: tf.Tensor,
+      current_rois: tf.Tensor,
+      matched_gt_indices: tf.Tensor,
+      matched_gt_boxes: tf.Tensor,
+      matched_gt_classes: tf.Tensor,
+      gt_masks: tf.Tensor,
+      training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
+    """Implementation of Mask-RCNN mask prediction logic."""
+
+    model_outputs = dict(model_box_outputs)
+    if training:
+      current_rois, roi_classes, roi_masks = self.mask_sampler(
+          current_rois, matched_gt_boxes, matched_gt_classes,
+          matched_gt_indices, gt_masks)
+      roi_masks = tf.stop_gradient(roi_masks)
+
+      model_outputs.update({
+          'mask_class_targets': roi_classes,
+          'mask_targets': roi_masks,
+      })
+    else:
+      current_rois = model_outputs['detection_boxes']
+      roi_classes = model_outputs['detection_classes']
+
+    mask_logits, mask_probs = self._features_to_mask_outputs(
+        features, current_rois, roi_classes)
+
+    if training:
+      model_outputs.update({
+          'mask_outputs': mask_logits,
+      })
+    else:
+      model_outputs.update({
+          'detection_masks': mask_probs,
+      })
+    return model_outputs
+
+  def _run_frcnn_head(self, features, rois, gt_boxes, gt_classes, training,
+                      model_outputs, cascade_num, regression_weights):
+    """Runs the frcnn head that does both class and box prediction.
+
+    Args:
+      features: `list` of features from the feature extractor.
+      rois: `list` of current rois that will be used to predict bbox refinement
+        and classes from.
+      gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4].
+        This tensor might have paddings with a negative value.
+      gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
+        classes. It is padded with -1s to indicate the invalid classes.
+      training: `bool`, if model is training or being evaluated.
+      model_outputs: `dict`, used for storing outputs used for eval and losses.
+      cascade_num: `int`, the current frcnn layer in the cascade.
+      regression_weights: `list`, weights used for l1 loss in bounding box
+        regression.
+
+    Returns:
+      class_outputs: Class predictions for rois.
+      box_outputs: Box predictions for rois. These are formatted for the
+        regression loss and need to be converted before being used as rois
+        in the next stage.
+      model_outputs: Updated dict with predictions used for losses and eval.
+      matched_gt_boxes: If `is_training` is true, then these give the gt box
+        location of its positive match.
+      matched_gt_classes: If `is_training` is true, then these give the gt class
+         of the predicted box.
+      matched_gt_boxes: If `is_training` is true, then these give the box
+        location of its positive match.
+      matched_gt_indices: If `is_training` is true, then gives the index of
+        the positive box match. Used for mask prediction.
+      rois: The sampled rois used for this layer.
+    """
+    # Only used during training.
+    matched_gt_boxes, matched_gt_classes, matched_gt_indices = (None, None,
+                                                                None)
+    if training and gt_boxes is not None:
+      rois = tf.stop_gradient(rois)
+
+      current_roi_sampler = self.roi_sampler[cascade_num]
+      rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
+          current_roi_sampler(rois, gt_boxes, gt_classes))
+      # Create bounding box training targets.
+      box_targets = box_ops.encode_boxes(
+          matched_gt_boxes, rois, weights=regression_weights)
+      # If the target is background, the box target is set to all 0s.
+      box_targets = tf.where(
+          tf.tile(
+              tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
+              [1, 1, 4]), tf.zeros_like(box_targets), box_targets)
+      model_outputs.update({
+          'class_targets_{}'.format(cascade_num)
+          if cascade_num else 'class_targets':
+              matched_gt_classes,
+          'box_targets_{}'.format(cascade_num)
+          if cascade_num else 'box_targets':
+              box_targets,
+      })
+
+    # Get roi features.
+    roi_features = self.roi_aligner(features, rois)
+
+    # Run frcnn head to get class and bbox predictions.
+    current_detection_head = self.detection_head[cascade_num]
+    class_outputs, box_outputs = current_detection_head(roi_features)
+
+    model_outputs.update({
+        'class_outputs_{}'.format(cascade_num)
+        if cascade_num else 'class_outputs':
+            class_outputs,
+        'box_outputs_{}'.format(cascade_num) if cascade_num else 'box_outputs':
+            box_outputs,
+    })
+    return (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
+            matched_gt_classes, matched_gt_indices, rois)
+
+  def _features_to_mask_outputs(self, features, rois, roi_classes):
+    # Mask RoI align.
+    mask_roi_features = self.mask_roi_aligner(features, rois)
+
+    # Mask head.
+    raw_masks = self.mask_head([mask_roi_features, roi_classes])
+
+    return raw_masks, tf.nn.sigmoid(raw_masks)
+
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(
+        backbone=self.backbone,
+        rpn_head=self.rpn_head,
+        detection_head=self.detection_head)
+    if self.decoder is not None:
+      items.update(decoder=self.decoder)
+    if self._include_mask:
+      items.update(mask_head=self.mask_head)
+
+    return items
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/maskrcnn_model_test.py
+++ b/official/vision/modeling/maskrcnn_model_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for maskrcnn_model.py."""
+
+import os
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.modeling import maskrcnn_model
+from official.vision.modeling.backbones import resnet
+from official.vision.modeling.decoders import fpn
+from official.vision.modeling.heads import dense_prediction_heads
+from official.vision.modeling.heads import instance_heads
+from official.vision.modeling.layers import detection_generator
+from official.vision.modeling.layers import mask_sampler
+from official.vision.modeling.layers import roi_aligner
+from official.vision.modeling.layers import roi_generator
+from official.vision.modeling.layers import roi_sampler
+from official.vision.ops import anchor
+
+
+class MaskRCNNModelTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          include_mask=[True, False],
+          use_separable_conv=[True, False],
+          build_anchor_boxes=[True, False],
+          is_training=[True, False]))
+  def test_build_model(self, include_mask, use_separable_conv,
+                       build_anchor_boxes, is_training):
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    anchor_size = 3
+    resnet_model_id = 50
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+    image_size = 384
+    images = np.random.rand(2, image_size, image_size, 3)
+    image_shape = np.array([[image_size, image_size], [image_size, image_size]])
+
+    if build_anchor_boxes:
+      anchor_boxes = anchor.Anchor(
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=3,
+          image_size=(image_size, image_size)).multilevel_boxes
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
+    else:
+      anchor_boxes = None
+
+    backbone = resnet.ResNet(model_id=resnet_model_id)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level,
+        use_separable_conv=use_separable_conv)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_anchors_per_location=num_anchors_per_location,
+        num_convs=1)
+    detection_head = instance_heads.DetectionHead(num_classes=num_classes)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    if include_mask:
+      mask_head = instance_heads.MaskHead(
+          num_classes=num_classes, upsample_factor=2)
+      mask_sampler_obj = mask_sampler.MaskSampler(
+          mask_target_size=28, num_sampled_masks=1)
+      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+    else:
+      mask_head = None
+      mask_sampler_obj = None
+      mask_roi_aligner_obj = None
+    model = maskrcnn_model.MaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size)
+
+    gt_boxes = np.array(
+        [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
+         [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
+        dtype=np.float32)
+    gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
+    if include_mask:
+      gt_masks = np.ones((2, 3, 100, 100))
+    else:
+      gt_masks = None
+
+    # Results will be checked in test_forward.
+    _ = model(
+        images,
+        image_shape,
+        anchor_boxes,
+        gt_boxes,
+        gt_classes,
+        gt_masks,
+        training=is_training)
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          include_mask=[True, False],
+          build_anchor_boxes=[True, False],
+          use_cascade_heads=[True, False],
+          training=[True, False],
+      ))
+  def test_forward(self, strategy, include_mask, build_anchor_boxes, training,
+                   use_cascade_heads):
+    num_classes = 3
+    min_level = 3
+    max_level = 4
+    num_scales = 3
+    aspect_ratios = [1.0]
+    anchor_size = 3
+    if use_cascade_heads:
+      cascade_iou_thresholds = [0.6]
+      class_agnostic_bbox_pred = True
+      cascade_class_ensemble = True
+    else:
+      cascade_iou_thresholds = None
+      class_agnostic_bbox_pred = False
+      cascade_class_ensemble = False
+
+    image_size = (256, 256)
+    images = np.random.rand(2, image_size[0], image_size[1], 3)
+    image_shape = np.array([[224, 100], [100, 224]])
+    with strategy.scope():
+      if build_anchor_boxes:
+        anchor_boxes = anchor.Anchor(
+            min_level=min_level,
+            max_level=max_level,
+            num_scales=num_scales,
+            aspect_ratios=aspect_ratios,
+            anchor_size=anchor_size,
+            image_size=image_size).multilevel_boxes
+      else:
+        anchor_boxes = None
+      num_anchors_per_location = len(aspect_ratios) * num_scales
+
+      input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+      backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+      decoder = fpn.FPN(
+          min_level=min_level,
+          max_level=max_level,
+          input_specs=backbone.output_specs)
+      rpn_head = dense_prediction_heads.RPNHead(
+          min_level=min_level,
+          max_level=max_level,
+          num_anchors_per_location=num_anchors_per_location)
+      detection_head = instance_heads.DetectionHead(
+          num_classes=num_classes,
+          class_agnostic_bbox_pred=class_agnostic_bbox_pred)
+      roi_generator_obj = roi_generator.MultilevelROIGenerator()
+
+      roi_sampler_cascade = []
+      roi_sampler_obj = roi_sampler.ROISampler()
+      roi_sampler_cascade.append(roi_sampler_obj)
+      if cascade_iou_thresholds:
+        for iou in cascade_iou_thresholds:
+          roi_sampler_obj = roi_sampler.ROISampler(
+              mix_gt_boxes=False,
+              foreground_iou_threshold=iou,
+              background_iou_high_threshold=iou,
+              background_iou_low_threshold=0.0,
+              skip_subsampling=True)
+          roi_sampler_cascade.append(roi_sampler_obj)
+      roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+      detection_generator_obj = detection_generator.DetectionGenerator()
+      if include_mask:
+        mask_head = instance_heads.MaskHead(
+            num_classes=num_classes, upsample_factor=2)
+        mask_sampler_obj = mask_sampler.MaskSampler(
+            mask_target_size=28, num_sampled_masks=1)
+        mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+      else:
+        mask_head = None
+        mask_sampler_obj = None
+        mask_roi_aligner_obj = None
+      model = maskrcnn_model.MaskRCNNModel(
+          backbone,
+          decoder,
+          rpn_head,
+          detection_head,
+          roi_generator_obj,
+          roi_sampler_obj,
+          roi_aligner_obj,
+          detection_generator_obj,
+          mask_head,
+          mask_sampler_obj,
+          mask_roi_aligner_obj,
+          class_agnostic_bbox_pred=class_agnostic_bbox_pred,
+          cascade_class_ensemble=cascade_class_ensemble,
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=anchor_size)
+
+      gt_boxes = np.array(
+          [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
+           [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
+          dtype=np.float32)
+      gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
+      if include_mask:
+        gt_masks = np.ones((2, 3, 100, 100))
+      else:
+        gt_masks = None
+
+      results = model(
+          images,
+          image_shape,
+          anchor_boxes,
+          gt_boxes,
+          gt_classes,
+          gt_masks,
+          training=training)
+
+    self.assertIn('rpn_boxes', results)
+    self.assertIn('rpn_scores', results)
+    if training:
+      self.assertIn('class_targets', results)
+      self.assertIn('box_targets', results)
+      self.assertIn('class_outputs', results)
+      self.assertIn('box_outputs', results)
+      if include_mask:
+        self.assertIn('mask_outputs', results)
+    else:
+      self.assertIn('detection_boxes', results)
+      self.assertIn('detection_scores', results)
+      self.assertIn('detection_classes', results)
+      self.assertIn('num_detections', results)
+      if include_mask:
+        self.assertIn('detection_masks', results)
+
+  @parameterized.parameters(
+      (False,),
+      (True,),
+  )
+  def test_serialize_deserialize(self, include_mask):
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+    backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+    decoder = fpn.FPN(
+        min_level=3, max_level=7, input_specs=backbone.output_specs)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=3, max_level=7, num_anchors_per_location=3)
+    detection_head = instance_heads.DetectionHead(num_classes=2)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    if include_mask:
+      mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
+      mask_sampler_obj = mask_sampler.MaskSampler(
+          mask_target_size=28, num_sampled_masks=1)
+      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+    else:
+      mask_head = None
+      mask_sampler_obj = None
+      mask_roi_aligner_obj = None
+    model = maskrcnn_model.MaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj,
+        min_level=3,
+        max_level=7,
+        num_scales=3,
+        aspect_ratios=[1.0],
+        anchor_size=3)
+
+    config = model.get_config()
+    new_model = maskrcnn_model.MaskRCNNModel.from_config(config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+  @parameterized.parameters(
+      (False,),
+      (True,),
+  )
+  def test_checkpoint(self, include_mask):
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+    backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+    decoder = fpn.FPN(
+        min_level=3, max_level=7, input_specs=backbone.output_specs)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=3, max_level=7, num_anchors_per_location=3)
+    detection_head = instance_heads.DetectionHead(num_classes=2)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    if include_mask:
+      mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
+      mask_sampler_obj = mask_sampler.MaskSampler(
+          mask_target_size=28, num_sampled_masks=1)
+      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+    else:
+      mask_head = None
+      mask_sampler_obj = None
+      mask_roi_aligner_obj = None
+    model = maskrcnn_model.MaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj,
+        min_level=3,
+        max_level=7,
+        num_scales=3,
+        aspect_ratios=[1.0],
+        anchor_size=3)
+    expect_checkpoint_items = dict(
+        backbone=backbone,
+        decoder=decoder,
+        rpn_head=rpn_head,
+        detection_head=[detection_head])
+    if include_mask:
+      expect_checkpoint_items['mask_head'] = mask_head
+    self.assertAllEqual(expect_checkpoint_items, model.checkpoint_items)
+
+    # Test save and load checkpoints.
+    ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items)
+    save_dir = self.create_tempdir().full_path
+    ckpt.save(os.path.join(save_dir, 'ckpt'))
+
+    partial_ckpt = tf.train.Checkpoint(backbone=backbone)
+    partial_ckpt.read(tf.train.latest_checkpoint(
+        save_dir)).expect_partial().assert_existing_objects_matched()
+
+    if include_mask:
+      partial_ckpt_mask = tf.train.Checkpoint(
+          backbone=backbone, mask_head=mask_head)
+      partial_ckpt_mask.restore(tf.train.latest_checkpoint(
+          save_dir)).expect_partial().assert_existing_objects_matched()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/retinanet_model.py
+++ b/official/vision/modeling/retinanet_model.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""RetinaNet."""
+from typing import Any, Mapping, List, Optional, Union
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.ops import anchor
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class RetinaNetModel(tf.keras.Model):
+  """The RetinaNet model class."""
+
+  def __init__(self,
+               backbone: tf.keras.Model,
+               decoder: tf.keras.Model,
+               head: tf.keras.layers.Layer,
+               detection_generator: tf.keras.layers.Layer,
+               min_level: Optional[int] = None,
+               max_level: Optional[int] = None,
+               num_scales: Optional[int] = None,
+               aspect_ratios: Optional[List[float]] = None,
+               anchor_size: Optional[float] = None,
+               **kwargs):
+    """Classification initialization function.
+
+    Args:
+      backbone: `tf.keras.Model` a backbone network.
+      decoder: `tf.keras.Model` a decoder network.
+      head: `RetinaNetHead`, the RetinaNet head.
+      detection_generator: the detection generator.
+      min_level: Minimum level in output feature maps.
+      max_level: Maximum level in output feature maps.
+      num_scales: A number representing intermediate scales added
+        on each level. For instances, num_scales=2 adds one additional
+        intermediate anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: A list representing the aspect raito
+        anchors added on each level. The number indicates the ratio of width to
+        height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
+        on each scale level.
+      anchor_size: A number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(RetinaNetModel, self).__init__(**kwargs)
+    self._config_dict = {
+        'backbone': backbone,
+        'decoder': decoder,
+        'head': head,
+        'detection_generator': detection_generator,
+        'min_level': min_level,
+        'max_level': max_level,
+        'num_scales': num_scales,
+        'aspect_ratios': aspect_ratios,
+        'anchor_size': anchor_size,
+    }
+    self._backbone = backbone
+    self._decoder = decoder
+    self._head = head
+    self._detection_generator = detection_generator
+
+  def call(self,
+           images: tf.Tensor,
+           image_shape: Optional[tf.Tensor] = None,
+           anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+           output_intermediate_features: bool = False,
+           training: bool = None) -> Mapping[str, tf.Tensor]:
+    """Forward pass of the RetinaNet model.
+
+    Args:
+      images: `Tensor`, the input batched images, whose shape is
+        [batch, height, width, 3].
+      image_shape: `Tensor`, the actual shape of the input images, whose shape
+        is [batch, 2] where the last dimension is [height, width]. Note that
+        this is the actual image shape excluding paddings. For example, images
+        in the batch may be resized into different shapes before padding to the
+        fixed size.
+      anchor_boxes: a dict of tensors which includes multilevel anchors.
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the anchor coordinates of a particular feature
+            level, whose shape is [height_l, width_l, num_anchors_per_location].
+      output_intermediate_features: `bool` indicating whether to return the
+        intermediate feature maps generated by backbone and decoder.
+      training: `bool`, indicating whether it is in training mode.
+
+    Returns:
+      scores: a dict of tensors which includes scores of the predictions.
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the box scores predicted from a particular feature
+            level, whose shape is
+            [batch, height_l, width_l, num_classes * num_anchors_per_location].
+      boxes: a dict of tensors which includes coordinates of the predictions.
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the box coordinates predicted from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l, 4 * num_anchors_per_location].
+      attributes: a dict of (attribute_name, attribute_predictions). Each
+        attribute prediction is a dict that includes:
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the attribute predictions from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l, att_size * num_anchors_per_location].
+    """
+    outputs = {}
+    # Feature extraction.
+    features = self.backbone(images)
+    if output_intermediate_features:
+      outputs.update(
+          {'backbone_{}'.format(k): v for k, v in features.items()})
+    if self.decoder:
+      features = self.decoder(features)
+    if output_intermediate_features:
+      outputs.update(
+          {'decoder_{}'.format(k): v for k, v in features.items()})
+
+    # Dense prediction. `raw_attributes` can be empty.
+    raw_scores, raw_boxes, raw_attributes = self.head(features)
+
+    if training:
+      outputs.update({
+          'cls_outputs': raw_scores,
+          'box_outputs': raw_boxes,
+      })
+      if raw_attributes:
+        outputs.update({'attribute_outputs': raw_attributes})
+      return outputs
+    else:
+      # Generate anchor boxes for this batch if not provided.
+      if anchor_boxes is None:
+        _, image_height, image_width, _ = images.get_shape().as_list()
+        anchor_boxes = anchor.Anchor(
+            min_level=self._config_dict['min_level'],
+            max_level=self._config_dict['max_level'],
+            num_scales=self._config_dict['num_scales'],
+            aspect_ratios=self._config_dict['aspect_ratios'],
+            anchor_size=self._config_dict['anchor_size'],
+            image_size=(image_height, image_width)).multilevel_boxes
+        for l in anchor_boxes:
+          anchor_boxes[l] = tf.tile(
+              tf.expand_dims(anchor_boxes[l], axis=0),
+              [tf.shape(images)[0], 1, 1, 1])
+
+      # Post-processing.
+      final_results = self.detection_generator(raw_boxes, raw_scores,
+                                               anchor_boxes, image_shape,
+                                               raw_attributes)
+      outputs.update({
+          'cls_outputs': raw_scores,
+          'box_outputs': raw_boxes,
+      })
+      if self.detection_generator.get_config()['apply_nms']:
+        outputs.update({
+            'detection_boxes': final_results['detection_boxes'],
+            'detection_scores': final_results['detection_scores'],
+            'detection_classes': final_results['detection_classes'],
+            'num_detections': final_results['num_detections']
+        })
+      else:
+        outputs.update({
+            'decoded_boxes': final_results['decoded_boxes'],
+            'decoded_box_scores': final_results['decoded_box_scores']
+        })
+
+      if raw_attributes:
+        outputs.update({
+            'attribute_outputs': raw_attributes,
+            'detection_attributes': final_results['detection_attributes'],
+        })
+      return outputs
+
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(backbone=self.backbone, head=self.head)
+    if self.decoder is not None:
+      items.update(decoder=self.decoder)
+
+    return items
+
+  @property
+  def backbone(self) -> tf.keras.Model:
+    return self._backbone
+
+  @property
+  def decoder(self) -> tf.keras.Model:
+    return self._decoder
+
+  @property
+  def head(self) -> tf.keras.layers.Layer:
+    return self._head
+
+  @property
+  def detection_generator(self) -> tf.keras.layers.Layer:
+    return self._detection_generator
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/retinanet_model_test.py
+++ b/official/vision/modeling/retinanet_model_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for RetinaNet models."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.modeling import retinanet_model
+from official.vision.modeling.backbones import resnet
+from official.vision.modeling.decoders import fpn
+from official.vision.modeling.heads import dense_prediction_heads
+from official.vision.modeling.layers import detection_generator
+from official.vision.ops import anchor
+
+
+class RetinaNetTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      {
+          'use_separable_conv': True,
+          'build_anchor_boxes': True,
+          'is_training': False,
+          'has_att_heads': False
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': True,
+          'is_training': False,
+          'has_att_heads': False
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': False,
+          'is_training': False,
+          'has_att_heads': False
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': False,
+          'is_training': True,
+          'has_att_heads': False
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': True,
+          'is_training': True,
+          'has_att_heads': True
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': True,
+          'is_training': False,
+          'has_att_heads': True
+      },
+  )
+  def test_build_model(self, use_separable_conv, build_anchor_boxes,
+                       is_training, has_att_heads):
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    anchor_size = 3
+    fpn_num_filters = 256
+    head_num_convs = 4
+    head_num_filters = 256
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+    image_size = 384
+    images = np.random.rand(2, image_size, image_size, 3)
+    image_shape = np.array([[image_size, image_size], [image_size, image_size]])
+
+    if build_anchor_boxes:
+      anchor_boxes = anchor.Anchor(
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=anchor_size,
+          image_size=(image_size, image_size)).multilevel_boxes
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
+    else:
+      anchor_boxes = None
+
+    if has_att_heads:
+      attribute_heads = [dict(name='depth', type='regression', size=1)]
+    else:
+      attribute_heads = None
+
+    backbone = resnet.ResNet(model_id=50)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level,
+        num_filters=fpn_num_filters,
+        use_separable_conv=use_separable_conv)
+    head = dense_prediction_heads.RetinaNetHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_classes=num_classes,
+        attribute_heads=attribute_heads,
+        num_anchors_per_location=num_anchors_per_location,
+        use_separable_conv=use_separable_conv,
+        num_convs=head_num_convs,
+        num_filters=head_num_filters)
+    generator = detection_generator.MultilevelDetectionGenerator(
+        max_num_detections=10)
+    model = retinanet_model.RetinaNetModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head,
+        detection_generator=generator,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size)
+
+    _ = model(images, image_shape, anchor_boxes, training=is_training)
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          image_size=[
+              (128, 128),
+          ],
+          training=[True, False],
+          has_att_heads=[True, False],
+          output_intermediate_features=[True, False],
+          soft_nms_sigma=[None, 0.0, 0.1],
+      ))
+  def test_forward(self, strategy, image_size, training, has_att_heads,
+                   output_intermediate_features, soft_nms_sigma):
+    """Test for creation of a R50-FPN RetinaNet."""
+    tf.keras.backend.set_image_data_format('channels_last')
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+
+    images = np.random.rand(2, image_size[0], image_size[1], 3)
+    image_shape = np.array(
+        [[image_size[0], image_size[1]], [image_size[0], image_size[1]]])
+
+    with strategy.scope():
+      anchor_gen = anchor.build_anchor_generator(
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=3)
+      anchor_boxes = anchor_gen(image_size)
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
+
+      backbone = resnet.ResNet(model_id=50)
+      decoder = fpn.FPN(
+          input_specs=backbone.output_specs,
+          min_level=min_level,
+          max_level=max_level)
+
+      if has_att_heads:
+        attribute_heads = [dict(name='depth', type='regression', size=1)]
+      else:
+        attribute_heads = None
+      head = dense_prediction_heads.RetinaNetHead(
+          min_level=min_level,
+          max_level=max_level,
+          num_classes=num_classes,
+          attribute_heads=attribute_heads,
+          num_anchors_per_location=num_anchors_per_location)
+      generator = detection_generator.MultilevelDetectionGenerator(
+          max_num_detections=10,
+          nms_version='v1',
+          use_cpu_nms=soft_nms_sigma is not None,
+          soft_nms_sigma=soft_nms_sigma)
+      model = retinanet_model.RetinaNetModel(
+          backbone=backbone,
+          decoder=decoder,
+          head=head,
+          detection_generator=generator)
+
+      model_outputs = model(
+          images,
+          image_shape,
+          anchor_boxes,
+          output_intermediate_features=output_intermediate_features,
+          training=training)
+
+    if training:
+      cls_outputs = model_outputs['cls_outputs']
+      box_outputs = model_outputs['box_outputs']
+      for level in range(min_level, max_level + 1):
+        self.assertIn(str(level), cls_outputs)
+        self.assertIn(str(level), box_outputs)
+        self.assertAllEqual([
+            2,
+            image_size[0] // 2**level,
+            image_size[1] // 2**level,
+            num_classes * num_anchors_per_location
+        ], cls_outputs[str(level)].numpy().shape)
+        self.assertAllEqual([
+            2,
+            image_size[0] // 2**level,
+            image_size[1] // 2**level,
+            4 * num_anchors_per_location
+        ], box_outputs[str(level)].numpy().shape)
+        if has_att_heads:
+          att_outputs = model_outputs['attribute_outputs']
+          for att in att_outputs.values():
+            self.assertAllEqual([
+                2, image_size[0] // 2**level, image_size[1] // 2**level,
+                1 * num_anchors_per_location
+            ], att[str(level)].numpy().shape)
+    else:
+      self.assertIn('detection_boxes', model_outputs)
+      self.assertIn('detection_scores', model_outputs)
+      self.assertIn('detection_classes', model_outputs)
+      self.assertIn('num_detections', model_outputs)
+      self.assertAllEqual(
+          [2, 10, 4], model_outputs['detection_boxes'].numpy().shape)
+      self.assertAllEqual(
+          [2, 10], model_outputs['detection_scores'].numpy().shape)
+      self.assertAllEqual(
+          [2, 10], model_outputs['detection_classes'].numpy().shape)
+      self.assertAllEqual(
+          [2,], model_outputs['num_detections'].numpy().shape)
+      if has_att_heads:
+        self.assertIn('detection_attributes', model_outputs)
+        self.assertAllEqual(
+            [2, 10, 1],
+            model_outputs['detection_attributes']['depth'].numpy().shape)
+    if output_intermediate_features:
+      for l in range(2, 6):
+        self.assertIn('backbone_{}'.format(l), model_outputs)
+        self.assertAllEqual([
+            2, image_size[0] // 2**l, image_size[1] // 2**l,
+            backbone.output_specs[str(l)].as_list()[-1]
+        ], model_outputs['backbone_{}'.format(l)].numpy().shape)
+      for l in range(min_level, max_level + 1):
+        self.assertIn('decoder_{}'.format(l), model_outputs)
+        self.assertAllEqual([
+            2, image_size[0] // 2**l, image_size[1] // 2**l,
+            decoder.output_specs[str(l)].as_list()[-1]
+        ], model_outputs['decoder_{}'.format(l)].numpy().shape)
+
+  def test_serialize_deserialize(self):
+    """Validate the network can be serialized and deserialized."""
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+
+    backbone = resnet.ResNet(model_id=50)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level)
+    head = dense_prediction_heads.RetinaNetHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_classes=num_classes,
+        num_anchors_per_location=num_anchors_per_location)
+    generator = detection_generator.MultilevelDetectionGenerator(
+        max_num_detections=10)
+    model = retinanet_model.RetinaNetModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head,
+        detection_generator=generator,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=3)
+
+    config = model.get_config()
+    new_model = retinanet_model.RetinaNetModel.from_config(config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/segmentation_model.py
+++ b/official/vision/modeling/segmentation_model.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Build segmentation models."""
+from typing import Any, Mapping, Union, Optional, Dict
+
+# Import libraries
+import tensorflow as tf
+
+layers = tf.keras.layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SegmentationModel(tf.keras.Model):
+  """A Segmentation class model.
+
+  Input images are passed through backbone first. Decoder network is then
+  applied, and finally, segmentation head is applied on the output of the
+  decoder network. Layers such as ASPP should be part of decoder. Any feature
+  fusion is done as part of the segmentation head (i.e. deeplabv3+ feature
+  fusion is not part of the decoder, instead it is part of the segmentation
+  head). This way, different feature fusion techniques can be combined with
+  different backbones, and decoders.
+  """
+
+  def __init__(self, backbone: tf.keras.Model, decoder: tf.keras.Model,
+               head: tf.keras.layers.Layer,
+               mask_scoring_head: Optional[tf.keras.layers.Layer] = None,
+               **kwargs):
+    """Segmentation initialization function.
+
+    Args:
+      backbone: a backbone network.
+      decoder: a decoder network. E.g. FPN.
+      head: segmentation head.
+      mask_scoring_head: mask scoring head.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(SegmentationModel, self).__init__(**kwargs)
+    self._config_dict = {
+        'backbone': backbone,
+        'decoder': decoder,
+        'head': head,
+        'mask_scoring_head': mask_scoring_head,
+    }
+    self.backbone = backbone
+    self.decoder = decoder
+    self.head = head
+    self.mask_scoring_head = mask_scoring_head
+
+  def call(self, inputs: tf.Tensor, training: bool = None
+           ) -> Dict[str, tf.Tensor]:
+    backbone_features = self.backbone(inputs)
+
+    if self.decoder:
+      decoder_features = self.decoder(backbone_features)
+    else:
+      decoder_features = backbone_features
+
+    logits = self.head((backbone_features, decoder_features))
+    outputs = {'logits': logits}
+    if self.mask_scoring_head:
+      mask_scores = self.mask_scoring_head(logits)
+      outputs.update({'mask_scores': mask_scores})
+    return outputs
+
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(backbone=self.backbone, head=self.head)
+    if self.decoder is not None:
+      items.update(decoder=self.decoder)
+    if self.mask_scoring_head is not None:
+      items.update(mask_scoring_head=self.mask_scoring_head)
+    return items
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/modeling/segmentation_model_test.py
+++ b/official/vision/modeling/segmentation_model_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for segmentation network."""
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling import backbones
+from official.vision.modeling import segmentation_model
+from official.vision.modeling.decoders import fpn
+from official.vision.modeling.heads import segmentation_heads
+
+
+class SegmentationNetworkTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (128, 2),
+      (128, 3),
+      (128, 4),
+      (256, 2),
+      (256, 3),
+      (256, 4),
+  )
+  def test_segmentation_network_creation(
+      self, input_size, level):
+    """Test for creation of a segmentation network."""
+    num_classes = 10
+    inputs = np.random.rand(2, input_size, input_size, 3)
+    tf.keras.backend.set_image_data_format('channels_last')
+    backbone = backbones.ResNet(model_id=50)
+
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs, min_level=2, max_level=7)
+    head = segmentation_heads.SegmentationHead(num_classes, level=level)
+
+    model = segmentation_model.SegmentationModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head,
+        mask_scoring_head=None,
+    )
+
+    outputs = model(inputs)
+    self.assertAllEqual(
+        [2, input_size // (2**level), input_size // (2**level), num_classes],
+        outputs['logits'].numpy().shape)
+
+  def test_serialize_deserialize(self):
+    """Validate the network can be serialized and deserialized."""
+    num_classes = 3
+    backbone = backbones.ResNet(model_id=50)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs, min_level=3, max_level=7)
+    head = segmentation_heads.SegmentationHead(num_classes, level=3)
+    model = segmentation_model.SegmentationModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head
+    )
+
+    config = model.get_config()
+    new_model = segmentation_model.SegmentationModel.from_config(config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/video_classification_model.py
+++ b/official/vision/modeling/video_classification_model.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Build video classification models."""
+from typing import Any, Mapping, Optional, Union, List, Text
+
+import tensorflow as tf
+
+layers = tf.keras.layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class VideoClassificationModel(tf.keras.Model):
+  """A video classification class builder."""
+
+  def __init__(
+      self,
+      backbone: tf.keras.Model,
+      num_classes: int,
+      input_specs: Optional[Mapping[str, tf.keras.layers.InputSpec]] = None,
+      dropout_rate: float = 0.0,
+      aggregate_endpoints: bool = False,
+      kernel_initializer: str = 'random_uniform',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      require_endpoints: Optional[List[Text]] = None,
+      **kwargs):
+    """Video Classification initialization function.
+
+    Args:
+      backbone: a 3d backbone network.
+      num_classes: `int` number of classes in classification task.
+      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
+      dropout_rate: `float` rate for dropout regularization.
+      aggregate_endpoints: `bool` aggregate all end ponits or only use the
+        final end point.
+      kernel_initializer: kernel initializer for the dense layer.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object. Default to
+        None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object. Default to
+        None.
+      require_endpoints: the required endpoints for prediction. If None or
+        empty, then only uses the final endpoint.
+      **kwargs: keyword arguments to be passed.
+    """
+    if not input_specs:
+      input_specs = {
+          'image': layers.InputSpec(shape=[None, None, None, None, 3])
+      }
+    self._self_setattr_tracking = False
+    self._config_dict = {
+        'backbone': backbone,
+        'num_classes': num_classes,
+        'input_specs': input_specs,
+        'dropout_rate': dropout_rate,
+        'aggregate_endpoints': aggregate_endpoints,
+        'kernel_initializer': kernel_initializer,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+        'require_endpoints': require_endpoints,
+    }
+    self._input_specs = input_specs
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._backbone = backbone
+
+    inputs = {
+        k: tf.keras.Input(shape=v.shape[1:]) for k, v in input_specs.items()
+    }
+    endpoints = backbone(inputs['image'])
+
+    if aggregate_endpoints:
+      pooled_feats = []
+      for endpoint in endpoints.values():
+        x_pool = tf.keras.layers.GlobalAveragePooling3D()(endpoint)
+        pooled_feats.append(x_pool)
+      x = tf.concat(pooled_feats, axis=1)
+    else:
+      if not require_endpoints:
+        # Uses the last endpoint for prediction.
+        x = endpoints[max(endpoints.keys())]
+        x = tf.keras.layers.GlobalAveragePooling3D()(x)
+      else:
+        # Concats all the required endpoints for prediction.
+        outputs = []
+        for name in require_endpoints:
+          x = endpoints[name]
+          x = tf.keras.layers.GlobalAveragePooling3D()(x)
+          outputs.append(x)
+        x = tf.concat(outputs, axis=1)
+
+    x = tf.keras.layers.Dropout(dropout_rate)(x)
+    x = tf.keras.layers.Dense(
+        num_classes, kernel_initializer=kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            x)
+
+    super(VideoClassificationModel, self).__init__(
+        inputs=inputs, outputs=x, **kwargs)
+
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    return dict(backbone=self.backbone)
+
+  @property
+  def backbone(self) -> tf.keras.Model:
+    return self._backbone
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/modeling/video_classification_model_test.py
+++ b/official/vision/modeling/video_classification_model_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for video classification network."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling import backbones
+from official.vision.modeling import video_classification_model
+
+
+class VideoClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (50, 8, 112, 'relu', False),
+      (50, 8, 112, 'swish', True),
+  )
+  def test_resnet3d_network_creation(self, model_id, temporal_size,
+                                     spatial_size, activation,
+                                     aggregate_endpoints):
+    """Test for creation of a ResNet3D-50 classifier."""
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, temporal_size, spatial_size, spatial_size, 3])
+    temporal_strides = [1, 1, 1, 1]
+    temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
+                             (1, 3, 1)]
+
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    backbone = backbones.ResNet3D(
+        model_id=model_id,
+        temporal_strides=temporal_strides,
+        temporal_kernel_sizes=temporal_kernel_sizes,
+        input_specs=input_specs,
+        activation=activation)
+
+    num_classes = 1000
+    model = video_classification_model.VideoClassificationModel(
+        backbone=backbone,
+        num_classes=num_classes,
+        input_specs={'image': input_specs},
+        dropout_rate=0.2,
+        aggregate_endpoints=aggregate_endpoints,
+    )
+
+    inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
+    logits = model(inputs)
+    self.assertAllEqual([2, num_classes], logits.numpy().shape)
+
+  def test_serialize_deserialize(self):
+    """Validate the classification network can be serialized and deserialized."""
+    model_id = 50
+    temporal_strides = [1, 1, 1, 1]
+    temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
+                             (1, 3, 1)]
+
+    backbone = backbones.ResNet3D(
+        model_id=model_id,
+        temporal_strides=temporal_strides,
+        temporal_kernel_sizes=temporal_kernel_sizes)
+
+    model = video_classification_model.VideoClassificationModel(
+        backbone=backbone, num_classes=1000)
+
+    config = model.get_config()
+    new_model = video_classification_model.VideoClassificationModel.from_config(
+        config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/ops/__init__.py
+++ b/official/vision/ops/__init__.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/vision/ops/anchor.py
+++ b/official/vision/ops/anchor.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Anchor box and labeler definition."""
+
+import collections
+
+# Import libraries
+
+import tensorflow as tf
+
+from official.vision.ops import anchor_generator
+from official.vision.ops import box_matcher
+from official.vision.ops import iou_similarity
+from official.vision.ops import target_gather
+from official.vision.utils.object_detection import balanced_positive_negative_sampler
+from official.vision.utils.object_detection import box_list
+from official.vision.utils.object_detection import faster_rcnn_box_coder
+
+
+class Anchor(object):
+  """Anchor class for anchor-based object detectors."""
+
+  def __init__(self,
+               min_level,
+               max_level,
+               num_scales,
+               aspect_ratios,
+               anchor_size,
+               image_size):
+    """Constructs multiscale anchors.
+
+    Args:
+      min_level: integer number of minimum level of the output feature pyramid.
+      max_level: integer number of maximum level of the output feature pyramid.
+      num_scales: integer number representing intermediate scales added
+        on each level. For instances, num_scales=2 adds one additional
+        intermediate anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: list of float numbers representing the aspect raito anchors
+        added on each level. The number indicates the ratio of width to height.
+        For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
+        scale level.
+      anchor_size: float number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      image_size: a list of integer numbers or Tensors representing
+        [height, width] of the input image size.The image_size should be divided
+        by the largest feature stride 2^max_level.
+    """
+    self.min_level = min_level
+    self.max_level = max_level
+    self.num_scales = num_scales
+    self.aspect_ratios = aspect_ratios
+    self.anchor_size = anchor_size
+    self.image_size = image_size
+    self.boxes = self._generate_boxes()
+
+  def _generate_boxes(self):
+    """Generates multiscale anchor boxes.
+
+    Returns:
+      a Tensor of shape [N, 4], representing anchor boxes of all levels
+      concatenated together.
+    """
+    boxes_all = []
+    for level in range(self.min_level, self.max_level + 1):
+      boxes_l = []
+      for scale in range(self.num_scales):
+        for aspect_ratio in self.aspect_ratios:
+          stride = 2 ** level
+          intermidate_scale = 2 ** (scale / float(self.num_scales))
+          base_anchor_size = self.anchor_size * stride * intermidate_scale
+          aspect_x = aspect_ratio ** 0.5
+          aspect_y = aspect_ratio ** -0.5
+          half_anchor_size_x = base_anchor_size * aspect_x / 2.0
+          half_anchor_size_y = base_anchor_size * aspect_y / 2.0
+          x = tf.range(stride / 2, self.image_size[1], stride)
+          y = tf.range(stride / 2, self.image_size[0], stride)
+          xv, yv = tf.meshgrid(x, y)
+          xv = tf.cast(tf.reshape(xv, [-1]), dtype=tf.float32)
+          yv = tf.cast(tf.reshape(yv, [-1]), dtype=tf.float32)
+          # Tensor shape Nx4.
+          boxes = tf.stack([yv - half_anchor_size_y, xv - half_anchor_size_x,
+                            yv + half_anchor_size_y, xv + half_anchor_size_x],
+                           axis=1)
+          boxes_l.append(boxes)
+      # Concat anchors on the same level to tensor shape NxAx4.
+      boxes_l = tf.stack(boxes_l, axis=1)
+      boxes_l = tf.reshape(boxes_l, [-1, 4])
+      boxes_all.append(boxes_l)
+    return tf.concat(boxes_all, axis=0)
+
+  def unpack_labels(self, labels):
+    """Unpacks an array of labels into multiscales labels."""
+    unpacked_labels = collections.OrderedDict()
+    count = 0
+    for level in range(self.min_level, self.max_level + 1):
+      feat_size_y = tf.cast(self.image_size[0] / 2 ** level, tf.int32)
+      feat_size_x = tf.cast(self.image_size[1] / 2 ** level, tf.int32)
+      steps = feat_size_y * feat_size_x * self.anchors_per_location
+      unpacked_labels[str(level)] = tf.reshape(
+          labels[count:count + steps], [feat_size_y, feat_size_x, -1])
+      count += steps
+    return unpacked_labels
+
+  @property
+  def anchors_per_location(self):
+    return self.num_scales * len(self.aspect_ratios)
+
+  @property
+  def multilevel_boxes(self):
+    return self.unpack_labels(self.boxes)
+
+
+class AnchorLabeler(object):
+  """Labeler for dense object detector."""
+
+  def __init__(self,
+               match_threshold=0.5,
+               unmatched_threshold=0.5):
+    """Constructs anchor labeler to assign labels to anchors.
+
+    Args:
+      match_threshold: a float number between 0 and 1 representing the
+        lower-bound threshold to assign positive labels for anchors. An anchor
+        with a score over the threshold is labeled positive.
+      unmatched_threshold: a float number between 0 and 1 representing the
+        upper-bound threshold to assign negative labels for anchors. An anchor
+        with a score below the threshold is labeled negative.
+    """
+    self.similarity_calc = iou_similarity.IouSimilarity()
+    self.target_gather = target_gather.TargetGather()
+    self.matcher = box_matcher.BoxMatcher(
+        thresholds=[unmatched_threshold, match_threshold],
+        indicators=[-1, -2, 1],
+        force_match_for_each_col=True)
+    self.box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder()
+
+  def label_anchors(self,
+                    anchor_boxes,
+                    gt_boxes,
+                    gt_labels,
+                    gt_attributes=None,
+                    gt_weights=None):
+    """Labels anchors with ground truth inputs.
+
+    Args:
+      anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
+        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+      gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
+        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+      gt_labels: A integer tensor with shape [N, 1] representing groundtruth
+        classes.
+      gt_attributes: If not None, a dict of (name, gt_attribute) pairs.
+        `gt_attribute` is a float tensor with shape [N, attribute_size]
+        representing groundtruth attributes.
+      gt_weights: If not None, a float tensor with shape [N] representing
+        groundtruth weights.
+    Returns:
+      cls_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors_per_location]. The height_l and
+        width_l represent the dimension of class logits at l-th level.
+      box_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors_per_location * 4]. The height_l
+        and width_l represent the dimension of bounding box regression output at
+        l-th level.
+      attribute_targets_dict: a dict with (name, attribute_targets) pairs. Each
+        `attribute_targets` represents an ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors_per_location * attribute_size].
+        The height_l and width_l represent the dimension of attribute prediction
+        output at l-th level.
+      cls_weights: A flattened Tensor with shape [batch_size, num_anchors], that
+        serves as masking / sample weight for classification loss. Its value
+        is 1.0 for positive and negative matched anchors, and 0.0 for ignored
+        anchors.
+      box_weights: A flattened Tensor with shape [batch_size, num_anchors], that
+        serves as masking / sample weight for regression loss. Its value is
+        1.0 for positive matched anchors, and 0.0 for negative and ignored
+        anchors.
+    """
+    flattened_anchor_boxes = []
+    for anchors in anchor_boxes.values():
+      flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4]))
+    flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0)
+    similarity_matrix = self.similarity_calc(flattened_anchor_boxes, gt_boxes)
+    match_indices, match_indicators = self.matcher(similarity_matrix)
+
+    mask = tf.less_equal(match_indicators, 0)
+    cls_mask = tf.expand_dims(mask, -1)
+    cls_targets = self.target_gather(gt_labels, match_indices, cls_mask, -1)
+    box_mask = tf.tile(cls_mask, [1, 4])
+    box_targets = self.target_gather(gt_boxes, match_indices, box_mask)
+    att_targets = {}
+    if gt_attributes:
+      for k, v in gt_attributes.items():
+        att_size = v.get_shape().as_list()[-1]
+        att_mask = tf.tile(cls_mask, [1, att_size])
+        att_targets[k] = self.target_gather(v, match_indices, att_mask, 0.0)
+
+    weights = tf.squeeze(tf.ones_like(gt_labels, dtype=tf.float32), -1)
+    if gt_weights is not None:
+      weights = tf.math.multiply(weights, gt_weights)
+    box_weights = self.target_gather(weights, match_indices, mask)
+    ignore_mask = tf.equal(match_indicators, -2)
+    cls_weights = self.target_gather(weights, match_indices, ignore_mask)
+    box_targets_list = box_list.BoxList(box_targets)
+    anchor_box_list = box_list.BoxList(flattened_anchor_boxes)
+    box_targets = self.box_coder.encode(box_targets_list, anchor_box_list)
+
+    # Unpacks labels into multi-level representations.
+    cls_targets_dict = unpack_targets(cls_targets, anchor_boxes)
+    box_targets_dict = unpack_targets(box_targets, anchor_boxes)
+    attribute_targets_dict = {}
+    for k, v in att_targets.items():
+      attribute_targets_dict[k] = unpack_targets(v, anchor_boxes)
+
+    return cls_targets_dict, box_targets_dict, attribute_targets_dict, cls_weights, box_weights
+
+
+class RpnAnchorLabeler(AnchorLabeler):
+  """Labeler for Region Proposal Network."""
+
+  def __init__(self,
+               match_threshold=0.7,
+               unmatched_threshold=0.3,
+               rpn_batch_size_per_im=256,
+               rpn_fg_fraction=0.5):
+    AnchorLabeler.__init__(self, match_threshold=match_threshold,
+                           unmatched_threshold=unmatched_threshold)
+    self._rpn_batch_size_per_im = rpn_batch_size_per_im
+    self._rpn_fg_fraction = rpn_fg_fraction
+
+  def _get_rpn_samples(self, match_results):
+    """Computes anchor labels.
+
+    This function performs subsampling for foreground (fg) and background (bg)
+    anchors.
+    Args:
+      match_results: A integer tensor with shape [N] representing the
+        matching results of anchors. (1) match_results[i]>=0,
+        meaning that column i is matched with row match_results[i].
+        (2) match_results[i]=-1, meaning that column i is not matched.
+        (3) match_results[i]=-2, meaning that column i is ignored.
+    Returns:
+      score_targets: a integer tensor with the a shape of [N].
+        (1) score_targets[i]=1, the anchor is a positive sample.
+        (2) score_targets[i]=0, negative. (3) score_targets[i]=-1, the anchor is
+        don't care (ignore).
+    """
+    sampler = (
+        balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
+            positive_fraction=self._rpn_fg_fraction, is_static=False))
+    # indicator includes both positive and negative labels.
+    # labels includes only positives labels.
+    # positives = indicator & labels.
+    # negatives = indicator & !labels.
+    # ignore = !indicator.
+    indicator = tf.greater(match_results, -2)
+    labels = tf.greater(match_results, -1)
+
+    samples = sampler.subsample(
+        indicator, self._rpn_batch_size_per_im, labels)
+    positive_labels = tf.where(
+        tf.logical_and(samples, labels),
+        tf.constant(2, dtype=tf.int32, shape=match_results.shape),
+        tf.constant(0, dtype=tf.int32, shape=match_results.shape))
+    negative_labels = tf.where(
+        tf.logical_and(samples, tf.logical_not(labels)),
+        tf.constant(1, dtype=tf.int32, shape=match_results.shape),
+        tf.constant(0, dtype=tf.int32, shape=match_results.shape))
+    ignore_labels = tf.fill(match_results.shape, -1)
+
+    return (ignore_labels + positive_labels + negative_labels,
+            positive_labels, negative_labels)
+
+  def label_anchors(self, anchor_boxes, gt_boxes, gt_labels):
+    """Labels anchors with ground truth inputs.
+
+    Args:
+      anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
+        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+      gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
+        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+      gt_labels: A integer tensor with shape [N, 1] representing groundtruth
+        classes.
+    Returns:
+      score_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors]. The height_l and width_l
+        represent the dimension of class logits at l-th level.
+      box_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors * 4]. The height_l and
+        width_l represent the dimension of bounding box regression output at
+        l-th level.
+    """
+    flattened_anchor_boxes = []
+    for anchors in anchor_boxes.values():
+      flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4]))
+    flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0)
+    similarity_matrix = self.similarity_calc(flattened_anchor_boxes, gt_boxes)
+    match_indices, match_indicators = self.matcher(similarity_matrix)
+    box_mask = tf.tile(tf.expand_dims(tf.less_equal(match_indicators, 0), -1),
+                       [1, 4])
+    box_targets = self.target_gather(gt_boxes, match_indices, box_mask)
+    box_targets_list = box_list.BoxList(box_targets)
+    anchor_box_list = box_list.BoxList(flattened_anchor_boxes)
+    box_targets = self.box_coder.encode(box_targets_list, anchor_box_list)
+
+    # Zero out the unmatched and ignored regression targets.
+    num_matches = match_indices.shape.as_list()[0] or tf.shape(match_indices)[0]
+    unmatched_ignored_box_targets = tf.zeros([num_matches, 4], dtype=tf.float32)
+    matched_anchors_mask = tf.greater_equal(match_indicators, 0)
+    # To broadcast matched_anchors_mask to the same shape as
+    # matched_reg_targets.
+    matched_anchors_mask = tf.tile(
+        tf.expand_dims(matched_anchors_mask, 1),
+        [1, tf.shape(box_targets)[1]])
+    box_targets = tf.where(matched_anchors_mask, box_targets,
+                           unmatched_ignored_box_targets)
+
+    # score_targets contains the subsampled positive and negative anchors.
+    score_targets, _, _ = self._get_rpn_samples(match_indicators)
+
+    # Unpacks labels.
+    score_targets_dict = unpack_targets(score_targets, anchor_boxes)
+    box_targets_dict = unpack_targets(box_targets, anchor_boxes)
+
+    return score_targets_dict, box_targets_dict
+
+
+def build_anchor_generator(min_level, max_level, num_scales, aspect_ratios,
+                           anchor_size):
+  """Build anchor generator from levels."""
+  anchor_sizes = collections.OrderedDict()
+  strides = collections.OrderedDict()
+  scales = []
+  for scale in range(num_scales):
+    scales.append(2**(scale / float(num_scales)))
+  for level in range(min_level, max_level + 1):
+    stride = 2**level
+    strides[str(level)] = stride
+    anchor_sizes[str(level)] = anchor_size * stride
+  anchor_gen = anchor_generator.AnchorGenerator(
+      anchor_sizes=anchor_sizes,
+      scales=scales,
+      aspect_ratios=aspect_ratios,
+      strides=strides)
+  return anchor_gen
+
+
+def unpack_targets(targets, anchor_boxes_dict):
+  """Unpacks an array of labels into multiscales labels."""
+  unpacked_targets = collections.OrderedDict()
+  count = 0
+  for level, anchor_boxes in anchor_boxes_dict.items():
+    feat_size_shape = anchor_boxes.shape.as_list()
+    feat_size_y = feat_size_shape[0]
+    feat_size_x = feat_size_shape[1]
+    anchors_per_location = int(feat_size_shape[2] / 4)
+    steps = feat_size_y * feat_size_x * anchors_per_location
+    unpacked_targets[level] = tf.reshape(targets[count:count + steps],
+                                         [feat_size_y, feat_size_x, -1])
+    count += steps
+  return unpacked_targets
--- a/official/vision/ops/anchor_generator.py
+++ b/official/vision/ops/anchor_generator.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multi scale anchor generator definition."""
+
+import tensorflow as tf
+
+
+# (TODO/tanzheny): consider having customized anchor offset.
+class _SingleAnchorGenerator:
+  """Utility to generate anchors for a single feature map.
+
+  Example:
+  ```python
+  anchor_gen = _SingleAnchorGenerator(32, [.5, 1., 2.], stride=16)
+  anchors = anchor_gen([512, 512, 3])
+  ```
+  """
+
+  def __init__(self,
+               anchor_size,
+               scales,
+               aspect_ratios,
+               stride,
+               clip_boxes=False):
+    """Constructs single scale anchor.
+
+    Args:
+      anchor_size: A single int represents the base anchor size. The anchor
+        height will be `anchor_size / sqrt(aspect_ratio)`, anchor width will be
+        `anchor_size * sqrt(aspect_ratio)`.
+      scales: A list/tuple, or a list/tuple of a list/tuple of positive
+        floats representing the actual anchor size to the base `anchor_size`.
+      aspect_ratios: a list/tuple of positive floats representing the ratio of
+        anchor width to anchor height.
+      stride: A single int represents the anchor stride size between center of
+        each anchor.
+      clip_boxes: Boolean to represent whether the anchor coordinates should be
+        clipped to the image size. Defaults to `True`.
+    Input shape: the size of the image, `[H, W, C]`
+    Output shape: the size of anchors, `[(H / stride) * (W / stride), 4]`
+    """
+    self.anchor_size = anchor_size
+    self.scales = scales
+    self.aspect_ratios = aspect_ratios
+    self.stride = stride
+    self.clip_boxes = clip_boxes
+
+  def __call__(self, image_size):
+    image_height = tf.cast(image_size[0], tf.float32)
+    image_width = tf.cast(image_size[1], tf.float32)
+
+    k = len(self.scales) * len(self.aspect_ratios)
+    aspect_ratios_sqrt = tf.cast(tf.sqrt(self.aspect_ratios), dtype=tf.float32)
+    anchor_size = tf.cast(self.anchor_size, tf.float32)
+
+    # [K]
+    anchor_heights = []
+    anchor_widths = []
+    for scale in self.scales:
+      anchor_size_t = anchor_size * scale
+      anchor_height = anchor_size_t / aspect_ratios_sqrt
+      anchor_width = anchor_size_t * aspect_ratios_sqrt
+      anchor_heights.append(anchor_height)
+      anchor_widths.append(anchor_width)
+    anchor_heights = tf.concat(anchor_heights, axis=0)
+    anchor_widths = tf.concat(anchor_widths, axis=0)
+    half_anchor_heights = tf.reshape(0.5 * anchor_heights, [1, 1, k])
+    half_anchor_widths = tf.reshape(0.5 * anchor_widths, [1, 1, k])
+
+    stride = tf.cast(self.stride, tf.float32)
+    # [W]
+    cx = tf.range(0.5 * stride, image_width, stride)
+    # [H]
+    cy = tf.range(0.5 * stride, image_height, stride)
+    # [H, W]
+    cx_grid, cy_grid = tf.meshgrid(cx, cy)
+    # [H, W, 1]
+    cx_grid = tf.expand_dims(cx_grid, axis=-1)
+    cy_grid = tf.expand_dims(cy_grid, axis=-1)
+
+    # [H, W, K, 1]
+    y_min = tf.expand_dims(cy_grid - half_anchor_heights, axis=-1)
+    y_max = tf.expand_dims(cy_grid + half_anchor_heights, axis=-1)
+    x_min = tf.expand_dims(cx_grid - half_anchor_widths, axis=-1)
+    x_max = tf.expand_dims(cx_grid + half_anchor_widths, axis=-1)
+
+    if self.clip_boxes:
+      y_min = tf.maximum(tf.minimum(y_min, image_height), 0.)
+      y_max = tf.maximum(tf.minimum(y_max, image_height), 0.)
+      x_min = tf.maximum(tf.minimum(x_min, image_width), 0.)
+      x_max = tf.maximum(tf.minimum(x_max, image_width), 0.)
+
+    # [H, W, K, 4]
+    result = tf.concat([y_min, x_min, y_max, x_max], axis=-1)
+    shape = result.shape.as_list()
+    # [H, W, K * 4]
+    return tf.reshape(result, [shape[0], shape[1], shape[2] * shape[3]])
+
+
+class AnchorGenerator():
+  """Utility to generate anchors for a multiple feature maps.
+
+  Example:
+  ```python
+  anchor_gen = AnchorGenerator([32, 64], [.5, 1., 2.],
+    strides=[16, 32])
+  anchors = anchor_gen([512, 512, 3])
+  ```
+
+  """
+
+  def __init__(self,
+               anchor_sizes,
+               scales,
+               aspect_ratios,
+               strides,
+               clip_boxes=False):
+    """Constructs multiscale anchors.
+
+    Args:
+      anchor_sizes: A list of int represents the anchor size for each scale. The
+        anchor height will be `anchor_size / sqrt(aspect_ratio)`, anchor width
+        will be `anchor_size * sqrt(aspect_ratio)` for each scale.
+      scales: A list/tuple, or a list/tuple of a list/tuple of positive
+        floats representing the actual anchor size to the base `anchor_size`.
+      aspect_ratios: A list/tuple, or a list/tuple of a list/tuple of positive
+        floats representing the ratio of anchor width to anchor height.
+      strides: A list/tuple of ints represent the anchor stride size between
+        center of anchors at each scale.
+      clip_boxes: Boolean to represents whether the anchor coordinates should be
+        clipped to the image size. Defaults to `False`.
+    Input shape: the size of the image, `[H, W, C]`
+    Output shape: the size of anchors concat on each level, `[(H /
+      strides) * (W / strides), K * 4]`
+    """
+    # aspect_ratio is a single list that is the same across all levels.
+    aspect_ratios = maybe_map_structure_for_anchor(aspect_ratios, anchor_sizes)
+    scales = maybe_map_structure_for_anchor(scales, anchor_sizes)
+    if isinstance(anchor_sizes, dict):
+      self.anchor_generators = {}
+      for k in anchor_sizes.keys():
+        self.anchor_generators[k] = _SingleAnchorGenerator(
+            anchor_sizes[k], scales[k], aspect_ratios[k], strides[k],
+            clip_boxes)
+    elif isinstance(anchor_sizes, (list, tuple)):
+      self.anchor_generators = []
+      for anchor_size, scale_list, ar_list, stride in zip(
+          anchor_sizes, scales, aspect_ratios, strides):
+        self.anchor_generators.append(
+            _SingleAnchorGenerator(anchor_size, scale_list, ar_list, stride,
+                                   clip_boxes))
+
+  def __call__(self, image_size):
+    anchor_generators = tf.nest.flatten(self.anchor_generators)
+    results = [anchor_gen(image_size) for anchor_gen in anchor_generators]
+    return tf.nest.pack_sequence_as(self.anchor_generators, results)
+
+
+def maybe_map_structure_for_anchor(params, anchor_sizes):
+  """broadcast the params to match anchor_sizes."""
+  if all(isinstance(param, (int, float)) for param in params):
+    if isinstance(anchor_sizes, (tuple, list)):
+      return [params] * len(anchor_sizes)
+    elif isinstance(anchor_sizes, dict):
+      return tf.nest.map_structure(lambda _: params, anchor_sizes)
+    else:
+      raise ValueError("the structure of `anchor_sizes` must be a tuple, "
+                       "list, or dict, given {}".format(anchor_sizes))
+  else:
+    return params
--- a/official/vision/ops/anchor_generator_test.py
+++ b/official/vision/ops/anchor_generator_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for anchor_generator.py."""
+
+from absl.testing import parameterized
+import tensorflow as tf
+from official.vision.ops import anchor_generator
+
+
+class AnchorGeneratorTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      # Single scale anchor.
+      (5, [1.0], [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]],
+                  [[16., -16., 80., 48.], [16., 16., 80., 80.]]]),
+      # # Multi aspect ratio anchor.
+      (6, [1.0, 4.0, 0.25],
+       [[[-32., -32., 96., 96., 0., -96., 64., 160., -96., 0., 160., 64.]]]),
+  )
+  def testAnchorGeneration(self, level, aspect_ratios, expected_boxes):
+    image_size = [64, 64]
+    anchor_size = 2**(level + 1)
+    stride = 2**level
+    anchor_gen = anchor_generator._SingleAnchorGenerator(
+        anchor_size=anchor_size,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        stride=stride,
+        clip_boxes=False)
+    anchors = anchor_gen(image_size).numpy()
+    self.assertAllClose(expected_boxes, anchors)
+
+  @parameterized.parameters(
+      # Single scale anchor.
+      (5, [1.0], [[[0., 0., 48., 48.], [0., 16., 48., 64.]],
+                  [[16., 0., 64., 48.], [16., 16., 64., 64.]]]),
+      # # Multi aspect ratio anchor.
+      (6, [1.0, 4.0, 0.25
+          ], [[[0., 0., 64., 64., 0., 0., 64., 64., 0., 0., 64., 64.]]]),
+  )
+  def testAnchorGenerationClipped(self, level, aspect_ratios, expected_boxes):
+    image_size = [64, 64]
+    anchor_size = 2**(level + 1)
+    stride = 2**level
+    anchor_gen = anchor_generator._SingleAnchorGenerator(
+        anchor_size=anchor_size,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        stride=stride,
+        clip_boxes=True)
+    anchors = anchor_gen(image_size).numpy()
+    self.assertAllClose(expected_boxes, anchors)
+
+
+class MultiScaleAnchorGeneratorTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      # Multi scale anchor.
+      (5, 6, [[1.0], [1.0]], [[-16, -16, 48, 48], [-16, 16, 48, 80],
+                              [16, -16, 80, 48], [16, 16, 80, 80],
+                              [-32, -32, 96, 96]]),)
+  def testAnchorGeneration(self, min_level, max_level, aspect_ratios,
+                           expected_boxes):
+    image_size = [64, 64]
+    levels = range(min_level, max_level + 1)
+    anchor_sizes = [2**(level + 1) for level in levels]
+    strides = [2**level for level in levels]
+    anchor_gen = anchor_generator.AnchorGenerator(
+        anchor_sizes=anchor_sizes,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        strides=strides)
+    anchors = anchor_gen(image_size)
+    anchors = [tf.reshape(anchor, [-1, 4]) for anchor in anchors]
+    anchors = tf.concat(anchors, axis=0).numpy()
+    self.assertAllClose(expected_boxes, anchors)
+
+  @parameterized.parameters(
+      # Multi scale anchor.
+      (5, 6, [[1.0], [1.0]], [[-16, -16, 48, 48], [-16, 16, 48, 80],
+                              [16, -16, 80, 48], [16, 16, 80, 80],
+                              [-32, -32, 96, 96]]),)
+  def testAnchorGenerationClipped(self, min_level, max_level, aspect_ratios,
+                                  expected_boxes):
+    image_size = [64, 64]
+    levels = range(min_level, max_level + 1)
+    anchor_sizes = [2**(level + 1) for level in levels]
+    strides = [2**level for level in levels]
+    anchor_gen = anchor_generator.AnchorGenerator(
+        anchor_sizes=anchor_sizes,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        strides=strides,
+        clip_boxes=False)
+    anchors = anchor_gen(image_size)
+    anchors = [tf.reshape(anchor, [-1, 4]) for anchor in anchors]
+    anchors = tf.concat(anchors, axis=0).numpy()
+    self.assertAllClose(expected_boxes, anchors)
+
+  @parameterized.parameters(
+      # Multi scale anchor.
+      (5, 6, [1.0], {
+          '5': [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]],
+                [[16., -16., 80., 48.], [16., 16., 80., 80.]]],
+          '6': [[[-32, -32, 96, 96]]]
+      }),)
+  def testAnchorGenerationDict(self, min_level, max_level, aspect_ratios,
+                               expected_boxes):
+    image_size = [64, 64]
+    levels = range(min_level, max_level + 1)
+    anchor_sizes = dict((str(level), 2**(level + 1)) for level in levels)
+    strides = dict((str(level), 2**level) for level in levels)
+    anchor_gen = anchor_generator.AnchorGenerator(
+        anchor_sizes=anchor_sizes,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        strides=strides,
+        clip_boxes=False)
+    anchors = anchor_gen(image_size)
+    for k in expected_boxes.keys():
+      self.assertAllClose(expected_boxes[k], anchors[k].numpy())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/ops/anchor_test.py
+++ b/official/vision/ops/anchor_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for anchor.py."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from official.vision.ops import anchor
+
+
+class AnchorTest(parameterized.TestCase, tf.test.TestCase):
+
+  # The set of parameters are tailored for the MLPerf configuration, where
+  # the number of anchors is 495132, rpn_batch_size_per_im=256, and
+  # rpn_fg_fraction=0.5.
+  @parameterized.parameters(
+      (512, 25, 25, 25, 25, (512, 512)),
+      (512, 25, 25, 25, 25, (512, 640)),
+      (512, 25, 25, 25, 25, (640, 512)),
+      (495132, 100, 100, 100, 100, (512, 512)),
+      (495132, 200, 100, 128, 100, (512, 512)),
+      (495132, 100, 120, 100, 120, (512, 512)),
+      (495132, 100, 200, 100, 156, (512, 512)),
+      (495132, 200, 200, 128, 128, (512, 512)),
+  )
+  def testAnchorRpnSample(self, num_anchors, num_positives,
+                          num_negatives, expected_positives,
+                          expected_negatives, image_size):
+    match_results_np = np.empty([num_anchors])
+    match_results_np.fill(-2)
+    match_results_np[:num_positives] = 0
+    match_results_np[num_positives:num_positives + num_negatives] = -1
+    match_results = tf.convert_to_tensor(value=match_results_np, dtype=tf.int32)
+    anchor_labeler = anchor.RpnAnchorLabeler(
+        match_threshold=0.7,
+        unmatched_threshold=0.3,
+        rpn_batch_size_per_im=256,
+        rpn_fg_fraction=0.5)
+    rpn_sample_op = anchor_labeler._get_rpn_samples(match_results)
+    labels = [v.numpy() for v in rpn_sample_op]
+    self.assertLen(labels[0], num_anchors)
+    positives = np.sum(np.array(labels[0]) == 1)
+    negatives = np.sum(np.array(labels[0]) == 0)
+    self.assertEqual(positives, expected_positives)
+    self.assertEqual(negatives, expected_negatives)
+
+  @parameterized.parameters(
+      # Single scale anchor.
+      (5, 5, 1, [1.0], 2.0,
+       [[-16, -16, 48, 48], [-16, 16, 48, 80],
+        [16, -16, 80, 48], [16, 16, 80, 80]]),
+      # Multi scale anchor.
+      (5, 6, 1, [1.0], 2.0,
+       [[-16, -16, 48, 48], [-16, 16, 48, 80],
+        [16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]),
+      # # Multi aspect ratio anchor.
+      (6, 6, 1, [1.0, 4.0, 0.25], 2.0,
+       [[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]),
+
+  )
+  def testAnchorGeneration(self, min_level, max_level, num_scales,
+                           aspect_ratios, anchor_size, expected_boxes):
+    image_size = [64, 64]
+    anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios,
+                            anchor_size, image_size)
+    boxes = anchors.boxes.numpy()
+    self.assertEqual(expected_boxes, boxes.tolist())
+
+  @parameterized.parameters(
+      # Single scale anchor.
+      (5, 5, 1, [1.0], 2.0,
+       [[-16, -16, 48, 48], [-16, 16, 48, 80],
+        [16, -16, 80, 48], [16, 16, 80, 80]]),
+      # Multi scale anchor.
+      (5, 6, 1, [1.0], 2.0,
+       [[-16, -16, 48, 48], [-16, 16, 48, 80],
+        [16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]),
+      # # Multi aspect ratio anchor.
+      (6, 6, 1, [1.0, 4.0, 0.25], 2.0,
+       [[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]),
+
+  )
+  def testAnchorGenerationWithImageSizeAsTensor(self,
+                                                min_level,
+                                                max_level,
+                                                num_scales,
+                                                aspect_ratios,
+                                                anchor_size,
+                                                expected_boxes):
+    image_size = tf.constant([64, 64], tf.int32)
+    anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios,
+                            anchor_size, image_size)
+    boxes = anchors.boxes.numpy()
+    self.assertEqual(expected_boxes, boxes.tolist())
+
+  @parameterized.parameters(
+      (3, 6, 2, [1.0], 2.0, False),
+      (3, 6, 2, [1.0], 2.0, True),
+  )
+  def testLabelAnchors(self, min_level, max_level, num_scales, aspect_ratios,
+                       anchor_size, has_attribute):
+    input_size = [512, 512]
+    ground_truth_class_id = 2
+    attribute_name = 'depth'
+    ground_truth_depth = 3.0
+
+    # The matched anchors are the anchors used as ground truth and the anchors
+    # at the next octave scale on the same location.
+    expected_anchor_locations = [[0, 0, 0], [0, 0, 1]]
+    anchor_gen = anchor.build_anchor_generator(min_level, max_level, num_scales,
+                                               aspect_ratios, anchor_size)
+    anchor_boxes = anchor_gen(input_size)
+    anchor_labeler = anchor.AnchorLabeler()
+
+    # Uses the first anchors as ground truth. The ground truth should map to
+    # two anchors with two intermediate scales at the same location.
+    gt_boxes = anchor_boxes['3'][0:1, 0, 0:4]
+    gt_classes = tf.constant([[ground_truth_class_id]], dtype=tf.float32)
+    gt_attributes = {
+        attribute_name: tf.constant([[ground_truth_depth]], dtype=tf.float32)
+    } if has_attribute else {}
+
+    (cls_targets, box_targets, att_targets, _,
+     box_weights) = anchor_labeler.label_anchors(anchor_boxes, gt_boxes,
+                                                 gt_classes, gt_attributes)
+
+    for k, v in cls_targets.items():
+      cls_targets[k] = v.numpy()
+    for k, v in box_targets.items():
+      box_targets[k] = v.numpy()
+    box_weights = box_weights.numpy()
+
+    anchor_locations = np.vstack(
+        np.where(cls_targets[str(min_level)] > -1)).transpose()
+    self.assertAllClose(expected_anchor_locations, anchor_locations)
+    # Two anchor boxes on min_level got matched to the gt_boxes.
+    self.assertAllClose(tf.reduce_sum(box_weights), 2)
+
+    if has_attribute:
+      self.assertIn(attribute_name, att_targets)
+      for k, v in att_targets[attribute_name].items():
+        att_targets[attribute_name][k] = v.numpy()
+      anchor_locations = np.vstack(
+          np.where(
+              att_targets[attribute_name][str(min_level)] > 0.0)).transpose()
+      self.assertAllClose(expected_anchor_locations, anchor_locations)
+    else:
+      self.assertEmpty(att_targets)
+
+  @parameterized.parameters(
+      (3, 7, [.5, 1., 2.], 2, 8, (256, 256)),
+      (3, 8, [1.], 3, 32, (512, 512)),
+      (3, 3, [1.], 2, 4, (32, 32)),
+  )
+  def testEquivalentResult(self, min_level, max_level, aspect_ratios,
+                           num_scales, anchor_size, image_size):
+    anchor_gen = anchor.build_anchor_generator(
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size)
+    anchors = anchor_gen(image_size)
+    expected_anchor_gen = anchor.Anchor(min_level, max_level, num_scales,
+                                        aspect_ratios, anchor_size, image_size)
+
+    expected_anchors = expected_anchor_gen.multilevel_boxes
+    for k in expected_anchors.keys():
+      self.assertAllClose(expected_anchors[k], anchors[k])
+
+
+if __name__ == '__main__':
+  tf.test.main()