Internal change

PiperOrigin-RevId: 431756117

Internal change
PiperOrigin-RevId: 431756117
c8e6faf7 · A. Unique TensorFlower · 13a5e4fb · c8e6faf7 · c8e6faf7 · c8e6faf7
Commit c8e6faf7 authored Mar 01, 2022 by A. Unique TensorFlower
20 changed files
--- a/official/vision/modeling/decoders/factory_test.py
+++ b/official/vision/modeling/decoders/factory_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for decoder factory functions."""
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from official.vision import configs
+from official.vision.configs import decoders as decoders_cfg
+from official.vision.modeling import decoders
+from official.vision.modeling.decoders import factory
+
+
+class FactoryTest(tf.test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          num_filters=[128, 256], use_separable_conv=[True, False]))
+  def test_fpn_decoder_creation(self, num_filters, use_separable_conv):
+    """Test creation of FPN decoder."""
+    min_level = 3
+    max_level = 7
+    input_specs = {}
+    for level in range(min_level, max_level):
+      input_specs[str(level)] = tf.TensorShape(
+          [1, 128 // (2**level), 128 // (2**level), 3])
+
+    network = decoders.FPN(
+        input_specs=input_specs,
+        num_filters=num_filters,
+        use_separable_conv=use_separable_conv,
+        use_sync_bn=True)
+
+    model_config = configs.retinanet.RetinaNet()
+    model_config.min_level = min_level
+    model_config.max_level = max_level
+    model_config.num_classes = 10
+    model_config.input_size = [None, None, 3]
+    model_config.decoder = decoders_cfg.Decoder(
+        type='fpn',
+        fpn=decoders_cfg.FPN(
+            num_filters=num_filters, use_separable_conv=use_separable_conv))
+
+    factory_network = factory.build_decoder(
+        input_specs=input_specs, model_config=model_config)
+
+    network_config = network.get_config()
+    factory_network_config = factory_network.get_config()
+
+    self.assertEqual(network_config, factory_network_config)
+
+  @combinations.generate(
+      combinations.combine(
+          num_filters=[128, 256],
+          num_repeats=[3, 5],
+          use_separable_conv=[True, False]))
+  def test_nasfpn_decoder_creation(self, num_filters, num_repeats,
+                                   use_separable_conv):
+    """Test creation of NASFPN decoder."""
+    min_level = 3
+    max_level = 7
+    input_specs = {}
+    for level in range(min_level, max_level):
+      input_specs[str(level)] = tf.TensorShape(
+          [1, 128 // (2**level), 128 // (2**level), 3])
+
+    network = decoders.NASFPN(
+        input_specs=input_specs,
+        num_filters=num_filters,
+        num_repeats=num_repeats,
+        use_separable_conv=use_separable_conv,
+        use_sync_bn=True)
+
+    model_config = configs.retinanet.RetinaNet()
+    model_config.min_level = min_level
+    model_config.max_level = max_level
+    model_config.num_classes = 10
+    model_config.input_size = [None, None, 3]
+    model_config.decoder = decoders_cfg.Decoder(
+        type='nasfpn',
+        nasfpn=decoders_cfg.NASFPN(
+            num_filters=num_filters,
+            num_repeats=num_repeats,
+            use_separable_conv=use_separable_conv))
+
+    factory_network = factory.build_decoder(
+        input_specs=input_specs, model_config=model_config)
+
+    network_config = network.get_config()
+    factory_network_config = factory_network.get_config()
+
+    self.assertEqual(network_config, factory_network_config)
+
+  @combinations.generate(
+      combinations.combine(
+          level=[3, 4],
+          dilation_rates=[[6, 12, 18], [6, 12]],
+          num_filters=[128, 256]))
+  def test_aspp_decoder_creation(self, level, dilation_rates, num_filters):
+    """Test creation of ASPP decoder."""
+    input_specs = {'1': tf.TensorShape([1, 128, 128, 3])}
+
+    network = decoders.ASPP(
+        level=level,
+        dilation_rates=dilation_rates,
+        num_filters=num_filters,
+        use_sync_bn=True)
+
+    model_config = configs.semantic_segmentation.SemanticSegmentationModel()
+    model_config.num_classes = 10
+    model_config.input_size = [None, None, 3]
+    model_config.decoder = decoders_cfg.Decoder(
+        type='aspp',
+        aspp=decoders_cfg.ASPP(
+            level=level, dilation_rates=dilation_rates,
+            num_filters=num_filters))
+
+    factory_network = factory.build_decoder(
+        input_specs=input_specs, model_config=model_config)
+
+    network_config = network.get_config()
+    factory_network_config = factory_network.get_config()
+    # Due to calling `super().get_config()` in aspp layer, everything but the
+    # the name of two layer instances are the same, so we force equal name so it
+    # will not give false alarm.
+    factory_network_config['name'] = network_config['name']
+
+    self.assertEqual(network_config, factory_network_config)
+
+  def test_identity_decoder_creation(self):
+    """Test creation of identity decoder."""
+    model_config = configs.retinanet.RetinaNet()
+    model_config.num_classes = 2
+    model_config.input_size = [None, None, 3]
+
+    model_config.decoder = decoders_cfg.Decoder(
+        type='identity', identity=decoders_cfg.Identity())
+
+    factory_network = factory.build_decoder(
+        input_specs=None, model_config=model_config)
+
+    self.assertIsNone(factory_network)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/decoders/fpn.py
+++ b/official/vision/modeling/decoders/fpn.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains the definitions of Feature Pyramid Networks (FPN)."""
+from typing import Any, Mapping, Optional
+
+# Import libraries
+from absl import logging
+import tensorflow as tf
+
+from official.modeling import hyperparams
+from official.modeling import tf_utils
+from official.vision.modeling.decoders import factory
+from official.vision.ops import spatial_transform_ops
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class FPN(tf.keras.Model):
+  """Creates a Feature Pyramid Network (FPN).
+
+  This implemets the paper:
+  Tsung-Yi Lin, Piotr Dollar, Ross Girshick, Kaiming He, Bharath Hariharan, and
+  Serge Belongie.
+  Feature Pyramid Networks for Object Detection.
+  (https://arxiv.org/pdf/1612.03144)
+  """
+
+  def __init__(
+      self,
+      input_specs: Mapping[str, tf.TensorShape],
+      min_level: int = 3,
+      max_level: int = 7,
+      num_filters: int = 256,
+      fusion_type: str = 'sum',
+      use_separable_conv: bool = False,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_initializer: str = 'VarianceScaling',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+    """Initializes a Feature Pyramid Network (FPN).
+
+    Args:
+      input_specs: A `dict` of input specifications. A dictionary consists of
+        {level: TensorShape} from a backbone.
+      min_level: An `int` of minimum level in FPN output feature maps.
+      max_level: An `int` of maximum level in FPN output feature maps.
+      num_filters: An `int` number of filters in FPN layers.
+      fusion_type: A `str` of `sum` or `concat`. Whether performing sum or
+        concat for feature fusion.
+      use_separable_conv: A `bool`.  If True use separable convolution for
+        convolution in FPN layers.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_initializer: A `str` name of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    self._config_dict = {
+        'input_specs': input_specs,
+        'min_level': min_level,
+        'max_level': max_level,
+        'num_filters': num_filters,
+        'fusion_type': fusion_type,
+        'use_separable_conv': use_separable_conv,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_initializer': kernel_initializer,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+    }
+    if use_separable_conv:
+      conv2d = tf.keras.layers.SeparableConv2D
+    else:
+      conv2d = tf.keras.layers.Conv2D
+    if use_sync_bn:
+      norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      norm = tf.keras.layers.BatchNormalization
+    activation_fn = tf.keras.layers.Activation(
+        tf_utils.get_activation(activation))
+
+    # Build input feature pyramid.
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      bn_axis = -1
+    else:
+      bn_axis = 1
+
+    # Get input feature pyramid from backbone.
+    logging.info('FPN input_specs: %s', input_specs)
+    inputs = self._build_input_pyramid(input_specs, min_level)
+    backbone_max_level = min(int(max(inputs.keys())), max_level)
+
+    # Build lateral connections.
+    feats_lateral = {}
+    for level in range(min_level, backbone_max_level + 1):
+      feats_lateral[str(level)] = conv2d(
+          filters=num_filters,
+          kernel_size=1,
+          padding='same',
+          kernel_initializer=kernel_initializer,
+          kernel_regularizer=kernel_regularizer,
+          bias_regularizer=bias_regularizer)(
+              inputs[str(level)])
+
+    # Build top-down path.
+    feats = {str(backbone_max_level): feats_lateral[str(backbone_max_level)]}
+    for level in range(backbone_max_level - 1, min_level - 1, -1):
+      feat_a = spatial_transform_ops.nearest_upsampling(
+          feats[str(level + 1)], 2)
+      feat_b = feats_lateral[str(level)]
+
+      if fusion_type == 'sum':
+        feats[str(level)] = feat_a + feat_b
+      elif fusion_type == 'concat':
+        feats[str(level)] = tf.concat([feat_a, feat_b], axis=-1)
+      else:
+        raise ValueError('Fusion type {} not supported.'.format(fusion_type))
+
+    # TODO(xianzhi): consider to remove bias in conv2d.
+    # Build post-hoc 3x3 convolution kernel.
+    for level in range(min_level, backbone_max_level + 1):
+      feats[str(level)] = conv2d(
+          filters=num_filters,
+          strides=1,
+          kernel_size=3,
+          padding='same',
+          kernel_initializer=kernel_initializer,
+          kernel_regularizer=kernel_regularizer,
+          bias_regularizer=bias_regularizer)(
+              feats[str(level)])
+
+    # TODO(xianzhi): consider to remove bias in conv2d.
+    # Build coarser FPN levels introduced for RetinaNet.
+    for level in range(backbone_max_level + 1, max_level + 1):
+      feats_in = feats[str(level - 1)]
+      if level > backbone_max_level + 1:
+        feats_in = activation_fn(feats_in)
+      feats[str(level)] = conv2d(
+          filters=num_filters,
+          strides=2,
+          kernel_size=3,
+          padding='same',
+          kernel_initializer=kernel_initializer,
+          kernel_regularizer=kernel_regularizer,
+          bias_regularizer=bias_regularizer)(
+              feats_in)
+
+    # Apply batch norm layers.
+    for level in range(min_level, max_level + 1):
+      feats[str(level)] = norm(
+          axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)(
+              feats[str(level)])
+
+    self._output_specs = {
+        str(level): feats[str(level)].get_shape()
+        for level in range(min_level, max_level + 1)
+    }
+
+    super(FPN, self).__init__(inputs=inputs, outputs=feats, **kwargs)
+
+  def _build_input_pyramid(self, input_specs: Mapping[str, tf.TensorShape],
+                           min_level: int):
+    assert isinstance(input_specs, dict)
+    if min(input_specs.keys()) > str(min_level):
+      raise ValueError(
+          'Backbone min level should be less or equal to FPN min level')
+
+    inputs = {}
+    for level, spec in input_specs.items():
+      inputs[level] = tf.keras.Input(shape=spec[1:])
+    return inputs
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def output_specs(self) -> Mapping[str, tf.TensorShape]:
+    """A dict of {level: TensorShape} pairs for the model output."""
+    return self._output_specs
+
+
+@factory.register_decoder_builder('fpn')
+def build_fpn_decoder(
+    input_specs: Mapping[str, tf.TensorShape],
+    model_config: hyperparams.Config,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None
+) -> tf.keras.Model:
+  """Builds FPN decoder from a config.
+
+  Args:
+    input_specs: A `dict` of input specifications. A dictionary consists of
+      {level: TensorShape} from a backbone.
+    model_config: A OneOfConfig. Model config.
+    l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to
+      None.
+
+  Returns:
+    A `tf.keras.Model` instance of the FPN decoder.
+
+  Raises:
+    ValueError: If the model_config.decoder.type is not `fpn`.
+  """
+  decoder_type = model_config.decoder.type
+  decoder_cfg = model_config.decoder.get()
+  if decoder_type != 'fpn':
+    raise ValueError(f'Inconsistent decoder type {decoder_type}. '
+                     'Need to be `fpn`.')
+  norm_activation_config = model_config.norm_activation
+  return FPN(
+      input_specs=input_specs,
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      num_filters=decoder_cfg.num_filters,
+      fusion_type=decoder_cfg.fusion_type,
+      use_separable_conv=decoder_cfg.use_separable_conv,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
--- a/official/vision/modeling/decoders/fpn_test.py
+++ b/official/vision/modeling/decoders/fpn_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for FPN."""
+
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.modeling.backbones import mobilenet
+from official.vision.modeling.backbones import resnet
+from official.vision.modeling.decoders import fpn
+
+
+class FPNTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (256, 3, 7, False, 'sum'),
+      (256, 3, 7, True, 'concat'),
+  )
+  def test_network_creation(self, input_size, min_level, max_level,
+                            use_separable_conv, fusion_type):
+    """Test creation of FPN."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+
+    backbone = resnet.ResNet(model_id=50)
+    network = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level,
+        fusion_type=fusion_type,
+        use_separable_conv=use_separable_conv)
+
+    endpoints = backbone(inputs)
+    feats = network(endpoints)
+
+    for level in range(min_level, max_level + 1):
+      self.assertIn(str(level), feats)
+      self.assertAllEqual(
+          [1, input_size // 2**level, input_size // 2**level, 256],
+          feats[str(level)].shape.as_list())
+
+  @parameterized.parameters(
+      (256, 3, 7, False),
+      (256, 3, 7, True),
+  )
+  def test_network_creation_with_mobilenet(self, input_size, min_level,
+                                           max_level, use_separable_conv):
+    """Test creation of FPN with mobilenet backbone."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+
+    backbone = mobilenet.MobileNet(model_id='MobileNetV2')
+    network = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level,
+        use_separable_conv=use_separable_conv)
+
+    endpoints = backbone(inputs)
+    feats = network(endpoints)
+
+    for level in range(min_level, max_level + 1):
+      self.assertIn(str(level), feats)
+      self.assertAllEqual(
+          [1, input_size // 2**level, input_size // 2**level, 256],
+          feats[str(level)].shape.as_list())
+
+  def test_serialize_deserialize(self):
+    # Create a network object that sets all of its config options.
+    kwargs = dict(
+        input_specs=resnet.ResNet(model_id=50).output_specs,
+        min_level=3,
+        max_level=7,
+        num_filters=256,
+        fusion_type='sum',
+        use_separable_conv=False,
+        use_sync_bn=False,
+        activation='relu',
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_initializer='VarianceScaling',
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    network = fpn.FPN(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(network.get_config(), expected_config)
+
+    # Create another network object from the first object's config.
+    new_network = fpn.FPN.from_config(network.get_config())
+
+    # Validate that the config can be forced to JSON.
+    _ = new_network.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(network.get_config(), new_network.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/decoders/nasfpn.py
+++ b/official/vision/modeling/decoders/nasfpn.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of NAS-FPN."""
+
+from typing import Any, List, Mapping, Optional, Tuple
+
+# Import libraries
+
+from absl import logging
+import tensorflow as tf
+
+from official.modeling import hyperparams
+from official.modeling import tf_utils
+from official.vision.modeling.decoders import factory
+from official.vision.ops import spatial_transform_ops
+
+
+# The fixed NAS-FPN architecture discovered by NAS.
+# Each element represents a specification of a building block:
+#   (block_level, combine_fn, (input_offset0, input_offset1), is_output).
+NASFPN_BLOCK_SPECS = [
+    (4, 'attention', (1, 3), False),
+    (4, 'sum', (1, 5), False),
+    (3, 'sum', (0, 6), True),
+    (4, 'sum', (6, 7), True),
+    (5, 'attention', (7, 8), True),
+    (7, 'attention', (6, 9), True),
+    (6, 'attention', (9, 10), True),
+]
+
+
+class BlockSpec():
+  """A container class that specifies the block configuration for NAS-FPN."""
+
+  def __init__(self, level: int, combine_fn: str,
+               input_offsets: Tuple[int, int], is_output: bool):
+    self.level = level
+    self.combine_fn = combine_fn
+    self.input_offsets = input_offsets
+    self.is_output = is_output
+
+
+def build_block_specs(
+    block_specs: Optional[List[Tuple[Any, ...]]] = None) -> List[BlockSpec]:
+  """Builds the list of BlockSpec objects for NAS-FPN."""
+  if not block_specs:
+    block_specs = NASFPN_BLOCK_SPECS
+  logging.info('Building NAS-FPN block specs: %s', block_specs)
+  return [BlockSpec(*b) for b in block_specs]
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class NASFPN(tf.keras.Model):
+  """Creates a NAS-FPN model.
+
+  This implements the paper:
+  Golnaz Ghiasi, Tsung-Yi Lin, Ruoming Pang, Quoc V. Le.
+  NAS-FPN: Learning Scalable Feature Pyramid Architecture for Object Detection.
+  (https://arxiv.org/abs/1904.07392)
+  """
+
+  def __init__(
+      self,
+      input_specs: Mapping[str, tf.TensorShape],
+      min_level: int = 3,
+      max_level: int = 7,
+      block_specs: List[BlockSpec] = build_block_specs(),
+      num_filters: int = 256,
+      num_repeats: int = 5,
+      use_separable_conv: bool = False,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_initializer: str = 'VarianceScaling',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+    """Initializes a NAS-FPN model.
+
+    Args:
+      input_specs: A `dict` of input specifications. A dictionary consists of
+        {level: TensorShape} from a backbone.
+      min_level: An `int` of minimum level in FPN output feature maps.
+      max_level: An `int` of maximum level in FPN output feature maps.
+      block_specs: a list of BlockSpec objects that specifies the NAS-FPN
+        network topology. By default, the previously discovered architecture is
+        used.
+      num_filters: An `int` number of filters in FPN layers.
+      num_repeats: number of repeats for feature pyramid network.
+      use_separable_conv: A `bool`.  If True use separable convolution for
+        convolution in FPN layers.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_initializer: A `str` name of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    self._config_dict = {
+        'input_specs': input_specs,
+        'min_level': min_level,
+        'max_level': max_level,
+        'num_filters': num_filters,
+        'num_repeats': num_repeats,
+        'use_separable_conv': use_separable_conv,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_initializer': kernel_initializer,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+    }
+    self._min_level = min_level
+    self._max_level = max_level
+    self._block_specs = block_specs
+    self._num_repeats = num_repeats
+    self._conv_op = (tf.keras.layers.SeparableConv2D
+                     if self._config_dict['use_separable_conv']
+                     else tf.keras.layers.Conv2D)
+    if self._config_dict['use_separable_conv']:
+      self._conv_kwargs = {
+          'depthwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'pointwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'depthwise_regularizer': self._config_dict['kernel_regularizer'],
+          'pointwise_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      }
+    else:
+      self._conv_kwargs = {
+          'kernel_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      }
+    self._norm_op = (tf.keras.layers.experimental.SyncBatchNormalization
+                     if self._config_dict['use_sync_bn']
+                     else tf.keras.layers.BatchNormalization)
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._norm_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+    self._activation = tf_utils.get_activation(activation)
+
+    # Gets input feature pyramid from backbone.
+    inputs = self._build_input_pyramid(input_specs, min_level)
+
+    # Projects the input features.
+    feats = []
+    for level in range(self._min_level, self._max_level + 1):
+      if str(level) in inputs.keys():
+        feats.append(self._resample_feature_map(
+            inputs[str(level)], level, level, self._config_dict['num_filters']))
+      else:
+        feats.append(self._resample_feature_map(
+            feats[-1], level - 1, level, self._config_dict['num_filters']))
+
+    # Repeatly builds the NAS-FPN modules.
+    for _ in range(self._num_repeats):
+      output_feats = self._build_feature_pyramid(feats)
+      feats = [output_feats[level]
+               for level in range(self._min_level, self._max_level + 1)]
+
+    self._output_specs = {
+        str(level): output_feats[level].get_shape()
+        for level in range(min_level, max_level + 1)
+    }
+    output_feats = {str(level): output_feats[level]
+                    for level in output_feats.keys()}
+    super(NASFPN, self).__init__(inputs=inputs, outputs=output_feats, **kwargs)
+
+  def _build_input_pyramid(self, input_specs: Mapping[str, tf.TensorShape],
+                           min_level: int):
+    assert isinstance(input_specs, dict)
+    if min(input_specs.keys()) > str(min_level):
+      raise ValueError(
+          'Backbone min level should be less or equal to FPN min level')
+
+    inputs = {}
+    for level, spec in input_specs.items():
+      inputs[level] = tf.keras.Input(shape=spec[1:])
+    return inputs
+
+  def _resample_feature_map(self,
+                            inputs,
+                            input_level,
+                            target_level,
+                            target_num_filters=256):
+    x = inputs
+    _, _, _, input_num_filters = x.get_shape().as_list()
+    if input_num_filters != target_num_filters:
+      x = self._conv_op(
+          filters=target_num_filters,
+          kernel_size=1,
+          padding='same',
+          **self._conv_kwargs)(x)
+      x = self._norm_op(**self._norm_kwargs)(x)
+
+    if input_level < target_level:
+      stride = int(2 ** (target_level - input_level))
+      return tf.keras.layers.MaxPool2D(
+          pool_size=stride, strides=stride, padding='same')(x)
+    if input_level > target_level:
+      scale = int(2 ** (input_level - target_level))
+      return spatial_transform_ops.nearest_upsampling(x, scale=scale)
+
+    # Force output x to be the same dtype as mixed precision policy. This avoids
+    # dtype mismatch when one input (by default float32 dtype) does not meet all
+    # the above conditions and is output unchanged, while other inputs are
+    # processed to have different dtype, e.g., using bfloat16 on TPU.
+    compute_dtype = tf.keras.layers.Layer().dtype_policy.compute_dtype
+    if (compute_dtype is not None) and (x.dtype != compute_dtype):
+      return tf.cast(x, dtype=compute_dtype)
+    else:
+      return x
+
+  def _global_attention(self, feat0, feat1):
+    m = tf.math.reduce_max(feat0, axis=[1, 2], keepdims=True)
+    m = tf.math.sigmoid(m)
+    return feat0 + feat1 * m
+
+  def _build_feature_pyramid(self, feats):
+    num_output_connections = [0] * len(feats)
+    num_output_levels = self._max_level - self._min_level + 1
+    feat_levels = list(range(self._min_level, self._max_level + 1))
+
+    for i, block_spec in enumerate(self._block_specs):
+      new_level = block_spec.level
+
+      # Checks the range of input_offsets.
+      for input_offset in block_spec.input_offsets:
+        if input_offset >= len(feats):
+          raise ValueError(
+              'input_offset ({}) is larger than num feats({})'.format(
+                  input_offset, len(feats)))
+      input0 = block_spec.input_offsets[0]
+      input1 = block_spec.input_offsets[1]
+
+      # Update graph with inputs.
+      node0 = feats[input0]
+      node0_level = feat_levels[input0]
+      num_output_connections[input0] += 1
+      node0 = self._resample_feature_map(node0, node0_level, new_level)
+      node1 = feats[input1]
+      node1_level = feat_levels[input1]
+      num_output_connections[input1] += 1
+      node1 = self._resample_feature_map(node1, node1_level, new_level)
+
+      # Combine node0 and node1 to create new feat.
+      if block_spec.combine_fn == 'sum':
+        new_node = node0 + node1
+      elif block_spec.combine_fn == 'attention':
+        if node0_level >= node1_level:
+          new_node = self._global_attention(node0, node1)
+        else:
+          new_node = self._global_attention(node1, node0)
+      else:
+        raise ValueError('unknown combine_fn `{}`.'
+                         .format(block_spec.combine_fn))
+
+      # Add intermediate nodes that do not have any connections to output.
+      if block_spec.is_output:
+        for j, (feat, feat_level, num_output) in enumerate(
+            zip(feats, feat_levels, num_output_connections)):
+          if num_output == 0 and feat_level == new_level:
+            num_output_connections[j] += 1
+
+            feat_ = self._resample_feature_map(feat, feat_level, new_level)
+            new_node += feat_
+
+      new_node = self._activation(new_node)
+      new_node = self._conv_op(
+          filters=self._config_dict['num_filters'],
+          kernel_size=(3, 3),
+          padding='same',
+          **self._conv_kwargs)(new_node)
+      new_node = self._norm_op(**self._norm_kwargs)(new_node)
+
+      feats.append(new_node)
+      feat_levels.append(new_level)
+      num_output_connections.append(0)
+
+    output_feats = {}
+    for i in range(len(feats) - num_output_levels, len(feats)):
+      level = feat_levels[i]
+      output_feats[level] = feats[i]
+    logging.info('Output feature pyramid: %s', output_feats)
+    return output_feats
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def output_specs(self) -> Mapping[str, tf.TensorShape]:
+    """A dict of {level: TensorShape} pairs for the model output."""
+    return self._output_specs
+
+
+@factory.register_decoder_builder('nasfpn')
+def build_nasfpn_decoder(
+    input_specs: Mapping[str, tf.TensorShape],
+    model_config: hyperparams.Config,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None
+) -> tf.keras.Model:
+  """Builds NASFPN decoder from a config.
+
+  Args:
+    input_specs: A `dict` of input specifications. A dictionary consists of
+      {level: TensorShape} from a backbone.
+    model_config: A OneOfConfig. Model config.
+    l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to
+      None.
+
+  Returns:
+    A `tf.keras.Model` instance of the NASFPN decoder.
+
+  Raises:
+    ValueError: If the model_config.decoder.type is not `nasfpn`.
+  """
+  decoder_type = model_config.decoder.type
+  decoder_cfg = model_config.decoder.get()
+  if decoder_type != 'nasfpn':
+    raise ValueError(f'Inconsistent decoder type {decoder_type}. '
+                     'Need to be `nasfpn`.')
+
+  norm_activation_config = model_config.norm_activation
+  return NASFPN(
+      input_specs=input_specs,
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      num_filters=decoder_cfg.num_filters,
+      num_repeats=decoder_cfg.num_repeats,
+      use_separable_conv=decoder_cfg.use_separable_conv,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
--- a/official/vision/modeling/decoders/nasfpn_test.py
+++ b/official/vision/modeling/decoders/nasfpn_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for NAS-FPN."""
+
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.modeling.backbones import resnet
+from official.vision.modeling.decoders import nasfpn
+
+
+class NASFPNTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (256, 3, 7, False),
+      (256, 3, 7, True),
+  )
+  def test_network_creation(self, input_size, min_level, max_level,
+                            use_separable_conv):
+    """Test creation of NAS-FPN."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+
+    num_filters = 256
+    backbone = resnet.ResNet(model_id=50)
+    network = nasfpn.NASFPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level,
+        num_filters=num_filters,
+        use_separable_conv=use_separable_conv)
+
+    endpoints = backbone(inputs)
+    feats = network(endpoints)
+
+    for level in range(min_level, max_level + 1):
+      self.assertIn(str(level), feats)
+      self.assertAllEqual(
+          [1, input_size // 2**level, input_size // 2**level, num_filters],
+          feats[str(level)].shape.as_list())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/factory.py
+++ b/official/vision/modeling/factory.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Factory methods to build models."""
+
+from typing import Optional
+
+import tensorflow as tf
+
+from official.vision.configs import image_classification as classification_cfg
+from official.vision.configs import maskrcnn as maskrcnn_cfg
+from official.vision.configs import retinanet as retinanet_cfg
+from official.vision.configs import semantic_segmentation as segmentation_cfg
+from official.vision.modeling import backbones
+from official.vision.modeling import classification_model
+from official.vision.modeling import decoders
+from official.vision.modeling import maskrcnn_model
+from official.vision.modeling import retinanet_model
+from official.vision.modeling import segmentation_model
+from official.vision.modeling.heads import dense_prediction_heads
+from official.vision.modeling.heads import instance_heads
+from official.vision.modeling.heads import segmentation_heads
+from official.vision.modeling.layers import detection_generator
+from official.vision.modeling.layers import mask_sampler
+from official.vision.modeling.layers import roi_aligner
+from official.vision.modeling.layers import roi_generator
+from official.vision.modeling.layers import roi_sampler
+
+
+def build_classification_model(
+    input_specs: tf.keras.layers.InputSpec,
+    model_config: classification_cfg.ImageClassificationModel,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+    skip_logits_layer: bool = False,
+    backbone: Optional[tf.keras.Model] = None) -> tf.keras.Model:
+  """Builds the classification model."""
+  norm_activation_config = model_config.norm_activation
+  if not backbone:
+    backbone = backbones.factory.build_backbone(
+        input_specs=input_specs,
+        backbone_config=model_config.backbone,
+        norm_activation_config=norm_activation_config,
+        l2_regularizer=l2_regularizer)
+
+  model = classification_model.ClassificationModel(
+      backbone=backbone,
+      num_classes=model_config.num_classes,
+      input_specs=input_specs,
+      dropout_rate=model_config.dropout_rate,
+      kernel_initializer=model_config.kernel_initializer,
+      kernel_regularizer=l2_regularizer,
+      add_head_batch_norm=model_config.add_head_batch_norm,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      skip_logits_layer=skip_logits_layer)
+  return model
+
+
+def build_maskrcnn(input_specs: tf.keras.layers.InputSpec,
+                   model_config: maskrcnn_cfg.MaskRCNN,
+                   l2_regularizer: Optional[
+                       tf.keras.regularizers.Regularizer] = None,
+                   backbone: Optional[tf.keras.Model] = None,
+                   decoder: Optional[tf.keras.Model] = None) -> tf.keras.Model:
+  """Builds Mask R-CNN model."""
+  norm_activation_config = model_config.norm_activation
+  if not backbone:
+    backbone = backbones.factory.build_backbone(
+        input_specs=input_specs,
+        backbone_config=model_config.backbone,
+        norm_activation_config=norm_activation_config,
+        l2_regularizer=l2_regularizer)
+  backbone_features = backbone(tf.keras.Input(input_specs.shape[1:]))
+
+  if not decoder:
+    decoder = decoders.factory.build_decoder(
+        input_specs=backbone.output_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+
+  rpn_head_config = model_config.rpn_head
+  roi_generator_config = model_config.roi_generator
+  roi_sampler_config = model_config.roi_sampler
+  roi_aligner_config = model_config.roi_aligner
+  detection_head_config = model_config.detection_head
+  generator_config = model_config.detection_generator
+  num_anchors_per_location = (
+      len(model_config.anchor.aspect_ratios) * model_config.anchor.num_scales)
+
+  rpn_head = dense_prediction_heads.RPNHead(
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      num_anchors_per_location=num_anchors_per_location,
+      num_convs=rpn_head_config.num_convs,
+      num_filters=rpn_head_config.num_filters,
+      use_separable_conv=rpn_head_config.use_separable_conv,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
+
+  detection_head = instance_heads.DetectionHead(
+      num_classes=model_config.num_classes,
+      num_convs=detection_head_config.num_convs,
+      num_filters=detection_head_config.num_filters,
+      use_separable_conv=detection_head_config.use_separable_conv,
+      num_fcs=detection_head_config.num_fcs,
+      fc_dims=detection_head_config.fc_dims,
+      class_agnostic_bbox_pred=detection_head_config.class_agnostic_bbox_pred,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer,
+      name='detection_head')
+
+  if decoder:
+    decoder_features = decoder(backbone_features)
+    rpn_head(decoder_features)
+
+  if roi_sampler_config.cascade_iou_thresholds:
+    detection_head_cascade = [detection_head]
+    for cascade_num in range(len(roi_sampler_config.cascade_iou_thresholds)):
+      detection_head = instance_heads.DetectionHead(
+          num_classes=model_config.num_classes,
+          num_convs=detection_head_config.num_convs,
+          num_filters=detection_head_config.num_filters,
+          use_separable_conv=detection_head_config.use_separable_conv,
+          num_fcs=detection_head_config.num_fcs,
+          fc_dims=detection_head_config.fc_dims,
+          class_agnostic_bbox_pred=detection_head_config
+          .class_agnostic_bbox_pred,
+          activation=norm_activation_config.activation,
+          use_sync_bn=norm_activation_config.use_sync_bn,
+          norm_momentum=norm_activation_config.norm_momentum,
+          norm_epsilon=norm_activation_config.norm_epsilon,
+          kernel_regularizer=l2_regularizer,
+          name='detection_head_{}'.format(cascade_num + 1))
+
+      detection_head_cascade.append(detection_head)
+    detection_head = detection_head_cascade
+
+  roi_generator_obj = roi_generator.MultilevelROIGenerator(
+      pre_nms_top_k=roi_generator_config.pre_nms_top_k,
+      pre_nms_score_threshold=roi_generator_config.pre_nms_score_threshold,
+      pre_nms_min_size_threshold=(
+          roi_generator_config.pre_nms_min_size_threshold),
+      nms_iou_threshold=roi_generator_config.nms_iou_threshold,
+      num_proposals=roi_generator_config.num_proposals,
+      test_pre_nms_top_k=roi_generator_config.test_pre_nms_top_k,
+      test_pre_nms_score_threshold=(
+          roi_generator_config.test_pre_nms_score_threshold),
+      test_pre_nms_min_size_threshold=(
+          roi_generator_config.test_pre_nms_min_size_threshold),
+      test_nms_iou_threshold=roi_generator_config.test_nms_iou_threshold,
+      test_num_proposals=roi_generator_config.test_num_proposals,
+      use_batched_nms=roi_generator_config.use_batched_nms)
+
+  roi_sampler_cascade = []
+  roi_sampler_obj = roi_sampler.ROISampler(
+      mix_gt_boxes=roi_sampler_config.mix_gt_boxes,
+      num_sampled_rois=roi_sampler_config.num_sampled_rois,
+      foreground_fraction=roi_sampler_config.foreground_fraction,
+      foreground_iou_threshold=roi_sampler_config.foreground_iou_threshold,
+      background_iou_high_threshold=(
+          roi_sampler_config.background_iou_high_threshold),
+      background_iou_low_threshold=(
+          roi_sampler_config.background_iou_low_threshold))
+  roi_sampler_cascade.append(roi_sampler_obj)
+  # Initialize addtional roi simplers for cascade heads.
+  if roi_sampler_config.cascade_iou_thresholds:
+    for iou in roi_sampler_config.cascade_iou_thresholds:
+      roi_sampler_obj = roi_sampler.ROISampler(
+          mix_gt_boxes=False,
+          num_sampled_rois=roi_sampler_config.num_sampled_rois,
+          foreground_iou_threshold=iou,
+          background_iou_high_threshold=iou,
+          background_iou_low_threshold=0.0,
+          skip_subsampling=True)
+      roi_sampler_cascade.append(roi_sampler_obj)
+
+  roi_aligner_obj = roi_aligner.MultilevelROIAligner(
+      crop_size=roi_aligner_config.crop_size,
+      sample_offset=roi_aligner_config.sample_offset)
+
+  detection_generator_obj = detection_generator.DetectionGenerator(
+      apply_nms=generator_config.apply_nms,
+      pre_nms_top_k=generator_config.pre_nms_top_k,
+      pre_nms_score_threshold=generator_config.pre_nms_score_threshold,
+      nms_iou_threshold=generator_config.nms_iou_threshold,
+      max_num_detections=generator_config.max_num_detections,
+      nms_version=generator_config.nms_version,
+      use_cpu_nms=generator_config.use_cpu_nms,
+      soft_nms_sigma=generator_config.soft_nms_sigma)
+
+  if model_config.include_mask:
+    mask_head = instance_heads.MaskHead(
+        num_classes=model_config.num_classes,
+        upsample_factor=model_config.mask_head.upsample_factor,
+        num_convs=model_config.mask_head.num_convs,
+        num_filters=model_config.mask_head.num_filters,
+        use_separable_conv=model_config.mask_head.use_separable_conv,
+        activation=model_config.norm_activation.activation,
+        norm_momentum=model_config.norm_activation.norm_momentum,
+        norm_epsilon=model_config.norm_activation.norm_epsilon,
+        kernel_regularizer=l2_regularizer,
+        class_agnostic=model_config.mask_head.class_agnostic)
+
+    mask_sampler_obj = mask_sampler.MaskSampler(
+        mask_target_size=(
+            model_config.mask_roi_aligner.crop_size *
+            model_config.mask_head.upsample_factor),
+        num_sampled_masks=model_config.mask_sampler.num_sampled_masks)
+
+    mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(
+        crop_size=model_config.mask_roi_aligner.crop_size,
+        sample_offset=model_config.mask_roi_aligner.sample_offset)
+  else:
+    mask_head = None
+    mask_sampler_obj = None
+    mask_roi_aligner_obj = None
+
+  model = maskrcnn_model.MaskRCNNModel(
+      backbone=backbone,
+      decoder=decoder,
+      rpn_head=rpn_head,
+      detection_head=detection_head,
+      roi_generator=roi_generator_obj,
+      roi_sampler=roi_sampler_cascade,
+      roi_aligner=roi_aligner_obj,
+      detection_generator=detection_generator_obj,
+      mask_head=mask_head,
+      mask_sampler=mask_sampler_obj,
+      mask_roi_aligner=mask_roi_aligner_obj,
+      class_agnostic_bbox_pred=detection_head_config.class_agnostic_bbox_pred,
+      cascade_class_ensemble=detection_head_config.cascade_class_ensemble,
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      num_scales=model_config.anchor.num_scales,
+      aspect_ratios=model_config.anchor.aspect_ratios,
+      anchor_size=model_config.anchor.anchor_size)
+  return model
+
+
+def build_retinanet(
+    input_specs: tf.keras.layers.InputSpec,
+    model_config: retinanet_cfg.RetinaNet,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+    backbone: Optional[tf.keras.Model] = None,
+    decoder: Optional[tf.keras.regularizers.Regularizer] = None
+) -> tf.keras.Model:
+  """Builds RetinaNet model."""
+  norm_activation_config = model_config.norm_activation
+  if not backbone:
+    backbone = backbones.factory.build_backbone(
+        input_specs=input_specs,
+        backbone_config=model_config.backbone,
+        norm_activation_config=norm_activation_config,
+        l2_regularizer=l2_regularizer)
+  backbone_features = backbone(tf.keras.Input(input_specs.shape[1:]))
+
+  if not decoder:
+    decoder = decoders.factory.build_decoder(
+        input_specs=backbone.output_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+
+  head_config = model_config.head
+  generator_config = model_config.detection_generator
+  num_anchors_per_location = (
+      len(model_config.anchor.aspect_ratios) * model_config.anchor.num_scales)
+
+  head = dense_prediction_heads.RetinaNetHead(
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      num_classes=model_config.num_classes,
+      num_anchors_per_location=num_anchors_per_location,
+      num_convs=head_config.num_convs,
+      num_filters=head_config.num_filters,
+      attribute_heads=[
+          cfg.as_dict() for cfg in (head_config.attribute_heads or [])
+      ],
+      use_separable_conv=head_config.use_separable_conv,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
+
+  # Builds decoder and head so that their trainable weights are initialized
+  if decoder:
+    decoder_features = decoder(backbone_features)
+    _ = head(decoder_features)
+
+  detection_generator_obj = detection_generator.MultilevelDetectionGenerator(
+      apply_nms=generator_config.apply_nms,
+      pre_nms_top_k=generator_config.pre_nms_top_k,
+      pre_nms_score_threshold=generator_config.pre_nms_score_threshold,
+      nms_iou_threshold=generator_config.nms_iou_threshold,
+      max_num_detections=generator_config.max_num_detections,
+      nms_version=generator_config.nms_version,
+      use_cpu_nms=generator_config.use_cpu_nms,
+      soft_nms_sigma=generator_config.soft_nms_sigma,
+      tflite_post_processing_config=generator_config.tflite_post_processing
+      .as_dict())
+
+  model = retinanet_model.RetinaNetModel(
+      backbone,
+      decoder,
+      head,
+      detection_generator_obj,
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      num_scales=model_config.anchor.num_scales,
+      aspect_ratios=model_config.anchor.aspect_ratios,
+      anchor_size=model_config.anchor.anchor_size)
+  return model
+
+
+def build_segmentation_model(
+    input_specs: tf.keras.layers.InputSpec,
+    model_config: segmentation_cfg.SemanticSegmentationModel,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+    backbone: Optional[tf.keras.regularizers.Regularizer] = None,
+    decoder: Optional[tf.keras.regularizers.Regularizer] = None
+) -> tf.keras.Model:
+  """Builds Segmentation model."""
+  norm_activation_config = model_config.norm_activation
+  if not backbone:
+    backbone = backbones.factory.build_backbone(
+        input_specs=input_specs,
+        backbone_config=model_config.backbone,
+        norm_activation_config=norm_activation_config,
+        l2_regularizer=l2_regularizer)
+
+  if not decoder:
+    decoder = decoders.factory.build_decoder(
+        input_specs=backbone.output_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+
+  head_config = model_config.head
+
+  head = segmentation_heads.SegmentationHead(
+      num_classes=model_config.num_classes,
+      level=head_config.level,
+      num_convs=head_config.num_convs,
+      prediction_kernel_size=head_config.prediction_kernel_size,
+      num_filters=head_config.num_filters,
+      use_depthwise_convolution=head_config.use_depthwise_convolution,
+      upsample_factor=head_config.upsample_factor,
+      feature_fusion=head_config.feature_fusion,
+      low_level=head_config.low_level,
+      low_level_num_filters=head_config.low_level_num_filters,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
+
+  mask_scoring_head = None
+  if model_config.mask_scoring_head:
+    mask_scoring_head = segmentation_heads.MaskScoring(
+        num_classes=model_config.num_classes,
+        **model_config.mask_scoring_head.as_dict(),
+        activation=norm_activation_config.activation,
+        use_sync_bn=norm_activation_config.use_sync_bn,
+        norm_momentum=norm_activation_config.norm_momentum,
+        norm_epsilon=norm_activation_config.norm_epsilon,
+        kernel_regularizer=l2_regularizer)
+
+  model = segmentation_model.SegmentationModel(
+      backbone, decoder, head, mask_scoring_head=mask_scoring_head)
+  return model
--- a/official/vision/modeling/factory_3d.py
+++ b/official/vision/modeling/factory_3d.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Factory methods to build models."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.core import registry
+from official.vision.configs import video_classification as video_classification_cfg
+from official.vision.modeling import video_classification_model
+from official.vision.modeling import backbones
+
+_REGISTERED_MODEL_CLS = {}
+
+
+def register_model_builder(key: str):
+  """Decorates a builder of model class.
+
+  The builder should be a Callable (a class or a function).
+  This decorator supports registration of backbone builder as follows:
+
+  ```
+  class MyModel(tf.keras.Model):
+    pass
+
+  @register_backbone_builder('mybackbone')
+  def builder(input_specs, config, l2_reg):
+    return MyModel(...)
+
+  # Builds a MyModel object.
+  my_backbone = build_backbone_3d(input_specs, config, l2_reg)
+  ```
+
+  Args:
+    key: the key to look up the builder.
+
+  Returns:
+    A callable for use as class decorator that registers the decorated class
+    for creation from an instance of model class.
+  """
+  return registry.register(_REGISTERED_MODEL_CLS, key)
+
+
+def build_model(
+    model_type: str,
+    input_specs: tf.keras.layers.InputSpec,
+    model_config: video_classification_cfg.hyperparams.Config,
+    num_classes: int,
+    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
+  """Builds backbone from a config.
+
+  Args:
+    model_type: string name of model type. It should be consistent with
+      ModelConfig.model_type.
+    input_specs: tf.keras.layers.InputSpec.
+    model_config: a OneOfConfig. Model config.
+    num_classes: number of classes.
+    l2_regularizer: tf.keras.regularizers.Regularizer instance. Default to None.
+
+  Returns:
+    tf.keras.Model instance of the backbone.
+  """
+  model_builder = registry.lookup(_REGISTERED_MODEL_CLS, model_type)
+
+  return model_builder(input_specs, model_config, num_classes, l2_regularizer)
+
+
+@register_model_builder('video_classification')
+def build_video_classification_model(
+    input_specs: tf.keras.layers.InputSpec,
+    model_config: video_classification_cfg.VideoClassificationModel,
+    num_classes: int,
+    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
+  """Builds the video classification model."""
+  input_specs_dict = {'image': input_specs}
+  norm_activation_config = model_config.norm_activation
+  backbone = backbones.factory.build_backbone(
+      input_specs=input_specs,
+      backbone_config=model_config.backbone,
+      norm_activation_config=norm_activation_config,
+      l2_regularizer=l2_regularizer)
+
+  model = video_classification_model.VideoClassificationModel(
+      backbone=backbone,
+      num_classes=num_classes,
+      input_specs=input_specs_dict,
+      dropout_rate=model_config.dropout_rate,
+      aggregate_endpoints=model_config.aggregate_endpoints,
+      kernel_regularizer=l2_regularizer,
+      require_endpoints=model_config.require_endpoints)
+  return model
--- a/official/vision/modeling/factory_test.py
+++ b/official/vision/modeling/factory_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for factory.py."""
+
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.configs import backbones
+from official.vision.configs import backbones_3d
+from official.vision.configs import image_classification as classification_cfg
+from official.vision.configs import maskrcnn as maskrcnn_cfg
+from official.vision.configs import retinanet as retinanet_cfg
+from official.vision.configs import video_classification as video_classification_cfg
+from official.vision.modeling import factory
+from official.vision.modeling import factory_3d
+
+
+class ClassificationModelBuilderTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ('resnet', (224, 224), 5e-5),
+      ('resnet', (224, 224), None),
+      ('resnet', (None, None), 5e-5),
+      ('resnet', (None, None), None),
+  )
+  def test_builder(self, backbone_type, input_size, weight_decay):
+    num_classes = 2
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, input_size[0], input_size[1], 3])
+    model_config = classification_cfg.ImageClassificationModel(
+        num_classes=num_classes,
+        backbone=backbones.Backbone(type=backbone_type))
+    l2_regularizer = (
+        tf.keras.regularizers.l2(weight_decay) if weight_decay else None)
+    _ = factory.build_classification_model(
+        input_specs=input_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+
+
+class MaskRCNNBuilderTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ('resnet', (640, 640)),
+      ('resnet', (None, None)),
+  )
+  def test_builder(self, backbone_type, input_size):
+    num_classes = 2
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, input_size[0], input_size[1], 3])
+    model_config = maskrcnn_cfg.MaskRCNN(
+        num_classes=num_classes,
+        backbone=backbones.Backbone(type=backbone_type))
+    l2_regularizer = tf.keras.regularizers.l2(5e-5)
+    _ = factory.build_maskrcnn(
+        input_specs=input_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+
+
+class RetinaNetBuilderTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ('resnet', (640, 640), False),
+      ('resnet', (None, None), True),
+  )
+  def test_builder(self, backbone_type, input_size, has_att_heads):
+    num_classes = 2
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, input_size[0], input_size[1], 3])
+    if has_att_heads:
+      attribute_heads_config = [
+          retinanet_cfg.AttributeHead(name='att1'),
+          retinanet_cfg.AttributeHead(
+              name='att2', type='classification', size=2),
+      ]
+    else:
+      attribute_heads_config = None
+    model_config = retinanet_cfg.RetinaNet(
+        num_classes=num_classes,
+        backbone=backbones.Backbone(type=backbone_type),
+        head=retinanet_cfg.RetinaNetHead(
+            attribute_heads=attribute_heads_config))
+    l2_regularizer = tf.keras.regularizers.l2(5e-5)
+    _ = factory.build_retinanet(
+        input_specs=input_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+    if has_att_heads:
+      self.assertEqual(model_config.head.attribute_heads[0].as_dict(),
+                       dict(name='att1', type='regression', size=1))
+      self.assertEqual(model_config.head.attribute_heads[1].as_dict(),
+                       dict(name='att2', type='classification', size=2))
+
+
+class VideoClassificationModelBuilderTest(parameterized.TestCase,
+                                          tf.test.TestCase):
+
+  @parameterized.parameters(
+      ('resnet_3d', (8, 224, 224), 5e-5),
+      ('resnet_3d', (None, None, None), 5e-5),
+  )
+  def test_builder(self, backbone_type, input_size, weight_decay):
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, input_size[0], input_size[1], input_size[2], 3])
+    model_config = video_classification_cfg.VideoClassificationModel(
+        backbone=backbones_3d.Backbone3D(type=backbone_type))
+    l2_regularizer = (
+        tf.keras.regularizers.l2(weight_decay) if weight_decay else None)
+    _ = factory_3d.build_video_classification_model(
+        input_specs=input_specs,
+        model_config=model_config,
+        num_classes=2,
+        l2_regularizer=l2_regularizer)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/heads/__init__.py
+++ b/official/vision/modeling/heads/__init__.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Heads package definition."""
+
+from official.vision.modeling.heads.dense_prediction_heads import RetinaNetHead
+from official.vision.modeling.heads.dense_prediction_heads import RPNHead
+from official.vision.modeling.heads.instance_heads import DetectionHead
+from official.vision.modeling.heads.instance_heads import MaskHead
+from official.vision.modeling.heads.segmentation_heads import SegmentationHead
--- a/official/vision/modeling/heads/dense_prediction_heads.py
+++ b/official/vision/modeling/heads/dense_prediction_heads.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of dense prediction heads."""
+
+from typing import Any, Dict, List, Mapping, Optional, Union
+
+# Import libraries
+
+import numpy as np
+import tensorflow as tf
+
+from official.modeling import tf_utils
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class RetinaNetHead(tf.keras.layers.Layer):
+  """Creates a RetinaNet head."""
+
+  def __init__(
+      self,
+      min_level: int,
+      max_level: int,
+      num_classes: int,
+      num_anchors_per_location: int,
+      num_convs: int = 4,
+      num_filters: int = 256,
+      attribute_heads: Optional[List[Dict[str, Any]]] = None,
+      use_separable_conv: bool = False,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      num_params_per_anchor: int = 4,
+      **kwargs):
+    """Initializes a RetinaNet head.
+
+    Args:
+      min_level: An `int` number of minimum feature level.
+      max_level: An `int` number of maximum feature level.
+      num_classes: An `int` number of classes to predict.
+      num_anchors_per_location: An `int` number of number of anchors per pixel
+        location.
+      num_convs: An `int` number that represents the number of the intermediate
+        conv layers before the prediction.
+      num_filters: An `int` number that represents the number of filters of the
+        intermediate conv layers.
+      attribute_heads: If not None, a list that contains a dict for each
+        additional attribute head. Each dict consists of 3 key-value pairs:
+        `name`, `type` ('regression' or 'classification'), and `size` (number
+        of predicted values for each instance).
+      use_separable_conv: A `bool` that indicates whether the separable
+        convolution layers is used.
+      activation: A `str` that indicates which activation is used, e.g. 'relu',
+        'swish', etc.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      num_params_per_anchor: Number of parameters required to specify an anchor
+        box. For example, `num_params_per_anchor` would be 4 for axis-aligned
+        anchor boxes specified by their y-centers, x-centers, heights, and
+        widths.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(RetinaNetHead, self).__init__(**kwargs)
+    self._config_dict = {
+        'min_level': min_level,
+        'max_level': max_level,
+        'num_classes': num_classes,
+        'num_anchors_per_location': num_anchors_per_location,
+        'num_convs': num_convs,
+        'num_filters': num_filters,
+        'attribute_heads': attribute_heads,
+        'use_separable_conv': use_separable_conv,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+        'num_params_per_anchor': num_params_per_anchor,
+    }
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
+    """Creates the variables of the head."""
+    conv_op = (tf.keras.layers.SeparableConv2D
+               if self._config_dict['use_separable_conv']
+               else tf.keras.layers.Conv2D)
+    conv_kwargs = {
+        'filters': self._config_dict['num_filters'],
+        'kernel_size': 3,
+        'padding': 'same',
+        'bias_initializer': tf.zeros_initializer(),
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    }
+    if not self._config_dict['use_separable_conv']:
+      conv_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.RandomNormal(
+              stddev=0.01),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+      })
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+
+    # Class net.
+    self._cls_convs = []
+    self._cls_norms = []
+    for level in range(
+        self._config_dict['min_level'], self._config_dict['max_level'] + 1):
+      this_level_cls_norms = []
+      for i in range(self._config_dict['num_convs']):
+        if level == self._config_dict['min_level']:
+          cls_conv_name = 'classnet-conv_{}'.format(i)
+          self._cls_convs.append(conv_op(name=cls_conv_name, **conv_kwargs))
+        cls_norm_name = 'classnet-conv-norm_{}_{}'.format(level, i)
+        this_level_cls_norms.append(bn_op(name=cls_norm_name, **bn_kwargs))
+      self._cls_norms.append(this_level_cls_norms)
+
+    classifier_kwargs = {
+        'filters': (
+            self._config_dict['num_classes'] *
+            self._config_dict['num_anchors_per_location']),
+        'kernel_size': 3,
+        'padding': 'same',
+        'bias_initializer': tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    }
+    if not self._config_dict['use_separable_conv']:
+      classifier_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.RandomNormal(stddev=1e-5),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+      })
+    self._classifier = conv_op(name='scores', **classifier_kwargs)
+
+    # Box net.
+    self._box_convs = []
+    self._box_norms = []
+    for level in range(
+        self._config_dict['min_level'], self._config_dict['max_level'] + 1):
+      this_level_box_norms = []
+      for i in range(self._config_dict['num_convs']):
+        if level == self._config_dict['min_level']:
+          box_conv_name = 'boxnet-conv_{}'.format(i)
+          self._box_convs.append(conv_op(name=box_conv_name, **conv_kwargs))
+        box_norm_name = 'boxnet-conv-norm_{}_{}'.format(level, i)
+        this_level_box_norms.append(bn_op(name=box_norm_name, **bn_kwargs))
+      self._box_norms.append(this_level_box_norms)
+
+    box_regressor_kwargs = {
+        'filters': (self._config_dict['num_params_per_anchor'] *
+                    self._config_dict['num_anchors_per_location']),
+        'kernel_size': 3,
+        'padding': 'same',
+        'bias_initializer': tf.zeros_initializer(),
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    }
+    if not self._config_dict['use_separable_conv']:
+      box_regressor_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.RandomNormal(
+              stddev=1e-5),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+      })
+    self._box_regressor = conv_op(name='boxes', **box_regressor_kwargs)
+
+    # Attribute learning nets.
+    if self._config_dict['attribute_heads']:
+      self._att_predictors = {}
+      self._att_convs = {}
+      self._att_norms = {}
+
+      for att_config in self._config_dict['attribute_heads']:
+        att_name = att_config['name']
+        att_type = att_config['type']
+        att_size = att_config['size']
+        att_convs_i = []
+        att_norms_i = []
+
+        # Build conv and norm layers.
+        for level in range(self._config_dict['min_level'],
+                           self._config_dict['max_level'] + 1):
+          this_level_att_norms = []
+          for i in range(self._config_dict['num_convs']):
+            if level == self._config_dict['min_level']:
+              att_conv_name = '{}-conv_{}'.format(att_name, i)
+              att_convs_i.append(conv_op(name=att_conv_name, **conv_kwargs))
+            att_norm_name = '{}-conv-norm_{}_{}'.format(att_name, level, i)
+            this_level_att_norms.append(bn_op(name=att_norm_name, **bn_kwargs))
+          att_norms_i.append(this_level_att_norms)
+        self._att_convs[att_name] = att_convs_i
+        self._att_norms[att_name] = att_norms_i
+
+        # Build the final prediction layer.
+        att_predictor_kwargs = {
+            'filters':
+                (att_size * self._config_dict['num_anchors_per_location']),
+            'kernel_size': 3,
+            'padding': 'same',
+            'bias_initializer': tf.zeros_initializer(),
+            'bias_regularizer': self._config_dict['bias_regularizer'],
+        }
+        if att_type == 'regression':
+          att_predictor_kwargs.update(
+              {'bias_initializer': tf.zeros_initializer()})
+        elif att_type == 'classification':
+          att_predictor_kwargs.update({
+              'bias_initializer':
+                  tf.constant_initializer(-np.log((1 - 0.01) / 0.01))
+          })
+        else:
+          raise ValueError(
+              'Attribute head type {} not supported.'.format(att_type))
+
+        if not self._config_dict['use_separable_conv']:
+          att_predictor_kwargs.update({
+              'kernel_initializer':
+                  tf.keras.initializers.RandomNormal(stddev=1e-5),
+              'kernel_regularizer':
+                  self._config_dict['kernel_regularizer'],
+          })
+
+        self._att_predictors[att_name] = conv_op(
+            name='{}_attributes'.format(att_name), **att_predictor_kwargs)
+
+    super(RetinaNetHead, self).build(input_shape)
+
+  def call(self, features: Mapping[str, tf.Tensor]):
+    """Forward pass of the RetinaNet head.
+
+    Args:
+      features: A `dict` of `tf.Tensor` where
+        - key: A `str` of the level of the multilevel features.
+        - values: A `tf.Tensor`, the feature map tensors, whose shape is
+            [batch, height_l, width_l, channels].
+
+    Returns:
+      scores: A `dict` of `tf.Tensor` which includes scores of the predictions.
+        - key: A `str` of the level of the multilevel predictions.
+        - values: A `tf.Tensor` of the box scores predicted from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l, num_classes * num_anchors_per_location].
+      boxes: A `dict` of `tf.Tensor` which includes coordinates of the
+        predictions.
+        - key: A `str` of the level of the multilevel predictions.
+        - values: A `tf.Tensor` of the box scores predicted from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l,
+             num_params_per_anchor * num_anchors_per_location].
+      attributes: a dict of (attribute_name, attribute_prediction). Each
+        `attribute_prediction` is a dict of:
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the box scores predicted from a particular feature
+            level, whose shape is
+            [batch, height_l, width_l,
+            attribute_size * num_anchors_per_location].
+        Can be an empty dictionary if no attribute learning is required.
+    """
+    scores = {}
+    boxes = {}
+    if self._config_dict['attribute_heads']:
+      attributes = {
+          att_config['name']: {}
+          for att_config in self._config_dict['attribute_heads']
+      }
+    else:
+      attributes = {}
+
+    for i, level in enumerate(
+        range(self._config_dict['min_level'],
+              self._config_dict['max_level'] + 1)):
+      this_level_features = features[str(level)]
+
+      # class net.
+      x = this_level_features
+      for conv, norm in zip(self._cls_convs, self._cls_norms[i]):
+        x = conv(x)
+        x = norm(x)
+        x = self._activation(x)
+      scores[str(level)] = self._classifier(x)
+
+      # box net.
+      x = this_level_features
+      for conv, norm in zip(self._box_convs, self._box_norms[i]):
+        x = conv(x)
+        x = norm(x)
+        x = self._activation(x)
+      boxes[str(level)] = self._box_regressor(x)
+
+      # attribute nets.
+      if self._config_dict['attribute_heads']:
+        for att_config in self._config_dict['attribute_heads']:
+          att_name = att_config['name']
+          x = this_level_features
+          for conv, norm in zip(self._att_convs[att_name],
+                                self._att_norms[att_name][i]):
+            x = conv(x)
+            x = norm(x)
+            x = self._activation(x)
+          attributes[att_name][str(level)] = self._att_predictors[att_name](x)
+
+    return scores, boxes, attributes
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class RPNHead(tf.keras.layers.Layer):
+  """Creates a Region Proposal Network (RPN) head."""
+
+  def __init__(
+      self,
+      min_level: int,
+      max_level: int,
+      num_anchors_per_location: int,
+      num_convs: int = 1,
+      num_filters: int = 256,
+      use_separable_conv: bool = False,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+    """Initializes a Region Proposal Network head.
+
+    Args:
+      min_level: An `int` number of minimum feature level.
+      max_level: An `int` number of maximum feature level.
+      num_anchors_per_location: An `int` number of number of anchors per pixel
+        location.
+      num_convs: An `int` number that represents the number of the intermediate
+        convolution layers before the prediction.
+      num_filters: An `int` number that represents the number of filters of the
+        intermediate convolution layers.
+      use_separable_conv: A `bool` that indicates whether the separable
+        convolution layers is used.
+      activation: A `str` that indicates which activation is used, e.g. 'relu',
+        'swish', etc.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(RPNHead, self).__init__(**kwargs)
+    self._config_dict = {
+        'min_level': min_level,
+        'max_level': max_level,
+        'num_anchors_per_location': num_anchors_per_location,
+        'num_convs': num_convs,
+        'num_filters': num_filters,
+        'use_separable_conv': use_separable_conv,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+    }
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+
+  def build(self, input_shape):
+    """Creates the variables of the head."""
+    conv_op = (tf.keras.layers.SeparableConv2D
+               if self._config_dict['use_separable_conv']
+               else tf.keras.layers.Conv2D)
+    conv_kwargs = {
+        'filters': self._config_dict['num_filters'],
+        'kernel_size': 3,
+        'padding': 'same',
+        'bias_initializer': tf.zeros_initializer(),
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    }
+    if not self._config_dict['use_separable_conv']:
+      conv_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.RandomNormal(
+              stddev=0.01),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+      })
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+
+    self._convs = []
+    self._norms = []
+    for level in range(
+        self._config_dict['min_level'], self._config_dict['max_level'] + 1):
+      this_level_norms = []
+      for i in range(self._config_dict['num_convs']):
+        if level == self._config_dict['min_level']:
+          conv_name = 'rpn-conv_{}'.format(i)
+          self._convs.append(conv_op(name=conv_name, **conv_kwargs))
+        norm_name = 'rpn-conv-norm_{}_{}'.format(level, i)
+        this_level_norms.append(bn_op(name=norm_name, **bn_kwargs))
+      self._norms.append(this_level_norms)
+
+    classifier_kwargs = {
+        'filters': self._config_dict['num_anchors_per_location'],
+        'kernel_size': 1,
+        'padding': 'valid',
+        'bias_initializer': tf.zeros_initializer(),
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    }
+    if not self._config_dict['use_separable_conv']:
+      classifier_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.RandomNormal(
+              stddev=1e-5),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+      })
+    self._classifier = conv_op(name='rpn-scores', **classifier_kwargs)
+
+    box_regressor_kwargs = {
+        'filters': 4 * self._config_dict['num_anchors_per_location'],
+        'kernel_size': 1,
+        'padding': 'valid',
+        'bias_initializer': tf.zeros_initializer(),
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    }
+    if not self._config_dict['use_separable_conv']:
+      box_regressor_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.RandomNormal(
+              stddev=1e-5),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+      })
+    self._box_regressor = conv_op(name='rpn-boxes', **box_regressor_kwargs)
+
+    super(RPNHead, self).build(input_shape)
+
+  def call(self, features: Mapping[str, tf.Tensor]):
+    """Forward pass of the RPN head.
+
+    Args:
+      features: A `dict` of `tf.Tensor` where
+        - key: A `str` of the level of the multilevel features.
+        - values: A `tf.Tensor`, the feature map tensors, whose shape is [batch,
+          height_l, width_l, channels].
+
+    Returns:
+      scores: A `dict` of `tf.Tensor` which includes scores of the predictions.
+        - key: A `str` of the level of the multilevel predictions.
+        - values: A `tf.Tensor` of the box scores predicted from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l, num_classes * num_anchors_per_location].
+      boxes: A `dict` of `tf.Tensor` which includes coordinates of the
+        predictions.
+        - key: A `str` of the level of the multilevel predictions.
+        - values: A `tf.Tensor` of the box scores predicted from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l, 4 * num_anchors_per_location].
+    """
+    scores = {}
+    boxes = {}
+    for i, level in enumerate(
+        range(self._config_dict['min_level'],
+              self._config_dict['max_level'] + 1)):
+      x = features[str(level)]
+      for conv, norm in zip(self._convs, self._norms[i]):
+        x = conv(x)
+        x = norm(x)
+        x = self._activation(x)
+      scores[str(level)] = self._classifier(x)
+      boxes[str(level)] = self._box_regressor(x)
+    return scores, boxes
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/heads/dense_prediction_heads_test.py
+++ b/official/vision/modeling/heads/dense_prediction_heads_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for dense_prediction_heads.py."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling.heads import dense_prediction_heads
+
+
+class RetinaNetHeadTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (False, False, False),
+      (False, True, False),
+      (True, False, True),
+      (True, True, True),
+  )
+  def test_forward(self, use_separable_conv, use_sync_bn, has_att_heads):
+    if has_att_heads:
+      attribute_heads = [dict(name='depth', type='regression', size=1)]
+    else:
+      attribute_heads = None
+
+    retinanet_head = dense_prediction_heads.RetinaNetHead(
+        min_level=3,
+        max_level=4,
+        num_classes=3,
+        num_anchors_per_location=3,
+        num_convs=2,
+        num_filters=256,
+        attribute_heads=attribute_heads,
+        use_separable_conv=use_separable_conv,
+        activation='relu',
+        use_sync_bn=use_sync_bn,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    features = {
+        '3': np.random.rand(2, 128, 128, 16),
+        '4': np.random.rand(2, 64, 64, 16),
+    }
+    scores, boxes, attributes = retinanet_head(features)
+    self.assertAllEqual(scores['3'].numpy().shape, [2, 128, 128, 9])
+    self.assertAllEqual(scores['4'].numpy().shape, [2, 64, 64, 9])
+    self.assertAllEqual(boxes['3'].numpy().shape, [2, 128, 128, 12])
+    self.assertAllEqual(boxes['4'].numpy().shape, [2, 64, 64, 12])
+    if has_att_heads:
+      for att in attributes.values():
+        self.assertAllEqual(att['3'].numpy().shape, [2, 128, 128, 3])
+        self.assertAllEqual(att['4'].numpy().shape, [2, 64, 64, 3])
+
+  def test_serialize_deserialize(self):
+    retinanet_head = dense_prediction_heads.RetinaNetHead(
+        min_level=3,
+        max_level=7,
+        num_classes=3,
+        num_anchors_per_location=9,
+        num_convs=2,
+        num_filters=16,
+        attribute_heads=None,
+        use_separable_conv=False,
+        activation='relu',
+        use_sync_bn=False,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    config = retinanet_head.get_config()
+    new_retinanet_head = (
+        dense_prediction_heads.RetinaNetHead.from_config(config))
+    self.assertAllEqual(
+        retinanet_head.get_config(), new_retinanet_head.get_config())
+
+
+class RpnHeadTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (False, False),
+      (False, True),
+      (True, False),
+      (True, True),
+  )
+  def test_forward(self, use_separable_conv, use_sync_bn):
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=3,
+        max_level=4,
+        num_anchors_per_location=3,
+        num_convs=2,
+        num_filters=256,
+        use_separable_conv=use_separable_conv,
+        activation='relu',
+        use_sync_bn=use_sync_bn,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    features = {
+        '3': np.random.rand(2, 128, 128, 16),
+        '4': np.random.rand(2, 64, 64, 16),
+    }
+    scores, boxes = rpn_head(features)
+    self.assertAllEqual(scores['3'].numpy().shape, [2, 128, 128, 3])
+    self.assertAllEqual(scores['4'].numpy().shape, [2, 64, 64, 3])
+    self.assertAllEqual(boxes['3'].numpy().shape, [2, 128, 128, 12])
+    self.assertAllEqual(boxes['4'].numpy().shape, [2, 64, 64, 12])
+
+  def test_serialize_deserialize(self):
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=3,
+        max_level=7,
+        num_anchors_per_location=9,
+        num_convs=2,
+        num_filters=16,
+        use_separable_conv=False,
+        activation='relu',
+        use_sync_bn=False,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    config = rpn_head.get_config()
+    new_rpn_head = dense_prediction_heads.RPNHead.from_config(config)
+    self.assertAllEqual(rpn_head.get_config(), new_rpn_head.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/heads/instance_heads.py
+++ b/official/vision/modeling/heads/instance_heads.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of instance prediction heads."""
+
+from typing import List, Union, Optional
+# Import libraries
+import tensorflow as tf
+
+from official.modeling import tf_utils
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class DetectionHead(tf.keras.layers.Layer):
+  """Creates a detection head."""
+
+  def __init__(
+      self,
+      num_classes: int,
+      num_convs: int = 0,
+      num_filters: int = 256,
+      use_separable_conv: bool = False,
+      num_fcs: int = 2,
+      fc_dims: int = 1024,
+      class_agnostic_bbox_pred: bool = False,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+    """Initializes a detection head.
+
+    Args:
+      num_classes: An `int` for the number of classes.
+      num_convs: An `int` number that represents the number of the intermediate
+        convolution layers before the FC layers.
+      num_filters: An `int` number that represents the number of filters of the
+        intermediate convolution layers.
+      use_separable_conv: A `bool` that indicates whether the separable
+        convolution layers is used.
+      num_fcs: An `int` number that represents the number of FC layers before
+        the predictions.
+      fc_dims: An `int` number that represents the number of dimension of the FC
+        layers.
+      class_agnostic_bbox_pred: `bool`, indicating whether bboxes should be
+        predicted for every class or not.
+      activation: A `str` that indicates which activation is used, e.g. 'relu',
+        'swish', etc.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(DetectionHead, self).__init__(**kwargs)
+    self._config_dict = {
+        'num_classes': num_classes,
+        'num_convs': num_convs,
+        'num_filters': num_filters,
+        'use_separable_conv': use_separable_conv,
+        'num_fcs': num_fcs,
+        'fc_dims': fc_dims,
+        'class_agnostic_bbox_pred': class_agnostic_bbox_pred,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+    }
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
+    """Creates the variables of the head."""
+    conv_op = (tf.keras.layers.SeparableConv2D
+               if self._config_dict['use_separable_conv']
+               else tf.keras.layers.Conv2D)
+    conv_kwargs = {
+        'filters': self._config_dict['num_filters'],
+        'kernel_size': 3,
+        'padding': 'same',
+    }
+    if self._config_dict['use_separable_conv']:
+      conv_kwargs.update({
+          'depthwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'pointwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'depthwise_regularizer': self._config_dict['kernel_regularizer'],
+          'pointwise_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      })
+    else:
+      conv_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      })
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+
+    self._convs = []
+    self._conv_norms = []
+    for i in range(self._config_dict['num_convs']):
+      conv_name = 'detection-conv_{}'.format(i)
+      self._convs.append(conv_op(name=conv_name, **conv_kwargs))
+      bn_name = 'detection-conv-bn_{}'.format(i)
+      self._conv_norms.append(bn_op(name=bn_name, **bn_kwargs))
+
+    self._fcs = []
+    self._fc_norms = []
+    for i in range(self._config_dict['num_fcs']):
+      fc_name = 'detection-fc_{}'.format(i)
+      self._fcs.append(
+          tf.keras.layers.Dense(
+              units=self._config_dict['fc_dims'],
+              kernel_initializer=tf.keras.initializers.VarianceScaling(
+                  scale=1 / 3.0, mode='fan_out', distribution='uniform'),
+              kernel_regularizer=self._config_dict['kernel_regularizer'],
+              bias_regularizer=self._config_dict['bias_regularizer'],
+              name=fc_name))
+      bn_name = 'detection-fc-bn_{}'.format(i)
+      self._fc_norms.append(bn_op(name=bn_name, **bn_kwargs))
+
+    self._classifier = tf.keras.layers.Dense(
+        units=self._config_dict['num_classes'],
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        bias_initializer=tf.zeros_initializer(),
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'],
+        name='detection-scores')
+
+    num_box_outputs = (4 if self._config_dict['class_agnostic_bbox_pred'] else
+                       self._config_dict['num_classes'] * 4)
+    self._box_regressor = tf.keras.layers.Dense(
+        units=num_box_outputs,
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.001),
+        bias_initializer=tf.zeros_initializer(),
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'],
+        name='detection-boxes')
+
+    super(DetectionHead, self).build(input_shape)
+
+  def call(self, inputs: tf.Tensor, training: bool = None):
+    """Forward pass of box and class branches for the Mask-RCNN model.
+
+    Args:
+      inputs: A `tf.Tensor` of the shape [batch_size, num_instances, roi_height,
+        roi_width, roi_channels], representing the ROI features.
+      training: a `bool` indicating whether it is in `training` mode.
+
+    Returns:
+      class_outputs: A `tf.Tensor` of the shape
+        [batch_size, num_rois, num_classes], representing the class predictions.
+      box_outputs: A `tf.Tensor` of the shape
+        [batch_size, num_rois, num_classes * 4], representing the box
+        predictions.
+    """
+    roi_features = inputs
+    _, num_rois, height, width, filters = roi_features.get_shape().as_list()
+
+    x = tf.reshape(roi_features, [-1, height, width, filters])
+    for conv, bn in zip(self._convs, self._conv_norms):
+      x = conv(x)
+      x = bn(x)
+      x = self._activation(x)
+
+    _, _, _, filters = x.get_shape().as_list()
+    x = tf.reshape(x, [-1, num_rois, height * width * filters])
+
+    for fc, bn in zip(self._fcs, self._fc_norms):
+      x = fc(x)
+      x = bn(x)
+      x = self._activation(x)
+
+    classes = self._classifier(x)
+    boxes = self._box_regressor(x)
+    return classes, boxes
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MaskHead(tf.keras.layers.Layer):
+  """Creates a mask head."""
+
+  def __init__(
+      self,
+      num_classes: int,
+      upsample_factor: int = 2,
+      num_convs: int = 4,
+      num_filters: int = 256,
+      use_separable_conv: bool = False,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      class_agnostic: bool = False,
+      **kwargs):
+    """Initializes a mask head.
+
+    Args:
+      num_classes: An `int` of the number of classes.
+      upsample_factor: An `int` that indicates the upsample factor to generate
+        the final predicted masks. It should be >= 1.
+      num_convs: An `int` number that represents the number of the intermediate
+        convolution layers before the mask prediction layers.
+      num_filters: An `int` number that represents the number of filters of the
+        intermediate convolution layers.
+      use_separable_conv: A `bool` that indicates whether the separable
+        convolution layers is used.
+      activation: A `str` that indicates which activation is used, e.g. 'relu',
+        'swish', etc.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      class_agnostic: A `bool`. If set, we use a single channel mask head that
+        is shared between all classes.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(MaskHead, self).__init__(**kwargs)
+    self._config_dict = {
+        'num_classes': num_classes,
+        'upsample_factor': upsample_factor,
+        'num_convs': num_convs,
+        'num_filters': num_filters,
+        'use_separable_conv': use_separable_conv,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+        'class_agnostic': class_agnostic
+    }
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
+    """Creates the variables of the head."""
+    conv_op = (tf.keras.layers.SeparableConv2D
+               if self._config_dict['use_separable_conv']
+               else tf.keras.layers.Conv2D)
+    conv_kwargs = {
+        'filters': self._config_dict['num_filters'],
+        'kernel_size': 3,
+        'padding': 'same',
+    }
+    if self._config_dict['use_separable_conv']:
+      conv_kwargs.update({
+          'depthwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'pointwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'depthwise_regularizer': self._config_dict['kernel_regularizer'],
+          'pointwise_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      })
+    else:
+      conv_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      })
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+
+    self._convs = []
+    self._conv_norms = []
+    for i in range(self._config_dict['num_convs']):
+      conv_name = 'mask-conv_{}'.format(i)
+      self._convs.append(conv_op(name=conv_name, **conv_kwargs))
+      bn_name = 'mask-conv-bn_{}'.format(i)
+      self._conv_norms.append(bn_op(name=bn_name, **bn_kwargs))
+
+    self._deconv = tf.keras.layers.Conv2DTranspose(
+        filters=self._config_dict['num_filters'],
+        kernel_size=self._config_dict['upsample_factor'],
+        strides=self._config_dict['upsample_factor'],
+        padding='valid',
+        kernel_initializer=tf.keras.initializers.VarianceScaling(
+            scale=2, mode='fan_out', distribution='untruncated_normal'),
+        bias_initializer=tf.zeros_initializer(),
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'],
+        name='mask-upsampling')
+    self._deconv_bn = bn_op(name='mask-deconv-bn', **bn_kwargs)
+
+    if self._config_dict['class_agnostic']:
+      num_filters = 1
+    else:
+      num_filters = self._config_dict['num_classes']
+
+    conv_kwargs = {
+        'filters': num_filters,
+        'kernel_size': 1,
+        'padding': 'valid',
+    }
+    if self._config_dict['use_separable_conv']:
+      conv_kwargs.update({
+          'depthwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'pointwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'depthwise_regularizer': self._config_dict['kernel_regularizer'],
+          'pointwise_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      })
+    else:
+      conv_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      })
+    self._mask_regressor = conv_op(name='mask-logits', **conv_kwargs)
+
+    super(MaskHead, self).build(input_shape)
+
+  def call(self, inputs: List[tf.Tensor], training: bool = None):
+    """Forward pass of mask branch for the Mask-RCNN model.
+
+    Args:
+      inputs: A `list` of two tensors where
+        inputs[0]: A `tf.Tensor` of shape [batch_size, num_instances,
+          roi_height, roi_width, roi_channels], representing the ROI features.
+        inputs[1]: A `tf.Tensor` of shape [batch_size, num_instances],
+          representing the classes of the ROIs.
+      training: A `bool` indicating whether it is in `training` mode.
+
+    Returns:
+      mask_outputs: A `tf.Tensor` of shape
+        [batch_size, num_instances, roi_height * upsample_factor,
+         roi_width * upsample_factor], representing the mask predictions.
+    """
+    roi_features, roi_classes = inputs
+    batch_size, num_rois, height, width, filters = (
+        roi_features.get_shape().as_list())
+    if batch_size is None:
+      batch_size = tf.shape(roi_features)[0]
+
+    x = tf.reshape(roi_features, [-1, height, width, filters])
+    for conv, bn in zip(self._convs, self._conv_norms):
+      x = conv(x)
+      x = bn(x)
+      x = self._activation(x)
+
+    x = self._deconv(x)
+    x = self._deconv_bn(x)
+    x = self._activation(x)
+
+    logits = self._mask_regressor(x)
+
+    mask_height = height * self._config_dict['upsample_factor']
+    mask_width = width * self._config_dict['upsample_factor']
+
+    if self._config_dict['class_agnostic']:
+      logits = tf.reshape(logits, [-1, num_rois, mask_height, mask_width, 1])
+    else:
+      logits = tf.reshape(
+          logits,
+          [-1, num_rois, mask_height, mask_width,
+           self._config_dict['num_classes']])
+
+    batch_indices = tf.tile(
+        tf.expand_dims(tf.range(batch_size), axis=1), [1, num_rois])
+    mask_indices = tf.tile(
+        tf.expand_dims(tf.range(num_rois), axis=0), [batch_size, 1])
+
+    if self._config_dict['class_agnostic']:
+      class_gather_indices = tf.zeros_like(roi_classes, dtype=tf.int32)
+    else:
+      class_gather_indices = tf.cast(roi_classes, dtype=tf.int32)
+
+    gather_indices = tf.stack(
+        [batch_indices, mask_indices, class_gather_indices],
+        axis=2)
+    mask_outputs = tf.gather_nd(
+        tf.transpose(logits, [0, 1, 4, 2, 3]), gather_indices)
+    return mask_outputs
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/heads/instance_heads_test.py
+++ b/official/vision/modeling/heads/instance_heads_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for instance_heads.py."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling.heads import instance_heads
+
+
+class DetectionHeadTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (0, 0, False, False),
+      (0, 1, False, False),
+      (1, 0, False, False),
+      (1, 1, False, False),
+  )
+  def test_forward(self, num_convs, num_fcs, use_separable_conv, use_sync_bn):
+    detection_head = instance_heads.DetectionHead(
+        num_classes=3,
+        num_convs=num_convs,
+        num_filters=16,
+        use_separable_conv=use_separable_conv,
+        num_fcs=num_fcs,
+        fc_dims=4,
+        activation='relu',
+        use_sync_bn=use_sync_bn,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    roi_features = np.random.rand(2, 10, 128, 128, 16)
+    scores, boxes = detection_head(roi_features)
+    self.assertAllEqual(scores.numpy().shape, [2, 10, 3])
+    self.assertAllEqual(boxes.numpy().shape, [2, 10, 12])
+
+  def test_serialize_deserialize(self):
+    detection_head = instance_heads.DetectionHead(
+        num_classes=91,
+        num_convs=0,
+        num_filters=256,
+        use_separable_conv=False,
+        num_fcs=2,
+        fc_dims=1024,
+        activation='relu',
+        use_sync_bn=False,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    config = detection_head.get_config()
+    new_detection_head = instance_heads.DetectionHead.from_config(config)
+    self.assertAllEqual(
+        detection_head.get_config(), new_detection_head.get_config())
+
+
+class MaskHeadTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (1, 1, False),
+      (1, 2, False),
+      (2, 1, False),
+      (2, 2, False),
+  )
+  def test_forward(self, upsample_factor, num_convs, use_sync_bn):
+    mask_head = instance_heads.MaskHead(
+        num_classes=3,
+        upsample_factor=upsample_factor,
+        num_convs=num_convs,
+        num_filters=16,
+        use_separable_conv=False,
+        activation='relu',
+        use_sync_bn=use_sync_bn,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    roi_features = np.random.rand(2, 10, 14, 14, 16)
+    roi_classes = np.zeros((2, 10))
+    masks = mask_head([roi_features, roi_classes])
+    self.assertAllEqual(
+        masks.numpy().shape,
+        [2, 10, 14 * upsample_factor, 14 * upsample_factor])
+
+  def test_serialize_deserialize(self):
+    mask_head = instance_heads.MaskHead(
+        num_classes=3,
+        upsample_factor=2,
+        num_convs=1,
+        num_filters=256,
+        use_separable_conv=False,
+        activation='relu',
+        use_sync_bn=False,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    config = mask_head.get_config()
+    new_mask_head = instance_heads.MaskHead.from_config(config)
+    self.assertAllEqual(
+        mask_head.get_config(), new_mask_head.get_config())
+
+  def test_forward_class_agnostic(self):
+    mask_head = instance_heads.MaskHead(
+        num_classes=3,
+        class_agnostic=True
+    )
+    roi_features = np.random.rand(2, 10, 14, 14, 16)
+    roi_classes = np.zeros((2, 10))
+    masks = mask_head([roi_features, roi_classes])
+    self.assertAllEqual(masks.numpy().shape, [2, 10, 28, 28])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/heads/segmentation_heads.py
+++ b/official/vision/modeling/heads/segmentation_heads.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of segmentation heads."""
+from typing import List, Union, Optional, Mapping, Tuple, Any
+import tensorflow as tf
+
+from official.modeling import tf_utils
+from official.vision.modeling.layers import nn_layers
+from official.vision.ops import spatial_transform_ops
+
+
+class MaskScoring(tf.keras.Model):
+  """Creates a mask scoring layer.
+
+  This implements mask scoring layer from the paper:
+
+  Zhaojin Huang, Lichao Huang, Yongchao Gong, Chang Huang, Xinggang Wang.
+  Mask Scoring R-CNN.
+  (https://arxiv.org/pdf/1903.00241.pdf)
+  """
+
+  def __init__(
+      self,
+      num_classes: int,
+      fc_input_size: List[int],
+      num_convs: int = 3,
+      num_filters: int = 256,
+      fc_dims: int = 1024,
+      num_fcs: int = 2,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+
+    """Initializes mask scoring layer.
+
+    Args:
+      num_classes: An `int` for number of classes.
+      fc_input_size: A List of `int` for the input size of the
+        fully connected layers.
+      num_convs: An`int` for number of conv layers.
+      num_filters: An `int` for the number of filters for conv layers.
+      fc_dims: An `int` number of filters for each fully connected layers.
+      num_fcs: An `int` for number of fully connected layers.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A bool, whether or not to use sync batch normalization.
+      norm_momentum: A float for the momentum in BatchNorm. Defaults to 0.99.
+      norm_epsilon: A float for the epsilon value in BatchNorm. Defaults to
+        0.001.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(MaskScoring, self).__init__(**kwargs)
+
+    self._config_dict = {
+        'num_classes': num_classes,
+        'num_convs': num_convs,
+        'num_filters': num_filters,
+        'fc_input_size': fc_input_size,
+        'fc_dims': fc_dims,
+        'num_fcs': num_fcs,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'activation': activation,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+    }
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
+    """Creates the variables of the mask scoring head."""
+    conv_op = tf.keras.layers.Conv2D
+    conv_kwargs = {
+        'filters': self._config_dict['num_filters'],
+        'kernel_size': 3,
+        'padding': 'same',
+    }
+    conv_kwargs.update({
+        'kernel_initializer': tf.keras.initializers.VarianceScaling(
+            scale=2, mode='fan_out', distribution='untruncated_normal'),
+        'bias_initializer': tf.zeros_initializer(),
+        'kernel_regularizer': self._config_dict['kernel_regularizer'],
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    })
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+
+    self._convs = []
+    self._conv_norms = []
+    for i in range(self._config_dict['num_convs']):
+      conv_name = 'mask-scoring_{}'.format(i)
+      self._convs.append(conv_op(name=conv_name, **conv_kwargs))
+      bn_name = 'mask-scoring-bn_{}'.format(i)
+      self._conv_norms.append(bn_op(name=bn_name, **bn_kwargs))
+
+    self._fcs = []
+    self._fc_norms = []
+    for i in range(self._config_dict['num_fcs']):
+      fc_name = 'mask-scoring-fc_{}'.format(i)
+      self._fcs.append(
+          tf.keras.layers.Dense(
+              units=self._config_dict['fc_dims'],
+              kernel_initializer=tf.keras.initializers.VarianceScaling(
+                  scale=1 / 3.0, mode='fan_out', distribution='uniform'),
+              kernel_regularizer=self._config_dict['kernel_regularizer'],
+              bias_regularizer=self._config_dict['bias_regularizer'],
+              name=fc_name))
+      bn_name = 'mask-scoring-fc-bn_{}'.format(i)
+      self._fc_norms.append(bn_op(name=bn_name, **bn_kwargs))
+
+    self._classifier = tf.keras.layers.Dense(
+        units=self._config_dict['num_classes'],
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        bias_initializer=tf.zeros_initializer(),
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'],
+        name='iou-scores')
+
+    super(MaskScoring, self).build(input_shape)
+
+  def call(self, inputs: tf.Tensor, training: bool = None):
+    """Forward pass mask scoring head.
+
+    Args:
+      inputs: A `tf.Tensor` of the shape [batch_size, width, size, num_classes],
+      representing the segmentation logits.
+      training: a `bool` indicating whether it is in `training` mode.
+
+    Returns:
+      mask_scores: A `tf.Tensor` of predicted mask scores
+        [batch_size, num_classes].
+    """
+    x = tf.stop_gradient(inputs)
+    for conv, bn in zip(self._convs, self._conv_norms):
+      x = conv(x)
+      x = bn(x)
+      x = self._activation(x)
+
+    # Casts feat to float32 so the resize op can be run on TPU.
+    x = tf.cast(x, tf.float32)
+    x = tf.image.resize(x, size=self._config_dict['fc_input_size'],
+                        method=tf.image.ResizeMethod.BILINEAR)
+    # Casts it back to be compatible with the rest opetations.
+    x = tf.cast(x, inputs.dtype)
+
+    _, h, w, filters = x.get_shape().as_list()
+    x = tf.reshape(x, [-1, h * w * filters])
+
+    for fc, bn in zip(self._fcs, self._fc_norms):
+      x = fc(x)
+      x = bn(x)
+      x = self._activation(x)
+
+    ious = self._classifier(x)
+    return ious
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SegmentationHead(tf.keras.layers.Layer):
+  """Creates a segmentation head."""
+
+  def __init__(
+      self,
+      num_classes: int,
+      level: Union[int, str],
+      num_convs: int = 2,
+      num_filters: int = 256,
+      use_depthwise_convolution: bool = False,
+      prediction_kernel_size: int = 1,
+      upsample_factor: int = 1,
+      feature_fusion: Optional[str] = None,
+      decoder_min_level: Optional[int] = None,
+      decoder_max_level: Optional[int] = None,
+      low_level: int = 2,
+      low_level_num_filters: int = 48,
+      num_decoder_filters: int = 256,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+    """Initializes a segmentation head.
+
+    Args:
+      num_classes: An `int` number of mask classification categories. The number
+        of classes does not include background class.
+      level: An `int` or `str`, level to use to build segmentation head.
+      num_convs: An `int` number of stacked convolution before the last
+        prediction layer.
+      num_filters: An `int` number to specify the number of filters used.
+        Default is 256.
+      use_depthwise_convolution: A bool to specify if use depthwise separable
+        convolutions.
+      prediction_kernel_size: An `int` number to specify the kernel size of the
+      prediction layer.
+      upsample_factor: An `int` number to specify the upsampling factor to
+        generate finer mask. Default 1 means no upsampling is applied.
+      feature_fusion: One of `deeplabv3plus`, `pyramid_fusion`,
+        `panoptic_fpn_fusion`, or None. If `deeplabv3plus`, features from
+        decoder_features[level] will be fused with low level feature maps from
+        backbone. If `pyramid_fusion`, multiscale features will be resized and
+        fused at the target level.
+      decoder_min_level: An `int` of minimum level from decoder to use in
+        feature fusion. It is only used when feature_fusion is set to
+        `panoptic_fpn_fusion`.
+      decoder_max_level: An `int` of maximum level from decoder to use in
+        feature fusion. It is only used when feature_fusion is set to
+        `panoptic_fpn_fusion`.
+      low_level: An `int` of backbone level to be used for feature fusion. It is
+        used when feature_fusion is set to `deeplabv3plus`.
+      low_level_num_filters: An `int` of reduced number of filters for the low
+        level features before fusing it with higher level features. It is only
+        used when feature_fusion is set to `deeplabv3plus`.
+      num_decoder_filters: An `int` of number of filters in the decoder outputs.
+        It is only used when feature_fusion is set to `panoptic_fpn_fusion`.
+      activation: A `str` that indicates which activation is used, e.g. 'relu',
+        'swish', etc.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(SegmentationHead, self).__init__(**kwargs)
+
+    self._config_dict = {
+        'num_classes': num_classes,
+        'level': level,
+        'num_convs': num_convs,
+        'num_filters': num_filters,
+        'use_depthwise_convolution': use_depthwise_convolution,
+        'prediction_kernel_size': prediction_kernel_size,
+        'upsample_factor': upsample_factor,
+        'feature_fusion': feature_fusion,
+        'decoder_min_level': decoder_min_level,
+        'decoder_max_level': decoder_max_level,
+        'low_level': low_level,
+        'low_level_num_filters': low_level_num_filters,
+        'num_decoder_filters': num_decoder_filters,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer
+    }
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
+    """Creates the variables of the segmentation head."""
+    use_depthwise_convolution = self._config_dict['use_depthwise_convolution']
+    random_initializer = tf.keras.initializers.RandomNormal(stddev=0.01)
+    conv_op = tf.keras.layers.Conv2D
+    conv_kwargs = {
+        'kernel_size': 3 if not use_depthwise_convolution else 1,
+        'padding': 'same',
+        'use_bias': False,
+        'kernel_initializer': random_initializer,
+        'kernel_regularizer': self._config_dict['kernel_regularizer'],
+    }
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+
+    if self._config_dict['feature_fusion'] == 'deeplabv3plus':
+      # Deeplabv3+ feature fusion layers.
+      self._dlv3p_conv = conv_op(
+          kernel_size=1,
+          padding='same',
+          use_bias=False,
+          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+          kernel_regularizer=self._config_dict['kernel_regularizer'],
+          name='segmentation_head_deeplabv3p_fusion_conv',
+          filters=self._config_dict['low_level_num_filters'])
+
+      self._dlv3p_norm = bn_op(
+          name='segmentation_head_deeplabv3p_fusion_norm', **bn_kwargs)
+
+    elif self._config_dict['feature_fusion'] == 'panoptic_fpn_fusion':
+      self._panoptic_fpn_fusion = nn_layers.PanopticFPNFusion(
+          min_level=self._config_dict['decoder_min_level'],
+          max_level=self._config_dict['decoder_max_level'],
+          target_level=self._config_dict['level'],
+          num_filters=self._config_dict['num_filters'],
+          num_fpn_filters=self._config_dict['num_decoder_filters'],
+          activation=self._config_dict['activation'],
+          kernel_regularizer=self._config_dict['kernel_regularizer'],
+          bias_regularizer=self._config_dict['bias_regularizer'])
+
+    # Segmentation head layers.
+    self._convs = []
+    self._norms = []
+    for i in range(self._config_dict['num_convs']):
+      if use_depthwise_convolution:
+        self._convs.append(
+            tf.keras.layers.DepthwiseConv2D(
+                name='segmentation_head_depthwise_conv_{}'.format(i),
+                kernel_size=3,
+                padding='same',
+                use_bias=False,
+                depthwise_initializer=random_initializer,
+                depthwise_regularizer=self._config_dict['kernel_regularizer'],
+                depth_multiplier=1))
+        norm_name = 'segmentation_head_depthwise_norm_{}'.format(i)
+        self._norms.append(bn_op(name=norm_name, **bn_kwargs))
+      conv_name = 'segmentation_head_conv_{}'.format(i)
+      self._convs.append(
+          conv_op(
+              name=conv_name,
+              filters=self._config_dict['num_filters'],
+              **conv_kwargs))
+      norm_name = 'segmentation_head_norm_{}'.format(i)
+      self._norms.append(bn_op(name=norm_name, **bn_kwargs))
+
+    self._classifier = conv_op(
+        name='segmentation_output',
+        filters=self._config_dict['num_classes'],
+        kernel_size=self._config_dict['prediction_kernel_size'],
+        padding='same',
+        bias_initializer=tf.zeros_initializer(),
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'])
+
+    super().build(input_shape)
+
+  def call(self, inputs: Tuple[Union[tf.Tensor, Mapping[str, tf.Tensor]],
+                               Union[tf.Tensor, Mapping[str, tf.Tensor]]]):
+    """Forward pass of the segmentation head.
+
+    It supports both a tuple of 2 tensors or 2 dictionaries. The first is
+    backbone endpoints, and the second is decoder endpoints. When inputs are
+    tensors, they are from a single level of feature maps. When inputs are
+    dictionaries, they contain multiple levels of feature maps, where the key
+    is the index of feature map.
+
+    Args:
+      inputs: A tuple of 2 feature map tensors of shape
+        [batch, height_l, width_l, channels] or 2 dictionaries of tensors:
+        - key: A `str` of the level of the multilevel features.
+        - values: A `tf.Tensor` of the feature map tensors, whose shape is
+            [batch, height_l, width_l, channels].
+        The first is backbone endpoints, and the second is decoder endpoints.
+    Returns:
+      segmentation prediction mask: A `tf.Tensor` of the segmentation mask
+        scores predicted from input features.
+    """
+
+    backbone_output = inputs[0]
+    decoder_output = inputs[1]
+    if self._config_dict['feature_fusion'] == 'deeplabv3plus':
+      # deeplabv3+ feature fusion
+      x = decoder_output[str(self._config_dict['level'])] if isinstance(
+          decoder_output, dict) else decoder_output
+      y = backbone_output[str(self._config_dict['low_level'])] if isinstance(
+          backbone_output, dict) else backbone_output
+      y = self._dlv3p_norm(self._dlv3p_conv(y))
+      y = self._activation(y)
+
+      x = tf.image.resize(
+          x, tf.shape(y)[1:3], method=tf.image.ResizeMethod.BILINEAR)
+      x = tf.cast(x, dtype=y.dtype)
+      x = tf.concat([x, y], axis=self._bn_axis)
+    elif self._config_dict['feature_fusion'] == 'pyramid_fusion':
+      if not isinstance(decoder_output, dict):
+        raise ValueError('Only support dictionary decoder_output.')
+      x = nn_layers.pyramid_feature_fusion(decoder_output,
+                                           self._config_dict['level'])
+    elif self._config_dict['feature_fusion'] == 'panoptic_fpn_fusion':
+      x = self._panoptic_fpn_fusion(decoder_output)
+    else:
+      x = decoder_output[str(self._config_dict['level'])] if isinstance(
+          decoder_output, dict) else decoder_output
+
+    for conv, norm in zip(self._convs, self._norms):
+      x = conv(x)
+      x = norm(x)
+      x = self._activation(x)
+    if self._config_dict['upsample_factor'] > 1:
+      x = spatial_transform_ops.nearest_upsampling(
+          x, scale=self._config_dict['upsample_factor'])
+
+    return self._classifier(x)
+
+  def get_config(self):
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(self._config_dict.items()))
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/heads/segmentation_heads_test.py
+++ b/official/vision/modeling/heads/segmentation_heads_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for segmentation_heads.py."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling.heads import segmentation_heads
+
+
+class SegmentationHeadTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (2, 'pyramid_fusion', None, None),
+      (3, 'pyramid_fusion', None, None),
+      (2, 'panoptic_fpn_fusion', 2, 5),
+      (2, 'panoptic_fpn_fusion', 2, 6),
+      (3, 'panoptic_fpn_fusion', 3, 5),
+      (3, 'panoptic_fpn_fusion', 3, 6))
+  def test_forward(self, level, feature_fusion,
+                   decoder_min_level, decoder_max_level):
+    backbone_features = {
+        '3': np.random.rand(2, 128, 128, 16),
+        '4': np.random.rand(2, 64, 64, 16),
+        '5': np.random.rand(2, 32, 32, 16),
+    }
+    decoder_features = {
+        '3': np.random.rand(2, 128, 128, 64),
+        '4': np.random.rand(2, 64, 64, 64),
+        '5': np.random.rand(2, 32, 32, 64),
+        '6': np.random.rand(2, 16, 16, 64),
+    }
+
+    if feature_fusion == 'panoptic_fpn_fusion':
+      backbone_features['2'] = np.random.rand(2, 256, 256, 16)
+      decoder_features['2'] = np.random.rand(2, 256, 256, 64)
+
+    head = segmentation_heads.SegmentationHead(
+        num_classes=10,
+        level=level,
+        feature_fusion=feature_fusion,
+        decoder_min_level=decoder_min_level,
+        decoder_max_level=decoder_max_level,
+        num_decoder_filters=64)
+
+    logits = head((backbone_features, decoder_features))
+
+    if level in decoder_features:
+      self.assertAllEqual(logits.numpy().shape, [
+          2, decoder_features[str(level)].shape[1],
+          decoder_features[str(level)].shape[2], 10
+      ])
+
+  def test_serialize_deserialize(self):
+    head = segmentation_heads.SegmentationHead(num_classes=10, level=3)
+    config = head.get_config()
+    new_head = segmentation_heads.SegmentationHead.from_config(config)
+    self.assertAllEqual(head.get_config(), new_head.get_config())
+
+
+class MaskScoringHeadTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (1, 1, 64, [4, 4]),
+      (2, 1, 64, [4, 4]),
+      (3, 1, 64, [4, 4]),
+      (1, 2, 32, [8, 8]),
+      (2, 2, 32, [8, 8]),
+      (3, 2, 32, [8, 8]),)
+  def test_forward(self, num_convs, num_fcs,
+                   num_filters, fc_input_size):
+    features = np.random.rand(2, 64, 64, 16)
+
+    head = segmentation_heads.MaskScoring(
+        num_classes=2,
+        num_convs=num_convs,
+        num_filters=num_filters,
+        fc_dims=128,
+        fc_input_size=fc_input_size)
+
+    scores = head(features)
+    self.assertAllEqual(scores.numpy().shape, [2, 2])
+
+  def test_serialize_deserialize(self):
+    head = segmentation_heads.MaskScoring(
+        num_classes=2, fc_input_size=[4, 4], fc_dims=128)
+    config = head.get_config()
+    new_head = segmentation_heads.MaskScoring.from_config(config)
+    self.assertAllEqual(head.get_config(), new_head.get_config())
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/__init__.py
+++ b/official/vision/modeling/layers/__init__.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Layers package definition."""
+
+from official.vision.modeling.layers.box_sampler import BoxSampler
+from official.vision.modeling.layers.detection_generator import DetectionGenerator
+from official.vision.modeling.layers.detection_generator import MultilevelDetectionGenerator
+from official.vision.modeling.layers.mask_sampler import MaskSampler
+from official.vision.modeling.layers.nn_blocks import BottleneckBlock
+from official.vision.modeling.layers.nn_blocks import BottleneckResidualInner
+from official.vision.modeling.layers.nn_blocks import DepthwiseSeparableConvBlock
+from official.vision.modeling.layers.nn_blocks import InvertedBottleneckBlock
+from official.vision.modeling.layers.nn_blocks import ResidualBlock
+from official.vision.modeling.layers.nn_blocks import ResidualInner
+from official.vision.modeling.layers.nn_blocks import ReversibleLayer
+from official.vision.modeling.layers.nn_blocks_3d import BottleneckBlock3D
+from official.vision.modeling.layers.nn_blocks_3d import SelfGating
+from official.vision.modeling.layers.nn_layers import CausalConvMixin
+from official.vision.modeling.layers.nn_layers import Conv2D
+from official.vision.modeling.layers.nn_layers import Conv3D
+from official.vision.modeling.layers.nn_layers import DepthwiseConv2D
+from official.vision.modeling.layers.nn_layers import GlobalAveragePool3D
+from official.vision.modeling.layers.nn_layers import PositionalEncoding
+from official.vision.modeling.layers.nn_layers import Scale
+from official.vision.modeling.layers.nn_layers import SpatialAveragePool3D
+from official.vision.modeling.layers.nn_layers import SqueezeExcitation
+from official.vision.modeling.layers.nn_layers import StochasticDepth
+from official.vision.modeling.layers.nn_layers import TemporalSoftmaxPool
+from official.vision.modeling.layers.roi_aligner import MultilevelROIAligner
+from official.vision.modeling.layers.roi_generator import MultilevelROIGenerator
+from official.vision.modeling.layers.roi_sampler import ROISampler
--- a/official/vision/modeling/layers/box_sampler.py
+++ b/official/vision/modeling/layers/box_sampler.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of box sampler."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.ops import sampling_ops
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BoxSampler(tf.keras.layers.Layer):
+  """Creates a BoxSampler to sample positive and negative boxes."""
+
+  def __init__(self,
+               num_samples: int = 512,
+               foreground_fraction: float = 0.25,
+               **kwargs):
+    """Initializes a box sampler.
+
+    Args:
+      num_samples: An `int` of the number of sampled boxes per image.
+      foreground_fraction: A `float` in [0, 1], what percentage of boxes should
+        be sampled from the positive examples.
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'num_samples': num_samples,
+        'foreground_fraction': foreground_fraction,
+    }
+    super(BoxSampler, self).__init__(**kwargs)
+
+  def call(self, positive_matches: tf.Tensor, negative_matches: tf.Tensor,
+           ignored_matches: tf.Tensor):
+    """Samples and selects positive and negative instances.
+
+    Args:
+      positive_matches: A `bool` tensor of shape of [batch, N] where N is the
+        number of instances. For each element, `True` means the instance
+        corresponds to a positive example.
+      negative_matches: A `bool` tensor of shape of [batch, N] where N is the
+        number of instances. For each element, `True` means the instance
+        corresponds to a negative example.
+      ignored_matches: A `bool` tensor of shape of [batch, N] where N is the
+        number of instances. For each element, `True` means the instance should
+        be ignored.
+
+    Returns:
+      A `tf.tensor` of shape of [batch_size, K], storing the indices of the
+        sampled examples, where K is `num_samples`.
+    """
+    sample_candidates = tf.logical_and(
+        tf.logical_or(positive_matches, negative_matches),
+        tf.logical_not(ignored_matches))
+
+    sampler = sampling_ops.BalancedPositiveNegativeSampler(
+        positive_fraction=self._config_dict['foreground_fraction'],
+        is_static=True)
+
+    batch_size = sample_candidates.shape[0]
+    sampled_indicators = []
+    for i in range(batch_size):
+      sampled_indicator = sampler.subsample(
+          sample_candidates[i],
+          self._config_dict['num_samples'],
+          positive_matches[i])
+      sampled_indicators.append(sampled_indicator)
+    sampled_indicators = tf.stack(sampled_indicators)
+    _, selected_indices = tf.nn.top_k(
+        tf.cast(sampled_indicators, dtype=tf.int32),
+        k=self._config_dict['num_samples'],
+        sorted=True)
+
+    return selected_indices
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/layers/deeplab.py
+++ b/official/vision/modeling/layers/deeplab.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Layers for DeepLabV3."""
+
+import tensorflow as tf
+
+
+class SpatialPyramidPooling(tf.keras.layers.Layer):
+  """Implements the Atrous Spatial Pyramid Pooling.
+
+  References:
+    [Rethinking Atrous Convolution for Semantic Image Segmentation](
+      https://arxiv.org/pdf/1706.05587.pdf)
+    [Encoder-Decoder with Atrous Separable Convolution for Semantic Image
+    Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
+  """
+
+  def __init__(
+      self,
+      output_channels,
+      dilation_rates,
+      pool_kernel_size=None,
+      use_sync_bn=False,
+      batchnorm_momentum=0.99,
+      batchnorm_epsilon=0.001,
+      activation='relu',
+      dropout=0.5,
+      kernel_initializer='glorot_uniform',
+      kernel_regularizer=None,
+      interpolation='bilinear',
+      use_depthwise_convolution=False,
+      **kwargs):
+    """Initializes `SpatialPyramidPooling`.
+
+    Args:
+      output_channels: Number of channels produced by SpatialPyramidPooling.
+      dilation_rates: A list of integers for parallel dilated conv.
+      pool_kernel_size: A list of integers or None. If None, global average
+        pooling is applied, otherwise an average pooling of pool_kernel_size
+        is applied.
+      use_sync_bn: A bool, whether or not to use sync batch normalization.
+      batchnorm_momentum: A float for the momentum in BatchNorm. Defaults to
+        0.99.
+      batchnorm_epsilon: A float for the epsilon value in BatchNorm. Defaults to
+        0.001.
+      activation: A `str` for type of activation to be used. Defaults to 'relu'.
+      dropout: A float for the dropout rate before output. Defaults to 0.5.
+      kernel_initializer: Kernel initializer for conv layers. Defaults to
+        `glorot_uniform`.
+      kernel_regularizer: Kernel regularizer for conv layers. Defaults to None.
+      interpolation: The interpolation method for upsampling. Defaults to
+        `bilinear`.
+      use_depthwise_convolution: Allows spatial pooling to be separable
+         depthwise convolusions. [Encoder-Decoder with Atrous Separable
+         Convolution for Semantic Image Segmentation](
+         https://arxiv.org/pdf/1802.02611.pdf)
+      **kwargs: Other keyword arguments for the layer.
+    """
+    super(SpatialPyramidPooling, self).__init__(**kwargs)
+
+    self.output_channels = output_channels
+    self.dilation_rates = dilation_rates
+    self.use_sync_bn = use_sync_bn
+    self.batchnorm_momentum = batchnorm_momentum
+    self.batchnorm_epsilon = batchnorm_epsilon
+    self.activation = activation
+    self.dropout = dropout
+    self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self.interpolation = interpolation
+    self.input_spec = tf.keras.layers.InputSpec(ndim=4)
+    self.pool_kernel_size = pool_kernel_size
+    self.use_depthwise_convolution = use_depthwise_convolution
+
+  def build(self, input_shape):
+    height = input_shape[1]
+    width = input_shape[2]
+    channels = input_shape[3]
+
+    self.aspp_layers = []
+
+    if self.use_sync_bn:
+      bn_op = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      bn_op = tf.keras.layers.BatchNormalization
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      bn_axis = -1
+    else:
+      bn_axis = 1
+
+    conv_sequential = tf.keras.Sequential([
+        tf.keras.layers.Conv2D(
+            filters=self.output_channels, kernel_size=(1, 1),
+            kernel_initializer=self.kernel_initializer,
+            kernel_regularizer=self.kernel_regularizer,
+            use_bias=False),
+        bn_op(
+            axis=bn_axis,
+            momentum=self.batchnorm_momentum,
+            epsilon=self.batchnorm_epsilon),
+        tf.keras.layers.Activation(self.activation)
+    ])
+    self.aspp_layers.append(conv_sequential)
+
+    for dilation_rate in self.dilation_rates:
+      leading_layers = []
+      kernel_size = (3, 3)
+      if self.use_depthwise_convolution:
+        leading_layers += [
+            tf.keras.layers.DepthwiseConv2D(
+                depth_multiplier=1, kernel_size=kernel_size,
+                padding='same', depthwise_regularizer=self.kernel_regularizer,
+                depthwise_initializer=self.kernel_initializer,
+                dilation_rate=dilation_rate, use_bias=False)
+        ]
+        kernel_size = (1, 1)
+      conv_sequential = tf.keras.Sequential(leading_layers + [
+          tf.keras.layers.Conv2D(
+              filters=self.output_channels, kernel_size=kernel_size,
+              padding='same', kernel_regularizer=self.kernel_regularizer,
+              kernel_initializer=self.kernel_initializer,
+              dilation_rate=dilation_rate, use_bias=False),
+          bn_op(axis=bn_axis, momentum=self.batchnorm_momentum,
+                epsilon=self.batchnorm_epsilon),
+          tf.keras.layers.Activation(self.activation)])
+      self.aspp_layers.append(conv_sequential)
+
+    if self.pool_kernel_size is None:
+      pool_sequential = tf.keras.Sequential([
+          tf.keras.layers.GlobalAveragePooling2D(),
+          tf.keras.layers.Reshape((1, 1, channels))])
+    else:
+      pool_sequential = tf.keras.Sequential([
+          tf.keras.layers.AveragePooling2D(self.pool_kernel_size)])
+
+    pool_sequential.add(
+        tf.keras.Sequential([
+            tf.keras.layers.Conv2D(
+                filters=self.output_channels,
+                kernel_size=(1, 1),
+                kernel_initializer=self.kernel_initializer,
+                kernel_regularizer=self.kernel_regularizer,
+                use_bias=False),
+            bn_op(
+                axis=bn_axis,
+                momentum=self.batchnorm_momentum,
+                epsilon=self.batchnorm_epsilon),
+            tf.keras.layers.Activation(self.activation),
+            tf.keras.layers.experimental.preprocessing.Resizing(
+                height,
+                width,
+                interpolation=self.interpolation,
+                dtype=tf.float32)
+        ]))
+
+    self.aspp_layers.append(pool_sequential)
+
+    self.projection = tf.keras.Sequential([
+        tf.keras.layers.Conv2D(
+            filters=self.output_channels, kernel_size=(1, 1),
+            kernel_initializer=self.kernel_initializer,
+            kernel_regularizer=self.kernel_regularizer,
+            use_bias=False),
+        bn_op(
+            axis=bn_axis,
+            momentum=self.batchnorm_momentum,
+            epsilon=self.batchnorm_epsilon),
+        tf.keras.layers.Activation(self.activation),
+        tf.keras.layers.Dropout(rate=self.dropout)])
+
+  def call(self, inputs, training=None):
+    if training is None:
+      training = tf.keras.backend.learning_phase()
+    result = []
+    for layer in self.aspp_layers:
+      result.append(tf.cast(layer(inputs, training=training), inputs.dtype))
+    result = tf.concat(result, axis=-1)
+    result = self.projection(result, training=training)
+    return result
+
+  def get_config(self):
+    config = {
+        'output_channels': self.output_channels,
+        'dilation_rates': self.dilation_rates,
+        'pool_kernel_size': self.pool_kernel_size,
+        'use_sync_bn': self.use_sync_bn,
+        'batchnorm_momentum': self.batchnorm_momentum,
+        'batchnorm_epsilon': self.batchnorm_epsilon,
+        'activation': self.activation,
+        'dropout': self.dropout,
+        'kernel_initializer': tf.keras.initializers.serialize(
+            self.kernel_initializer),
+        'kernel_regularizer': tf.keras.regularizers.serialize(
+            self.kernel_regularizer),
+        'interpolation': self.interpolation,
+    }
+    base_config = super(SpatialPyramidPooling, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
--- a/official/vision/modeling/layers/deeplab_test.py
+++ b/official/vision/modeling/layers/deeplab_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for ASPP."""
+
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized
+from official.vision.modeling.layers import deeplab
+
+
+@keras_parameterized.run_all_keras_modes
+class DeeplabTest(keras_parameterized.TestCase):
+
+  @keras_parameterized.parameterized.parameters(
+      (None,),
+      ([32, 32],),
+      )
+  def test_aspp(self, pool_kernel_size):
+    inputs = tf.keras.Input(shape=(64, 64, 128), dtype=tf.float32)
+    layer = deeplab.SpatialPyramidPooling(output_channels=256,
+                                          dilation_rates=[6, 12, 18],
+                                          pool_kernel_size=None)
+    output = layer(inputs)
+    self.assertAllEqual([None, 64, 64, 256], output.shape)
+
+  def test_aspp_invalid_shape(self):
+    inputs = tf.keras.Input(shape=(64, 64), dtype=tf.float32)
+    layer = deeplab.SpatialPyramidPooling(output_channels=256,
+                                          dilation_rates=[6, 12, 18])
+    with self.assertRaises(ValueError):
+      _ = layer(inputs)
+
+  def test_config_with_custom_name(self):
+    layer = deeplab.SpatialPyramidPooling(256, [5], name='aspp')
+    config = layer.get_config()
+    layer_1 = deeplab.SpatialPyramidPooling.from_config(config)
+    self.assertEqual(layer_1.name, layer.name)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/detection_generator.py
+++ b/official/vision/modeling/layers/detection_generator.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of generators to generate the final detections."""
+import contextlib
+from typing import Any, Dict, List, Optional, Mapping, Sequence
+# Import libraries
+import tensorflow as tf
+
+from official.vision.ops import box_ops
+from official.vision.ops import nms
+from official.vision.ops import preprocess_ops
+
+
+def _generate_detections_v1(boxes: tf.Tensor,
+                            scores: tf.Tensor,
+                            attributes: Optional[Mapping[str,
+                                                         tf.Tensor]] = None,
+                            pre_nms_top_k: int = 5000,
+                            pre_nms_score_threshold: float = 0.05,
+                            nms_iou_threshold: float = 0.5,
+                            max_num_detections: int = 100,
+                            soft_nms_sigma: Optional[float] = None):
+  """Generates the final detections given the model outputs.
+
+  The implementation unrolls the batch dimension and process images one by one.
+  It required the batch dimension to be statically known and it is TPU
+  compatible.
+
+  Args:
+    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
+      `[batch_size, N, 1, 4]` for box predictions on all feature levels. The
+      N is the number of total anchors on all levels.
+    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
+      stacks class probability on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    attributes: None or a dict of (attribute_name, attributes) pairs. Each
+      attributes is a `tf.Tensor` with shape
+      `[batch_size, N, num_classes, attribute_size]` or
+      `[batch_size, N, 1, attribute_size]` for attribute predictions on all
+      feature levels. The N is the number of total anchors on all levels. Can
+      be None if no attribute learning is required.
+    pre_nms_top_k: An `int` number of top candidate detections per class before
+      NMS.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
+      when to remove boxes based on score.
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    max_num_detections: A scalar representing maximum number of boxes retained
+      over all classes.
+    soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
+      When soft_nms_sigma=0.0 (which is default), we fall back to standard NMS.
+
+  Returns:
+    nms_boxes: A `float` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections, 4]` representing top detected boxes in
+      `[y1, x1, y2, x2]`.
+    nms_scores: A `float` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections]` representing sorted confidence scores
+      for detected boxes. The values are between `[0, 1]`.
+    nms_classes: An `int` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections]` representing classes for detected
+      boxes.
+    valid_detections: An `int` type `tf.Tensor` of shape `[batch_size]` only the
+       top `valid_detections` boxes are valid detections.
+    nms_attributes: None or a dict of (attribute_name, attributes). Each
+      attribute is a `float` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections, attribute_size]` representing attribute
+      predictions for detected boxes. Can be an empty dict if no attribute
+      learning is required.
+  """
+  with tf.name_scope('generate_detections'):
+    batch_size = scores.get_shape().as_list()[0]
+    nmsed_boxes = []
+    nmsed_classes = []
+    nmsed_scores = []
+    valid_detections = []
+    if attributes:
+      nmsed_attributes = {att_name: [] for att_name in attributes.keys()}
+    else:
+      nmsed_attributes = {}
+
+    for i in range(batch_size):
+      (nmsed_boxes_i, nmsed_scores_i, nmsed_classes_i, valid_detections_i,
+       nmsed_att_i) = _generate_detections_per_image(
+           boxes[i],
+           scores[i],
+           attributes={
+               att_name: att[i] for att_name, att in attributes.items()
+           } if attributes else {},
+           pre_nms_top_k=pre_nms_top_k,
+           pre_nms_score_threshold=pre_nms_score_threshold,
+           nms_iou_threshold=nms_iou_threshold,
+           max_num_detections=max_num_detections,
+           soft_nms_sigma=soft_nms_sigma)
+      nmsed_boxes.append(nmsed_boxes_i)
+      nmsed_scores.append(nmsed_scores_i)
+      nmsed_classes.append(nmsed_classes_i)
+      valid_detections.append(valid_detections_i)
+      if attributes:
+        for att_name in attributes.keys():
+          nmsed_attributes[att_name].append(nmsed_att_i[att_name])
+
+  nmsed_boxes = tf.stack(nmsed_boxes, axis=0)
+  nmsed_scores = tf.stack(nmsed_scores, axis=0)
+  nmsed_classes = tf.stack(nmsed_classes, axis=0)
+  valid_detections = tf.stack(valid_detections, axis=0)
+  if attributes:
+    for att_name in attributes.keys():
+      nmsed_attributes[att_name] = tf.stack(nmsed_attributes[att_name], axis=0)
+
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes
+
+
+def _generate_detections_per_image(
+    boxes: tf.Tensor,
+    scores: tf.Tensor,
+    attributes: Optional[Mapping[str, tf.Tensor]] = None,
+    pre_nms_top_k: int = 5000,
+    pre_nms_score_threshold: float = 0.05,
+    nms_iou_threshold: float = 0.5,
+    max_num_detections: int = 100,
+    soft_nms_sigma: Optional[float] = None):
+  """Generates the final detections per image given the model outputs.
+
+  Args:
+    boxes: A  `tf.Tensor` with shape `[N, num_classes, 4]` or `[N, 1, 4]`, which
+      box predictions on all feature levels. The N is the number of total
+      anchors on all levels.
+    scores: A `tf.Tensor` with shape `[N, num_classes]`, which stacks class
+      probability on all feature levels. The N is the number of total anchors on
+      all levels. The num_classes is the number of classes predicted by the
+      model. Note that the class_outputs here is the raw score.
+    attributes: If not None, a dict of `tf.Tensor`. Each value is in shape
+      `[N, num_classes, attribute_size]` or `[N, 1, attribute_size]` of
+      attribute predictions on all feature levels. The N is the number of total
+      anchors on all levels.
+    pre_nms_top_k: An `int` number of top candidate detections per class before
+      NMS.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
+      when to remove boxes based on score.
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    max_num_detections: A `scalar` representing maximum number of boxes retained
+      over all classes.
+    soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
+      When soft_nms_sigma=0.0, we fall back to standard NMS.
+      If set to None, `tf.image.non_max_suppression_padded` is called instead.
+
+  Returns:
+    nms_boxes: A `float` tf.Tensor of shape `[max_num_detections, 4]`
+      representing top detected boxes in `[y1, x1, y2, x2]`.
+    nms_scores: A `float` tf.Tensor of shape `[max_num_detections]` representing
+      sorted confidence scores for detected boxes. The values are between [0,
+      1].
+    nms_classes: An `int` tf.Tensor of shape `[max_num_detections]` representing
+      classes for detected boxes.
+    valid_detections: An `int` tf.Tensor of shape [1] only the top
+      `valid_detections` boxes are valid detections.
+    nms_attributes: None or a dict. Each value is a `float` tf.Tensor of shape
+      `[max_num_detections, attribute_size]` representing attribute predictions
+      for detected boxes. Can be an empty dict if `attributes` is None.
+  """
+  nmsed_boxes = []
+  nmsed_scores = []
+  nmsed_classes = []
+  num_classes_for_box = boxes.get_shape().as_list()[1]
+  num_classes = scores.get_shape().as_list()[1]
+  if attributes:
+    nmsed_attributes = {att_name: [] for att_name in attributes.keys()}
+  else:
+    nmsed_attributes = {}
+
+  for i in range(num_classes):
+    boxes_i = boxes[:, min(num_classes_for_box - 1, i)]
+    scores_i = scores[:, i]
+    # Obtains pre_nms_top_k before running NMS.
+    scores_i, indices = tf.nn.top_k(
+        scores_i, k=tf.minimum(tf.shape(scores_i)[-1], pre_nms_top_k))
+    boxes_i = tf.gather(boxes_i, indices)
+
+    if soft_nms_sigma is not None:
+      (nmsed_indices_i,
+       nmsed_scores_i) = tf.image.non_max_suppression_with_scores(
+           tf.cast(boxes_i, tf.float32),
+           tf.cast(scores_i, tf.float32),
+           max_num_detections,
+           iou_threshold=nms_iou_threshold,
+           score_threshold=pre_nms_score_threshold,
+           soft_nms_sigma=soft_nms_sigma,
+           name='nms_detections_' + str(i))
+      nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
+      nmsed_boxes_i = preprocess_ops.clip_or_pad_to_fixed_size(
+          nmsed_boxes_i, max_num_detections, 0.0)
+      nmsed_scores_i = preprocess_ops.clip_or_pad_to_fixed_size(
+          nmsed_scores_i, max_num_detections, -1.0)
+    else:
+      (nmsed_indices_i,
+       nmsed_num_valid_i) = tf.image.non_max_suppression_padded(
+           tf.cast(boxes_i, tf.float32),
+           tf.cast(scores_i, tf.float32),
+           max_num_detections,
+           iou_threshold=nms_iou_threshold,
+           score_threshold=pre_nms_score_threshold,
+           pad_to_max_output_size=True,
+           name='nms_detections_' + str(i))
+      nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
+      nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i)
+      # Sets scores of invalid boxes to -1.
+      nmsed_scores_i = tf.where(
+          tf.less(tf.range(max_num_detections), [nmsed_num_valid_i]),
+          nmsed_scores_i, -tf.ones_like(nmsed_scores_i))
+
+    nmsed_classes_i = tf.fill([max_num_detections], i)
+    nmsed_boxes.append(nmsed_boxes_i)
+    nmsed_scores.append(nmsed_scores_i)
+    nmsed_classes.append(nmsed_classes_i)
+    if attributes:
+      for att_name, att in attributes.items():
+        num_classes_for_attr = att.get_shape().as_list()[1]
+        att_i = att[:, min(num_classes_for_attr - 1, i)]
+        att_i = tf.gather(att_i, indices)
+        nmsed_att_i = tf.gather(att_i, nmsed_indices_i)
+        nmsed_att_i = preprocess_ops.clip_or_pad_to_fixed_size(
+            nmsed_att_i, max_num_detections, 0.0)
+        nmsed_attributes[att_name].append(nmsed_att_i)
+
+  # Concats results from all classes and sort them.
+  nmsed_boxes = tf.concat(nmsed_boxes, axis=0)
+  nmsed_scores = tf.concat(nmsed_scores, axis=0)
+  nmsed_classes = tf.concat(nmsed_classes, axis=0)
+  nmsed_scores, indices = tf.nn.top_k(
+      nmsed_scores, k=max_num_detections, sorted=True)
+  nmsed_boxes = tf.gather(nmsed_boxes, indices)
+  nmsed_classes = tf.gather(nmsed_classes, indices)
+  valid_detections = tf.reduce_sum(
+      tf.cast(tf.greater(nmsed_scores, -1), tf.int32))
+  if attributes:
+    for att_name in attributes.keys():
+      nmsed_attributes[att_name] = tf.concat(nmsed_attributes[att_name], axis=0)
+      nmsed_attributes[att_name] = tf.gather(nmsed_attributes[att_name],
+                                             indices)
+
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes
+
+
+def _select_top_k_scores(scores_in: tf.Tensor, pre_nms_num_detections: int):
+  """Selects top_k scores and indices for each class.
+
+  Args:
+    scores_in: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
+      stacks class logit outputs on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model.
+    pre_nms_num_detections: Number of candidates before NMS.
+
+  Returns:
+    scores and indices: A `tf.Tensor` with shape
+      `[batch_size, pre_nms_num_detections, num_classes]`.
+  """
+  batch_size, num_anchors, num_class = scores_in.get_shape().as_list()
+  if batch_size is None:
+    batch_size = tf.shape(scores_in)[0]
+  scores_trans = tf.transpose(scores_in, perm=[0, 2, 1])
+  scores_trans = tf.reshape(scores_trans, [-1, num_anchors])
+
+  top_k_scores, top_k_indices = tf.nn.top_k(
+      scores_trans, k=pre_nms_num_detections, sorted=True)
+
+  top_k_scores = tf.reshape(top_k_scores,
+                            [batch_size, num_class, pre_nms_num_detections])
+  top_k_indices = tf.reshape(top_k_indices,
+                             [batch_size, num_class, pre_nms_num_detections])
+
+  return tf.transpose(top_k_scores,
+                      [0, 2, 1]), tf.transpose(top_k_indices, [0, 2, 1])
+
+
+def _generate_detections_v2(boxes: tf.Tensor,
+                            scores: tf.Tensor,
+                            pre_nms_top_k: int = 5000,
+                            pre_nms_score_threshold: float = 0.05,
+                            nms_iou_threshold: float = 0.5,
+                            max_num_detections: int = 100):
+  """Generates the final detections given the model outputs.
+
+  This implementation unrolls classes dimension while using the tf.while_loop
+  to implement the batched NMS, so that it can be parallelized at the batch
+  dimension. It should give better performance comparing to v1 implementation.
+  It is TPU compatible.
+
+  Args:
+    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
+      `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
+      N is the number of total anchors on all levels.
+    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
+      stacks class probability on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    pre_nms_top_k: An `int` number of top candidate detections per class before
+      NMS.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
+      when to remove boxes based on score.
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    max_num_detections: A `scalar` representing maximum number of boxes retained
+      over all classes.
+
+  Returns:
+    nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4]
+      representing top detected boxes in [y1, x1, y2, x2].
+    nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections]
+      representing sorted confidence scores for detected boxes. The values are
+      between [0, 1].
+    nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections]
+      representing classes for detected boxes.
+    valid_detections: An `int` tf.Tensor of shape [batch_size] only the top
+      `valid_detections` boxes are valid detections.
+  """
+  with tf.name_scope('generate_detections'):
+    nmsed_boxes = []
+    nmsed_classes = []
+    nmsed_scores = []
+    valid_detections = []
+    batch_size, _, num_classes_for_box, _ = boxes.get_shape().as_list()
+    if batch_size is None:
+      batch_size = tf.shape(boxes)[0]
+    _, total_anchors, num_classes = scores.get_shape().as_list()
+    # Selects top pre_nms_num scores and indices before NMS.
+    scores, indices = _select_top_k_scores(
+        scores, min(total_anchors, pre_nms_top_k))
+    for i in range(num_classes):
+      boxes_i = boxes[:, :, min(num_classes_for_box - 1, i), :]
+      scores_i = scores[:, :, i]
+      # Obtains pre_nms_top_k before running NMS.
+      boxes_i = tf.gather(boxes_i, indices[:, :, i], batch_dims=1, axis=1)
+
+      # Filter out scores.
+      boxes_i, scores_i = box_ops.filter_boxes_by_scores(
+          boxes_i, scores_i, min_score_threshold=pre_nms_score_threshold)
+
+      (nmsed_scores_i, nmsed_boxes_i) = nms.sorted_non_max_suppression_padded(
+          tf.cast(scores_i, tf.float32),
+          tf.cast(boxes_i, tf.float32),
+          max_num_detections,
+          iou_threshold=nms_iou_threshold)
+      nmsed_classes_i = tf.fill([batch_size, max_num_detections], i)
+      nmsed_boxes.append(nmsed_boxes_i)
+      nmsed_scores.append(nmsed_scores_i)
+      nmsed_classes.append(nmsed_classes_i)
+  nmsed_boxes = tf.concat(nmsed_boxes, axis=1)
+  nmsed_scores = tf.concat(nmsed_scores, axis=1)
+  nmsed_classes = tf.concat(nmsed_classes, axis=1)
+  nmsed_scores, indices = tf.nn.top_k(
+      nmsed_scores, k=max_num_detections, sorted=True)
+  nmsed_boxes = tf.gather(nmsed_boxes, indices, batch_dims=1, axis=1)
+  nmsed_classes = tf.gather(nmsed_classes, indices, batch_dims=1)
+  valid_detections = tf.reduce_sum(
+      input_tensor=tf.cast(tf.greater(nmsed_scores, 0.0), tf.int32), axis=1)
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+
+
+def _generate_detections_batched(boxes: tf.Tensor, scores: tf.Tensor,
+                                 pre_nms_score_threshold: float,
+                                 nms_iou_threshold: float,
+                                 max_num_detections: int):
+  """Generates detected boxes with scores and classes for one-stage detector.
+
+  The function takes output of multi-level ConvNets and anchor boxes and
+  generates detected boxes. Note that this used batched nms, which is not
+  supported on TPU currently.
+
+  Args:
+    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
+      `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
+      N is the number of total anchors on all levels.
+    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
+      stacks class probability on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
+      when to remove boxes based on score.
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    max_num_detections: A `scalar` representing maximum number of boxes retained
+      over all classes.
+
+  Returns:
+    nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4]
+      representing top detected boxes in [y1, x1, y2, x2].
+    nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections]
+      representing sorted confidence scores for detected boxes. The values are
+      between [0, 1].
+    nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections]
+      representing classes for detected boxes.
+    valid_detections: An `int` tf.Tensor of shape [batch_size] only the top
+      `valid_detections` boxes are valid detections.
+  """
+  with tf.name_scope('generate_detections'):
+    nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+        tf.image.combined_non_max_suppression(
+            boxes,
+            scores,
+            max_output_size_per_class=max_num_detections,
+            max_total_size=max_num_detections,
+            iou_threshold=nms_iou_threshold,
+            score_threshold=pre_nms_score_threshold,
+            pad_per_class=False,
+            clip_boxes=False))
+    nmsed_classes = tf.cast(nmsed_classes, tf.int32)
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+
+
+def _generate_detections_tflite_implements_signature(
+    config: Dict[str, Any]) -> str:
+  """Returns `experimental_implements` signature for TFLite's custom NMS op.
+
+  This signature encodes the arguments to correctly initialize TFLite's custom
+  post-processing op in the MLIR converter.
+  For details on `experimental_implements` see here:
+  https://www.tensorflow.org/api_docs/python/tf/function
+
+  Args:
+    config: A dictionary of configs defining parameters for TFLite NMS op.
+
+  Returns:
+    An `experimental_implements` signature string.
+  """
+  scale_value = 1.0
+
+  implements_signature = [
+      'name: "%s"' % 'TFLite_Detection_PostProcess',
+      'attr { key: "max_detections" value { i: %d } }' %
+      config['max_detections'],
+      'attr { key: "max_classes_per_detection" value { i: %d } }' %
+      config['max_classes_per_detection'],
+      'attr { key: "use_regular_nms" value { b: %s } }' %
+      str(config['use_regular_nms']).lower(),
+      'attr { key: "nms_score_threshold" value { f: %f } }' %
+      config['nms_score_threshold'],
+      'attr { key: "nms_iou_threshold" value { f: %f } }' %
+      config['nms_iou_threshold'],
+      'attr { key: "y_scale" value { f: %f } }' % scale_value,
+      'attr { key: "x_scale" value { f: %f } }' % scale_value,
+      'attr { key: "h_scale" value { f: %f } }' % scale_value,
+      'attr { key: "w_scale" value { f: %f } }' % scale_value,
+      'attr { key: "num_classes" value { i: %d } }' % config['num_classes']
+  ]
+  implements_signature = ' '.join(implements_signature)
+  return implements_signature
+
+
+def _generate_detections_tflite(raw_boxes: Mapping[str, tf.Tensor],
+                                raw_scores: Mapping[str, tf.Tensor],
+                                anchor_boxes: Mapping[str, tf.Tensor],
+                                config: Dict[str, Any]) -> Sequence[Any]:
+  """Generate detections for conversion to TFLite.
+
+  Mathematically same as class-agnostic NMS, except that the last portion of
+  the TF graph constitutes a dummy `tf.function` that contains an annotation
+  for conversion to TFLite's custom NMS op. Using this custom op allows
+  features like post-training quantization & accelerator support.
+  NOTE: This function does NOT return a valid output, and is only meant to
+  generate a SavedModel for TFLite conversion via MLIR. The generated SavedModel
+  should not be used for inference.
+  For TFLite op details, see tensorflow/lite/kernels/detection_postprocess.cc
+
+  Args:
+    raw_boxes: A dictionary of tensors for raw boxes. Key is level of features
+      and value is a tensor denoting a level of boxes with shape [1, H, W, 4 *
+      num_anchors].
+    raw_scores: A dictionary of tensors for classes. Key is level of features
+      and value is a tensor denoting a level of logits with shape [1, H, W,
+      num_class * num_anchors].
+    anchor_boxes: A dictionary of tensors for anchor boxes. Key is level of
+      features and value is a tensor denoting a level of anchors with shape
+      [num_anchors, 4].
+    config: A dictionary of configs defining parameters for TFLite NMS op.
+
+  Returns:
+    A (dummy) tuple of (boxes, scores, classess, num_detections).
+
+  Raises:
+    ValueError: If the last dimension of predicted boxes is not divisible by 4,
+      or the last dimension of predicted scores is not divisible by number of
+      anchors per location.
+  """
+  scores, boxes, anchors = [], [], []
+  levels = list(raw_scores.keys())
+  min_level = int(min(levels))
+  max_level = int(max(levels))
+  batch_size = tf.shape(raw_scores[str(min_level)])[0]
+
+  num_anchors_per_locations_times_4 = raw_boxes[str(
+      min_level)].get_shape().as_list()[-1]
+  if num_anchors_per_locations_times_4 % 4 != 0:
+    raise ValueError(
+        'The last dimension of predicted boxes should be divisible by 4.')
+  num_anchors_per_locations = num_anchors_per_locations_times_4 // 4
+  if num_anchors_per_locations_times_4 % 4 != 0:
+    raise ValueError(
+        f'The last dimension of predicted scores should be divisible by {num_anchors_per_locations}.'
+    )
+  num_classes = raw_scores[str(
+      min_level)].get_shape().as_list()[-1] // num_anchors_per_locations
+  config.update({'num_classes': num_classes})
+
+  for i in range(min_level, max_level + 1):
+    scores.append(
+        tf.sigmoid(
+            tf.reshape(raw_scores[str(i)], [batch_size, -1, num_classes])))
+    boxes.append(tf.reshape(raw_boxes[str(i)], [batch_size, -1, 4]))
+    anchors.append(tf.reshape(anchor_boxes[str(i)], [-1, 4]))
+  scores = tf.concat(scores, 1)
+  boxes = tf.concat(boxes, 1)
+  anchors = tf.concat(anchors, 0)
+
+  ycenter_a = (anchors[..., 0] + anchors[..., 2]) / 2
+  xcenter_a = (anchors[..., 1] + anchors[..., 3]) / 2
+  ha = anchors[..., 2] - anchors[..., 0]
+  wa = anchors[..., 3] - anchors[..., 1]
+  anchors = tf.stack([ycenter_a, xcenter_a, ha, wa], axis=-1)
+
+  # There is no TF equivalent for TFLite's custom post-processing op.
+  # So we add an 'empty' composite function here, that is legalized to the
+  # custom op with MLIR.
+  # For details, see: tensorflow/compiler/mlir/lite/utils/nms_utils.cc
+  @tf.function(
+      experimental_implements=_generate_detections_tflite_implements_signature(
+          config))
+  # pylint: disable=g-unused-argument,unused-argument
+  def dummy_post_processing(input_boxes, input_scores, input_anchors):
+    boxes = tf.constant(0.0, dtype=tf.float32, name='boxes')
+    scores = tf.constant(0.0, dtype=tf.float32, name='scores')
+    classes = tf.constant(0.0, dtype=tf.float32, name='classes')
+    num_detections = tf.constant(0.0, dtype=tf.float32, name='num_detections')
+    return boxes, classes, scores, num_detections
+
+  return dummy_post_processing(boxes, scores, anchors)[::-1]
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class DetectionGenerator(tf.keras.layers.Layer):
+  """Generates the final detected boxes with scores and classes."""
+
+  def __init__(self,
+               apply_nms: bool = True,
+               pre_nms_top_k: int = 5000,
+               pre_nms_score_threshold: float = 0.05,
+               nms_iou_threshold: float = 0.5,
+               max_num_detections: int = 100,
+               nms_version: str = 'v2',
+               use_cpu_nms: bool = False,
+               soft_nms_sigma: Optional[float] = None,
+               **kwargs):
+    """Initializes a detection generator.
+
+    Args:
+      apply_nms: A `bool` of whether or not apply non maximum suppression.
+        If False, the decoded boxes and their scores are returned.
+      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
+        before applying NMS.
+      pre_nms_score_threshold: A `float` of the score threshold to apply before
+        applying  NMS. Proposals whose scores are below this threshold are
+        thrown away.
+      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
+      max_num_detections: An `int` of the final number of total detections to
+        generate.
+      nms_version: A string of `batched`, `v1` or `v2` specifies NMS version.
+      use_cpu_nms: A `bool` of whether or not enforce NMS to run on CPU.
+      soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
+        When soft_nms_sigma=0.0, we fall back to standard NMS.
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'apply_nms': apply_nms,
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'nms_iou_threshold': nms_iou_threshold,
+        'max_num_detections': max_num_detections,
+        'nms_version': nms_version,
+        'use_cpu_nms': use_cpu_nms,
+        'soft_nms_sigma': soft_nms_sigma,
+    }
+    super(DetectionGenerator, self).__init__(**kwargs)
+
+  def __call__(self,
+               raw_boxes: tf.Tensor,
+               raw_scores: tf.Tensor,
+               anchor_boxes: tf.Tensor,
+               image_shape: tf.Tensor,
+               regression_weights: Optional[List[float]] = None,
+               bbox_per_class: bool = True):
+    """Generates final detections.
+
+    Args:
+      raw_boxes: A `tf.Tensor` of shape of `[batch_size, K, num_classes * 4]`
+        representing the class-specific box coordinates relative to anchors.
+      raw_scores: A `tf.Tensor` of shape of `[batch_size, K, num_classes]`
+        representing the class logits before applying score activiation.
+      anchor_boxes: A `tf.Tensor` of shape of `[batch_size, K, 4]` representing
+        the corresponding anchor boxes w.r.t `box_outputs`.
+      image_shape: A `tf.Tensor` of shape of `[batch_size, 2]` storing the image
+        height and width w.r.t. the scaled image, i.e. the same image space as
+        `box_outputs` and `anchor_boxes`.
+      regression_weights: A list of four float numbers to scale coordinates.
+      bbox_per_class: A `bool`. If True, perform per-class box regression.
+
+    Returns:
+      If `apply_nms` = True, the return is a dictionary with keys:
+        `detection_boxes`: A `float` tf.Tensor of shape
+          [batch, max_num_detections, 4] representing top detected boxes in
+          [y1, x1, y2, x2].
+        `detection_scores`: A `float` `tf.Tensor` of shape
+          [batch, max_num_detections] representing sorted confidence scores for
+          detected boxes. The values are between [0, 1].
+        `detection_classes`: An `int` tf.Tensor of shape
+          [batch, max_num_detections] representing classes for detected boxes.
+        `num_detections`: An `int` tf.Tensor of shape [batch] only the first
+          `num_detections` boxes are valid detections
+      If `apply_nms` = False, the return is a dictionary with keys:
+        `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
+          representing all the decoded boxes.
+        `decoded_box_scores`: A `float` tf.Tensor of shape
+          [batch, num_raw_boxes] representing socres of all the decoded boxes.
+    """
+    box_scores = tf.nn.softmax(raw_scores, axis=-1)
+
+    # Removes the background class.
+    box_scores_shape = tf.shape(box_scores)
+    box_scores_shape_list = box_scores.get_shape().as_list()
+    batch_size = box_scores_shape[0]
+    num_locations = box_scores_shape_list[1]
+    num_classes = box_scores_shape_list[-1]
+
+    box_scores = tf.slice(box_scores, [0, 0, 1], [-1, -1, -1])
+
+    if bbox_per_class:
+      num_detections = num_locations * (num_classes - 1)
+      raw_boxes = tf.reshape(raw_boxes,
+                             [batch_size, num_locations, num_classes, 4])
+      raw_boxes = tf.slice(raw_boxes, [0, 0, 1, 0], [-1, -1, -1, -1])
+      anchor_boxes = tf.tile(
+          tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
+      raw_boxes = tf.reshape(raw_boxes, [batch_size, num_detections, 4])
+      anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])
+
+    # Box decoding.
+    decoded_boxes = box_ops.decode_boxes(
+        raw_boxes, anchor_boxes, weights=regression_weights)
+
+    # Box clipping
+    decoded_boxes = box_ops.clip_boxes(
+        decoded_boxes, tf.expand_dims(image_shape, axis=1))
+
+    if bbox_per_class:
+      decoded_boxes = tf.reshape(
+          decoded_boxes, [batch_size, num_locations, num_classes - 1, 4])
+    else:
+      decoded_boxes = tf.expand_dims(decoded_boxes, axis=2)
+
+    if not self._config_dict['apply_nms']:
+      return {
+          'decoded_boxes': decoded_boxes,
+          'decoded_box_scores': box_scores,
+      }
+
+    # Optionally force the NMS be run on CPU.
+    if self._config_dict['use_cpu_nms']:
+      nms_context = tf.device('cpu:0')
+    else:
+      nms_context = contextlib.nullcontext()
+
+    with nms_context:
+      if self._config_dict['nms_version'] == 'batched':
+        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
+            _generate_detections_batched(
+                decoded_boxes, box_scores,
+                self._config_dict['pre_nms_score_threshold'],
+                self._config_dict['nms_iou_threshold'],
+                self._config_dict['max_num_detections']))
+      elif self._config_dict['nms_version'] == 'v1':
+        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, _) = (
+            _generate_detections_v1(
+                decoded_boxes,
+                box_scores,
+                pre_nms_top_k=self._config_dict['pre_nms_top_k'],
+                pre_nms_score_threshold=self
+                ._config_dict['pre_nms_score_threshold'],
+                nms_iou_threshold=self._config_dict['nms_iou_threshold'],
+                max_num_detections=self._config_dict['max_num_detections'],
+                soft_nms_sigma=self._config_dict['soft_nms_sigma']))
+      elif self._config_dict['nms_version'] == 'v2':
+        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
+            _generate_detections_v2(
+                decoded_boxes,
+                box_scores,
+                pre_nms_top_k=self._config_dict['pre_nms_top_k'],
+                pre_nms_score_threshold=self
+                ._config_dict['pre_nms_score_threshold'],
+                nms_iou_threshold=self._config_dict['nms_iou_threshold'],
+                max_num_detections=self._config_dict['max_num_detections']))
+      else:
+        raise ValueError('NMS version {} not supported.'.format(
+            self._config_dict['nms_version']))
+
+    # Adds 1 to offset the background class which has index 0.
+    nmsed_classes += 1
+
+    return {
+        'num_detections': valid_detections,
+        'detection_boxes': nmsed_boxes,
+        'detection_classes': nmsed_classes,
+        'detection_scores': nmsed_scores,
+    }
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MultilevelDetectionGenerator(tf.keras.layers.Layer):
+  """Generates detected boxes with scores and classes for one-stage detector."""
+
+  def __init__(self,
+               apply_nms: bool = True,
+               pre_nms_top_k: int = 5000,
+               pre_nms_score_threshold: float = 0.05,
+               nms_iou_threshold: float = 0.5,
+               max_num_detections: int = 100,
+               nms_version: str = 'v1',
+               use_cpu_nms: bool = False,
+               soft_nms_sigma: Optional[float] = None,
+               tflite_post_processing_config: Optional[Dict[str, Any]] = None,
+               **kwargs):
+    """Initializes a multi-level detection generator.
+
+    Args:
+      apply_nms: A `bool` of whether or not apply non maximum suppression. If
+        False, the decoded boxes and their scores are returned.
+      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
+        before applying NMS.
+      pre_nms_score_threshold: A `float` of the score threshold to apply before
+        applying NMS. Proposals whose scores are below this threshold are thrown
+        away.
+      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
+      max_num_detections: An `int` of the final number of total detections to
+        generate.
+      nms_version: A string of `batched`, `v1` or `v2` specifies NMS version
+      use_cpu_nms: A `bool` of whether or not enforce NMS to run on CPU.
+      soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
+        When soft_nms_sigma=0.0, we fall back to standard NMS.
+      tflite_post_processing_config: An optional dictionary containing
+        post-processing parameters used for TFLite custom NMS op.
+
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'apply_nms': apply_nms,
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'nms_iou_threshold': nms_iou_threshold,
+        'max_num_detections': max_num_detections,
+        'nms_version': nms_version,
+        'use_cpu_nms': use_cpu_nms,
+        'soft_nms_sigma': soft_nms_sigma
+    }
+
+    if tflite_post_processing_config is not None:
+      self._config_dict.update(
+          {'tflite_post_processing_config': tflite_post_processing_config})
+    super(MultilevelDetectionGenerator, self).__init__(**kwargs)
+
+  def _decode_multilevel_outputs(
+      self,
+      raw_boxes: Mapping[str, tf.Tensor],
+      raw_scores: Mapping[str, tf.Tensor],
+      anchor_boxes: Mapping[str, tf.Tensor],
+      image_shape: tf.Tensor,
+      raw_attributes: Optional[Mapping[str, tf.Tensor]] = None):
+    """Collects dict of multilevel boxes, scores, attributes into lists."""
+    boxes = []
+    scores = []
+    if raw_attributes:
+      attributes = {att_name: [] for att_name in raw_attributes.keys()}
+    else:
+      attributes = {}
+
+    levels = list(raw_boxes.keys())
+    min_level = int(min(levels))
+    max_level = int(max(levels))
+    for i in range(min_level, max_level + 1):
+      raw_boxes_i = raw_boxes[str(i)]
+      raw_scores_i = raw_scores[str(i)]
+      batch_size = tf.shape(raw_boxes_i)[0]
+      (_, feature_h_i, feature_w_i,
+       num_anchors_per_locations_times_4) = raw_boxes_i.get_shape().as_list()
+      num_locations = feature_h_i * feature_w_i
+      num_anchors_per_locations = num_anchors_per_locations_times_4 // 4
+      num_classes = raw_scores_i.get_shape().as_list(
+      )[-1] // num_anchors_per_locations
+
+      # Applies score transformation and remove the implicit background class.
+      scores_i = tf.sigmoid(
+          tf.reshape(raw_scores_i, [
+              batch_size, num_locations * num_anchors_per_locations, num_classes
+          ]))
+      scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1])
+
+      # Box decoding.
+      # The anchor boxes are shared for all data in a batch.
+      # One stage detector only supports class agnostic box regression.
+      anchor_boxes_i = tf.reshape(
+          anchor_boxes[str(i)],
+          [batch_size, num_locations * num_anchors_per_locations, 4])
+      raw_boxes_i = tf.reshape(
+          raw_boxes_i,
+          [batch_size, num_locations * num_anchors_per_locations, 4])
+      boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i)
+
+      # Box clipping.
+      boxes_i = box_ops.clip_boxes(
+          boxes_i, tf.expand_dims(image_shape, axis=1))
+
+      boxes.append(boxes_i)
+      scores.append(scores_i)
+
+      if raw_attributes:
+        for att_name, raw_att in raw_attributes.items():
+          attribute_size = raw_att[str(
+              i)].get_shape().as_list()[-1] // num_anchors_per_locations
+          att_i = tf.reshape(raw_att[str(i)], [
+              batch_size, num_locations * num_anchors_per_locations,
+              attribute_size
+          ])
+          attributes[att_name].append(att_i)
+
+    boxes = tf.concat(boxes, axis=1)
+    boxes = tf.expand_dims(boxes, axis=2)
+    scores = tf.concat(scores, axis=1)
+
+    if raw_attributes:
+      for att_name in raw_attributes.keys():
+        attributes[att_name] = tf.concat(attributes[att_name], axis=1)
+        attributes[att_name] = tf.expand_dims(attributes[att_name], axis=2)
+
+    return boxes, scores, attributes
+
+  def __call__(self,
+               raw_boxes: Mapping[str, tf.Tensor],
+               raw_scores: Mapping[str, tf.Tensor],
+               anchor_boxes: Mapping[str, tf.Tensor],
+               image_shape: tf.Tensor,
+               raw_attributes: Optional[Mapping[str, tf.Tensor]] = None):
+    """Generates final detections.
+
+    Args:
+      raw_boxes: A `dict` with keys representing FPN levels and values
+        representing box tenors of shape `[batch, feature_h, feature_w,
+        num_anchors * 4]`.
+      raw_scores: A `dict` with keys representing FPN levels and values
+        representing logit tensors of shape `[batch, feature_h, feature_w,
+        num_anchors]`.
+      anchor_boxes: A `dict` with keys representing FPN levels and values
+        representing anchor tenors of shape `[batch_size, K, 4]` representing
+        the corresponding anchor boxes w.r.t `box_outputs`.
+      image_shape: A `tf.Tensor` of shape of [batch_size, 2] storing the image
+        height and width w.r.t. the scaled image, i.e. the same image space as
+        `box_outputs` and `anchor_boxes`.
+      raw_attributes: If not None, a `dict` of (attribute_name,
+        attribute_prediction) pairs. `attribute_prediction` is a dict that
+        contains keys representing FPN levels and values representing tenors of
+        shape `[batch, feature_h, feature_w, num_anchors * attribute_size]`.
+
+    Returns:
+      If `apply_nms` = True, the return is a dictionary with keys:
+        `detection_boxes`: A `float` tf.Tensor of shape
+          [batch, max_num_detections, 4] representing top detected boxes in
+          [y1, x1, y2, x2].
+        `detection_scores`: A `float` tf.Tensor of shape
+          [batch, max_num_detections] representing sorted confidence scores for
+          detected boxes. The values are between [0, 1].
+        `detection_classes`: An `int` tf.Tensor of shape
+          [batch, max_num_detections] representing classes for detected boxes.
+        `num_detections`: An `int` tf.Tensor of shape [batch] only the first
+          `num_detections` boxes are valid detections
+        `detection_attributes`: A dict. Values of the dict is a `float`
+          tf.Tensor of shape [batch, max_num_detections, attribute_size]
+          representing attribute predictions for detected boxes.
+      If `apply_nms` = False, the return is a dictionary with keys:
+        `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
+          representing all the decoded boxes.
+        `decoded_box_scores`: A `float` tf.Tensor of shape
+          [batch, num_raw_boxes] representing socres of all the decoded boxes.
+        `decoded_box_attributes`: A dict. Values in the dict is a
+          `float` tf.Tensor of shape [batch, num_raw_boxes, attribute_size]
+          representing attribute predictions of all the decoded boxes.
+    """
+    if self._config_dict['apply_nms'] and self._config_dict[
+        'nms_version'] == 'tflite':
+      boxes, classes, scores, num_detections = _generate_detections_tflite(
+          raw_boxes, raw_scores, anchor_boxes,
+          self.get_config()['tflite_post_processing_config'])
+      return {
+          'num_detections': num_detections,
+          'detection_boxes': boxes,
+          'detection_classes': classes,
+          'detection_scores': scores
+      }
+
+    boxes, scores, attributes = self._decode_multilevel_outputs(
+        raw_boxes, raw_scores, anchor_boxes, image_shape, raw_attributes)
+
+    if not self._config_dict['apply_nms']:
+      return {
+          'decoded_boxes': boxes,
+          'decoded_box_scores': scores,
+          'decoded_box_attributes': attributes,
+      }
+
+    # Optionally force the NMS to run on CPU.
+    if self._config_dict['use_cpu_nms']:
+      nms_context = tf.device('cpu:0')
+    else:
+      nms_context = contextlib.nullcontext()
+
+    with nms_context:
+      if raw_attributes and (self._config_dict['nms_version'] != 'v1'):
+        raise ValueError(
+            'Attribute learning is only supported for NMSv1 but NMS {} is used.'
+            .format(self._config_dict['nms_version']))
+      if self._config_dict['nms_version'] == 'batched':
+        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
+            _generate_detections_batched(
+                boxes, scores, self._config_dict['pre_nms_score_threshold'],
+                self._config_dict['nms_iou_threshold'],
+                self._config_dict['max_num_detections']))
+        # Set `nmsed_attributes` to None for batched NMS.
+        nmsed_attributes = {}
+      elif self._config_dict['nms_version'] == 'v1':
+        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections,
+         nmsed_attributes) = (
+             _generate_detections_v1(
+                 boxes,
+                 scores,
+                 attributes=attributes if raw_attributes else None,
+                 pre_nms_top_k=self._config_dict['pre_nms_top_k'],
+                 pre_nms_score_threshold=self
+                 ._config_dict['pre_nms_score_threshold'],
+                 nms_iou_threshold=self._config_dict['nms_iou_threshold'],
+                 max_num_detections=self._config_dict['max_num_detections'],
+                 soft_nms_sigma=self._config_dict['soft_nms_sigma']))
+      elif self._config_dict['nms_version'] == 'v2':
+        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
+            _generate_detections_v2(
+                boxes,
+                scores,
+                pre_nms_top_k=self._config_dict['pre_nms_top_k'],
+                pre_nms_score_threshold=self
+                ._config_dict['pre_nms_score_threshold'],
+                nms_iou_threshold=self._config_dict['nms_iou_threshold'],
+                max_num_detections=self._config_dict['max_num_detections']))
+        # Set `nmsed_attributes` to None for v2.
+        nmsed_attributes = {}
+      else:
+        raise ValueError('NMS version {} not supported.'.format(
+            self._config_dict['nms_version']))
+
+    # Adds 1 to offset the background class which has index 0.
+    nmsed_classes += 1
+
+    return {
+        'num_detections': valid_detections,
+        'detection_boxes': nmsed_boxes,
+        'detection_classes': nmsed_classes,
+        'detection_scores': nmsed_scores,
+        'detection_attributes': nmsed_attributes,
+    }
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)