Merge branch 'tensorflow:master' into panoptic-deeplab-modeling

0225b135 · Srihari Humbarwadi · GitHub · 7479dbb8 · 4c571a3c · 0225b135
Unverified Commit 0225b135 authored Mar 05, 2022 by Srihari Humbarwadi Committed by GitHub Mar 05, 2022
20 changed files
--- a/official/vision/modeling/factory.py
+++ b/official/vision/modeling/factory.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Factory methods to build models."""
+
+from typing import Optional
+
+import tensorflow as tf
+
+from official.vision.configs import image_classification as classification_cfg
+from official.vision.configs import maskrcnn as maskrcnn_cfg
+from official.vision.configs import retinanet as retinanet_cfg
+from official.vision.configs import semantic_segmentation as segmentation_cfg
+from official.vision.modeling import backbones
+from official.vision.modeling import classification_model
+from official.vision.modeling import decoders
+from official.vision.modeling import maskrcnn_model
+from official.vision.modeling import retinanet_model
+from official.vision.modeling import segmentation_model
+from official.vision.modeling.heads import dense_prediction_heads
+from official.vision.modeling.heads import instance_heads
+from official.vision.modeling.heads import segmentation_heads
+from official.vision.modeling.layers import detection_generator
+from official.vision.modeling.layers import mask_sampler
+from official.vision.modeling.layers import roi_aligner
+from official.vision.modeling.layers import roi_generator
+from official.vision.modeling.layers import roi_sampler
+
+
+def build_classification_model(
+    input_specs: tf.keras.layers.InputSpec,
+    model_config: classification_cfg.ImageClassificationModel,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+    skip_logits_layer: bool = False,
+    backbone: Optional[tf.keras.Model] = None) -> tf.keras.Model:
+  """Builds the classification model."""
+  norm_activation_config = model_config.norm_activation
+  if not backbone:
+    backbone = backbones.factory.build_backbone(
+        input_specs=input_specs,
+        backbone_config=model_config.backbone,
+        norm_activation_config=norm_activation_config,
+        l2_regularizer=l2_regularizer)
+
+  model = classification_model.ClassificationModel(
+      backbone=backbone,
+      num_classes=model_config.num_classes,
+      input_specs=input_specs,
+      dropout_rate=model_config.dropout_rate,
+      kernel_initializer=model_config.kernel_initializer,
+      kernel_regularizer=l2_regularizer,
+      add_head_batch_norm=model_config.add_head_batch_norm,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      skip_logits_layer=skip_logits_layer)
+  return model
+
+
+def build_maskrcnn(input_specs: tf.keras.layers.InputSpec,
+                   model_config: maskrcnn_cfg.MaskRCNN,
+                   l2_regularizer: Optional[
+                       tf.keras.regularizers.Regularizer] = None,
+                   backbone: Optional[tf.keras.Model] = None,
+                   decoder: Optional[tf.keras.Model] = None) -> tf.keras.Model:
+  """Builds Mask R-CNN model."""
+  norm_activation_config = model_config.norm_activation
+  if not backbone:
+    backbone = backbones.factory.build_backbone(
+        input_specs=input_specs,
+        backbone_config=model_config.backbone,
+        norm_activation_config=norm_activation_config,
+        l2_regularizer=l2_regularizer)
+  backbone_features = backbone(tf.keras.Input(input_specs.shape[1:]))
+
+  if not decoder:
+    decoder = decoders.factory.build_decoder(
+        input_specs=backbone.output_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+
+  rpn_head_config = model_config.rpn_head
+  roi_generator_config = model_config.roi_generator
+  roi_sampler_config = model_config.roi_sampler
+  roi_aligner_config = model_config.roi_aligner
+  detection_head_config = model_config.detection_head
+  generator_config = model_config.detection_generator
+  num_anchors_per_location = (
+      len(model_config.anchor.aspect_ratios) * model_config.anchor.num_scales)
+
+  rpn_head = dense_prediction_heads.RPNHead(
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      num_anchors_per_location=num_anchors_per_location,
+      num_convs=rpn_head_config.num_convs,
+      num_filters=rpn_head_config.num_filters,
+      use_separable_conv=rpn_head_config.use_separable_conv,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
+
+  detection_head = instance_heads.DetectionHead(
+      num_classes=model_config.num_classes,
+      num_convs=detection_head_config.num_convs,
+      num_filters=detection_head_config.num_filters,
+      use_separable_conv=detection_head_config.use_separable_conv,
+      num_fcs=detection_head_config.num_fcs,
+      fc_dims=detection_head_config.fc_dims,
+      class_agnostic_bbox_pred=detection_head_config.class_agnostic_bbox_pred,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer,
+      name='detection_head')
+
+  if decoder:
+    decoder_features = decoder(backbone_features)
+    rpn_head(decoder_features)
+
+  if roi_sampler_config.cascade_iou_thresholds:
+    detection_head_cascade = [detection_head]
+    for cascade_num in range(len(roi_sampler_config.cascade_iou_thresholds)):
+      detection_head = instance_heads.DetectionHead(
+          num_classes=model_config.num_classes,
+          num_convs=detection_head_config.num_convs,
+          num_filters=detection_head_config.num_filters,
+          use_separable_conv=detection_head_config.use_separable_conv,
+          num_fcs=detection_head_config.num_fcs,
+          fc_dims=detection_head_config.fc_dims,
+          class_agnostic_bbox_pred=detection_head_config
+          .class_agnostic_bbox_pred,
+          activation=norm_activation_config.activation,
+          use_sync_bn=norm_activation_config.use_sync_bn,
+          norm_momentum=norm_activation_config.norm_momentum,
+          norm_epsilon=norm_activation_config.norm_epsilon,
+          kernel_regularizer=l2_regularizer,
+          name='detection_head_{}'.format(cascade_num + 1))
+
+      detection_head_cascade.append(detection_head)
+    detection_head = detection_head_cascade
+
+  roi_generator_obj = roi_generator.MultilevelROIGenerator(
+      pre_nms_top_k=roi_generator_config.pre_nms_top_k,
+      pre_nms_score_threshold=roi_generator_config.pre_nms_score_threshold,
+      pre_nms_min_size_threshold=(
+          roi_generator_config.pre_nms_min_size_threshold),
+      nms_iou_threshold=roi_generator_config.nms_iou_threshold,
+      num_proposals=roi_generator_config.num_proposals,
+      test_pre_nms_top_k=roi_generator_config.test_pre_nms_top_k,
+      test_pre_nms_score_threshold=(
+          roi_generator_config.test_pre_nms_score_threshold),
+      test_pre_nms_min_size_threshold=(
+          roi_generator_config.test_pre_nms_min_size_threshold),
+      test_nms_iou_threshold=roi_generator_config.test_nms_iou_threshold,
+      test_num_proposals=roi_generator_config.test_num_proposals,
+      use_batched_nms=roi_generator_config.use_batched_nms)
+
+  roi_sampler_cascade = []
+  roi_sampler_obj = roi_sampler.ROISampler(
+      mix_gt_boxes=roi_sampler_config.mix_gt_boxes,
+      num_sampled_rois=roi_sampler_config.num_sampled_rois,
+      foreground_fraction=roi_sampler_config.foreground_fraction,
+      foreground_iou_threshold=roi_sampler_config.foreground_iou_threshold,
+      background_iou_high_threshold=(
+          roi_sampler_config.background_iou_high_threshold),
+      background_iou_low_threshold=(
+          roi_sampler_config.background_iou_low_threshold))
+  roi_sampler_cascade.append(roi_sampler_obj)
+  # Initialize addtional roi simplers for cascade heads.
+  if roi_sampler_config.cascade_iou_thresholds:
+    for iou in roi_sampler_config.cascade_iou_thresholds:
+      roi_sampler_obj = roi_sampler.ROISampler(
+          mix_gt_boxes=False,
+          num_sampled_rois=roi_sampler_config.num_sampled_rois,
+          foreground_iou_threshold=iou,
+          background_iou_high_threshold=iou,
+          background_iou_low_threshold=0.0,
+          skip_subsampling=True)
+      roi_sampler_cascade.append(roi_sampler_obj)
+
+  roi_aligner_obj = roi_aligner.MultilevelROIAligner(
+      crop_size=roi_aligner_config.crop_size,
+      sample_offset=roi_aligner_config.sample_offset)
+
+  detection_generator_obj = detection_generator.DetectionGenerator(
+      apply_nms=generator_config.apply_nms,
+      pre_nms_top_k=generator_config.pre_nms_top_k,
+      pre_nms_score_threshold=generator_config.pre_nms_score_threshold,
+      nms_iou_threshold=generator_config.nms_iou_threshold,
+      max_num_detections=generator_config.max_num_detections,
+      nms_version=generator_config.nms_version,
+      use_cpu_nms=generator_config.use_cpu_nms,
+      soft_nms_sigma=generator_config.soft_nms_sigma)
+
+  if model_config.include_mask:
+    mask_head = instance_heads.MaskHead(
+        num_classes=model_config.num_classes,
+        upsample_factor=model_config.mask_head.upsample_factor,
+        num_convs=model_config.mask_head.num_convs,
+        num_filters=model_config.mask_head.num_filters,
+        use_separable_conv=model_config.mask_head.use_separable_conv,
+        activation=model_config.norm_activation.activation,
+        norm_momentum=model_config.norm_activation.norm_momentum,
+        norm_epsilon=model_config.norm_activation.norm_epsilon,
+        kernel_regularizer=l2_regularizer,
+        class_agnostic=model_config.mask_head.class_agnostic)
+
+    mask_sampler_obj = mask_sampler.MaskSampler(
+        mask_target_size=(
+            model_config.mask_roi_aligner.crop_size *
+            model_config.mask_head.upsample_factor),
+        num_sampled_masks=model_config.mask_sampler.num_sampled_masks)
+
+    mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(
+        crop_size=model_config.mask_roi_aligner.crop_size,
+        sample_offset=model_config.mask_roi_aligner.sample_offset)
+  else:
+    mask_head = None
+    mask_sampler_obj = None
+    mask_roi_aligner_obj = None
+
+  model = maskrcnn_model.MaskRCNNModel(
+      backbone=backbone,
+      decoder=decoder,
+      rpn_head=rpn_head,
+      detection_head=detection_head,
+      roi_generator=roi_generator_obj,
+      roi_sampler=roi_sampler_cascade,
+      roi_aligner=roi_aligner_obj,
+      detection_generator=detection_generator_obj,
+      mask_head=mask_head,
+      mask_sampler=mask_sampler_obj,
+      mask_roi_aligner=mask_roi_aligner_obj,
+      class_agnostic_bbox_pred=detection_head_config.class_agnostic_bbox_pred,
+      cascade_class_ensemble=detection_head_config.cascade_class_ensemble,
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      num_scales=model_config.anchor.num_scales,
+      aspect_ratios=model_config.anchor.aspect_ratios,
+      anchor_size=model_config.anchor.anchor_size)
+  return model
+
+
+def build_retinanet(
+    input_specs: tf.keras.layers.InputSpec,
+    model_config: retinanet_cfg.RetinaNet,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+    backbone: Optional[tf.keras.Model] = None,
+    decoder: Optional[tf.keras.regularizers.Regularizer] = None
+) -> tf.keras.Model:
+  """Builds RetinaNet model."""
+  norm_activation_config = model_config.norm_activation
+  if not backbone:
+    backbone = backbones.factory.build_backbone(
+        input_specs=input_specs,
+        backbone_config=model_config.backbone,
+        norm_activation_config=norm_activation_config,
+        l2_regularizer=l2_regularizer)
+  backbone_features = backbone(tf.keras.Input(input_specs.shape[1:]))
+
+  if not decoder:
+    decoder = decoders.factory.build_decoder(
+        input_specs=backbone.output_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+
+  head_config = model_config.head
+  generator_config = model_config.detection_generator
+  num_anchors_per_location = (
+      len(model_config.anchor.aspect_ratios) * model_config.anchor.num_scales)
+
+  head = dense_prediction_heads.RetinaNetHead(
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      num_classes=model_config.num_classes,
+      num_anchors_per_location=num_anchors_per_location,
+      num_convs=head_config.num_convs,
+      num_filters=head_config.num_filters,
+      attribute_heads=[
+          cfg.as_dict() for cfg in (head_config.attribute_heads or [])
+      ],
+      use_separable_conv=head_config.use_separable_conv,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
+
+  # Builds decoder and head so that their trainable weights are initialized
+  if decoder:
+    decoder_features = decoder(backbone_features)
+    _ = head(decoder_features)
+
+  detection_generator_obj = detection_generator.MultilevelDetectionGenerator(
+      apply_nms=generator_config.apply_nms,
+      pre_nms_top_k=generator_config.pre_nms_top_k,
+      pre_nms_score_threshold=generator_config.pre_nms_score_threshold,
+      nms_iou_threshold=generator_config.nms_iou_threshold,
+      max_num_detections=generator_config.max_num_detections,
+      nms_version=generator_config.nms_version,
+      use_cpu_nms=generator_config.use_cpu_nms,
+      soft_nms_sigma=generator_config.soft_nms_sigma,
+      tflite_post_processing_config=generator_config.tflite_post_processing
+      .as_dict())
+
+  model = retinanet_model.RetinaNetModel(
+      backbone,
+      decoder,
+      head,
+      detection_generator_obj,
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      num_scales=model_config.anchor.num_scales,
+      aspect_ratios=model_config.anchor.aspect_ratios,
+      anchor_size=model_config.anchor.anchor_size)
+  return model
+
+
+def build_segmentation_model(
+    input_specs: tf.keras.layers.InputSpec,
+    model_config: segmentation_cfg.SemanticSegmentationModel,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+    backbone: Optional[tf.keras.regularizers.Regularizer] = None,
+    decoder: Optional[tf.keras.regularizers.Regularizer] = None
+) -> tf.keras.Model:
+  """Builds Segmentation model."""
+  norm_activation_config = model_config.norm_activation
+  if not backbone:
+    backbone = backbones.factory.build_backbone(
+        input_specs=input_specs,
+        backbone_config=model_config.backbone,
+        norm_activation_config=norm_activation_config,
+        l2_regularizer=l2_regularizer)
+
+  if not decoder:
+    decoder = decoders.factory.build_decoder(
+        input_specs=backbone.output_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+
+  head_config = model_config.head
+
+  head = segmentation_heads.SegmentationHead(
+      num_classes=model_config.num_classes,
+      level=head_config.level,
+      num_convs=head_config.num_convs,
+      prediction_kernel_size=head_config.prediction_kernel_size,
+      num_filters=head_config.num_filters,
+      use_depthwise_convolution=head_config.use_depthwise_convolution,
+      upsample_factor=head_config.upsample_factor,
+      feature_fusion=head_config.feature_fusion,
+      low_level=head_config.low_level,
+      low_level_num_filters=head_config.low_level_num_filters,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
+
+  mask_scoring_head = None
+  if model_config.mask_scoring_head:
+    mask_scoring_head = segmentation_heads.MaskScoring(
+        num_classes=model_config.num_classes,
+        **model_config.mask_scoring_head.as_dict(),
+        activation=norm_activation_config.activation,
+        use_sync_bn=norm_activation_config.use_sync_bn,
+        norm_momentum=norm_activation_config.norm_momentum,
+        norm_epsilon=norm_activation_config.norm_epsilon,
+        kernel_regularizer=l2_regularizer)
+
+  model = segmentation_model.SegmentationModel(
+      backbone, decoder, head, mask_scoring_head=mask_scoring_head)
+  return model
--- a/official/vision/modeling/factory_3d.py
+++ b/official/vision/modeling/factory_3d.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Factory methods to build models."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.core import registry
+from official.vision.configs import video_classification as video_classification_cfg
+from official.vision.modeling import video_classification_model
+from official.vision.modeling import backbones
+
+_REGISTERED_MODEL_CLS = {}
+
+
+def register_model_builder(key: str):
+  """Decorates a builder of model class.
+
+  The builder should be a Callable (a class or a function).
+  This decorator supports registration of backbone builder as follows:
+
+  ```
+  class MyModel(tf.keras.Model):
+    pass
+
+  @register_backbone_builder('mybackbone')
+  def builder(input_specs, config, l2_reg):
+    return MyModel(...)
+
+  # Builds a MyModel object.
+  my_backbone = build_backbone_3d(input_specs, config, l2_reg)
+  ```
+
+  Args:
+    key: the key to look up the builder.
+
+  Returns:
+    A callable for use as class decorator that registers the decorated class
+    for creation from an instance of model class.
+  """
+  return registry.register(_REGISTERED_MODEL_CLS, key)
+
+
+def build_model(
+    model_type: str,
+    input_specs: tf.keras.layers.InputSpec,
+    model_config: video_classification_cfg.hyperparams.Config,
+    num_classes: int,
+    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
+  """Builds backbone from a config.
+
+  Args:
+    model_type: string name of model type. It should be consistent with
+      ModelConfig.model_type.
+    input_specs: tf.keras.layers.InputSpec.
+    model_config: a OneOfConfig. Model config.
+    num_classes: number of classes.
+    l2_regularizer: tf.keras.regularizers.Regularizer instance. Default to None.
+
+  Returns:
+    tf.keras.Model instance of the backbone.
+  """
+  model_builder = registry.lookup(_REGISTERED_MODEL_CLS, model_type)
+
+  return model_builder(input_specs, model_config, num_classes, l2_regularizer)
+
+
+@register_model_builder('video_classification')
+def build_video_classification_model(
+    input_specs: tf.keras.layers.InputSpec,
+    model_config: video_classification_cfg.VideoClassificationModel,
+    num_classes: int,
+    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
+  """Builds the video classification model."""
+  input_specs_dict = {'image': input_specs}
+  norm_activation_config = model_config.norm_activation
+  backbone = backbones.factory.build_backbone(
+      input_specs=input_specs,
+      backbone_config=model_config.backbone,
+      norm_activation_config=norm_activation_config,
+      l2_regularizer=l2_regularizer)
+
+  model = video_classification_model.VideoClassificationModel(
+      backbone=backbone,
+      num_classes=num_classes,
+      input_specs=input_specs_dict,
+      dropout_rate=model_config.dropout_rate,
+      aggregate_endpoints=model_config.aggregate_endpoints,
+      kernel_regularizer=l2_regularizer,
+      require_endpoints=model_config.require_endpoints)
+  return model
--- a/official/vision/modeling/factory_test.py
+++ b/official/vision/modeling/factory_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for factory.py."""
+
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.configs import backbones
+from official.vision.configs import backbones_3d
+from official.vision.configs import image_classification as classification_cfg
+from official.vision.configs import maskrcnn as maskrcnn_cfg
+from official.vision.configs import retinanet as retinanet_cfg
+from official.vision.configs import video_classification as video_classification_cfg
+from official.vision.modeling import factory
+from official.vision.modeling import factory_3d
+
+
+class ClassificationModelBuilderTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ('resnet', (224, 224), 5e-5),
+      ('resnet', (224, 224), None),
+      ('resnet', (None, None), 5e-5),
+      ('resnet', (None, None), None),
+  )
+  def test_builder(self, backbone_type, input_size, weight_decay):
+    num_classes = 2
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, input_size[0], input_size[1], 3])
+    model_config = classification_cfg.ImageClassificationModel(
+        num_classes=num_classes,
+        backbone=backbones.Backbone(type=backbone_type))
+    l2_regularizer = (
+        tf.keras.regularizers.l2(weight_decay) if weight_decay else None)
+    _ = factory.build_classification_model(
+        input_specs=input_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+
+
+class MaskRCNNBuilderTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ('resnet', (640, 640)),
+      ('resnet', (None, None)),
+  )
+  def test_builder(self, backbone_type, input_size):
+    num_classes = 2
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, input_size[0], input_size[1], 3])
+    model_config = maskrcnn_cfg.MaskRCNN(
+        num_classes=num_classes,
+        backbone=backbones.Backbone(type=backbone_type))
+    l2_regularizer = tf.keras.regularizers.l2(5e-5)
+    _ = factory.build_maskrcnn(
+        input_specs=input_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+
+
+class RetinaNetBuilderTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ('resnet', (640, 640), False),
+      ('resnet', (None, None), True),
+  )
+  def test_builder(self, backbone_type, input_size, has_att_heads):
+    num_classes = 2
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, input_size[0], input_size[1], 3])
+    if has_att_heads:
+      attribute_heads_config = [
+          retinanet_cfg.AttributeHead(name='att1'),
+          retinanet_cfg.AttributeHead(
+              name='att2', type='classification', size=2),
+      ]
+    else:
+      attribute_heads_config = None
+    model_config = retinanet_cfg.RetinaNet(
+        num_classes=num_classes,
+        backbone=backbones.Backbone(type=backbone_type),
+        head=retinanet_cfg.RetinaNetHead(
+            attribute_heads=attribute_heads_config))
+    l2_regularizer = tf.keras.regularizers.l2(5e-5)
+    _ = factory.build_retinanet(
+        input_specs=input_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+    if has_att_heads:
+      self.assertEqual(model_config.head.attribute_heads[0].as_dict(),
+                       dict(name='att1', type='regression', size=1))
+      self.assertEqual(model_config.head.attribute_heads[1].as_dict(),
+                       dict(name='att2', type='classification', size=2))
+
+
+class VideoClassificationModelBuilderTest(parameterized.TestCase,
+                                          tf.test.TestCase):
+
+  @parameterized.parameters(
+      ('resnet_3d', (8, 224, 224), 5e-5),
+      ('resnet_3d', (None, None, None), 5e-5),
+  )
+  def test_builder(self, backbone_type, input_size, weight_decay):
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, input_size[0], input_size[1], input_size[2], 3])
+    model_config = video_classification_cfg.VideoClassificationModel(
+        backbone=backbones_3d.Backbone3D(type=backbone_type))
+    l2_regularizer = (
+        tf.keras.regularizers.l2(weight_decay) if weight_decay else None)
+    _ = factory_3d.build_video_classification_model(
+        input_specs=input_specs,
+        model_config=model_config,
+        num_classes=2,
+        l2_regularizer=l2_regularizer)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/heads/__init__.py
+++ b/official/vision/modeling/heads/__init__.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Heads package definition."""
+
+from official.vision.modeling.heads.dense_prediction_heads import RetinaNetHead
+from official.vision.modeling.heads.dense_prediction_heads import RPNHead
+from official.vision.modeling.heads.instance_heads import DetectionHead
+from official.vision.modeling.heads.instance_heads import MaskHead
+from official.vision.modeling.heads.segmentation_heads import SegmentationHead
--- a/official/vision/modeling/heads/dense_prediction_heads.py
+++ b/official/vision/modeling/heads/dense_prediction_heads.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of dense prediction heads."""
+
+from typing import Any, Dict, List, Mapping, Optional, Union
+
+# Import libraries
+
+import numpy as np
+import tensorflow as tf
+
+from official.modeling import tf_utils
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class RetinaNetHead(tf.keras.layers.Layer):
+  """Creates a RetinaNet head."""
+
+  def __init__(
+      self,
+      min_level: int,
+      max_level: int,
+      num_classes: int,
+      num_anchors_per_location: int,
+      num_convs: int = 4,
+      num_filters: int = 256,
+      attribute_heads: Optional[List[Dict[str, Any]]] = None,
+      use_separable_conv: bool = False,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      num_params_per_anchor: int = 4,
+      **kwargs):
+    """Initializes a RetinaNet head.
+
+    Args:
+      min_level: An `int` number of minimum feature level.
+      max_level: An `int` number of maximum feature level.
+      num_classes: An `int` number of classes to predict.
+      num_anchors_per_location: An `int` number of number of anchors per pixel
+        location.
+      num_convs: An `int` number that represents the number of the intermediate
+        conv layers before the prediction.
+      num_filters: An `int` number that represents the number of filters of the
+        intermediate conv layers.
+      attribute_heads: If not None, a list that contains a dict for each
+        additional attribute head. Each dict consists of 3 key-value pairs:
+        `name`, `type` ('regression' or 'classification'), and `size` (number
+        of predicted values for each instance).
+      use_separable_conv: A `bool` that indicates whether the separable
+        convolution layers is used.
+      activation: A `str` that indicates which activation is used, e.g. 'relu',
+        'swish', etc.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      num_params_per_anchor: Number of parameters required to specify an anchor
+        box. For example, `num_params_per_anchor` would be 4 for axis-aligned
+        anchor boxes specified by their y-centers, x-centers, heights, and
+        widths.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(RetinaNetHead, self).__init__(**kwargs)
+    self._config_dict = {
+        'min_level': min_level,
+        'max_level': max_level,
+        'num_classes': num_classes,
+        'num_anchors_per_location': num_anchors_per_location,
+        'num_convs': num_convs,
+        'num_filters': num_filters,
+        'attribute_heads': attribute_heads,
+        'use_separable_conv': use_separable_conv,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+        'num_params_per_anchor': num_params_per_anchor,
+    }
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
+    """Creates the variables of the head."""
+    conv_op = (tf.keras.layers.SeparableConv2D
+               if self._config_dict['use_separable_conv']
+               else tf.keras.layers.Conv2D)
+    conv_kwargs = {
+        'filters': self._config_dict['num_filters'],
+        'kernel_size': 3,
+        'padding': 'same',
+        'bias_initializer': tf.zeros_initializer(),
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    }
+    if not self._config_dict['use_separable_conv']:
+      conv_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.RandomNormal(
+              stddev=0.01),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+      })
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+
+    # Class net.
+    self._cls_convs = []
+    self._cls_norms = []
+    for level in range(
+        self._config_dict['min_level'], self._config_dict['max_level'] + 1):
+      this_level_cls_norms = []
+      for i in range(self._config_dict['num_convs']):
+        if level == self._config_dict['min_level']:
+          cls_conv_name = 'classnet-conv_{}'.format(i)
+          self._cls_convs.append(conv_op(name=cls_conv_name, **conv_kwargs))
+        cls_norm_name = 'classnet-conv-norm_{}_{}'.format(level, i)
+        this_level_cls_norms.append(bn_op(name=cls_norm_name, **bn_kwargs))
+      self._cls_norms.append(this_level_cls_norms)
+
+    classifier_kwargs = {
+        'filters': (
+            self._config_dict['num_classes'] *
+            self._config_dict['num_anchors_per_location']),
+        'kernel_size': 3,
+        'padding': 'same',
+        'bias_initializer': tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    }
+    if not self._config_dict['use_separable_conv']:
+      classifier_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.RandomNormal(stddev=1e-5),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+      })
+    self._classifier = conv_op(name='scores', **classifier_kwargs)
+
+    # Box net.
+    self._box_convs = []
+    self._box_norms = []
+    for level in range(
+        self._config_dict['min_level'], self._config_dict['max_level'] + 1):
+      this_level_box_norms = []
+      for i in range(self._config_dict['num_convs']):
+        if level == self._config_dict['min_level']:
+          box_conv_name = 'boxnet-conv_{}'.format(i)
+          self._box_convs.append(conv_op(name=box_conv_name, **conv_kwargs))
+        box_norm_name = 'boxnet-conv-norm_{}_{}'.format(level, i)
+        this_level_box_norms.append(bn_op(name=box_norm_name, **bn_kwargs))
+      self._box_norms.append(this_level_box_norms)
+
+    box_regressor_kwargs = {
+        'filters': (self._config_dict['num_params_per_anchor'] *
+                    self._config_dict['num_anchors_per_location']),
+        'kernel_size': 3,
+        'padding': 'same',
+        'bias_initializer': tf.zeros_initializer(),
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    }
+    if not self._config_dict['use_separable_conv']:
+      box_regressor_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.RandomNormal(
+              stddev=1e-5),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+      })
+    self._box_regressor = conv_op(name='boxes', **box_regressor_kwargs)
+
+    # Attribute learning nets.
+    if self._config_dict['attribute_heads']:
+      self._att_predictors = {}
+      self._att_convs = {}
+      self._att_norms = {}
+
+      for att_config in self._config_dict['attribute_heads']:
+        att_name = att_config['name']
+        att_type = att_config['type']
+        att_size = att_config['size']
+        att_convs_i = []
+        att_norms_i = []
+
+        # Build conv and norm layers.
+        for level in range(self._config_dict['min_level'],
+                           self._config_dict['max_level'] + 1):
+          this_level_att_norms = []
+          for i in range(self._config_dict['num_convs']):
+            if level == self._config_dict['min_level']:
+              att_conv_name = '{}-conv_{}'.format(att_name, i)
+              att_convs_i.append(conv_op(name=att_conv_name, **conv_kwargs))
+            att_norm_name = '{}-conv-norm_{}_{}'.format(att_name, level, i)
+            this_level_att_norms.append(bn_op(name=att_norm_name, **bn_kwargs))
+          att_norms_i.append(this_level_att_norms)
+        self._att_convs[att_name] = att_convs_i
+        self._att_norms[att_name] = att_norms_i
+
+        # Build the final prediction layer.
+        att_predictor_kwargs = {
+            'filters':
+                (att_size * self._config_dict['num_anchors_per_location']),
+            'kernel_size': 3,
+            'padding': 'same',
+            'bias_initializer': tf.zeros_initializer(),
+            'bias_regularizer': self._config_dict['bias_regularizer'],
+        }
+        if att_type == 'regression':
+          att_predictor_kwargs.update(
+              {'bias_initializer': tf.zeros_initializer()})
+        elif att_type == 'classification':
+          att_predictor_kwargs.update({
+              'bias_initializer':
+                  tf.constant_initializer(-np.log((1 - 0.01) / 0.01))
+          })
+        else:
+          raise ValueError(
+              'Attribute head type {} not supported.'.format(att_type))
+
+        if not self._config_dict['use_separable_conv']:
+          att_predictor_kwargs.update({
+              'kernel_initializer':
+                  tf.keras.initializers.RandomNormal(stddev=1e-5),
+              'kernel_regularizer':
+                  self._config_dict['kernel_regularizer'],
+          })
+
+        self._att_predictors[att_name] = conv_op(
+            name='{}_attributes'.format(att_name), **att_predictor_kwargs)
+
+    super(RetinaNetHead, self).build(input_shape)
+
+  def call(self, features: Mapping[str, tf.Tensor]):
+    """Forward pass of the RetinaNet head.
+
+    Args:
+      features: A `dict` of `tf.Tensor` where
+        - key: A `str` of the level of the multilevel features.
+        - values: A `tf.Tensor`, the feature map tensors, whose shape is
+            [batch, height_l, width_l, channels].
+
+    Returns:
+      scores: A `dict` of `tf.Tensor` which includes scores of the predictions.
+        - key: A `str` of the level of the multilevel predictions.
+        - values: A `tf.Tensor` of the box scores predicted from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l, num_classes * num_anchors_per_location].
+      boxes: A `dict` of `tf.Tensor` which includes coordinates of the
+        predictions.
+        - key: A `str` of the level of the multilevel predictions.
+        - values: A `tf.Tensor` of the box scores predicted from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l,
+             num_params_per_anchor * num_anchors_per_location].
+      attributes: a dict of (attribute_name, attribute_prediction). Each
+        `attribute_prediction` is a dict of:
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the box scores predicted from a particular feature
+            level, whose shape is
+            [batch, height_l, width_l,
+            attribute_size * num_anchors_per_location].
+        Can be an empty dictionary if no attribute learning is required.
+    """
+    scores = {}
+    boxes = {}
+    if self._config_dict['attribute_heads']:
+      attributes = {
+          att_config['name']: {}
+          for att_config in self._config_dict['attribute_heads']
+      }
+    else:
+      attributes = {}
+
+    for i, level in enumerate(
+        range(self._config_dict['min_level'],
+              self._config_dict['max_level'] + 1)):
+      this_level_features = features[str(level)]
+
+      # class net.
+      x = this_level_features
+      for conv, norm in zip(self._cls_convs, self._cls_norms[i]):
+        x = conv(x)
+        x = norm(x)
+        x = self._activation(x)
+      scores[str(level)] = self._classifier(x)
+
+      # box net.
+      x = this_level_features
+      for conv, norm in zip(self._box_convs, self._box_norms[i]):
+        x = conv(x)
+        x = norm(x)
+        x = self._activation(x)
+      boxes[str(level)] = self._box_regressor(x)
+
+      # attribute nets.
+      if self._config_dict['attribute_heads']:
+        for att_config in self._config_dict['attribute_heads']:
+          att_name = att_config['name']
+          x = this_level_features
+          for conv, norm in zip(self._att_convs[att_name],
+                                self._att_norms[att_name][i]):
+            x = conv(x)
+            x = norm(x)
+            x = self._activation(x)
+          attributes[att_name][str(level)] = self._att_predictors[att_name](x)
+
+    return scores, boxes, attributes
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class RPNHead(tf.keras.layers.Layer):
+  """Creates a Region Proposal Network (RPN) head."""
+
+  def __init__(
+      self,
+      min_level: int,
+      max_level: int,
+      num_anchors_per_location: int,
+      num_convs: int = 1,
+      num_filters: int = 256,
+      use_separable_conv: bool = False,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+    """Initializes a Region Proposal Network head.
+
+    Args:
+      min_level: An `int` number of minimum feature level.
+      max_level: An `int` number of maximum feature level.
+      num_anchors_per_location: An `int` number of number of anchors per pixel
+        location.
+      num_convs: An `int` number that represents the number of the intermediate
+        convolution layers before the prediction.
+      num_filters: An `int` number that represents the number of filters of the
+        intermediate convolution layers.
+      use_separable_conv: A `bool` that indicates whether the separable
+        convolution layers is used.
+      activation: A `str` that indicates which activation is used, e.g. 'relu',
+        'swish', etc.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(RPNHead, self).__init__(**kwargs)
+    self._config_dict = {
+        'min_level': min_level,
+        'max_level': max_level,
+        'num_anchors_per_location': num_anchors_per_location,
+        'num_convs': num_convs,
+        'num_filters': num_filters,
+        'use_separable_conv': use_separable_conv,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+    }
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+
+  def build(self, input_shape):
+    """Creates the variables of the head."""
+    conv_op = (tf.keras.layers.SeparableConv2D
+               if self._config_dict['use_separable_conv']
+               else tf.keras.layers.Conv2D)
+    conv_kwargs = {
+        'filters': self._config_dict['num_filters'],
+        'kernel_size': 3,
+        'padding': 'same',
+        'bias_initializer': tf.zeros_initializer(),
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    }
+    if not self._config_dict['use_separable_conv']:
+      conv_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.RandomNormal(
+              stddev=0.01),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+      })
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+
+    self._convs = []
+    self._norms = []
+    for level in range(
+        self._config_dict['min_level'], self._config_dict['max_level'] + 1):
+      this_level_norms = []
+      for i in range(self._config_dict['num_convs']):
+        if level == self._config_dict['min_level']:
+          conv_name = 'rpn-conv_{}'.format(i)
+          self._convs.append(conv_op(name=conv_name, **conv_kwargs))
+        norm_name = 'rpn-conv-norm_{}_{}'.format(level, i)
+        this_level_norms.append(bn_op(name=norm_name, **bn_kwargs))
+      self._norms.append(this_level_norms)
+
+    classifier_kwargs = {
+        'filters': self._config_dict['num_anchors_per_location'],
+        'kernel_size': 1,
+        'padding': 'valid',
+        'bias_initializer': tf.zeros_initializer(),
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    }
+    if not self._config_dict['use_separable_conv']:
+      classifier_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.RandomNormal(
+              stddev=1e-5),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+      })
+    self._classifier = conv_op(name='rpn-scores', **classifier_kwargs)
+
+    box_regressor_kwargs = {
+        'filters': 4 * self._config_dict['num_anchors_per_location'],
+        'kernel_size': 1,
+        'padding': 'valid',
+        'bias_initializer': tf.zeros_initializer(),
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    }
+    if not self._config_dict['use_separable_conv']:
+      box_regressor_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.RandomNormal(
+              stddev=1e-5),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+      })
+    self._box_regressor = conv_op(name='rpn-boxes', **box_regressor_kwargs)
+
+    super(RPNHead, self).build(input_shape)
+
+  def call(self, features: Mapping[str, tf.Tensor]):
+    """Forward pass of the RPN head.
+
+    Args:
+      features: A `dict` of `tf.Tensor` where
+        - key: A `str` of the level of the multilevel features.
+        - values: A `tf.Tensor`, the feature map tensors, whose shape is [batch,
+          height_l, width_l, channels].
+
+    Returns:
+      scores: A `dict` of `tf.Tensor` which includes scores of the predictions.
+        - key: A `str` of the level of the multilevel predictions.
+        - values: A `tf.Tensor` of the box scores predicted from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l, num_classes * num_anchors_per_location].
+      boxes: A `dict` of `tf.Tensor` which includes coordinates of the
+        predictions.
+        - key: A `str` of the level of the multilevel predictions.
+        - values: A `tf.Tensor` of the box scores predicted from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l, 4 * num_anchors_per_location].
+    """
+    scores = {}
+    boxes = {}
+    for i, level in enumerate(
+        range(self._config_dict['min_level'],
+              self._config_dict['max_level'] + 1)):
+      x = features[str(level)]
+      for conv, norm in zip(self._convs, self._norms[i]):
+        x = conv(x)
+        x = norm(x)
+        x = self._activation(x)
+      scores[str(level)] = self._classifier(x)
+      boxes[str(level)] = self._box_regressor(x)
+    return scores, boxes
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/heads/dense_prediction_heads_test.py
+++ b/official/vision/modeling/heads/dense_prediction_heads_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for dense_prediction_heads.py."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling.heads import dense_prediction_heads
+
+
+class RetinaNetHeadTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (False, False, False),
+      (False, True, False),
+      (True, False, True),
+      (True, True, True),
+  )
+  def test_forward(self, use_separable_conv, use_sync_bn, has_att_heads):
+    if has_att_heads:
+      attribute_heads = [dict(name='depth', type='regression', size=1)]
+    else:
+      attribute_heads = None
+
+    retinanet_head = dense_prediction_heads.RetinaNetHead(
+        min_level=3,
+        max_level=4,
+        num_classes=3,
+        num_anchors_per_location=3,
+        num_convs=2,
+        num_filters=256,
+        attribute_heads=attribute_heads,
+        use_separable_conv=use_separable_conv,
+        activation='relu',
+        use_sync_bn=use_sync_bn,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    features = {
+        '3': np.random.rand(2, 128, 128, 16),
+        '4': np.random.rand(2, 64, 64, 16),
+    }
+    scores, boxes, attributes = retinanet_head(features)
+    self.assertAllEqual(scores['3'].numpy().shape, [2, 128, 128, 9])
+    self.assertAllEqual(scores['4'].numpy().shape, [2, 64, 64, 9])
+    self.assertAllEqual(boxes['3'].numpy().shape, [2, 128, 128, 12])
+    self.assertAllEqual(boxes['4'].numpy().shape, [2, 64, 64, 12])
+    if has_att_heads:
+      for att in attributes.values():
+        self.assertAllEqual(att['3'].numpy().shape, [2, 128, 128, 3])
+        self.assertAllEqual(att['4'].numpy().shape, [2, 64, 64, 3])
+
+  def test_serialize_deserialize(self):
+    retinanet_head = dense_prediction_heads.RetinaNetHead(
+        min_level=3,
+        max_level=7,
+        num_classes=3,
+        num_anchors_per_location=9,
+        num_convs=2,
+        num_filters=16,
+        attribute_heads=None,
+        use_separable_conv=False,
+        activation='relu',
+        use_sync_bn=False,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    config = retinanet_head.get_config()
+    new_retinanet_head = (
+        dense_prediction_heads.RetinaNetHead.from_config(config))
+    self.assertAllEqual(
+        retinanet_head.get_config(), new_retinanet_head.get_config())
+
+
+class RpnHeadTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (False, False),
+      (False, True),
+      (True, False),
+      (True, True),
+  )
+  def test_forward(self, use_separable_conv, use_sync_bn):
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=3,
+        max_level=4,
+        num_anchors_per_location=3,
+        num_convs=2,
+        num_filters=256,
+        use_separable_conv=use_separable_conv,
+        activation='relu',
+        use_sync_bn=use_sync_bn,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    features = {
+        '3': np.random.rand(2, 128, 128, 16),
+        '4': np.random.rand(2, 64, 64, 16),
+    }
+    scores, boxes = rpn_head(features)
+    self.assertAllEqual(scores['3'].numpy().shape, [2, 128, 128, 3])
+    self.assertAllEqual(scores['4'].numpy().shape, [2, 64, 64, 3])
+    self.assertAllEqual(boxes['3'].numpy().shape, [2, 128, 128, 12])
+    self.assertAllEqual(boxes['4'].numpy().shape, [2, 64, 64, 12])
+
+  def test_serialize_deserialize(self):
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=3,
+        max_level=7,
+        num_anchors_per_location=9,
+        num_convs=2,
+        num_filters=16,
+        use_separable_conv=False,
+        activation='relu',
+        use_sync_bn=False,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    config = rpn_head.get_config()
+    new_rpn_head = dense_prediction_heads.RPNHead.from_config(config)
+    self.assertAllEqual(rpn_head.get_config(), new_rpn_head.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/heads/instance_heads.py
+++ b/official/vision/modeling/heads/instance_heads.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of instance prediction heads."""
+
+from typing import List, Union, Optional
+# Import libraries
+import tensorflow as tf
+
+from official.modeling import tf_utils
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class DetectionHead(tf.keras.layers.Layer):
+  """Creates a detection head."""
+
+  def __init__(
+      self,
+      num_classes: int,
+      num_convs: int = 0,
+      num_filters: int = 256,
+      use_separable_conv: bool = False,
+      num_fcs: int = 2,
+      fc_dims: int = 1024,
+      class_agnostic_bbox_pred: bool = False,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+    """Initializes a detection head.
+
+    Args:
+      num_classes: An `int` for the number of classes.
+      num_convs: An `int` number that represents the number of the intermediate
+        convolution layers before the FC layers.
+      num_filters: An `int` number that represents the number of filters of the
+        intermediate convolution layers.
+      use_separable_conv: A `bool` that indicates whether the separable
+        convolution layers is used.
+      num_fcs: An `int` number that represents the number of FC layers before
+        the predictions.
+      fc_dims: An `int` number that represents the number of dimension of the FC
+        layers.
+      class_agnostic_bbox_pred: `bool`, indicating whether bboxes should be
+        predicted for every class or not.
+      activation: A `str` that indicates which activation is used, e.g. 'relu',
+        'swish', etc.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(DetectionHead, self).__init__(**kwargs)
+    self._config_dict = {
+        'num_classes': num_classes,
+        'num_convs': num_convs,
+        'num_filters': num_filters,
+        'use_separable_conv': use_separable_conv,
+        'num_fcs': num_fcs,
+        'fc_dims': fc_dims,
+        'class_agnostic_bbox_pred': class_agnostic_bbox_pred,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+    }
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
+    """Creates the variables of the head."""
+    conv_op = (tf.keras.layers.SeparableConv2D
+               if self._config_dict['use_separable_conv']
+               else tf.keras.layers.Conv2D)
+    conv_kwargs = {
+        'filters': self._config_dict['num_filters'],
+        'kernel_size': 3,
+        'padding': 'same',
+    }
+    if self._config_dict['use_separable_conv']:
+      conv_kwargs.update({
+          'depthwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'pointwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'depthwise_regularizer': self._config_dict['kernel_regularizer'],
+          'pointwise_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      })
+    else:
+      conv_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      })
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+
+    self._convs = []
+    self._conv_norms = []
+    for i in range(self._config_dict['num_convs']):
+      conv_name = 'detection-conv_{}'.format(i)
+      self._convs.append(conv_op(name=conv_name, **conv_kwargs))
+      bn_name = 'detection-conv-bn_{}'.format(i)
+      self._conv_norms.append(bn_op(name=bn_name, **bn_kwargs))
+
+    self._fcs = []
+    self._fc_norms = []
+    for i in range(self._config_dict['num_fcs']):
+      fc_name = 'detection-fc_{}'.format(i)
+      self._fcs.append(
+          tf.keras.layers.Dense(
+              units=self._config_dict['fc_dims'],
+              kernel_initializer=tf.keras.initializers.VarianceScaling(
+                  scale=1 / 3.0, mode='fan_out', distribution='uniform'),
+              kernel_regularizer=self._config_dict['kernel_regularizer'],
+              bias_regularizer=self._config_dict['bias_regularizer'],
+              name=fc_name))
+      bn_name = 'detection-fc-bn_{}'.format(i)
+      self._fc_norms.append(bn_op(name=bn_name, **bn_kwargs))
+
+    self._classifier = tf.keras.layers.Dense(
+        units=self._config_dict['num_classes'],
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        bias_initializer=tf.zeros_initializer(),
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'],
+        name='detection-scores')
+
+    num_box_outputs = (4 if self._config_dict['class_agnostic_bbox_pred'] else
+                       self._config_dict['num_classes'] * 4)
+    self._box_regressor = tf.keras.layers.Dense(
+        units=num_box_outputs,
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.001),
+        bias_initializer=tf.zeros_initializer(),
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'],
+        name='detection-boxes')
+
+    super(DetectionHead, self).build(input_shape)
+
+  def call(self, inputs: tf.Tensor, training: bool = None):
+    """Forward pass of box and class branches for the Mask-RCNN model.
+
+    Args:
+      inputs: A `tf.Tensor` of the shape [batch_size, num_instances, roi_height,
+        roi_width, roi_channels], representing the ROI features.
+      training: a `bool` indicating whether it is in `training` mode.
+
+    Returns:
+      class_outputs: A `tf.Tensor` of the shape
+        [batch_size, num_rois, num_classes], representing the class predictions.
+      box_outputs: A `tf.Tensor` of the shape
+        [batch_size, num_rois, num_classes * 4], representing the box
+        predictions.
+    """
+    roi_features = inputs
+    _, num_rois, height, width, filters = roi_features.get_shape().as_list()
+
+    x = tf.reshape(roi_features, [-1, height, width, filters])
+    for conv, bn in zip(self._convs, self._conv_norms):
+      x = conv(x)
+      x = bn(x)
+      x = self._activation(x)
+
+    _, _, _, filters = x.get_shape().as_list()
+    x = tf.reshape(x, [-1, num_rois, height * width * filters])
+
+    for fc, bn in zip(self._fcs, self._fc_norms):
+      x = fc(x)
+      x = bn(x)
+      x = self._activation(x)
+
+    classes = self._classifier(x)
+    boxes = self._box_regressor(x)
+    return classes, boxes
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MaskHead(tf.keras.layers.Layer):
+  """Creates a mask head."""
+
+  def __init__(
+      self,
+      num_classes: int,
+      upsample_factor: int = 2,
+      num_convs: int = 4,
+      num_filters: int = 256,
+      use_separable_conv: bool = False,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      class_agnostic: bool = False,
+      **kwargs):
+    """Initializes a mask head.
+
+    Args:
+      num_classes: An `int` of the number of classes.
+      upsample_factor: An `int` that indicates the upsample factor to generate
+        the final predicted masks. It should be >= 1.
+      num_convs: An `int` number that represents the number of the intermediate
+        convolution layers before the mask prediction layers.
+      num_filters: An `int` number that represents the number of filters of the
+        intermediate convolution layers.
+      use_separable_conv: A `bool` that indicates whether the separable
+        convolution layers is used.
+      activation: A `str` that indicates which activation is used, e.g. 'relu',
+        'swish', etc.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      class_agnostic: A `bool`. If set, we use a single channel mask head that
+        is shared between all classes.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(MaskHead, self).__init__(**kwargs)
+    self._config_dict = {
+        'num_classes': num_classes,
+        'upsample_factor': upsample_factor,
+        'num_convs': num_convs,
+        'num_filters': num_filters,
+        'use_separable_conv': use_separable_conv,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+        'class_agnostic': class_agnostic
+    }
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
+    """Creates the variables of the head."""
+    conv_op = (tf.keras.layers.SeparableConv2D
+               if self._config_dict['use_separable_conv']
+               else tf.keras.layers.Conv2D)
+    conv_kwargs = {
+        'filters': self._config_dict['num_filters'],
+        'kernel_size': 3,
+        'padding': 'same',
+    }
+    if self._config_dict['use_separable_conv']:
+      conv_kwargs.update({
+          'depthwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'pointwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'depthwise_regularizer': self._config_dict['kernel_regularizer'],
+          'pointwise_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      })
+    else:
+      conv_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      })
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+
+    self._convs = []
+    self._conv_norms = []
+    for i in range(self._config_dict['num_convs']):
+      conv_name = 'mask-conv_{}'.format(i)
+      self._convs.append(conv_op(name=conv_name, **conv_kwargs))
+      bn_name = 'mask-conv-bn_{}'.format(i)
+      self._conv_norms.append(bn_op(name=bn_name, **bn_kwargs))
+
+    self._deconv = tf.keras.layers.Conv2DTranspose(
+        filters=self._config_dict['num_filters'],
+        kernel_size=self._config_dict['upsample_factor'],
+        strides=self._config_dict['upsample_factor'],
+        padding='valid',
+        kernel_initializer=tf.keras.initializers.VarianceScaling(
+            scale=2, mode='fan_out', distribution='untruncated_normal'),
+        bias_initializer=tf.zeros_initializer(),
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'],
+        name='mask-upsampling')
+    self._deconv_bn = bn_op(name='mask-deconv-bn', **bn_kwargs)
+
+    if self._config_dict['class_agnostic']:
+      num_filters = 1
+    else:
+      num_filters = self._config_dict['num_classes']
+
+    conv_kwargs = {
+        'filters': num_filters,
+        'kernel_size': 1,
+        'padding': 'valid',
+    }
+    if self._config_dict['use_separable_conv']:
+      conv_kwargs.update({
+          'depthwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'pointwise_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'depthwise_regularizer': self._config_dict['kernel_regularizer'],
+          'pointwise_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      })
+    else:
+      conv_kwargs.update({
+          'kernel_initializer': tf.keras.initializers.VarianceScaling(
+              scale=2, mode='fan_out', distribution='untruncated_normal'),
+          'bias_initializer': tf.zeros_initializer(),
+          'kernel_regularizer': self._config_dict['kernel_regularizer'],
+          'bias_regularizer': self._config_dict['bias_regularizer'],
+      })
+    self._mask_regressor = conv_op(name='mask-logits', **conv_kwargs)
+
+    super(MaskHead, self).build(input_shape)
+
+  def call(self, inputs: List[tf.Tensor], training: bool = None):
+    """Forward pass of mask branch for the Mask-RCNN model.
+
+    Args:
+      inputs: A `list` of two tensors where
+        inputs[0]: A `tf.Tensor` of shape [batch_size, num_instances,
+          roi_height, roi_width, roi_channels], representing the ROI features.
+        inputs[1]: A `tf.Tensor` of shape [batch_size, num_instances],
+          representing the classes of the ROIs.
+      training: A `bool` indicating whether it is in `training` mode.
+
+    Returns:
+      mask_outputs: A `tf.Tensor` of shape
+        [batch_size, num_instances, roi_height * upsample_factor,
+         roi_width * upsample_factor], representing the mask predictions.
+    """
+    roi_features, roi_classes = inputs
+    batch_size, num_rois, height, width, filters = (
+        roi_features.get_shape().as_list())
+    if batch_size is None:
+      batch_size = tf.shape(roi_features)[0]
+
+    x = tf.reshape(roi_features, [-1, height, width, filters])
+    for conv, bn in zip(self._convs, self._conv_norms):
+      x = conv(x)
+      x = bn(x)
+      x = self._activation(x)
+
+    x = self._deconv(x)
+    x = self._deconv_bn(x)
+    x = self._activation(x)
+
+    logits = self._mask_regressor(x)
+
+    mask_height = height * self._config_dict['upsample_factor']
+    mask_width = width * self._config_dict['upsample_factor']
+
+    if self._config_dict['class_agnostic']:
+      logits = tf.reshape(logits, [-1, num_rois, mask_height, mask_width, 1])
+    else:
+      logits = tf.reshape(
+          logits,
+          [-1, num_rois, mask_height, mask_width,
+           self._config_dict['num_classes']])
+
+    batch_indices = tf.tile(
+        tf.expand_dims(tf.range(batch_size), axis=1), [1, num_rois])
+    mask_indices = tf.tile(
+        tf.expand_dims(tf.range(num_rois), axis=0), [batch_size, 1])
+
+    if self._config_dict['class_agnostic']:
+      class_gather_indices = tf.zeros_like(roi_classes, dtype=tf.int32)
+    else:
+      class_gather_indices = tf.cast(roi_classes, dtype=tf.int32)
+
+    gather_indices = tf.stack(
+        [batch_indices, mask_indices, class_gather_indices],
+        axis=2)
+    mask_outputs = tf.gather_nd(
+        tf.transpose(logits, [0, 1, 4, 2, 3]), gather_indices)
+    return mask_outputs
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/heads/instance_heads_test.py
+++ b/official/vision/modeling/heads/instance_heads_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for instance_heads.py."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling.heads import instance_heads
+
+
+class DetectionHeadTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (0, 0, False, False),
+      (0, 1, False, False),
+      (1, 0, False, False),
+      (1, 1, False, False),
+  )
+  def test_forward(self, num_convs, num_fcs, use_separable_conv, use_sync_bn):
+    detection_head = instance_heads.DetectionHead(
+        num_classes=3,
+        num_convs=num_convs,
+        num_filters=16,
+        use_separable_conv=use_separable_conv,
+        num_fcs=num_fcs,
+        fc_dims=4,
+        activation='relu',
+        use_sync_bn=use_sync_bn,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    roi_features = np.random.rand(2, 10, 128, 128, 16)
+    scores, boxes = detection_head(roi_features)
+    self.assertAllEqual(scores.numpy().shape, [2, 10, 3])
+    self.assertAllEqual(boxes.numpy().shape, [2, 10, 12])
+
+  def test_serialize_deserialize(self):
+    detection_head = instance_heads.DetectionHead(
+        num_classes=91,
+        num_convs=0,
+        num_filters=256,
+        use_separable_conv=False,
+        num_fcs=2,
+        fc_dims=1024,
+        activation='relu',
+        use_sync_bn=False,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    config = detection_head.get_config()
+    new_detection_head = instance_heads.DetectionHead.from_config(config)
+    self.assertAllEqual(
+        detection_head.get_config(), new_detection_head.get_config())
+
+
+class MaskHeadTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (1, 1, False),
+      (1, 2, False),
+      (2, 1, False),
+      (2, 2, False),
+  )
+  def test_forward(self, upsample_factor, num_convs, use_sync_bn):
+    mask_head = instance_heads.MaskHead(
+        num_classes=3,
+        upsample_factor=upsample_factor,
+        num_convs=num_convs,
+        num_filters=16,
+        use_separable_conv=False,
+        activation='relu',
+        use_sync_bn=use_sync_bn,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    roi_features = np.random.rand(2, 10, 14, 14, 16)
+    roi_classes = np.zeros((2, 10))
+    masks = mask_head([roi_features, roi_classes])
+    self.assertAllEqual(
+        masks.numpy().shape,
+        [2, 10, 14 * upsample_factor, 14 * upsample_factor])
+
+  def test_serialize_deserialize(self):
+    mask_head = instance_heads.MaskHead(
+        num_classes=3,
+        upsample_factor=2,
+        num_convs=1,
+        num_filters=256,
+        use_separable_conv=False,
+        activation='relu',
+        use_sync_bn=False,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+    )
+    config = mask_head.get_config()
+    new_mask_head = instance_heads.MaskHead.from_config(config)
+    self.assertAllEqual(
+        mask_head.get_config(), new_mask_head.get_config())
+
+  def test_forward_class_agnostic(self):
+    mask_head = instance_heads.MaskHead(
+        num_classes=3,
+        class_agnostic=True
+    )
+    roi_features = np.random.rand(2, 10, 14, 14, 16)
+    roi_classes = np.zeros((2, 10))
+    masks = mask_head([roi_features, roi_classes])
+    self.assertAllEqual(masks.numpy().shape, [2, 10, 28, 28])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/heads/segmentation_heads.py
+++ b/official/vision/modeling/heads/segmentation_heads.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of segmentation heads."""
+from typing import List, Union, Optional, Mapping, Tuple, Any
+import tensorflow as tf
+
+from official.modeling import tf_utils
+from official.vision.modeling.layers import nn_layers
+from official.vision.ops import spatial_transform_ops
+
+
+class MaskScoring(tf.keras.Model):
+  """Creates a mask scoring layer.
+
+  This implements mask scoring layer from the paper:
+
+  Zhaojin Huang, Lichao Huang, Yongchao Gong, Chang Huang, Xinggang Wang.
+  Mask Scoring R-CNN.
+  (https://arxiv.org/pdf/1903.00241.pdf)
+  """
+
+  def __init__(
+      self,
+      num_classes: int,
+      fc_input_size: List[int],
+      num_convs: int = 3,
+      num_filters: int = 256,
+      fc_dims: int = 1024,
+      num_fcs: int = 2,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+
+    """Initializes mask scoring layer.
+
+    Args:
+      num_classes: An `int` for number of classes.
+      fc_input_size: A List of `int` for the input size of the
+        fully connected layers.
+      num_convs: An`int` for number of conv layers.
+      num_filters: An `int` for the number of filters for conv layers.
+      fc_dims: An `int` number of filters for each fully connected layers.
+      num_fcs: An `int` for number of fully connected layers.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A bool, whether or not to use sync batch normalization.
+      norm_momentum: A float for the momentum in BatchNorm. Defaults to 0.99.
+      norm_epsilon: A float for the epsilon value in BatchNorm. Defaults to
+        0.001.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(MaskScoring, self).__init__(**kwargs)
+
+    self._config_dict = {
+        'num_classes': num_classes,
+        'num_convs': num_convs,
+        'num_filters': num_filters,
+        'fc_input_size': fc_input_size,
+        'fc_dims': fc_dims,
+        'num_fcs': num_fcs,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'activation': activation,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+    }
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
+    """Creates the variables of the mask scoring head."""
+    conv_op = tf.keras.layers.Conv2D
+    conv_kwargs = {
+        'filters': self._config_dict['num_filters'],
+        'kernel_size': 3,
+        'padding': 'same',
+    }
+    conv_kwargs.update({
+        'kernel_initializer': tf.keras.initializers.VarianceScaling(
+            scale=2, mode='fan_out', distribution='untruncated_normal'),
+        'bias_initializer': tf.zeros_initializer(),
+        'kernel_regularizer': self._config_dict['kernel_regularizer'],
+        'bias_regularizer': self._config_dict['bias_regularizer'],
+    })
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+
+    self._convs = []
+    self._conv_norms = []
+    for i in range(self._config_dict['num_convs']):
+      conv_name = 'mask-scoring_{}'.format(i)
+      self._convs.append(conv_op(name=conv_name, **conv_kwargs))
+      bn_name = 'mask-scoring-bn_{}'.format(i)
+      self._conv_norms.append(bn_op(name=bn_name, **bn_kwargs))
+
+    self._fcs = []
+    self._fc_norms = []
+    for i in range(self._config_dict['num_fcs']):
+      fc_name = 'mask-scoring-fc_{}'.format(i)
+      self._fcs.append(
+          tf.keras.layers.Dense(
+              units=self._config_dict['fc_dims'],
+              kernel_initializer=tf.keras.initializers.VarianceScaling(
+                  scale=1 / 3.0, mode='fan_out', distribution='uniform'),
+              kernel_regularizer=self._config_dict['kernel_regularizer'],
+              bias_regularizer=self._config_dict['bias_regularizer'],
+              name=fc_name))
+      bn_name = 'mask-scoring-fc-bn_{}'.format(i)
+      self._fc_norms.append(bn_op(name=bn_name, **bn_kwargs))
+
+    self._classifier = tf.keras.layers.Dense(
+        units=self._config_dict['num_classes'],
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        bias_initializer=tf.zeros_initializer(),
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'],
+        name='iou-scores')
+
+    super(MaskScoring, self).build(input_shape)
+
+  def call(self, inputs: tf.Tensor, training: bool = None):
+    """Forward pass mask scoring head.
+
+    Args:
+      inputs: A `tf.Tensor` of the shape [batch_size, width, size, num_classes],
+      representing the segmentation logits.
+      training: a `bool` indicating whether it is in `training` mode.
+
+    Returns:
+      mask_scores: A `tf.Tensor` of predicted mask scores
+        [batch_size, num_classes].
+    """
+    x = tf.stop_gradient(inputs)
+    for conv, bn in zip(self._convs, self._conv_norms):
+      x = conv(x)
+      x = bn(x)
+      x = self._activation(x)
+
+    # Casts feat to float32 so the resize op can be run on TPU.
+    x = tf.cast(x, tf.float32)
+    x = tf.image.resize(x, size=self._config_dict['fc_input_size'],
+                        method=tf.image.ResizeMethod.BILINEAR)
+    # Casts it back to be compatible with the rest opetations.
+    x = tf.cast(x, inputs.dtype)
+
+    _, h, w, filters = x.get_shape().as_list()
+    x = tf.reshape(x, [-1, h * w * filters])
+
+    for fc, bn in zip(self._fcs, self._fc_norms):
+      x = fc(x)
+      x = bn(x)
+      x = self._activation(x)
+
+    ious = self._classifier(x)
+    return ious
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SegmentationHead(tf.keras.layers.Layer):
+  """Creates a segmentation head."""
+
+  def __init__(
+      self,
+      num_classes: int,
+      level: Union[int, str],
+      num_convs: int = 2,
+      num_filters: int = 256,
+      use_depthwise_convolution: bool = False,
+      prediction_kernel_size: int = 1,
+      upsample_factor: int = 1,
+      feature_fusion: Optional[str] = None,
+      decoder_min_level: Optional[int] = None,
+      decoder_max_level: Optional[int] = None,
+      low_level: int = 2,
+      low_level_num_filters: int = 48,
+      num_decoder_filters: int = 256,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+    """Initializes a segmentation head.
+
+    Args:
+      num_classes: An `int` number of mask classification categories. The number
+        of classes does not include background class.
+      level: An `int` or `str`, level to use to build segmentation head.
+      num_convs: An `int` number of stacked convolution before the last
+        prediction layer.
+      num_filters: An `int` number to specify the number of filters used.
+        Default is 256.
+      use_depthwise_convolution: A bool to specify if use depthwise separable
+        convolutions.
+      prediction_kernel_size: An `int` number to specify the kernel size of the
+      prediction layer.
+      upsample_factor: An `int` number to specify the upsampling factor to
+        generate finer mask. Default 1 means no upsampling is applied.
+      feature_fusion: One of `deeplabv3plus`, `pyramid_fusion`,
+        `panoptic_fpn_fusion`, or None. If `deeplabv3plus`, features from
+        decoder_features[level] will be fused with low level feature maps from
+        backbone. If `pyramid_fusion`, multiscale features will be resized and
+        fused at the target level.
+      decoder_min_level: An `int` of minimum level from decoder to use in
+        feature fusion. It is only used when feature_fusion is set to
+        `panoptic_fpn_fusion`.
+      decoder_max_level: An `int` of maximum level from decoder to use in
+        feature fusion. It is only used when feature_fusion is set to
+        `panoptic_fpn_fusion`.
+      low_level: An `int` of backbone level to be used for feature fusion. It is
+        used when feature_fusion is set to `deeplabv3plus`.
+      low_level_num_filters: An `int` of reduced number of filters for the low
+        level features before fusing it with higher level features. It is only
+        used when feature_fusion is set to `deeplabv3plus`.
+      num_decoder_filters: An `int` of number of filters in the decoder outputs.
+        It is only used when feature_fusion is set to `panoptic_fpn_fusion`.
+      activation: A `str` that indicates which activation is used, e.g. 'relu',
+        'swish', etc.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(SegmentationHead, self).__init__(**kwargs)
+
+    self._config_dict = {
+        'num_classes': num_classes,
+        'level': level,
+        'num_convs': num_convs,
+        'num_filters': num_filters,
+        'use_depthwise_convolution': use_depthwise_convolution,
+        'prediction_kernel_size': prediction_kernel_size,
+        'upsample_factor': upsample_factor,
+        'feature_fusion': feature_fusion,
+        'decoder_min_level': decoder_min_level,
+        'decoder_max_level': decoder_max_level,
+        'low_level': low_level,
+        'low_level_num_filters': low_level_num_filters,
+        'num_decoder_filters': num_decoder_filters,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer
+    }
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
+    """Creates the variables of the segmentation head."""
+    use_depthwise_convolution = self._config_dict['use_depthwise_convolution']
+    random_initializer = tf.keras.initializers.RandomNormal(stddev=0.01)
+    conv_op = tf.keras.layers.Conv2D
+    conv_kwargs = {
+        'kernel_size': 3 if not use_depthwise_convolution else 1,
+        'padding': 'same',
+        'use_bias': False,
+        'kernel_initializer': random_initializer,
+        'kernel_regularizer': self._config_dict['kernel_regularizer'],
+    }
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+
+    if self._config_dict['feature_fusion'] == 'deeplabv3plus':
+      # Deeplabv3+ feature fusion layers.
+      self._dlv3p_conv = conv_op(
+          kernel_size=1,
+          padding='same',
+          use_bias=False,
+          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+          kernel_regularizer=self._config_dict['kernel_regularizer'],
+          name='segmentation_head_deeplabv3p_fusion_conv',
+          filters=self._config_dict['low_level_num_filters'])
+
+      self._dlv3p_norm = bn_op(
+          name='segmentation_head_deeplabv3p_fusion_norm', **bn_kwargs)
+
+    elif self._config_dict['feature_fusion'] == 'panoptic_fpn_fusion':
+      self._panoptic_fpn_fusion = nn_layers.PanopticFPNFusion(
+          min_level=self._config_dict['decoder_min_level'],
+          max_level=self._config_dict['decoder_max_level'],
+          target_level=self._config_dict['level'],
+          num_filters=self._config_dict['num_filters'],
+          num_fpn_filters=self._config_dict['num_decoder_filters'],
+          activation=self._config_dict['activation'],
+          kernel_regularizer=self._config_dict['kernel_regularizer'],
+          bias_regularizer=self._config_dict['bias_regularizer'])
+
+    # Segmentation head layers.
+    self._convs = []
+    self._norms = []
+    for i in range(self._config_dict['num_convs']):
+      if use_depthwise_convolution:
+        self._convs.append(
+            tf.keras.layers.DepthwiseConv2D(
+                name='segmentation_head_depthwise_conv_{}'.format(i),
+                kernel_size=3,
+                padding='same',
+                use_bias=False,
+                depthwise_initializer=random_initializer,
+                depthwise_regularizer=self._config_dict['kernel_regularizer'],
+                depth_multiplier=1))
+        norm_name = 'segmentation_head_depthwise_norm_{}'.format(i)
+        self._norms.append(bn_op(name=norm_name, **bn_kwargs))
+      conv_name = 'segmentation_head_conv_{}'.format(i)
+      self._convs.append(
+          conv_op(
+              name=conv_name,
+              filters=self._config_dict['num_filters'],
+              **conv_kwargs))
+      norm_name = 'segmentation_head_norm_{}'.format(i)
+      self._norms.append(bn_op(name=norm_name, **bn_kwargs))
+
+    self._classifier = conv_op(
+        name='segmentation_output',
+        filters=self._config_dict['num_classes'],
+        kernel_size=self._config_dict['prediction_kernel_size'],
+        padding='same',
+        bias_initializer=tf.zeros_initializer(),
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'])
+
+    super().build(input_shape)
+
+  def call(self, inputs: Tuple[Union[tf.Tensor, Mapping[str, tf.Tensor]],
+                               Union[tf.Tensor, Mapping[str, tf.Tensor]]]):
+    """Forward pass of the segmentation head.
+
+    It supports both a tuple of 2 tensors or 2 dictionaries. The first is
+    backbone endpoints, and the second is decoder endpoints. When inputs are
+    tensors, they are from a single level of feature maps. When inputs are
+    dictionaries, they contain multiple levels of feature maps, where the key
+    is the index of feature map.
+
+    Args:
+      inputs: A tuple of 2 feature map tensors of shape
+        [batch, height_l, width_l, channels] or 2 dictionaries of tensors:
+        - key: A `str` of the level of the multilevel features.
+        - values: A `tf.Tensor` of the feature map tensors, whose shape is
+            [batch, height_l, width_l, channels].
+        The first is backbone endpoints, and the second is decoder endpoints.
+    Returns:
+      segmentation prediction mask: A `tf.Tensor` of the segmentation mask
+        scores predicted from input features.
+    """
+
+    backbone_output = inputs[0]
+    decoder_output = inputs[1]
+    if self._config_dict['feature_fusion'] == 'deeplabv3plus':
+      # deeplabv3+ feature fusion
+      x = decoder_output[str(self._config_dict['level'])] if isinstance(
+          decoder_output, dict) else decoder_output
+      y = backbone_output[str(self._config_dict['low_level'])] if isinstance(
+          backbone_output, dict) else backbone_output
+      y = self._dlv3p_norm(self._dlv3p_conv(y))
+      y = self._activation(y)
+
+      x = tf.image.resize(
+          x, tf.shape(y)[1:3], method=tf.image.ResizeMethod.BILINEAR)
+      x = tf.cast(x, dtype=y.dtype)
+      x = tf.concat([x, y], axis=self._bn_axis)
+    elif self._config_dict['feature_fusion'] == 'pyramid_fusion':
+      if not isinstance(decoder_output, dict):
+        raise ValueError('Only support dictionary decoder_output.')
+      x = nn_layers.pyramid_feature_fusion(decoder_output,
+                                           self._config_dict['level'])
+    elif self._config_dict['feature_fusion'] == 'panoptic_fpn_fusion':
+      x = self._panoptic_fpn_fusion(decoder_output)
+    else:
+      x = decoder_output[str(self._config_dict['level'])] if isinstance(
+          decoder_output, dict) else decoder_output
+
+    for conv, norm in zip(self._convs, self._norms):
+      x = conv(x)
+      x = norm(x)
+      x = self._activation(x)
+    if self._config_dict['upsample_factor'] > 1:
+      x = spatial_transform_ops.nearest_upsampling(
+          x, scale=self._config_dict['upsample_factor'])
+
+    return self._classifier(x)
+
+  def get_config(self):
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(self._config_dict.items()))
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/heads/segmentation_heads_test.py
+++ b/official/vision/modeling/heads/segmentation_heads_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for segmentation_heads.py."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling.heads import segmentation_heads
+
+
+class SegmentationHeadTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (2, 'pyramid_fusion', None, None),
+      (3, 'pyramid_fusion', None, None),
+      (2, 'panoptic_fpn_fusion', 2, 5),
+      (2, 'panoptic_fpn_fusion', 2, 6),
+      (3, 'panoptic_fpn_fusion', 3, 5),
+      (3, 'panoptic_fpn_fusion', 3, 6))
+  def test_forward(self, level, feature_fusion,
+                   decoder_min_level, decoder_max_level):
+    backbone_features = {
+        '3': np.random.rand(2, 128, 128, 16),
+        '4': np.random.rand(2, 64, 64, 16),
+        '5': np.random.rand(2, 32, 32, 16),
+    }
+    decoder_features = {
+        '3': np.random.rand(2, 128, 128, 64),
+        '4': np.random.rand(2, 64, 64, 64),
+        '5': np.random.rand(2, 32, 32, 64),
+        '6': np.random.rand(2, 16, 16, 64),
+    }
+
+    if feature_fusion == 'panoptic_fpn_fusion':
+      backbone_features['2'] = np.random.rand(2, 256, 256, 16)
+      decoder_features['2'] = np.random.rand(2, 256, 256, 64)
+
+    head = segmentation_heads.SegmentationHead(
+        num_classes=10,
+        level=level,
+        feature_fusion=feature_fusion,
+        decoder_min_level=decoder_min_level,
+        decoder_max_level=decoder_max_level,
+        num_decoder_filters=64)
+
+    logits = head((backbone_features, decoder_features))
+
+    if level in decoder_features:
+      self.assertAllEqual(logits.numpy().shape, [
+          2, decoder_features[str(level)].shape[1],
+          decoder_features[str(level)].shape[2], 10
+      ])
+
+  def test_serialize_deserialize(self):
+    head = segmentation_heads.SegmentationHead(num_classes=10, level=3)
+    config = head.get_config()
+    new_head = segmentation_heads.SegmentationHead.from_config(config)
+    self.assertAllEqual(head.get_config(), new_head.get_config())
+
+
+class MaskScoringHeadTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (1, 1, 64, [4, 4]),
+      (2, 1, 64, [4, 4]),
+      (3, 1, 64, [4, 4]),
+      (1, 2, 32, [8, 8]),
+      (2, 2, 32, [8, 8]),
+      (3, 2, 32, [8, 8]),)
+  def test_forward(self, num_convs, num_fcs,
+                   num_filters, fc_input_size):
+    features = np.random.rand(2, 64, 64, 16)
+
+    head = segmentation_heads.MaskScoring(
+        num_classes=2,
+        num_convs=num_convs,
+        num_filters=num_filters,
+        fc_dims=128,
+        fc_input_size=fc_input_size)
+
+    scores = head(features)
+    self.assertAllEqual(scores.numpy().shape, [2, 2])
+
+  def test_serialize_deserialize(self):
+    head = segmentation_heads.MaskScoring(
+        num_classes=2, fc_input_size=[4, 4], fc_dims=128)
+    config = head.get_config()
+    new_head = segmentation_heads.MaskScoring.from_config(config)
+    self.assertAllEqual(head.get_config(), new_head.get_config())
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/__init__.py
+++ b/official/vision/modeling/layers/__init__.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Layers package definition."""
+
+from official.vision.modeling.layers.box_sampler import BoxSampler
+from official.vision.modeling.layers.detection_generator import DetectionGenerator
+from official.vision.modeling.layers.detection_generator import MultilevelDetectionGenerator
+from official.vision.modeling.layers.mask_sampler import MaskSampler
+from official.vision.modeling.layers.nn_blocks import BottleneckBlock
+from official.vision.modeling.layers.nn_blocks import BottleneckResidualInner
+from official.vision.modeling.layers.nn_blocks import DepthwiseSeparableConvBlock
+from official.vision.modeling.layers.nn_blocks import InvertedBottleneckBlock
+from official.vision.modeling.layers.nn_blocks import ResidualBlock
+from official.vision.modeling.layers.nn_blocks import ResidualInner
+from official.vision.modeling.layers.nn_blocks import ReversibleLayer
+from official.vision.modeling.layers.nn_blocks_3d import BottleneckBlock3D
+from official.vision.modeling.layers.nn_blocks_3d import SelfGating
+from official.vision.modeling.layers.nn_layers import CausalConvMixin
+from official.vision.modeling.layers.nn_layers import Conv2D
+from official.vision.modeling.layers.nn_layers import Conv3D
+from official.vision.modeling.layers.nn_layers import DepthwiseConv2D
+from official.vision.modeling.layers.nn_layers import GlobalAveragePool3D
+from official.vision.modeling.layers.nn_layers import PositionalEncoding
+from official.vision.modeling.layers.nn_layers import Scale
+from official.vision.modeling.layers.nn_layers import SpatialAveragePool3D
+from official.vision.modeling.layers.nn_layers import SqueezeExcitation
+from official.vision.modeling.layers.nn_layers import StochasticDepth
+from official.vision.modeling.layers.nn_layers import TemporalSoftmaxPool
+from official.vision.modeling.layers.roi_aligner import MultilevelROIAligner
+from official.vision.modeling.layers.roi_generator import MultilevelROIGenerator
+from official.vision.modeling.layers.roi_sampler import ROISampler
--- a/official/vision/modeling/layers/box_sampler.py
+++ b/official/vision/modeling/layers/box_sampler.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of box sampler."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.ops import sampling_ops
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BoxSampler(tf.keras.layers.Layer):
+  """Creates a BoxSampler to sample positive and negative boxes."""
+
+  def __init__(self,
+               num_samples: int = 512,
+               foreground_fraction: float = 0.25,
+               **kwargs):
+    """Initializes a box sampler.
+
+    Args:
+      num_samples: An `int` of the number of sampled boxes per image.
+      foreground_fraction: A `float` in [0, 1], what percentage of boxes should
+        be sampled from the positive examples.
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'num_samples': num_samples,
+        'foreground_fraction': foreground_fraction,
+    }
+    super(BoxSampler, self).__init__(**kwargs)
+
+  def call(self, positive_matches: tf.Tensor, negative_matches: tf.Tensor,
+           ignored_matches: tf.Tensor):
+    """Samples and selects positive and negative instances.
+
+    Args:
+      positive_matches: A `bool` tensor of shape of [batch, N] where N is the
+        number of instances. For each element, `True` means the instance
+        corresponds to a positive example.
+      negative_matches: A `bool` tensor of shape of [batch, N] where N is the
+        number of instances. For each element, `True` means the instance
+        corresponds to a negative example.
+      ignored_matches: A `bool` tensor of shape of [batch, N] where N is the
+        number of instances. For each element, `True` means the instance should
+        be ignored.
+
+    Returns:
+      A `tf.tensor` of shape of [batch_size, K], storing the indices of the
+        sampled examples, where K is `num_samples`.
+    """
+    sample_candidates = tf.logical_and(
+        tf.logical_or(positive_matches, negative_matches),
+        tf.logical_not(ignored_matches))
+
+    sampler = sampling_ops.BalancedPositiveNegativeSampler(
+        positive_fraction=self._config_dict['foreground_fraction'],
+        is_static=True)
+
+    batch_size = sample_candidates.shape[0]
+    sampled_indicators = []
+    for i in range(batch_size):
+      sampled_indicator = sampler.subsample(
+          sample_candidates[i],
+          self._config_dict['num_samples'],
+          positive_matches[i])
+      sampled_indicators.append(sampled_indicator)
+    sampled_indicators = tf.stack(sampled_indicators)
+    _, selected_indices = tf.nn.top_k(
+        tf.cast(sampled_indicators, dtype=tf.int32),
+        k=self._config_dict['num_samples'],
+        sorted=True)
+
+    return selected_indices
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/layers/deeplab.py
+++ b/official/vision/modeling/layers/deeplab.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Layers for DeepLabV3."""
+
+import tensorflow as tf
+
+
+class SpatialPyramidPooling(tf.keras.layers.Layer):
+  """Implements the Atrous Spatial Pyramid Pooling.
+
+  References:
+    [Rethinking Atrous Convolution for Semantic Image Segmentation](
+      https://arxiv.org/pdf/1706.05587.pdf)
+    [Encoder-Decoder with Atrous Separable Convolution for Semantic Image
+    Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
+  """
+
+  def __init__(
+      self,
+      output_channels,
+      dilation_rates,
+      pool_kernel_size=None,
+      use_sync_bn=False,
+      batchnorm_momentum=0.99,
+      batchnorm_epsilon=0.001,
+      activation='relu',
+      dropout=0.5,
+      kernel_initializer='glorot_uniform',
+      kernel_regularizer=None,
+      interpolation='bilinear',
+      use_depthwise_convolution=False,
+      **kwargs):
+    """Initializes `SpatialPyramidPooling`.
+
+    Args:
+      output_channels: Number of channels produced by SpatialPyramidPooling.
+      dilation_rates: A list of integers for parallel dilated conv.
+      pool_kernel_size: A list of integers or None. If None, global average
+        pooling is applied, otherwise an average pooling of pool_kernel_size
+        is applied.
+      use_sync_bn: A bool, whether or not to use sync batch normalization.
+      batchnorm_momentum: A float for the momentum in BatchNorm. Defaults to
+        0.99.
+      batchnorm_epsilon: A float for the epsilon value in BatchNorm. Defaults to
+        0.001.
+      activation: A `str` for type of activation to be used. Defaults to 'relu'.
+      dropout: A float for the dropout rate before output. Defaults to 0.5.
+      kernel_initializer: Kernel initializer for conv layers. Defaults to
+        `glorot_uniform`.
+      kernel_regularizer: Kernel regularizer for conv layers. Defaults to None.
+      interpolation: The interpolation method for upsampling. Defaults to
+        `bilinear`.
+      use_depthwise_convolution: Allows spatial pooling to be separable
+         depthwise convolusions. [Encoder-Decoder with Atrous Separable
+         Convolution for Semantic Image Segmentation](
+         https://arxiv.org/pdf/1802.02611.pdf)
+      **kwargs: Other keyword arguments for the layer.
+    """
+    super(SpatialPyramidPooling, self).__init__(**kwargs)
+
+    self.output_channels = output_channels
+    self.dilation_rates = dilation_rates
+    self.use_sync_bn = use_sync_bn
+    self.batchnorm_momentum = batchnorm_momentum
+    self.batchnorm_epsilon = batchnorm_epsilon
+    self.activation = activation
+    self.dropout = dropout
+    self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self.interpolation = interpolation
+    self.input_spec = tf.keras.layers.InputSpec(ndim=4)
+    self.pool_kernel_size = pool_kernel_size
+    self.use_depthwise_convolution = use_depthwise_convolution
+
+  def build(self, input_shape):
+    height = input_shape[1]
+    width = input_shape[2]
+    channels = input_shape[3]
+
+    self.aspp_layers = []
+
+    if self.use_sync_bn:
+      bn_op = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      bn_op = tf.keras.layers.BatchNormalization
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      bn_axis = -1
+    else:
+      bn_axis = 1
+
+    conv_sequential = tf.keras.Sequential([
+        tf.keras.layers.Conv2D(
+            filters=self.output_channels, kernel_size=(1, 1),
+            kernel_initializer=self.kernel_initializer,
+            kernel_regularizer=self.kernel_regularizer,
+            use_bias=False),
+        bn_op(
+            axis=bn_axis,
+            momentum=self.batchnorm_momentum,
+            epsilon=self.batchnorm_epsilon),
+        tf.keras.layers.Activation(self.activation)
+    ])
+    self.aspp_layers.append(conv_sequential)
+
+    for dilation_rate in self.dilation_rates:
+      leading_layers = []
+      kernel_size = (3, 3)
+      if self.use_depthwise_convolution:
+        leading_layers += [
+            tf.keras.layers.DepthwiseConv2D(
+                depth_multiplier=1, kernel_size=kernel_size,
+                padding='same', depthwise_regularizer=self.kernel_regularizer,
+                depthwise_initializer=self.kernel_initializer,
+                dilation_rate=dilation_rate, use_bias=False)
+        ]
+        kernel_size = (1, 1)
+      conv_sequential = tf.keras.Sequential(leading_layers + [
+          tf.keras.layers.Conv2D(
+              filters=self.output_channels, kernel_size=kernel_size,
+              padding='same', kernel_regularizer=self.kernel_regularizer,
+              kernel_initializer=self.kernel_initializer,
+              dilation_rate=dilation_rate, use_bias=False),
+          bn_op(axis=bn_axis, momentum=self.batchnorm_momentum,
+                epsilon=self.batchnorm_epsilon),
+          tf.keras.layers.Activation(self.activation)])
+      self.aspp_layers.append(conv_sequential)
+
+    if self.pool_kernel_size is None:
+      pool_sequential = tf.keras.Sequential([
+          tf.keras.layers.GlobalAveragePooling2D(),
+          tf.keras.layers.Reshape((1, 1, channels))])
+    else:
+      pool_sequential = tf.keras.Sequential([
+          tf.keras.layers.AveragePooling2D(self.pool_kernel_size)])
+
+    pool_sequential.add(
+        tf.keras.Sequential([
+            tf.keras.layers.Conv2D(
+                filters=self.output_channels,
+                kernel_size=(1, 1),
+                kernel_initializer=self.kernel_initializer,
+                kernel_regularizer=self.kernel_regularizer,
+                use_bias=False),
+            bn_op(
+                axis=bn_axis,
+                momentum=self.batchnorm_momentum,
+                epsilon=self.batchnorm_epsilon),
+            tf.keras.layers.Activation(self.activation),
+            tf.keras.layers.experimental.preprocessing.Resizing(
+                height,
+                width,
+                interpolation=self.interpolation,
+                dtype=tf.float32)
+        ]))
+
+    self.aspp_layers.append(pool_sequential)
+
+    self.projection = tf.keras.Sequential([
+        tf.keras.layers.Conv2D(
+            filters=self.output_channels, kernel_size=(1, 1),
+            kernel_initializer=self.kernel_initializer,
+            kernel_regularizer=self.kernel_regularizer,
+            use_bias=False),
+        bn_op(
+            axis=bn_axis,
+            momentum=self.batchnorm_momentum,
+            epsilon=self.batchnorm_epsilon),
+        tf.keras.layers.Activation(self.activation),
+        tf.keras.layers.Dropout(rate=self.dropout)])
+
+  def call(self, inputs, training=None):
+    if training is None:
+      training = tf.keras.backend.learning_phase()
+    result = []
+    for layer in self.aspp_layers:
+      result.append(tf.cast(layer(inputs, training=training), inputs.dtype))
+    result = tf.concat(result, axis=-1)
+    result = self.projection(result, training=training)
+    return result
+
+  def get_config(self):
+    config = {
+        'output_channels': self.output_channels,
+        'dilation_rates': self.dilation_rates,
+        'pool_kernel_size': self.pool_kernel_size,
+        'use_sync_bn': self.use_sync_bn,
+        'batchnorm_momentum': self.batchnorm_momentum,
+        'batchnorm_epsilon': self.batchnorm_epsilon,
+        'activation': self.activation,
+        'dropout': self.dropout,
+        'kernel_initializer': tf.keras.initializers.serialize(
+            self.kernel_initializer),
+        'kernel_regularizer': tf.keras.regularizers.serialize(
+            self.kernel_regularizer),
+        'interpolation': self.interpolation,
+    }
+    base_config = super(SpatialPyramidPooling, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
--- a/official/vision/modeling/layers/deeplab_test.py
+++ b/official/vision/modeling/layers/deeplab_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for ASPP."""
+
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized
+from official.vision.modeling.layers import deeplab
+
+
+@keras_parameterized.run_all_keras_modes
+class DeeplabTest(keras_parameterized.TestCase):
+
+  @keras_parameterized.parameterized.parameters(
+      (None,),
+      ([32, 32],),
+      )
+  def test_aspp(self, pool_kernel_size):
+    inputs = tf.keras.Input(shape=(64, 64, 128), dtype=tf.float32)
+    layer = deeplab.SpatialPyramidPooling(output_channels=256,
+                                          dilation_rates=[6, 12, 18],
+                                          pool_kernel_size=None)
+    output = layer(inputs)
+    self.assertAllEqual([None, 64, 64, 256], output.shape)
+
+  def test_aspp_invalid_shape(self):
+    inputs = tf.keras.Input(shape=(64, 64), dtype=tf.float32)
+    layer = deeplab.SpatialPyramidPooling(output_channels=256,
+                                          dilation_rates=[6, 12, 18])
+    with self.assertRaises(ValueError):
+      _ = layer(inputs)
+
+  def test_config_with_custom_name(self):
+    layer = deeplab.SpatialPyramidPooling(256, [5], name='aspp')
+    config = layer.get_config()
+    layer_1 = deeplab.SpatialPyramidPooling.from_config(config)
+    self.assertEqual(layer_1.name, layer.name)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/detection_generator.py
+++ b/official/vision/modeling/layers/detection_generator.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of generators to generate the final detections."""
+import contextlib
+from typing import Any, Dict, List, Optional, Mapping, Sequence
+# Import libraries
+import tensorflow as tf
+
+from official.vision.ops import box_ops
+from official.vision.ops import nms
+from official.vision.ops import preprocess_ops
+
+
+def _generate_detections_v1(boxes: tf.Tensor,
+                            scores: tf.Tensor,
+                            attributes: Optional[Mapping[str,
+                                                         tf.Tensor]] = None,
+                            pre_nms_top_k: int = 5000,
+                            pre_nms_score_threshold: float = 0.05,
+                            nms_iou_threshold: float = 0.5,
+                            max_num_detections: int = 100,
+                            soft_nms_sigma: Optional[float] = None):
+  """Generates the final detections given the model outputs.
+
+  The implementation unrolls the batch dimension and process images one by one.
+  It required the batch dimension to be statically known and it is TPU
+  compatible.
+
+  Args:
+    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
+      `[batch_size, N, 1, 4]` for box predictions on all feature levels. The
+      N is the number of total anchors on all levels.
+    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
+      stacks class probability on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    attributes: None or a dict of (attribute_name, attributes) pairs. Each
+      attributes is a `tf.Tensor` with shape
+      `[batch_size, N, num_classes, attribute_size]` or
+      `[batch_size, N, 1, attribute_size]` for attribute predictions on all
+      feature levels. The N is the number of total anchors on all levels. Can
+      be None if no attribute learning is required.
+    pre_nms_top_k: An `int` number of top candidate detections per class before
+      NMS.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
+      when to remove boxes based on score.
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    max_num_detections: A scalar representing maximum number of boxes retained
+      over all classes.
+    soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
+      When soft_nms_sigma=0.0 (which is default), we fall back to standard NMS.
+
+  Returns:
+    nms_boxes: A `float` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections, 4]` representing top detected boxes in
+      `[y1, x1, y2, x2]`.
+    nms_scores: A `float` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections]` representing sorted confidence scores
+      for detected boxes. The values are between `[0, 1]`.
+    nms_classes: An `int` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections]` representing classes for detected
+      boxes.
+    valid_detections: An `int` type `tf.Tensor` of shape `[batch_size]` only the
+       top `valid_detections` boxes are valid detections.
+    nms_attributes: None or a dict of (attribute_name, attributes). Each
+      attribute is a `float` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections, attribute_size]` representing attribute
+      predictions for detected boxes. Can be an empty dict if no attribute
+      learning is required.
+  """
+  with tf.name_scope('generate_detections'):
+    batch_size = scores.get_shape().as_list()[0]
+    nmsed_boxes = []
+    nmsed_classes = []
+    nmsed_scores = []
+    valid_detections = []
+    if attributes:
+      nmsed_attributes = {att_name: [] for att_name in attributes.keys()}
+    else:
+      nmsed_attributes = {}
+
+    for i in range(batch_size):
+      (nmsed_boxes_i, nmsed_scores_i, nmsed_classes_i, valid_detections_i,
+       nmsed_att_i) = _generate_detections_per_image(
+           boxes[i],
+           scores[i],
+           attributes={
+               att_name: att[i] for att_name, att in attributes.items()
+           } if attributes else {},
+           pre_nms_top_k=pre_nms_top_k,
+           pre_nms_score_threshold=pre_nms_score_threshold,
+           nms_iou_threshold=nms_iou_threshold,
+           max_num_detections=max_num_detections,
+           soft_nms_sigma=soft_nms_sigma)
+      nmsed_boxes.append(nmsed_boxes_i)
+      nmsed_scores.append(nmsed_scores_i)
+      nmsed_classes.append(nmsed_classes_i)
+      valid_detections.append(valid_detections_i)
+      if attributes:
+        for att_name in attributes.keys():
+          nmsed_attributes[att_name].append(nmsed_att_i[att_name])
+
+  nmsed_boxes = tf.stack(nmsed_boxes, axis=0)
+  nmsed_scores = tf.stack(nmsed_scores, axis=0)
+  nmsed_classes = tf.stack(nmsed_classes, axis=0)
+  valid_detections = tf.stack(valid_detections, axis=0)
+  if attributes:
+    for att_name in attributes.keys():
+      nmsed_attributes[att_name] = tf.stack(nmsed_attributes[att_name], axis=0)
+
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes
+
+
+def _generate_detections_per_image(
+    boxes: tf.Tensor,
+    scores: tf.Tensor,
+    attributes: Optional[Mapping[str, tf.Tensor]] = None,
+    pre_nms_top_k: int = 5000,
+    pre_nms_score_threshold: float = 0.05,
+    nms_iou_threshold: float = 0.5,
+    max_num_detections: int = 100,
+    soft_nms_sigma: Optional[float] = None):
+  """Generates the final detections per image given the model outputs.
+
+  Args:
+    boxes: A  `tf.Tensor` with shape `[N, num_classes, 4]` or `[N, 1, 4]`, which
+      box predictions on all feature levels. The N is the number of total
+      anchors on all levels.
+    scores: A `tf.Tensor` with shape `[N, num_classes]`, which stacks class
+      probability on all feature levels. The N is the number of total anchors on
+      all levels. The num_classes is the number of classes predicted by the
+      model. Note that the class_outputs here is the raw score.
+    attributes: If not None, a dict of `tf.Tensor`. Each value is in shape
+      `[N, num_classes, attribute_size]` or `[N, 1, attribute_size]` of
+      attribute predictions on all feature levels. The N is the number of total
+      anchors on all levels.
+    pre_nms_top_k: An `int` number of top candidate detections per class before
+      NMS.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
+      when to remove boxes based on score.
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    max_num_detections: A `scalar` representing maximum number of boxes retained
+      over all classes.
+    soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
+      When soft_nms_sigma=0.0, we fall back to standard NMS.
+      If set to None, `tf.image.non_max_suppression_padded` is called instead.
+
+  Returns:
+    nms_boxes: A `float` tf.Tensor of shape `[max_num_detections, 4]`
+      representing top detected boxes in `[y1, x1, y2, x2]`.
+    nms_scores: A `float` tf.Tensor of shape `[max_num_detections]` representing
+      sorted confidence scores for detected boxes. The values are between [0,
+      1].
+    nms_classes: An `int` tf.Tensor of shape `[max_num_detections]` representing
+      classes for detected boxes.
+    valid_detections: An `int` tf.Tensor of shape [1] only the top
+      `valid_detections` boxes are valid detections.
+    nms_attributes: None or a dict. Each value is a `float` tf.Tensor of shape
+      `[max_num_detections, attribute_size]` representing attribute predictions
+      for detected boxes. Can be an empty dict if `attributes` is None.
+  """
+  nmsed_boxes = []
+  nmsed_scores = []
+  nmsed_classes = []
+  num_classes_for_box = boxes.get_shape().as_list()[1]
+  num_classes = scores.get_shape().as_list()[1]
+  if attributes:
+    nmsed_attributes = {att_name: [] for att_name in attributes.keys()}
+  else:
+    nmsed_attributes = {}
+
+  for i in range(num_classes):
+    boxes_i = boxes[:, min(num_classes_for_box - 1, i)]
+    scores_i = scores[:, i]
+    # Obtains pre_nms_top_k before running NMS.
+    scores_i, indices = tf.nn.top_k(
+        scores_i, k=tf.minimum(tf.shape(scores_i)[-1], pre_nms_top_k))
+    boxes_i = tf.gather(boxes_i, indices)
+
+    if soft_nms_sigma is not None:
+      (nmsed_indices_i,
+       nmsed_scores_i) = tf.image.non_max_suppression_with_scores(
+           tf.cast(boxes_i, tf.float32),
+           tf.cast(scores_i, tf.float32),
+           max_num_detections,
+           iou_threshold=nms_iou_threshold,
+           score_threshold=pre_nms_score_threshold,
+           soft_nms_sigma=soft_nms_sigma,
+           name='nms_detections_' + str(i))
+      nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
+      nmsed_boxes_i = preprocess_ops.clip_or_pad_to_fixed_size(
+          nmsed_boxes_i, max_num_detections, 0.0)
+      nmsed_scores_i = preprocess_ops.clip_or_pad_to_fixed_size(
+          nmsed_scores_i, max_num_detections, -1.0)
+    else:
+      (nmsed_indices_i,
+       nmsed_num_valid_i) = tf.image.non_max_suppression_padded(
+           tf.cast(boxes_i, tf.float32),
+           tf.cast(scores_i, tf.float32),
+           max_num_detections,
+           iou_threshold=nms_iou_threshold,
+           score_threshold=pre_nms_score_threshold,
+           pad_to_max_output_size=True,
+           name='nms_detections_' + str(i))
+      nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
+      nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i)
+      # Sets scores of invalid boxes to -1.
+      nmsed_scores_i = tf.where(
+          tf.less(tf.range(max_num_detections), [nmsed_num_valid_i]),
+          nmsed_scores_i, -tf.ones_like(nmsed_scores_i))
+
+    nmsed_classes_i = tf.fill([max_num_detections], i)
+    nmsed_boxes.append(nmsed_boxes_i)
+    nmsed_scores.append(nmsed_scores_i)
+    nmsed_classes.append(nmsed_classes_i)
+    if attributes:
+      for att_name, att in attributes.items():
+        num_classes_for_attr = att.get_shape().as_list()[1]
+        att_i = att[:, min(num_classes_for_attr - 1, i)]
+        att_i = tf.gather(att_i, indices)
+        nmsed_att_i = tf.gather(att_i, nmsed_indices_i)
+        nmsed_att_i = preprocess_ops.clip_or_pad_to_fixed_size(
+            nmsed_att_i, max_num_detections, 0.0)
+        nmsed_attributes[att_name].append(nmsed_att_i)
+
+  # Concats results from all classes and sort them.
+  nmsed_boxes = tf.concat(nmsed_boxes, axis=0)
+  nmsed_scores = tf.concat(nmsed_scores, axis=0)
+  nmsed_classes = tf.concat(nmsed_classes, axis=0)
+  nmsed_scores, indices = tf.nn.top_k(
+      nmsed_scores, k=max_num_detections, sorted=True)
+  nmsed_boxes = tf.gather(nmsed_boxes, indices)
+  nmsed_classes = tf.gather(nmsed_classes, indices)
+  valid_detections = tf.reduce_sum(
+      tf.cast(tf.greater(nmsed_scores, -1), tf.int32))
+  if attributes:
+    for att_name in attributes.keys():
+      nmsed_attributes[att_name] = tf.concat(nmsed_attributes[att_name], axis=0)
+      nmsed_attributes[att_name] = tf.gather(nmsed_attributes[att_name],
+                                             indices)
+
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes
+
+
+def _select_top_k_scores(scores_in: tf.Tensor, pre_nms_num_detections: int):
+  """Selects top_k scores and indices for each class.
+
+  Args:
+    scores_in: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
+      stacks class logit outputs on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model.
+    pre_nms_num_detections: Number of candidates before NMS.
+
+  Returns:
+    scores and indices: A `tf.Tensor` with shape
+      `[batch_size, pre_nms_num_detections, num_classes]`.
+  """
+  batch_size, num_anchors, num_class = scores_in.get_shape().as_list()
+  if batch_size is None:
+    batch_size = tf.shape(scores_in)[0]
+  scores_trans = tf.transpose(scores_in, perm=[0, 2, 1])
+  scores_trans = tf.reshape(scores_trans, [-1, num_anchors])
+
+  top_k_scores, top_k_indices = tf.nn.top_k(
+      scores_trans, k=pre_nms_num_detections, sorted=True)
+
+  top_k_scores = tf.reshape(top_k_scores,
+                            [batch_size, num_class, pre_nms_num_detections])
+  top_k_indices = tf.reshape(top_k_indices,
+                             [batch_size, num_class, pre_nms_num_detections])
+
+  return tf.transpose(top_k_scores,
+                      [0, 2, 1]), tf.transpose(top_k_indices, [0, 2, 1])
+
+
+def _generate_detections_v2(boxes: tf.Tensor,
+                            scores: tf.Tensor,
+                            pre_nms_top_k: int = 5000,
+                            pre_nms_score_threshold: float = 0.05,
+                            nms_iou_threshold: float = 0.5,
+                            max_num_detections: int = 100):
+  """Generates the final detections given the model outputs.
+
+  This implementation unrolls classes dimension while using the tf.while_loop
+  to implement the batched NMS, so that it can be parallelized at the batch
+  dimension. It should give better performance comparing to v1 implementation.
+  It is TPU compatible.
+
+  Args:
+    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
+      `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
+      N is the number of total anchors on all levels.
+    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
+      stacks class probability on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    pre_nms_top_k: An `int` number of top candidate detections per class before
+      NMS.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
+      when to remove boxes based on score.
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    max_num_detections: A `scalar` representing maximum number of boxes retained
+      over all classes.
+
+  Returns:
+    nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4]
+      representing top detected boxes in [y1, x1, y2, x2].
+    nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections]
+      representing sorted confidence scores for detected boxes. The values are
+      between [0, 1].
+    nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections]
+      representing classes for detected boxes.
+    valid_detections: An `int` tf.Tensor of shape [batch_size] only the top
+      `valid_detections` boxes are valid detections.
+  """
+  with tf.name_scope('generate_detections'):
+    nmsed_boxes = []
+    nmsed_classes = []
+    nmsed_scores = []
+    valid_detections = []
+    batch_size, _, num_classes_for_box, _ = boxes.get_shape().as_list()
+    if batch_size is None:
+      batch_size = tf.shape(boxes)[0]
+    _, total_anchors, num_classes = scores.get_shape().as_list()
+    # Selects top pre_nms_num scores and indices before NMS.
+    scores, indices = _select_top_k_scores(
+        scores, min(total_anchors, pre_nms_top_k))
+    for i in range(num_classes):
+      boxes_i = boxes[:, :, min(num_classes_for_box - 1, i), :]
+      scores_i = scores[:, :, i]
+      # Obtains pre_nms_top_k before running NMS.
+      boxes_i = tf.gather(boxes_i, indices[:, :, i], batch_dims=1, axis=1)
+
+      # Filter out scores.
+      boxes_i, scores_i = box_ops.filter_boxes_by_scores(
+          boxes_i, scores_i, min_score_threshold=pre_nms_score_threshold)
+
+      (nmsed_scores_i, nmsed_boxes_i) = nms.sorted_non_max_suppression_padded(
+          tf.cast(scores_i, tf.float32),
+          tf.cast(boxes_i, tf.float32),
+          max_num_detections,
+          iou_threshold=nms_iou_threshold)
+      nmsed_classes_i = tf.fill([batch_size, max_num_detections], i)
+      nmsed_boxes.append(nmsed_boxes_i)
+      nmsed_scores.append(nmsed_scores_i)
+      nmsed_classes.append(nmsed_classes_i)
+  nmsed_boxes = tf.concat(nmsed_boxes, axis=1)
+  nmsed_scores = tf.concat(nmsed_scores, axis=1)
+  nmsed_classes = tf.concat(nmsed_classes, axis=1)
+  nmsed_scores, indices = tf.nn.top_k(
+      nmsed_scores, k=max_num_detections, sorted=True)
+  nmsed_boxes = tf.gather(nmsed_boxes, indices, batch_dims=1, axis=1)
+  nmsed_classes = tf.gather(nmsed_classes, indices, batch_dims=1)
+  valid_detections = tf.reduce_sum(
+      input_tensor=tf.cast(tf.greater(nmsed_scores, 0.0), tf.int32), axis=1)
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+
+
+def _generate_detections_batched(boxes: tf.Tensor, scores: tf.Tensor,
+                                 pre_nms_score_threshold: float,
+                                 nms_iou_threshold: float,
+                                 max_num_detections: int):
+  """Generates detected boxes with scores and classes for one-stage detector.
+
+  The function takes output of multi-level ConvNets and anchor boxes and
+  generates detected boxes. Note that this used batched nms, which is not
+  supported on TPU currently.
+
+  Args:
+    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
+      `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
+      N is the number of total anchors on all levels.
+    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
+      stacks class probability on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
+      when to remove boxes based on score.
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    max_num_detections: A `scalar` representing maximum number of boxes retained
+      over all classes.
+
+  Returns:
+    nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4]
+      representing top detected boxes in [y1, x1, y2, x2].
+    nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections]
+      representing sorted confidence scores for detected boxes. The values are
+      between [0, 1].
+    nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections]
+      representing classes for detected boxes.
+    valid_detections: An `int` tf.Tensor of shape [batch_size] only the top
+      `valid_detections` boxes are valid detections.
+  """
+  with tf.name_scope('generate_detections'):
+    nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+        tf.image.combined_non_max_suppression(
+            boxes,
+            scores,
+            max_output_size_per_class=max_num_detections,
+            max_total_size=max_num_detections,
+            iou_threshold=nms_iou_threshold,
+            score_threshold=pre_nms_score_threshold,
+            pad_per_class=False,
+            clip_boxes=False))
+    nmsed_classes = tf.cast(nmsed_classes, tf.int32)
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+
+
+def _generate_detections_tflite_implements_signature(
+    config: Dict[str, Any]) -> str:
+  """Returns `experimental_implements` signature for TFLite's custom NMS op.
+
+  This signature encodes the arguments to correctly initialize TFLite's custom
+  post-processing op in the MLIR converter.
+  For details on `experimental_implements` see here:
+  https://www.tensorflow.org/api_docs/python/tf/function
+
+  Args:
+    config: A dictionary of configs defining parameters for TFLite NMS op.
+
+  Returns:
+    An `experimental_implements` signature string.
+  """
+  scale_value = 1.0
+
+  implements_signature = [
+      'name: "%s"' % 'TFLite_Detection_PostProcess',
+      'attr { key: "max_detections" value { i: %d } }' %
+      config['max_detections'],
+      'attr { key: "max_classes_per_detection" value { i: %d } }' %
+      config['max_classes_per_detection'],
+      'attr { key: "use_regular_nms" value { b: %s } }' %
+      str(config['use_regular_nms']).lower(),
+      'attr { key: "nms_score_threshold" value { f: %f } }' %
+      config['nms_score_threshold'],
+      'attr { key: "nms_iou_threshold" value { f: %f } }' %
+      config['nms_iou_threshold'],
+      'attr { key: "y_scale" value { f: %f } }' % scale_value,
+      'attr { key: "x_scale" value { f: %f } }' % scale_value,
+      'attr { key: "h_scale" value { f: %f } }' % scale_value,
+      'attr { key: "w_scale" value { f: %f } }' % scale_value,
+      'attr { key: "num_classes" value { i: %d } }' % config['num_classes']
+  ]
+  implements_signature = ' '.join(implements_signature)
+  return implements_signature
+
+
+def _generate_detections_tflite(raw_boxes: Mapping[str, tf.Tensor],
+                                raw_scores: Mapping[str, tf.Tensor],
+                                anchor_boxes: Mapping[str, tf.Tensor],
+                                config: Dict[str, Any]) -> Sequence[Any]:
+  """Generate detections for conversion to TFLite.
+
+  Mathematically same as class-agnostic NMS, except that the last portion of
+  the TF graph constitutes a dummy `tf.function` that contains an annotation
+  for conversion to TFLite's custom NMS op. Using this custom op allows
+  features like post-training quantization & accelerator support.
+  NOTE: This function does NOT return a valid output, and is only meant to
+  generate a SavedModel for TFLite conversion via MLIR. The generated SavedModel
+  should not be used for inference.
+  For TFLite op details, see tensorflow/lite/kernels/detection_postprocess.cc
+
+  Args:
+    raw_boxes: A dictionary of tensors for raw boxes. Key is level of features
+      and value is a tensor denoting a level of boxes with shape [1, H, W, 4 *
+      num_anchors].
+    raw_scores: A dictionary of tensors for classes. Key is level of features
+      and value is a tensor denoting a level of logits with shape [1, H, W,
+      num_class * num_anchors].
+    anchor_boxes: A dictionary of tensors for anchor boxes. Key is level of
+      features and value is a tensor denoting a level of anchors with shape
+      [num_anchors, 4].
+    config: A dictionary of configs defining parameters for TFLite NMS op.
+
+  Returns:
+    A (dummy) tuple of (boxes, scores, classess, num_detections).
+
+  Raises:
+    ValueError: If the last dimension of predicted boxes is not divisible by 4,
+      or the last dimension of predicted scores is not divisible by number of
+      anchors per location.
+  """
+  scores, boxes, anchors = [], [], []
+  levels = list(raw_scores.keys())
+  min_level = int(min(levels))
+  max_level = int(max(levels))
+  batch_size = tf.shape(raw_scores[str(min_level)])[0]
+
+  num_anchors_per_locations_times_4 = raw_boxes[str(
+      min_level)].get_shape().as_list()[-1]
+  if num_anchors_per_locations_times_4 % 4 != 0:
+    raise ValueError(
+        'The last dimension of predicted boxes should be divisible by 4.')
+  num_anchors_per_locations = num_anchors_per_locations_times_4 // 4
+  if num_anchors_per_locations_times_4 % 4 != 0:
+    raise ValueError(
+        f'The last dimension of predicted scores should be divisible by {num_anchors_per_locations}.'
+    )
+  num_classes = raw_scores[str(
+      min_level)].get_shape().as_list()[-1] // num_anchors_per_locations
+  config.update({'num_classes': num_classes})
+
+  for i in range(min_level, max_level + 1):
+    scores.append(
+        tf.sigmoid(
+            tf.reshape(raw_scores[str(i)], [batch_size, -1, num_classes])))
+    boxes.append(tf.reshape(raw_boxes[str(i)], [batch_size, -1, 4]))
+    anchors.append(tf.reshape(anchor_boxes[str(i)], [-1, 4]))
+  scores = tf.concat(scores, 1)
+  boxes = tf.concat(boxes, 1)
+  anchors = tf.concat(anchors, 0)
+
+  ycenter_a = (anchors[..., 0] + anchors[..., 2]) / 2
+  xcenter_a = (anchors[..., 1] + anchors[..., 3]) / 2
+  ha = anchors[..., 2] - anchors[..., 0]
+  wa = anchors[..., 3] - anchors[..., 1]
+  anchors = tf.stack([ycenter_a, xcenter_a, ha, wa], axis=-1)
+
+  # There is no TF equivalent for TFLite's custom post-processing op.
+  # So we add an 'empty' composite function here, that is legalized to the
+  # custom op with MLIR.
+  # For details, see: tensorflow/compiler/mlir/lite/utils/nms_utils.cc
+  @tf.function(
+      experimental_implements=_generate_detections_tflite_implements_signature(
+          config))
+  # pylint: disable=g-unused-argument,unused-argument
+  def dummy_post_processing(input_boxes, input_scores, input_anchors):
+    boxes = tf.constant(0.0, dtype=tf.float32, name='boxes')
+    scores = tf.constant(0.0, dtype=tf.float32, name='scores')
+    classes = tf.constant(0.0, dtype=tf.float32, name='classes')
+    num_detections = tf.constant(0.0, dtype=tf.float32, name='num_detections')
+    return boxes, classes, scores, num_detections
+
+  return dummy_post_processing(boxes, scores, anchors)[::-1]
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class DetectionGenerator(tf.keras.layers.Layer):
+  """Generates the final detected boxes with scores and classes."""
+
+  def __init__(self,
+               apply_nms: bool = True,
+               pre_nms_top_k: int = 5000,
+               pre_nms_score_threshold: float = 0.05,
+               nms_iou_threshold: float = 0.5,
+               max_num_detections: int = 100,
+               nms_version: str = 'v2',
+               use_cpu_nms: bool = False,
+               soft_nms_sigma: Optional[float] = None,
+               **kwargs):
+    """Initializes a detection generator.
+
+    Args:
+      apply_nms: A `bool` of whether or not apply non maximum suppression.
+        If False, the decoded boxes and their scores are returned.
+      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
+        before applying NMS.
+      pre_nms_score_threshold: A `float` of the score threshold to apply before
+        applying  NMS. Proposals whose scores are below this threshold are
+        thrown away.
+      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
+      max_num_detections: An `int` of the final number of total detections to
+        generate.
+      nms_version: A string of `batched`, `v1` or `v2` specifies NMS version.
+      use_cpu_nms: A `bool` of whether or not enforce NMS to run on CPU.
+      soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
+        When soft_nms_sigma=0.0, we fall back to standard NMS.
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'apply_nms': apply_nms,
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'nms_iou_threshold': nms_iou_threshold,
+        'max_num_detections': max_num_detections,
+        'nms_version': nms_version,
+        'use_cpu_nms': use_cpu_nms,
+        'soft_nms_sigma': soft_nms_sigma,
+    }
+    super(DetectionGenerator, self).__init__(**kwargs)
+
+  def __call__(self,
+               raw_boxes: tf.Tensor,
+               raw_scores: tf.Tensor,
+               anchor_boxes: tf.Tensor,
+               image_shape: tf.Tensor,
+               regression_weights: Optional[List[float]] = None,
+               bbox_per_class: bool = True):
+    """Generates final detections.
+
+    Args:
+      raw_boxes: A `tf.Tensor` of shape of `[batch_size, K, num_classes * 4]`
+        representing the class-specific box coordinates relative to anchors.
+      raw_scores: A `tf.Tensor` of shape of `[batch_size, K, num_classes]`
+        representing the class logits before applying score activiation.
+      anchor_boxes: A `tf.Tensor` of shape of `[batch_size, K, 4]` representing
+        the corresponding anchor boxes w.r.t `box_outputs`.
+      image_shape: A `tf.Tensor` of shape of `[batch_size, 2]` storing the image
+        height and width w.r.t. the scaled image, i.e. the same image space as
+        `box_outputs` and `anchor_boxes`.
+      regression_weights: A list of four float numbers to scale coordinates.
+      bbox_per_class: A `bool`. If True, perform per-class box regression.
+
+    Returns:
+      If `apply_nms` = True, the return is a dictionary with keys:
+        `detection_boxes`: A `float` tf.Tensor of shape
+          [batch, max_num_detections, 4] representing top detected boxes in
+          [y1, x1, y2, x2].
+        `detection_scores`: A `float` `tf.Tensor` of shape
+          [batch, max_num_detections] representing sorted confidence scores for
+          detected boxes. The values are between [0, 1].
+        `detection_classes`: An `int` tf.Tensor of shape
+          [batch, max_num_detections] representing classes for detected boxes.
+        `num_detections`: An `int` tf.Tensor of shape [batch] only the first
+          `num_detections` boxes are valid detections
+      If `apply_nms` = False, the return is a dictionary with keys:
+        `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
+          representing all the decoded boxes.
+        `decoded_box_scores`: A `float` tf.Tensor of shape
+          [batch, num_raw_boxes] representing socres of all the decoded boxes.
+    """
+    box_scores = tf.nn.softmax(raw_scores, axis=-1)
+
+    # Removes the background class.
+    box_scores_shape = tf.shape(box_scores)
+    box_scores_shape_list = box_scores.get_shape().as_list()
+    batch_size = box_scores_shape[0]
+    num_locations = box_scores_shape_list[1]
+    num_classes = box_scores_shape_list[-1]
+
+    box_scores = tf.slice(box_scores, [0, 0, 1], [-1, -1, -1])
+
+    if bbox_per_class:
+      num_detections = num_locations * (num_classes - 1)
+      raw_boxes = tf.reshape(raw_boxes,
+                             [batch_size, num_locations, num_classes, 4])
+      raw_boxes = tf.slice(raw_boxes, [0, 0, 1, 0], [-1, -1, -1, -1])
+      anchor_boxes = tf.tile(
+          tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
+      raw_boxes = tf.reshape(raw_boxes, [batch_size, num_detections, 4])
+      anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])
+
+    # Box decoding.
+    decoded_boxes = box_ops.decode_boxes(
+        raw_boxes, anchor_boxes, weights=regression_weights)
+
+    # Box clipping
+    decoded_boxes = box_ops.clip_boxes(
+        decoded_boxes, tf.expand_dims(image_shape, axis=1))
+
+    if bbox_per_class:
+      decoded_boxes = tf.reshape(
+          decoded_boxes, [batch_size, num_locations, num_classes - 1, 4])
+    else:
+      decoded_boxes = tf.expand_dims(decoded_boxes, axis=2)
+
+    if not self._config_dict['apply_nms']:
+      return {
+          'decoded_boxes': decoded_boxes,
+          'decoded_box_scores': box_scores,
+      }
+
+    # Optionally force the NMS be run on CPU.
+    if self._config_dict['use_cpu_nms']:
+      nms_context = tf.device('cpu:0')
+    else:
+      nms_context = contextlib.nullcontext()
+
+    with nms_context:
+      if self._config_dict['nms_version'] == 'batched':
+        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
+            _generate_detections_batched(
+                decoded_boxes, box_scores,
+                self._config_dict['pre_nms_score_threshold'],
+                self._config_dict['nms_iou_threshold'],
+                self._config_dict['max_num_detections']))
+      elif self._config_dict['nms_version'] == 'v1':
+        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, _) = (
+            _generate_detections_v1(
+                decoded_boxes,
+                box_scores,
+                pre_nms_top_k=self._config_dict['pre_nms_top_k'],
+                pre_nms_score_threshold=self
+                ._config_dict['pre_nms_score_threshold'],
+                nms_iou_threshold=self._config_dict['nms_iou_threshold'],
+                max_num_detections=self._config_dict['max_num_detections'],
+                soft_nms_sigma=self._config_dict['soft_nms_sigma']))
+      elif self._config_dict['nms_version'] == 'v2':
+        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
+            _generate_detections_v2(
+                decoded_boxes,
+                box_scores,
+                pre_nms_top_k=self._config_dict['pre_nms_top_k'],
+                pre_nms_score_threshold=self
+                ._config_dict['pre_nms_score_threshold'],
+                nms_iou_threshold=self._config_dict['nms_iou_threshold'],
+                max_num_detections=self._config_dict['max_num_detections']))
+      else:
+        raise ValueError('NMS version {} not supported.'.format(
+            self._config_dict['nms_version']))
+
+    # Adds 1 to offset the background class which has index 0.
+    nmsed_classes += 1
+
+    return {
+        'num_detections': valid_detections,
+        'detection_boxes': nmsed_boxes,
+        'detection_classes': nmsed_classes,
+        'detection_scores': nmsed_scores,
+    }
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MultilevelDetectionGenerator(tf.keras.layers.Layer):
+  """Generates detected boxes with scores and classes for one-stage detector."""
+
+  def __init__(self,
+               apply_nms: bool = True,
+               pre_nms_top_k: int = 5000,
+               pre_nms_score_threshold: float = 0.05,
+               nms_iou_threshold: float = 0.5,
+               max_num_detections: int = 100,
+               nms_version: str = 'v1',
+               use_cpu_nms: bool = False,
+               soft_nms_sigma: Optional[float] = None,
+               tflite_post_processing_config: Optional[Dict[str, Any]] = None,
+               **kwargs):
+    """Initializes a multi-level detection generator.
+
+    Args:
+      apply_nms: A `bool` of whether or not apply non maximum suppression. If
+        False, the decoded boxes and their scores are returned.
+      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
+        before applying NMS.
+      pre_nms_score_threshold: A `float` of the score threshold to apply before
+        applying NMS. Proposals whose scores are below this threshold are thrown
+        away.
+      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
+      max_num_detections: An `int` of the final number of total detections to
+        generate.
+      nms_version: A string of `batched`, `v1` or `v2` specifies NMS version
+      use_cpu_nms: A `bool` of whether or not enforce NMS to run on CPU.
+      soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
+        When soft_nms_sigma=0.0, we fall back to standard NMS.
+      tflite_post_processing_config: An optional dictionary containing
+        post-processing parameters used for TFLite custom NMS op.
+
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'apply_nms': apply_nms,
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'nms_iou_threshold': nms_iou_threshold,
+        'max_num_detections': max_num_detections,
+        'nms_version': nms_version,
+        'use_cpu_nms': use_cpu_nms,
+        'soft_nms_sigma': soft_nms_sigma
+    }
+
+    if tflite_post_processing_config is not None:
+      self._config_dict.update(
+          {'tflite_post_processing_config': tflite_post_processing_config})
+    super(MultilevelDetectionGenerator, self).__init__(**kwargs)
+
+  def _decode_multilevel_outputs(
+      self,
+      raw_boxes: Mapping[str, tf.Tensor],
+      raw_scores: Mapping[str, tf.Tensor],
+      anchor_boxes: Mapping[str, tf.Tensor],
+      image_shape: tf.Tensor,
+      raw_attributes: Optional[Mapping[str, tf.Tensor]] = None):
+    """Collects dict of multilevel boxes, scores, attributes into lists."""
+    boxes = []
+    scores = []
+    if raw_attributes:
+      attributes = {att_name: [] for att_name in raw_attributes.keys()}
+    else:
+      attributes = {}
+
+    levels = list(raw_boxes.keys())
+    min_level = int(min(levels))
+    max_level = int(max(levels))
+    for i in range(min_level, max_level + 1):
+      raw_boxes_i = raw_boxes[str(i)]
+      raw_scores_i = raw_scores[str(i)]
+      batch_size = tf.shape(raw_boxes_i)[0]
+      (_, feature_h_i, feature_w_i,
+       num_anchors_per_locations_times_4) = raw_boxes_i.get_shape().as_list()
+      num_locations = feature_h_i * feature_w_i
+      num_anchors_per_locations = num_anchors_per_locations_times_4 // 4
+      num_classes = raw_scores_i.get_shape().as_list(
+      )[-1] // num_anchors_per_locations
+
+      # Applies score transformation and remove the implicit background class.
+      scores_i = tf.sigmoid(
+          tf.reshape(raw_scores_i, [
+              batch_size, num_locations * num_anchors_per_locations, num_classes
+          ]))
+      scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1])
+
+      # Box decoding.
+      # The anchor boxes are shared for all data in a batch.
+      # One stage detector only supports class agnostic box regression.
+      anchor_boxes_i = tf.reshape(
+          anchor_boxes[str(i)],
+          [batch_size, num_locations * num_anchors_per_locations, 4])
+      raw_boxes_i = tf.reshape(
+          raw_boxes_i,
+          [batch_size, num_locations * num_anchors_per_locations, 4])
+      boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i)
+
+      # Box clipping.
+      boxes_i = box_ops.clip_boxes(
+          boxes_i, tf.expand_dims(image_shape, axis=1))
+
+      boxes.append(boxes_i)
+      scores.append(scores_i)
+
+      if raw_attributes:
+        for att_name, raw_att in raw_attributes.items():
+          attribute_size = raw_att[str(
+              i)].get_shape().as_list()[-1] // num_anchors_per_locations
+          att_i = tf.reshape(raw_att[str(i)], [
+              batch_size, num_locations * num_anchors_per_locations,
+              attribute_size
+          ])
+          attributes[att_name].append(att_i)
+
+    boxes = tf.concat(boxes, axis=1)
+    boxes = tf.expand_dims(boxes, axis=2)
+    scores = tf.concat(scores, axis=1)
+
+    if raw_attributes:
+      for att_name in raw_attributes.keys():
+        attributes[att_name] = tf.concat(attributes[att_name], axis=1)
+        attributes[att_name] = tf.expand_dims(attributes[att_name], axis=2)
+
+    return boxes, scores, attributes
+
+  def __call__(self,
+               raw_boxes: Mapping[str, tf.Tensor],
+               raw_scores: Mapping[str, tf.Tensor],
+               anchor_boxes: Mapping[str, tf.Tensor],
+               image_shape: tf.Tensor,
+               raw_attributes: Optional[Mapping[str, tf.Tensor]] = None):
+    """Generates final detections.
+
+    Args:
+      raw_boxes: A `dict` with keys representing FPN levels and values
+        representing box tenors of shape `[batch, feature_h, feature_w,
+        num_anchors * 4]`.
+      raw_scores: A `dict` with keys representing FPN levels and values
+        representing logit tensors of shape `[batch, feature_h, feature_w,
+        num_anchors]`.
+      anchor_boxes: A `dict` with keys representing FPN levels and values
+        representing anchor tenors of shape `[batch_size, K, 4]` representing
+        the corresponding anchor boxes w.r.t `box_outputs`.
+      image_shape: A `tf.Tensor` of shape of [batch_size, 2] storing the image
+        height and width w.r.t. the scaled image, i.e. the same image space as
+        `box_outputs` and `anchor_boxes`.
+      raw_attributes: If not None, a `dict` of (attribute_name,
+        attribute_prediction) pairs. `attribute_prediction` is a dict that
+        contains keys representing FPN levels and values representing tenors of
+        shape `[batch, feature_h, feature_w, num_anchors * attribute_size]`.
+
+    Returns:
+      If `apply_nms` = True, the return is a dictionary with keys:
+        `detection_boxes`: A `float` tf.Tensor of shape
+          [batch, max_num_detections, 4] representing top detected boxes in
+          [y1, x1, y2, x2].
+        `detection_scores`: A `float` tf.Tensor of shape
+          [batch, max_num_detections] representing sorted confidence scores for
+          detected boxes. The values are between [0, 1].
+        `detection_classes`: An `int` tf.Tensor of shape
+          [batch, max_num_detections] representing classes for detected boxes.
+        `num_detections`: An `int` tf.Tensor of shape [batch] only the first
+          `num_detections` boxes are valid detections
+        `detection_attributes`: A dict. Values of the dict is a `float`
+          tf.Tensor of shape [batch, max_num_detections, attribute_size]
+          representing attribute predictions for detected boxes.
+      If `apply_nms` = False, the return is a dictionary with keys:
+        `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
+          representing all the decoded boxes.
+        `decoded_box_scores`: A `float` tf.Tensor of shape
+          [batch, num_raw_boxes] representing socres of all the decoded boxes.
+        `decoded_box_attributes`: A dict. Values in the dict is a
+          `float` tf.Tensor of shape [batch, num_raw_boxes, attribute_size]
+          representing attribute predictions of all the decoded boxes.
+    """
+    if self._config_dict['apply_nms'] and self._config_dict[
+        'nms_version'] == 'tflite':
+      boxes, classes, scores, num_detections = _generate_detections_tflite(
+          raw_boxes, raw_scores, anchor_boxes,
+          self.get_config()['tflite_post_processing_config'])
+      return {
+          'num_detections': num_detections,
+          'detection_boxes': boxes,
+          'detection_classes': classes,
+          'detection_scores': scores
+      }
+
+    boxes, scores, attributes = self._decode_multilevel_outputs(
+        raw_boxes, raw_scores, anchor_boxes, image_shape, raw_attributes)
+
+    if not self._config_dict['apply_nms']:
+      return {
+          'decoded_boxes': boxes,
+          'decoded_box_scores': scores,
+          'decoded_box_attributes': attributes,
+      }
+
+    # Optionally force the NMS to run on CPU.
+    if self._config_dict['use_cpu_nms']:
+      nms_context = tf.device('cpu:0')
+    else:
+      nms_context = contextlib.nullcontext()
+
+    with nms_context:
+      if raw_attributes and (self._config_dict['nms_version'] != 'v1'):
+        raise ValueError(
+            'Attribute learning is only supported for NMSv1 but NMS {} is used.'
+            .format(self._config_dict['nms_version']))
+      if self._config_dict['nms_version'] == 'batched':
+        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
+            _generate_detections_batched(
+                boxes, scores, self._config_dict['pre_nms_score_threshold'],
+                self._config_dict['nms_iou_threshold'],
+                self._config_dict['max_num_detections']))
+        # Set `nmsed_attributes` to None for batched NMS.
+        nmsed_attributes = {}
+      elif self._config_dict['nms_version'] == 'v1':
+        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections,
+         nmsed_attributes) = (
+             _generate_detections_v1(
+                 boxes,
+                 scores,
+                 attributes=attributes if raw_attributes else None,
+                 pre_nms_top_k=self._config_dict['pre_nms_top_k'],
+                 pre_nms_score_threshold=self
+                 ._config_dict['pre_nms_score_threshold'],
+                 nms_iou_threshold=self._config_dict['nms_iou_threshold'],
+                 max_num_detections=self._config_dict['max_num_detections'],
+                 soft_nms_sigma=self._config_dict['soft_nms_sigma']))
+      elif self._config_dict['nms_version'] == 'v2':
+        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
+            _generate_detections_v2(
+                boxes,
+                scores,
+                pre_nms_top_k=self._config_dict['pre_nms_top_k'],
+                pre_nms_score_threshold=self
+                ._config_dict['pre_nms_score_threshold'],
+                nms_iou_threshold=self._config_dict['nms_iou_threshold'],
+                max_num_detections=self._config_dict['max_num_detections']))
+        # Set `nmsed_attributes` to None for v2.
+        nmsed_attributes = {}
+      else:
+        raise ValueError('NMS version {} not supported.'.format(
+            self._config_dict['nms_version']))
+
+    # Adds 1 to offset the background class which has index 0.
+    nmsed_classes += 1
+
+    return {
+        'num_detections': valid_detections,
+        'detection_boxes': nmsed_boxes,
+        'detection_classes': nmsed_classes,
+        'detection_scores': nmsed_scores,
+        'detection_attributes': nmsed_attributes,
+    }
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/layers/detection_generator_test.py
+++ b/official/vision/modeling/layers/detection_generator_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for detection_generator.py."""
+# Import libraries
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling.layers import detection_generator
+from official.vision.ops import anchor
+
+
+class SelectTopKScoresTest(tf.test.TestCase):
+
+  def testSelectTopKScores(self):
+    pre_nms_num_boxes = 2
+    scores_data = [[[0.2, 0.2], [0.1, 0.9], [0.5, 0.1], [0.3, 0.5]]]
+    scores_in = tf.constant(scores_data, dtype=tf.float32)
+    top_k_scores, top_k_indices = detection_generator._select_top_k_scores(
+        scores_in, pre_nms_num_detections=pre_nms_num_boxes)
+    expected_top_k_scores = np.array([[[0.5, 0.9], [0.3, 0.5]]],
+                                     dtype=np.float32)
+
+    expected_top_k_indices = [[[2, 1], [3, 3]]]
+
+    self.assertAllEqual(top_k_scores.numpy(), expected_top_k_scores)
+    self.assertAllEqual(top_k_indices.numpy(), expected_top_k_indices)
+
+
+class DetectionGeneratorTest(
+    parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.product(
+      nms_version=['batched', 'v1', 'v2'],
+      use_cpu_nms=[True, False],
+      soft_nms_sigma=[None, 0.1])
+  def testDetectionsOutputShape(self, nms_version, use_cpu_nms, soft_nms_sigma):
+    max_num_detections = 10
+    num_classes = 4
+    pre_nms_top_k = 5000
+    pre_nms_score_threshold = 0.01
+    batch_size = 1
+    kwargs = {
+        'apply_nms': True,
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'nms_iou_threshold': 0.5,
+        'max_num_detections': max_num_detections,
+        'nms_version': nms_version,
+        'use_cpu_nms': use_cpu_nms,
+        'soft_nms_sigma': soft_nms_sigma,
+    }
+    generator = detection_generator.DetectionGenerator(**kwargs)
+
+    cls_outputs_all = (
+        np.random.rand(84, num_classes) - 0.5) * 3  # random 84x3 outputs.
+    box_outputs_all = np.random.rand(84, 4 * num_classes)  # random 84 boxes.
+    anchor_boxes_all = np.random.rand(84, 4)  # random 84 boxes.
+    class_outputs = tf.reshape(
+        tf.convert_to_tensor(cls_outputs_all, dtype=tf.float32),
+        [1, 84, num_classes])
+    box_outputs = tf.reshape(
+        tf.convert_to_tensor(box_outputs_all, dtype=tf.float32),
+        [1, 84, 4 * num_classes])
+    anchor_boxes = tf.reshape(
+        tf.convert_to_tensor(anchor_boxes_all, dtype=tf.float32),
+        [1, 84, 4])
+    image_info = tf.constant(
+        [[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]],
+        dtype=tf.float32)
+    results = generator(
+        box_outputs, class_outputs, anchor_boxes, image_info[:, 1, :])
+    boxes = results['detection_boxes']
+    classes = results['detection_classes']
+    scores = results['detection_scores']
+    valid_detections = results['num_detections']
+
+    self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4))
+    self.assertEqual(scores.numpy().shape, (batch_size, max_num_detections,))
+    self.assertEqual(classes.numpy().shape, (batch_size, max_num_detections,))
+    self.assertEqual(valid_detections.numpy().shape, (batch_size,))
+
+  def test_serialize_deserialize(self):
+    kwargs = {
+        'apply_nms': True,
+        'pre_nms_top_k': 1000,
+        'pre_nms_score_threshold': 0.1,
+        'nms_iou_threshold': 0.5,
+        'max_num_detections': 10,
+        'nms_version': 'v2',
+        'use_cpu_nms': False,
+        'soft_nms_sigma': None,
+    }
+    generator = detection_generator.DetectionGenerator(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(generator.get_config(), expected_config)
+
+    new_generator = (
+        detection_generator.DetectionGenerator.from_config(
+            generator.get_config()))
+
+    self.assertAllEqual(generator.get_config(), new_generator.get_config())
+
+
+class MultilevelDetectionGeneratorTest(
+    parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ('batched', False, True, None, None),
+      ('batched', False, False, None, None),
+      ('v2', False, True, None, None),
+      ('v2', False, False, None, None),
+      ('v1', True, True, 0.0, None),
+      ('v1', True, False, 0.1, None),
+      ('v1', True, False, None, None),
+      ('tflite', False, False, None, True),
+      ('tflite', False, False, None, False),
+  )
+  def testDetectionsOutputShape(self, nms_version, has_att_heads, use_cpu_nms,
+                                soft_nms_sigma, use_regular_nms):
+    min_level = 4
+    max_level = 6
+    num_scales = 2
+    max_num_detections = 10
+    aspect_ratios = [1.0, 2.0]
+    anchor_scale = 2.0
+    output_size = [64, 64]
+    num_classes = 4
+    pre_nms_top_k = 5000
+    pre_nms_score_threshold = 0.01
+    batch_size = 1
+    tflite_post_processing_config = {
+        'max_detections': max_num_detections,
+        'max_classes_per_detection': 1,
+        'use_regular_nms': use_regular_nms,
+        'nms_score_threshold': 0.01,
+        'nms_iou_threshold': 0.5
+    }
+    kwargs = {
+        'apply_nms': True,
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'nms_iou_threshold': 0.5,
+        'max_num_detections': max_num_detections,
+        'nms_version': nms_version,
+        'use_cpu_nms': use_cpu_nms,
+        'soft_nms_sigma': soft_nms_sigma,
+        'tflite_post_processing_config': tflite_post_processing_config
+    }
+
+    input_anchor = anchor.build_anchor_generator(min_level, max_level,
+                                                 num_scales, aspect_ratios,
+                                                 anchor_scale)
+    anchor_boxes = input_anchor(output_size)
+    cls_outputs_all = (
+        np.random.rand(84, num_classes) - 0.5) * 3  # random 84x3 outputs.
+    box_outputs_all = np.random.rand(84, 4)  # random 84 boxes.
+    class_outputs = {
+        '4':
+            tf.reshape(
+                tf.convert_to_tensor(cls_outputs_all[0:64], dtype=tf.float32),
+                [1, 8, 8, num_classes]),
+        '5':
+            tf.reshape(
+                tf.convert_to_tensor(cls_outputs_all[64:80], dtype=tf.float32),
+                [1, 4, 4, num_classes]),
+        '6':
+            tf.reshape(
+                tf.convert_to_tensor(cls_outputs_all[80:84], dtype=tf.float32),
+                [1, 2, 2, num_classes]),
+    }
+    box_outputs = {
+        '4': tf.reshape(tf.convert_to_tensor(
+            box_outputs_all[0:64], dtype=tf.float32), [1, 8, 8, 4]),
+        '5': tf.reshape(tf.convert_to_tensor(
+            box_outputs_all[64:80], dtype=tf.float32), [1, 4, 4, 4]),
+        '6': tf.reshape(tf.convert_to_tensor(
+            box_outputs_all[80:84], dtype=tf.float32), [1, 2, 2, 4]),
+    }
+    if has_att_heads:
+      att_outputs_all = np.random.rand(84, 1)  # random attributes.
+      att_outputs = {
+          'depth': {
+              '4':
+                  tf.reshape(
+                      tf.convert_to_tensor(
+                          att_outputs_all[0:64], dtype=tf.float32),
+                      [1, 8, 8, 1]),
+              '5':
+                  tf.reshape(
+                      tf.convert_to_tensor(
+                          att_outputs_all[64:80], dtype=tf.float32),
+                      [1, 4, 4, 1]),
+              '6':
+                  tf.reshape(
+                      tf.convert_to_tensor(
+                          att_outputs_all[80:84], dtype=tf.float32),
+                      [1, 2, 2, 1]),
+          }
+      }
+    else:
+      att_outputs = None
+    image_info = tf.constant([[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]],
+                             dtype=tf.float32)
+    generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
+    results = generator(box_outputs, class_outputs, anchor_boxes,
+                        image_info[:, 1, :], att_outputs)
+    boxes = results['detection_boxes']
+    classes = results['detection_classes']
+    scores = results['detection_scores']
+    valid_detections = results['num_detections']
+
+    if nms_version == 'tflite':
+      # When nms_version is `tflite`, all output tensors are empty as the actual
+      # post-processing happens in the TFLite model.
+      self.assertEqual(boxes.numpy().shape, ())
+      self.assertEqual(scores.numpy().shape, ())
+      self.assertEqual(classes.numpy().shape, ())
+      self.assertEqual(valid_detections.numpy().shape, ())
+    else:
+      self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4))
+      self.assertEqual(scores.numpy().shape, (
+          batch_size,
+          max_num_detections,
+      ))
+      self.assertEqual(classes.numpy().shape, (
+          batch_size,
+          max_num_detections,
+      ))
+      self.assertEqual(valid_detections.numpy().shape, (batch_size,))
+      if has_att_heads:
+        for att in results['detection_attributes'].values():
+          self.assertEqual(att.numpy().shape,
+                           (batch_size, max_num_detections, 1))
+
+  def test_serialize_deserialize(self):
+    tflite_post_processing_config = {
+        'max_detections': 100,
+        'max_classes_per_detection': 1,
+        'use_regular_nms': True,
+        'nms_score_threshold': 0.01,
+        'nms_iou_threshold': 0.5
+    }
+    kwargs = {
+        'apply_nms': True,
+        'pre_nms_top_k': 1000,
+        'pre_nms_score_threshold': 0.1,
+        'nms_iou_threshold': 0.5,
+        'max_num_detections': 10,
+        'nms_version': 'v2',
+        'use_cpu_nms': False,
+        'soft_nms_sigma': None,
+        'tflite_post_processing_config': tflite_post_processing_config
+    }
+    generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(generator.get_config(), expected_config)
+
+    new_generator = (
+        detection_generator.MultilevelDetectionGenerator.from_config(
+            generator.get_config()))
+
+    self.assertAllEqual(generator.get_config(), new_generator.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/mask_sampler.py
+++ b/official/vision/modeling/layers/mask_sampler.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of mask sampler."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.ops import spatial_transform_ops
+
+
+def _sample_and_crop_foreground_masks(candidate_rois: tf.Tensor,
+                                      candidate_gt_boxes: tf.Tensor,
+                                      candidate_gt_classes: tf.Tensor,
+                                      candidate_gt_indices: tf.Tensor,
+                                      gt_masks: tf.Tensor,
+                                      num_sampled_masks: int = 128,
+                                      mask_target_size: int = 28):
+  """Samples and creates cropped foreground masks for training.
+
+  Args:
+    candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is the
+      number of candidate RoIs to be considered for mask sampling. It includes
+      both positive and negative RoIs. The `num_mask_samples_per_image` positive
+      RoIs will be sampled to create mask training targets.
+    candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing
+      the corresponding groundtruth boxes to the `candidate_rois`.
+    candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing the
+      corresponding groundtruth classes to the `candidate_rois`. 0 in the tensor
+      corresponds to the background class, i.e. negative RoIs.
+    candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the
+      corresponding groundtruth instance indices to the `candidate_gt_boxes`,
+      i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i] and
+      gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is
+      the superset of candidate_gt_boxes.
+    gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height,
+      mask_width] containing all the groundtruth masks which sample masks are
+      drawn from.
+    num_sampled_masks: An `int` that specifies the number of masks to sample.
+    mask_target_size: An `int` that specifies the final cropped mask size after
+      sampling. The output masks are resized w.r.t the sampled RoIs.
+
+  Returns:
+    foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the
+      RoI that corresponds to the sampled foreground masks, where
+      K = num_mask_samples_per_image.
+    foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the
+      classes corresponding to the sampled foreground masks.
+    cropoped_foreground_masks: A `tf.Tensor` of shape of
+      [batch_size, K, mask_target_size, mask_target_size] storing the cropped
+      foreground masks used for training.
+  """
+  _, fg_instance_indices = tf.nn.top_k(
+      tf.cast(tf.greater(candidate_gt_classes, 0), dtype=tf.int32),
+      k=num_sampled_masks)
+
+  fg_instance_indices_shape = tf.shape(fg_instance_indices)
+  batch_indices = (
+      tf.expand_dims(tf.range(fg_instance_indices_shape[0]), axis=-1) *
+      tf.ones([1, fg_instance_indices_shape[-1]], dtype=tf.int32))
+
+  gather_nd_instance_indices = tf.stack(
+      [batch_indices, fg_instance_indices], axis=-1)
+  foreground_rois = tf.gather_nd(
+      candidate_rois, gather_nd_instance_indices)
+  foreground_boxes = tf.gather_nd(
+      candidate_gt_boxes, gather_nd_instance_indices)
+  foreground_classes = tf.gather_nd(
+      candidate_gt_classes, gather_nd_instance_indices)
+  foreground_gt_indices = tf.gather_nd(
+      candidate_gt_indices, gather_nd_instance_indices)
+  foreground_gt_indices = tf.where(
+      tf.equal(foreground_gt_indices, -1),
+      tf.zeros_like(foreground_gt_indices),
+      foreground_gt_indices)
+
+  foreground_gt_indices_shape = tf.shape(foreground_gt_indices)
+  batch_indices = (
+      tf.expand_dims(tf.range(foreground_gt_indices_shape[0]), axis=-1) *
+      tf.ones([1, foreground_gt_indices_shape[-1]], dtype=tf.int32))
+  gather_nd_gt_indices = tf.stack(
+      [batch_indices, foreground_gt_indices], axis=-1)
+  foreground_masks = tf.gather_nd(gt_masks, gather_nd_gt_indices)
+
+  cropped_foreground_masks = spatial_transform_ops.crop_mask_in_target_box(
+      foreground_masks, foreground_boxes, foreground_rois, mask_target_size,
+      sample_offset=0.5)
+
+  return foreground_rois, foreground_classes, cropped_foreground_masks
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MaskSampler(tf.keras.layers.Layer):
+  """Samples and creates mask training targets."""
+
+  def __init__(self, mask_target_size: int, num_sampled_masks: int, **kwargs):
+    self._config_dict = {
+        'mask_target_size': mask_target_size,
+        'num_sampled_masks': num_sampled_masks,
+    }
+    super(MaskSampler, self).__init__(**kwargs)
+
+  def call(self, candidate_rois: tf.Tensor, candidate_gt_boxes: tf.Tensor,
+           candidate_gt_classes: tf.Tensor, candidate_gt_indices: tf.Tensor,
+           gt_masks: tf.Tensor):
+    """Samples and creates mask targets for training.
+
+    Args:
+      candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is
+        the number of candidate RoIs to be considered for mask sampling. It
+        includes both positive and negative RoIs. The
+        `num_mask_samples_per_image` positive RoIs will be sampled to create
+        mask training targets.
+      candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing
+        the corresponding groundtruth boxes to the `candidate_rois`.
+      candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing
+        the corresponding groundtruth classes to the `candidate_rois`. 0 in the
+        tensor corresponds to the background class, i.e. negative RoIs.
+      candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the
+        corresponding groundtruth instance indices to the `candidate_gt_boxes`,
+        i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i],
+          where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >=
+          N, is the superset of candidate_gt_boxes.
+      gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height,
+        mask_width] containing all the groundtruth masks which sample masks are
+        drawn from. after sampling. The output masks are resized w.r.t the
+        sampled RoIs.
+
+    Returns:
+      foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the
+        RoI that corresponds to the sampled foreground masks, where
+        K = num_mask_samples_per_image.
+      foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the
+        classes corresponding to the sampled foreground masks.
+      cropoped_foreground_masks: A `tf.Tensor` of shape of
+        [batch_size, K, mask_target_size, mask_target_size] storing the
+        cropped foreground masks used for training.
+    """
+    foreground_rois, foreground_classes, cropped_foreground_masks = (
+        _sample_and_crop_foreground_masks(
+            candidate_rois,
+            candidate_gt_boxes,
+            candidate_gt_classes,
+            candidate_gt_indices,
+            gt_masks,
+            self._config_dict['num_sampled_masks'],
+            self._config_dict['mask_target_size']))
+    return foreground_rois, foreground_classes, cropped_foreground_masks
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/layers/nn_blocks.py
+++ b/official/vision/modeling/layers/nn_blocks.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains common building blocks for neural networks."""
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Text
+
+# Import libraries
+from absl import logging
+import tensorflow as tf
+
+from official.modeling import tf_utils
+from official.vision.modeling.layers import nn_layers
+
+
+def _pad_strides(strides: int, axis: int) -> Tuple[int, int, int, int]:
+  """Converts int to len 4 strides (`tf.nn.avg_pool` uses length 4)."""
+  if axis == 1:
+    return (1, 1, strides, strides)
+  else:
+    return (1, strides, strides, 1)
+
+
+def _maybe_downsample(x: tf.Tensor, out_filter: int, strides: int,
+                      axis: int) -> tf.Tensor:
+  """Downsamples feature map and 0-pads tensor if in_filter != out_filter."""
+  data_format = 'NCHW' if axis == 1 else 'NHWC'
+  strides = _pad_strides(strides, axis=axis)
+
+  x = tf.nn.avg_pool(x, strides, strides, 'VALID', data_format=data_format)
+
+  in_filter = x.shape[axis]
+  if in_filter < out_filter:
+    # Pad on channel dimension with 0s: half on top half on bottom.
+    pad_size = [(out_filter - in_filter) // 2, (out_filter - in_filter) // 2]
+    if axis == 1:
+      x = tf.pad(x, [[0, 0], pad_size, [0, 0], [0, 0]])
+    else:
+      x = tf.pad(x, [[0, 0], [0, 0], [0, 0], pad_size])
+
+  return x + 0.
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ResidualBlock(tf.keras.layers.Layer):
+  """A residual block."""
+
+  def __init__(self,
+               filters,
+               strides,
+               use_projection=False,
+               se_ratio=None,
+               resnetd_shortcut=False,
+               stochastic_depth_drop_rate=None,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_explicit_padding: bool = False,
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               bn_trainable=True,
+               **kwargs):
+    """Initializes a residual block with BN after convolutions.
+
+    Args:
+      filters: An `int` number of filters for the first two convolutions. Note
+        that the third and final convolution will use 4 times as many filters.
+      strides: An `int` block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      use_projection: A `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
+      resnetd_shortcut: A `bool` if True, apply the resnetd style modification
+        to the shortcut connection. Not implemented in residual blocks.
+      stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
+        the stochastic depth layer.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
+        inputs so that the output dimensions are the same as if 'SAME' padding
+        were used.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      bn_trainable: A `bool` that indicates whether batch norm layers should be
+        trainable. Default to True.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(ResidualBlock, self).__init__(**kwargs)
+
+    self._filters = filters
+    self._strides = strides
+    self._use_projection = use_projection
+    self._se_ratio = se_ratio
+    self._resnetd_shortcut = resnetd_shortcut
+    self._use_explicit_padding = use_explicit_padding
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+    self._bn_trainable = bn_trainable
+
+  def build(self, input_shape):
+    if self._use_projection:
+      self._shortcut = tf.keras.layers.Conv2D(
+          filters=self._filters,
+          kernel_size=1,
+          strides=self._strides,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon,
+          trainable=self._bn_trainable)
+
+    conv1_padding = 'same'
+    # explicit padding here is added for centernet
+    if self._use_explicit_padding:
+      self._pad = tf.keras.layers.ZeroPadding2D(padding=(1, 1))
+      conv1_padding = 'valid'
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=self._strides,
+        padding=conv1_padding,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon,
+        trainable=self._bn_trainable)
+
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon,
+        trainable=self._bn_trainable)
+
+    if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
+      self._squeeze_excitation = nn_layers.SqueezeExcitation(
+          in_filters=self._filters,
+          out_filters=self._filters,
+          se_ratio=self._se_ratio,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+    else:
+      self._squeeze_excitation = None
+
+    if self._stochastic_depth_drop_rate:
+      self._stochastic_depth = nn_layers.StochasticDepth(
+          self._stochastic_depth_drop_rate)
+    else:
+      self._stochastic_depth = None
+
+    super(ResidualBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'use_projection': self._use_projection,
+        'se_ratio': self._se_ratio,
+        'resnetd_shortcut': self._resnetd_shortcut,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_explicit_padding': self._use_explicit_padding,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'bn_trainable': self._bn_trainable
+    }
+    base_config = super(ResidualBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    shortcut = inputs
+    if self._use_projection:
+      shortcut = self._shortcut(shortcut)
+      shortcut = self._norm0(shortcut)
+
+    if self._use_explicit_padding:
+      inputs = self._pad(inputs)
+    x = self._conv1(inputs)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+
+    if self._squeeze_excitation:
+      x = self._squeeze_excitation(x)
+
+    if self._stochastic_depth:
+      x = self._stochastic_depth(x, training=training)
+
+    return self._activation_fn(x + shortcut)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BottleneckBlock(tf.keras.layers.Layer):
+  """A standard bottleneck block."""
+
+  def __init__(self,
+               filters,
+               strides,
+               dilation_rate=1,
+               use_projection=False,
+               se_ratio=None,
+               resnetd_shortcut=False,
+               stochastic_depth_drop_rate=None,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               bn_trainable=True,
+               **kwargs):
+    """Initializes a standard bottleneck block with BN after convolutions.
+
+    Args:
+      filters: An `int` number of filters for the first two convolutions. Note
+        that the third and final convolution will use 4 times as many filters.
+      strides: An `int` block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      dilation_rate: An `int` dilation_rate of convolutions. Default to 1.
+      use_projection: A `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
+      resnetd_shortcut: A `bool`. If True, apply the resnetd style modification
+        to the shortcut connection.
+      stochastic_depth_drop_rate: A `float` or None. If not None, drop rate for
+        the stochastic depth layer.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      bn_trainable: A `bool` that indicates whether batch norm layers should be
+        trainable. Default to True.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(BottleneckBlock, self).__init__(**kwargs)
+
+    self._filters = filters
+    self._strides = strides
+    self._dilation_rate = dilation_rate
+    self._use_projection = use_projection
+    self._se_ratio = se_ratio
+    self._resnetd_shortcut = resnetd_shortcut
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._bn_trainable = bn_trainable
+
+  def build(self, input_shape):
+    if self._use_projection:
+      if self._resnetd_shortcut:
+        self._shortcut0 = tf.keras.layers.AveragePooling2D(
+            pool_size=2, strides=self._strides, padding='same')
+        self._shortcut1 = tf.keras.layers.Conv2D(
+            filters=self._filters * 4,
+            kernel_size=1,
+            strides=1,
+            use_bias=False,
+            kernel_initializer=self._kernel_initializer,
+            kernel_regularizer=self._kernel_regularizer,
+            bias_regularizer=self._bias_regularizer)
+      else:
+        self._shortcut = tf.keras.layers.Conv2D(
+            filters=self._filters * 4,
+            kernel_size=1,
+            strides=self._strides,
+            use_bias=False,
+            kernel_initializer=self._kernel_initializer,
+            kernel_regularizer=self._kernel_regularizer,
+            bias_regularizer=self._bias_regularizer)
+
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon,
+          trainable=self._bn_trainable)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon,
+        trainable=self._bn_trainable)
+    self._activation1 = tf_utils.get_activation(
+        self._activation, use_keras_layer=True)
+
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=self._strides,
+        dilation_rate=self._dilation_rate,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon,
+        trainable=self._bn_trainable)
+    self._activation2 = tf_utils.get_activation(
+        self._activation, use_keras_layer=True)
+
+    self._conv3 = tf.keras.layers.Conv2D(
+        filters=self._filters * 4,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm3 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon,
+        trainable=self._bn_trainable)
+    self._activation3 = tf_utils.get_activation(
+        self._activation, use_keras_layer=True)
+
+    if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
+      self._squeeze_excitation = nn_layers.SqueezeExcitation(
+          in_filters=self._filters * 4,
+          out_filters=self._filters * 4,
+          se_ratio=self._se_ratio,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+    else:
+      self._squeeze_excitation = None
+
+    if self._stochastic_depth_drop_rate:
+      self._stochastic_depth = nn_layers.StochasticDepth(
+          self._stochastic_depth_drop_rate)
+    else:
+      self._stochastic_depth = None
+    self._add = tf.keras.layers.Add()
+
+    super(BottleneckBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'dilation_rate': self._dilation_rate,
+        'use_projection': self._use_projection,
+        'se_ratio': self._se_ratio,
+        'resnetd_shortcut': self._resnetd_shortcut,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'bn_trainable': self._bn_trainable
+    }
+    base_config = super(BottleneckBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    shortcut = inputs
+    if self._use_projection:
+      if self._resnetd_shortcut:
+        shortcut = self._shortcut0(shortcut)
+        shortcut = self._shortcut1(shortcut)
+      else:
+        shortcut = self._shortcut(shortcut)
+      shortcut = self._norm0(shortcut)
+
+    x = self._conv1(inputs)
+    x = self._norm1(x)
+    x = self._activation1(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+    x = self._activation2(x)
+
+    x = self._conv3(x)
+    x = self._norm3(x)
+
+    if self._squeeze_excitation:
+      x = self._squeeze_excitation(x)
+
+    if self._stochastic_depth:
+      x = self._stochastic_depth(x, training=training)
+
+    x = self._add([x, shortcut])
+    return self._activation3(x)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class InvertedBottleneckBlock(tf.keras.layers.Layer):
+  """An inverted bottleneck block."""
+
+  def __init__(self,
+               in_filters,
+               out_filters,
+               expand_ratio,
+               strides,
+               kernel_size=3,
+               se_ratio=None,
+               stochastic_depth_drop_rate=None,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               se_inner_activation='relu',
+               se_gating_activation='sigmoid',
+               se_round_down_protect=True,
+               expand_se_in_filters=False,
+               depthwise_activation=None,
+               use_sync_bn=False,
+               dilation_rate=1,
+               divisible_by=1,
+               regularize_depthwise=False,
+               use_depthwise=True,
+               use_residual=True,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               output_intermediate_endpoints=False,
+               **kwargs):
+    """Initializes an inverted bottleneck block with BN after convolutions.
+
+    Args:
+      in_filters: An `int` number of filters of the input tensor.
+      out_filters: An `int` number of filters of the output tensor.
+      expand_ratio: An `int` of expand_ratio for an inverted bottleneck block.
+      strides: An `int` block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      kernel_size: An `int` kernel_size of the depthwise conv layer.
+      se_ratio: A `float` or None. If not None, se ratio for the squeeze and
+        excitation layer.
+      stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
+        the stochastic depth layer.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      se_inner_activation: A `str` name of squeeze-excitation inner activation.
+      se_gating_activation: A `str` name of squeeze-excitation gating
+        activation.
+      se_round_down_protect: A `bool` of whether round down more than 10%
+        will be allowed in SE layer.
+      expand_se_in_filters: A `bool` of whether or not to expand in_filter in
+        squeeze and excitation layer.
+      depthwise_activation: A `str` name of the activation function for
+        depthwise only.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      dilation_rate: An `int` that specifies the dilation rate to use for.
+      divisible_by: An `int` that ensures all inner dimensions are divisible by
+        this number.
+      dilated convolution: An `int` to specify the same value for all spatial
+        dimensions.
+      regularize_depthwise: A `bool` of whether or not apply regularization on
+        depthwise.
+      use_depthwise: A `bool` of whether to uses fused convolutions instead of
+        depthwise.
+      use_residual: A `bool` of whether to include residual connection between
+        input and output.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      output_intermediate_endpoints: A `bool` of whether or not output the
+        intermediate endpoints.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(InvertedBottleneckBlock, self).__init__(**kwargs)
+
+    self._in_filters = in_filters
+    self._out_filters = out_filters
+    self._expand_ratio = expand_ratio
+    self._strides = strides
+    self._kernel_size = kernel_size
+    self._se_ratio = se_ratio
+    self._divisible_by = divisible_by
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._dilation_rate = dilation_rate
+    self._use_sync_bn = use_sync_bn
+    self._regularize_depthwise = regularize_depthwise
+    self._use_depthwise = use_depthwise
+    self._use_residual = use_residual
+    self._activation = activation
+    self._se_inner_activation = se_inner_activation
+    self._se_gating_activation = se_gating_activation
+    self._depthwise_activation = depthwise_activation
+    self._se_round_down_protect = se_round_down_protect
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._expand_se_in_filters = expand_se_in_filters
+    self._output_intermediate_endpoints = output_intermediate_endpoints
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    if not depthwise_activation:
+      self._depthwise_activation = activation
+    if regularize_depthwise:
+      self._depthsize_regularizer = kernel_regularizer
+    else:
+      self._depthsize_regularizer = None
+
+  def build(self, input_shape):
+    expand_filters = self._in_filters
+    if self._expand_ratio > 1:
+      # First 1x1 conv for channel expansion.
+      expand_filters = nn_layers.make_divisible(
+          self._in_filters * self._expand_ratio, self._divisible_by)
+
+      expand_kernel = 1 if self._use_depthwise else self._kernel_size
+      expand_stride = 1 if self._use_depthwise else self._strides
+
+      self._conv0 = tf.keras.layers.Conv2D(
+          filters=expand_filters,
+          kernel_size=expand_kernel,
+          strides=expand_stride,
+          padding='same',
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+      self._activation_layer = tf_utils.get_activation(
+          self._activation, use_keras_layer=True)
+
+    if self._use_depthwise:
+      # Depthwise conv.
+      self._conv1 = tf.keras.layers.DepthwiseConv2D(
+          kernel_size=(self._kernel_size, self._kernel_size),
+          strides=self._strides,
+          padding='same',
+          depth_multiplier=1,
+          dilation_rate=self._dilation_rate,
+          use_bias=False,
+          depthwise_initializer=self._kernel_initializer,
+          depthwise_regularizer=self._depthsize_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm1 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+      self._depthwise_activation_layer = tf_utils.get_activation(
+          self._depthwise_activation, use_keras_layer=True)
+
+    # Squeeze and excitation.
+    if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
+      logging.info('Use Squeeze and excitation.')
+      in_filters = self._in_filters
+      if self._expand_se_in_filters:
+        in_filters = expand_filters
+      self._squeeze_excitation = nn_layers.SqueezeExcitation(
+          in_filters=in_filters,
+          out_filters=expand_filters,
+          se_ratio=self._se_ratio,
+          divisible_by=self._divisible_by,
+          round_down_protect=self._se_round_down_protect,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer,
+          activation=self._se_inner_activation,
+          gating_activation=self._se_gating_activation)
+    else:
+      self._squeeze_excitation = None
+
+    # Last 1x1 conv.
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._out_filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    if self._stochastic_depth_drop_rate:
+      self._stochastic_depth = nn_layers.StochasticDepth(
+          self._stochastic_depth_drop_rate)
+    else:
+      self._stochastic_depth = None
+    self._add = tf.keras.layers.Add()
+
+    super(InvertedBottleneckBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'in_filters': self._in_filters,
+        'out_filters': self._out_filters,
+        'expand_ratio': self._expand_ratio,
+        'strides': self._strides,
+        'kernel_size': self._kernel_size,
+        'se_ratio': self._se_ratio,
+        'divisible_by': self._divisible_by,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'se_inner_activation': self._se_inner_activation,
+        'se_gating_activation': self._se_gating_activation,
+        'se_round_down_protect': self._se_round_down_protect,
+        'expand_se_in_filters': self._expand_se_in_filters,
+        'depthwise_activation': self._depthwise_activation,
+        'dilation_rate': self._dilation_rate,
+        'use_sync_bn': self._use_sync_bn,
+        'regularize_depthwise': self._regularize_depthwise,
+        'use_depthwise': self._use_depthwise,
+        'use_residual': self._use_residual,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(InvertedBottleneckBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    endpoints = {}
+    shortcut = inputs
+    if self._expand_ratio > 1:
+      x = self._conv0(inputs)
+      x = self._norm0(x)
+      x = self._activation_layer(x)
+    else:
+      x = inputs
+
+    if self._use_depthwise:
+      x = self._conv1(x)
+      x = self._norm1(x)
+      x = self._depthwise_activation_layer(x)
+      if self._output_intermediate_endpoints:
+        endpoints['depthwise'] = x
+
+    if self._squeeze_excitation:
+      x = self._squeeze_excitation(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+
+    if (self._use_residual and self._in_filters == self._out_filters and
+        self._strides == 1):
+      if self._stochastic_depth:
+        x = self._stochastic_depth(x, training=training)
+      x = self._add([x, shortcut])
+
+    if self._output_intermediate_endpoints:
+      return x, endpoints
+    return x
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ResidualInner(tf.keras.layers.Layer):
+  """Creates a single inner block of a residual.
+
+  This corresponds to `F`/`G` functions in the RevNet paper:
+  Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
+  The Reversible Residual Network: Backpropagation Without Storing Activations.
+  (https://arxiv.org/pdf/1707.04585.pdf)
+  """
+
+  def __init__(
+      self,
+      filters: int,
+      strides: int,
+      kernel_initializer: Union[str, Callable[
+          ..., tf.keras.initializers.Initializer]] = 'VarianceScaling',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      batch_norm_first: bool = True,
+      **kwargs):
+    """Initializes a ResidualInner.
+
+    Args:
+      filters: An `int` of output filter size.
+      strides: An `int` of stride size for convolution for the residual block.
+      kernel_initializer: A `str` or `tf.keras.initializers.Initializer`
+        instance for convolutional layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D.
+      activation: A `str` or `callable` instance of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      batch_norm_first: A `bool` of whether to apply activation and batch norm
+        before conv.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(ResidualInner, self).__init__(**kwargs)
+
+    self.strides = strides
+    self.filters = filters
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._kernel_regularizer = kernel_regularizer
+    self._activation = tf.keras.activations.get(activation)
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._batch_norm_first = batch_norm_first
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: tf.TensorShape):
+    if self._batch_norm_first:
+      self._batch_norm_0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+
+    self._conv2d_1 = tf.keras.layers.Conv2D(
+        filters=self.filters,
+        kernel_size=3,
+        strides=self.strides,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+
+    self._batch_norm_1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv2d_2 = tf.keras.layers.Conv2D(
+        filters=self.filters,
+        kernel_size=3,
+        strides=1,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+
+    super(ResidualInner, self).build(input_shape)
+
+  def get_config(self) -> Dict[str, Any]:
+    config = {
+        'filters': self.filters,
+        'strides': self.strides,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'batch_norm_first': self._batch_norm_first,
+    }
+    base_config = super(ResidualInner, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self,
+           inputs: tf.Tensor,
+           training: Optional[bool] = None) -> tf.Tensor:
+    x = inputs
+    if self._batch_norm_first:
+      x = self._batch_norm_0(x, training=training)
+      x = self._activation_fn(x)
+    x = self._conv2d_1(x)
+
+    x = self._batch_norm_1(x, training=training)
+    x = self._activation_fn(x)
+    x = self._conv2d_2(x)
+    return x
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BottleneckResidualInner(tf.keras.layers.Layer):
+  """Creates a single inner block of a bottleneck.
+
+  This corresponds to `F`/`G` functions in the RevNet paper:
+  Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
+  The Reversible Residual Network: Backpropagation Without Storing Activations.
+  (https://arxiv.org/pdf/1707.04585.pdf)
+  """
+
+  def __init__(
+      self,
+      filters: int,
+      strides: int,
+      kernel_initializer: Union[str, Callable[
+          ..., tf.keras.initializers.Initializer]] = 'VarianceScaling',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      batch_norm_first: bool = True,
+      **kwargs):
+    """Initializes a BottleneckResidualInner.
+
+    Args:
+      filters: An `int` number of filters for first 2 convolutions. Last Last,
+        and thus the number of output channels from the bottlneck block is
+        `4*filters`
+      strides: An `int` of stride size for convolution for the residual block.
+      kernel_initializer: A `str` or `tf.keras.initializers.Initializer`
+        instance for convolutional layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D.
+      activation: A `str` or `callable` instance of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      batch_norm_first: A `bool` of whether to apply activation and batch norm
+        before conv.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(BottleneckResidualInner, self).__init__(**kwargs)
+
+    self.strides = strides
+    self.filters = filters
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._kernel_regularizer = kernel_regularizer
+    self._activation = tf.keras.activations.get(activation)
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._batch_norm_first = batch_norm_first
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: tf.TensorShape):
+    if self._batch_norm_first:
+      self._batch_norm_0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+    self._conv2d_1 = tf.keras.layers.Conv2D(
+        filters=self.filters,
+        kernel_size=1,
+        strides=self.strides,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+    self._batch_norm_1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    self._conv2d_2 = tf.keras.layers.Conv2D(
+        filters=self.filters,
+        kernel_size=3,
+        strides=1,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+    self._batch_norm_2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    self._conv2d_3 = tf.keras.layers.Conv2D(
+        filters=self.filters * 4,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+
+    super(BottleneckResidualInner, self).build(input_shape)
+
+  def get_config(self) -> Dict[str, Any]:
+    config = {
+        'filters': self.filters,
+        'strides': self.strides,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'batch_norm_first': self._batch_norm_first,
+    }
+    base_config = super(BottleneckResidualInner, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self,
+           inputs: tf.Tensor,
+           training: Optional[bool] = None) -> tf.Tensor:
+    x = inputs
+    if self._batch_norm_first:
+      x = self._batch_norm_0(x, training=training)
+      x = self._activation_fn(x)
+    x = self._conv2d_1(x)
+
+    x = self._batch_norm_1(x, training=training)
+    x = self._activation_fn(x)
+    x = self._conv2d_2(x)
+
+    x = self._batch_norm_2(x, training=training)
+    x = self._activation_fn(x)
+    x = self._conv2d_3(x)
+
+    return x
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ReversibleLayer(tf.keras.layers.Layer):
+  """Creates a reversible layer.
+
+  Computes y1 = x1 + f(x2), y2 = x2 + g(y1), where f and g can be arbitrary
+  layers that are stateless, which in this case are `ResidualInner` layers.
+  """
+
+  def __init__(self,
+               f: tf.keras.layers.Layer,
+               g: tf.keras.layers.Layer,
+               manual_grads: bool = True,
+               **kwargs):
+    """Initializes a ReversibleLayer.
+
+    Args:
+      f: A `tf.keras.layers.Layer` instance of `f` inner block referred to in
+        paper. Each reversible layer consists of two inner functions. For
+        example, in RevNet the reversible residual consists of two f/g inner
+        (bottleneck) residual functions. Where the input to the reversible layer
+        is x, the input gets partitioned in the channel dimension and the
+        forward pass follows (eq8): x = [x1; x2], z1 = x1 + f(x2), y2 = x2 +
+          g(z1), y1 = stop_gradient(z1).
+      g: A `tf.keras.layers.Layer` instance of `g` inner block referred to in
+        paper. Detailed explanation same as above as `f` arg.
+      manual_grads: A `bool` [Testing Only] of whether to manually take
+        gradients as in Algorithm 1 or defer to autograd.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(ReversibleLayer, self).__init__(**kwargs)
+
+    self._f = f
+    self._g = g
+    self._manual_grads = manual_grads
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._axis = -1
+    else:
+      self._axis = 1
+
+  def get_config(self) -> Dict[str, Any]:
+    config = {
+        'f': self._f,
+        'g': self._g,
+        'manual_grads': self._manual_grads,
+    }
+    base_config = super(ReversibleLayer, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def _ckpt_non_trainable_vars(self):
+    self._f_non_trainable_vars = [
+        v.read_value() for v in self._f.non_trainable_variables
+    ]
+    self._g_non_trainable_vars = [
+        v.read_value() for v in self._g.non_trainable_variables
+    ]
+
+  def _load_ckpt_non_trainable_vars(self):
+    for v, v_chkpt in zip(self._f.non_trainable_variables,
+                          self._f_non_trainable_vars):
+      v.assign(v_chkpt)
+    for v, v_chkpt in zip(self._g.non_trainable_variables,
+                          self._g_non_trainable_vars):
+      v.assign(v_chkpt)
+
+  def call(self,
+           inputs: tf.Tensor,
+           training: Optional[bool] = None) -> tf.Tensor:
+
+    @tf.custom_gradient
+    def reversible(
+        x: tf.Tensor
+    ) -> Tuple[tf.Tensor, Callable[[Any], Tuple[List[tf.Tensor],
+                                                List[tf.Tensor]]]]:
+      """Implements Algorithm 1 in the RevNet paper.
+
+         Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
+         The Reversible Residual Network: Backpropagation Without Storing
+         Activations.
+         (https://arxiv.org/pdf/1707.04585.pdf)
+
+      Args:
+        x: An input `tf.Tensor.
+
+      Returns:
+        y: The output [y1; y2] in Algorithm 1.
+        grad_fn: A callable function that computes the gradients.
+      """
+      with tf.GradientTape() as fwdtape:
+        fwdtape.watch(x)
+        x1, x2 = tf.split(x, num_or_size_splits=2, axis=self._axis)
+        f_x2 = self._f(x2, training=training)
+        x1_down = _maybe_downsample(x1, f_x2.shape[self._axis], self._f.strides,
+                                    self._axis)
+        z1 = f_x2 + x1_down
+        g_z1 = self._g(z1, training=training)
+        x2_down = _maybe_downsample(x2, g_z1.shape[self._axis], self._f.strides,
+                                    self._axis)
+        y2 = x2_down + g_z1
+
+        # Equation 8: https://arxiv.org/pdf/1707.04585.pdf
+        # Decouple y1 and z1 so that their derivatives are different.
+        y1 = tf.identity(z1)
+        y = tf.concat([y1, y2], axis=self._axis)
+
+        irreversible = ((self._f.strides != 1 or self._g.strides != 1) or
+                        (y.shape[self._axis] != inputs.shape[self._axis]))
+
+        # Checkpointing moving mean/variance for batch normalization layers
+        # as they shouldn't be updated during the custom gradient pass of f/g.
+        self._ckpt_non_trainable_vars()
+
+      def grad_fn(
+          dy: tf.Tensor,
+          variables: Optional[List[tf.Variable]] = None,
+      ) -> Tuple[List[tf.Tensor], List[tf.Tensor]]:
+        """Given dy calculate (dy/dx)|_{x_{input}} using f/g."""
+        if irreversible or not self._manual_grads:
+          grads_combined = fwdtape.gradient(
+              y, [x] + variables, output_gradients=dy)
+          dx = grads_combined[0]
+          grad_vars = grads_combined[1:]
+        else:
+          y1_nograd = tf.stop_gradient(y1)
+          y2_nograd = tf.stop_gradient(y2)
+          dy1, dy2 = tf.split(dy, num_or_size_splits=2, axis=self._axis)
+
+          # Index mapping from self.f/g.trainable_variables to grad_fn
+          # input `variables` kwarg so that we can reorder dwf + dwg
+          # variable gradient list to match `variables` order.
+          f_var_refs = [v.ref() for v in self._f.trainable_variables]
+          g_var_refs = [v.ref() for v in self._g.trainable_variables]
+          fg_var_refs = f_var_refs + g_var_refs
+          self_to_var_index = [fg_var_refs.index(v.ref()) for v in variables]
+
+          # Algorithm 1 in paper (line # documented in-line)
+          z1 = y1_nograd  # line 2
+          with tf.GradientTape() as gtape:
+            gtape.watch(z1)
+            g_z1 = self._g(z1, training=training)
+          x2 = y2_nograd - g_z1  # line 3
+
+          with tf.GradientTape() as ftape:
+            ftape.watch(x2)
+            f_x2 = self._f(x2, training=training)
+          x1 = z1 - f_x2  # pylint: disable=unused-variable      # line 4
+
+          # Compute gradients
+          g_grads_combined = gtape.gradient(
+              g_z1, [z1] + self._g.trainable_variables, output_gradients=dy2)
+          dz1 = dy1 + g_grads_combined[0]  # line 5
+          dwg = g_grads_combined[1:]  # line 9
+
+          f_grads_combined = ftape.gradient(
+              f_x2, [x2] + self._f.trainable_variables, output_gradients=dz1)
+          dx2 = dy2 + f_grads_combined[0]  # line 6
+          dwf = f_grads_combined[1:]  # line 8
+          dx1 = dz1  # line 7
+
+          # Pack the input and variable gradients.
+          dx = tf.concat([dx1, dx2], axis=self._axis)
+          grad_vars = dwf + dwg
+          # Reorder gradients (trainable_variables to variables kwarg order)
+          grad_vars = [grad_vars[i] for i in self_to_var_index]
+
+          # Restore batch normalization moving mean/variance for correctness.
+          self._load_ckpt_non_trainable_vars()
+
+        return dx, grad_vars  # grad_fn end
+
+      return y, grad_fn  # reversible end
+
+    activations = reversible(inputs)
+    return activations
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class DepthwiseSeparableConvBlock(tf.keras.layers.Layer):
+  """Creates an depthwise separable convolution block with batch normalization."""
+
+  def __init__(
+      self,
+      filters: int,
+      kernel_size: int = 3,
+      strides: int = 1,
+      regularize_depthwise=False,
+      activation: Text = 'relu6',
+      kernel_initializer: Text = 'VarianceScaling',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      dilation_rate: int = 1,
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      **kwargs):
+    """Initializes a convolution block with batch normalization.
+
+    Args:
+      filters: An `int` number of filters for the first two convolutions. Note
+        that the third and final convolution will use 4 times as many filters.
+      kernel_size: An `int` that specifies the height and width of the 2D
+        convolution window.
+      strides: An `int` of block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      regularize_depthwise: A `bool`. If Ture, apply regularization on
+        depthwise.
+      activation: A `str` name of the activation function.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      dilation_rate: An `int` or tuple/list of 2 `int`, specifying the dilation
+        rate to use for dilated convolution. Can be a single integer to specify
+        the same value for all spatial dimensions.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(DepthwiseSeparableConvBlock, self).__init__(**kwargs)
+    self._filters = filters
+    self._kernel_size = kernel_size
+    self._strides = strides
+    self._activation = activation
+    self._regularize_depthwise = regularize_depthwise
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._dilation_rate = dilation_rate
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+    if regularize_depthwise:
+      self._depthsize_regularizer = kernel_regularizer
+    else:
+      self._depthsize_regularizer = None
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'regularize_depthwise': self._regularize_depthwise,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(DepthwiseSeparableConvBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+
+    self._dwconv0 = tf.keras.layers.DepthwiseConv2D(
+        kernel_size=self._kernel_size,
+        strides=self._strides,
+        padding='same',
+        depth_multiplier=1,
+        dilation_rate=self._dilation_rate,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._depthsize_regularizer,
+        use_bias=False)
+    self._norm0 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    super(DepthwiseSeparableConvBlock, self).build(input_shape)
+
+  def call(self, inputs, training=None):
+    x = self._dwconv0(inputs)
+    x = self._norm0(x)
+    x = self._activation_fn(x)
+
+    x = self._conv1(x)
+    x = self._norm1(x)
+    return self._activation_fn(x)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class TuckerConvBlock(tf.keras.layers.Layer):
+  """An Tucker block (generalized bottleneck)."""
+
+  def __init__(self,
+               in_filters,
+               out_filters,
+               input_compression_ratio,
+               output_compression_ratio,
+               strides,
+               kernel_size=3,
+               stochastic_depth_drop_rate=None,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               divisible_by=1,
+               use_residual=True,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """Initializes an inverted bottleneck block with BN after convolutions.
+
+    Args:
+      in_filters: An `int` number of filters of the input tensor.
+      out_filters: An `int` number of filters of the output tensor.
+      input_compression_ratio: An `float` of compression ratio for
+        input filters.
+      output_compression_ratio: An `float` of compression ratio for
+        output filters.
+      strides: An `int` block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      kernel_size: An `int` kernel_size of the depthwise conv layer.
+      stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
+        the stochastic depth layer.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      divisible_by: An `int` that ensures all inner dimensions are divisible by
+        this number.
+      use_residual: A `bool` of whether to include residual connection between
+        input and output.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(TuckerConvBlock, self).__init__(**kwargs)
+
+    self._in_filters = in_filters
+    self._out_filters = out_filters
+    self._input_compression_ratio = input_compression_ratio
+    self._output_compression_ratio = output_compression_ratio
+    self._strides = strides
+    self._kernel_size = kernel_size
+    self._divisible_by = divisible_by
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._use_sync_bn = use_sync_bn
+    self._use_residual = use_residual
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+
+  def build(self, input_shape):
+    input_compressed_filters = nn_layers.make_divisible(
+        value=self._in_filters * self._input_compression_ratio,
+        divisor=self._divisible_by,
+        round_down_protect=False)
+
+    self._conv0 = tf.keras.layers.Conv2D(
+        filters=input_compressed_filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm0 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    self._activation_layer0 = tf_utils.get_activation(
+        self._activation, use_keras_layer=True)
+
+    output_compressed_filters = nn_layers.make_divisible(
+        value=self._out_filters * self._output_compression_ratio,
+        divisor=self._divisible_by,
+        round_down_protect=False)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=output_compressed_filters,
+        kernel_size=self._kernel_size,
+        strides=self._strides,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    self._activation_layer1 = tf_utils.get_activation(
+        self._activation, use_keras_layer=True)
+
+    # Last 1x1 conv.
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._out_filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    if self._stochastic_depth_drop_rate:
+      self._stochastic_depth = nn_layers.StochasticDepth(
+          self._stochastic_depth_drop_rate)
+    else:
+      self._stochastic_depth = None
+    self._add = tf.keras.layers.Add()
+
+    super(TuckerConvBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'in_filters': self._in_filters,
+        'out_filters': self._out_filters,
+        'input_compression_ratio': self._input_compression_ratio,
+        'output_compression_ratio': self._output_compression_ratio,
+        'strides': self._strides,
+        'kernel_size': self._kernel_size,
+        'divisible_by': self._divisible_by,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'use_residual': self._use_residual,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(TuckerConvBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    shortcut = inputs
+
+    x = self._conv0(inputs)
+    x = self._norm0(x)
+    x = self._activation_layer0(x)
+
+    x = self._conv1(x)
+    x = self._norm1(x)
+    x = self._activation_layer1(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+
+    if (self._use_residual and
+        self._in_filters == self._out_filters and
+        self._strides == 1):
+      if self._stochastic_depth:
+        x = self._stochastic_depth(x, training=training)
+      x = self._add([x, shortcut])
+
+    return x
--- a/official/vision/modeling/layers/nn_blocks_3d.py
+++ b/official/vision/modeling/layers/nn_blocks_3d.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains common building blocks for 3D networks."""
+# Import libraries
+import tensorflow as tf
+
+from official.modeling import tf_utils
+from official.vision.modeling.layers import nn_layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SelfGating(tf.keras.layers.Layer):
+  """Feature gating as used in S3D-G.
+
+  This implements the S3D-G network from:
+  Saining Xie, Chen Sun, Jonathan Huang, Zhuowen Tu, Kevin Murphy.
+  Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video
+  Classification.
+  (https://arxiv.org/pdf/1712.04851.pdf)
+  """
+
+  def __init__(self, filters, **kwargs):
+    """Initializes a self-gating layer.
+
+    Args:
+      filters: An `int` number of filters for the convolutional layer.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(SelfGating, self).__init__(**kwargs)
+    self._filters = filters
+
+  def build(self, input_shape):
+    self._spatial_temporal_average = tf.keras.layers.GlobalAveragePooling3D()
+
+    # No BN and activation after conv.
+    self._transformer_w = tf.keras.layers.Conv3D(
+        filters=self._filters,
+        kernel_size=[1, 1, 1],
+        use_bias=True,
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            mean=0.0, stddev=0.01))
+
+    super(SelfGating, self).build(input_shape)
+
+  def call(self, inputs):
+    x = self._spatial_temporal_average(inputs)
+
+    x = tf.expand_dims(x, 1)
+    x = tf.expand_dims(x, 2)
+    x = tf.expand_dims(x, 3)
+
+    x = self._transformer_w(x)
+    x = tf.nn.sigmoid(x)
+
+    return tf.math.multiply(x, inputs)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BottleneckBlock3D(tf.keras.layers.Layer):
+  """Creates a 3D bottleneck block."""
+
+  def __init__(self,
+               filters,
+               temporal_kernel_size,
+               temporal_strides,
+               spatial_strides,
+               stochastic_depth_drop_rate=0.0,
+               se_ratio=None,
+               use_self_gating=False,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """Initializes a 3D bottleneck block with BN after convolutions.
+
+    Args:
+      filters: An `int` number of filters for the first two convolutions. Note
+        that the third and final convolution will use 4 times as many filters.
+      temporal_kernel_size: An `int` of kernel size for the temporal
+        convolutional layer.
+      temporal_strides: An `int` of ftemporal stride for the temporal
+        convolutional layer.
+      spatial_strides: An `int` of spatial stride for the spatial convolutional
+        layer.
+      stochastic_depth_drop_rate: A `float` or None. If not None, drop rate for
+        the stochastic depth layer.
+      se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
+      use_self_gating: A `bool` of whether to apply self-gating module or not.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(BottleneckBlock3D, self).__init__(**kwargs)
+
+    self._filters = filters
+    self._temporal_kernel_size = temporal_kernel_size
+    self._spatial_strides = spatial_strides
+    self._temporal_strides = temporal_strides
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._use_self_gating = use_self_gating
+    self._se_ratio = se_ratio
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape):
+    self._shortcut_maxpool = tf.keras.layers.MaxPool3D(
+        pool_size=[1, 1, 1],
+        strides=[
+            self._temporal_strides, self._spatial_strides, self._spatial_strides
+        ])
+
+    self._shortcut_conv = tf.keras.layers.Conv3D(
+        filters=4 * self._filters,
+        kernel_size=1,
+        strides=[
+            self._temporal_strides, self._spatial_strides, self._spatial_strides
+        ],
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm0 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._temporal_conv = tf.keras.layers.Conv3D(
+        filters=self._filters,
+        kernel_size=[self._temporal_kernel_size, 1, 1],
+        strides=[self._temporal_strides, 1, 1],
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._spatial_conv = tf.keras.layers.Conv3D(
+        filters=self._filters,
+        kernel_size=[1, 3, 3],
+        strides=[1, self._spatial_strides, self._spatial_strides],
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._expand_conv = tf.keras.layers.Conv3D(
+        filters=4 * self._filters,
+        kernel_size=[1, 1, 1],
+        strides=[1, 1, 1],
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm3 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
+      self._squeeze_excitation = nn_layers.SqueezeExcitation(
+          in_filters=self._filters * 4,
+          out_filters=self._filters * 4,
+          se_ratio=self._se_ratio,
+          use_3d_input=True,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+    else:
+      self._squeeze_excitation = None
+
+    if self._stochastic_depth_drop_rate:
+      self._stochastic_depth = nn_layers.StochasticDepth(
+          self._stochastic_depth_drop_rate)
+    else:
+      self._stochastic_depth = None
+
+    if self._use_self_gating:
+      self._self_gating = SelfGating(filters=4 * self._filters)
+    else:
+      self._self_gating = None
+
+    super(BottleneckBlock3D, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'temporal_kernel_size': self._temporal_kernel_size,
+        'temporal_strides': self._temporal_strides,
+        'spatial_strides': self._spatial_strides,
+        'use_self_gating': self._use_self_gating,
+        'se_ratio': self._se_ratio,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(BottleneckBlock3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    in_filters = inputs.shape.as_list()[-1]
+    if in_filters == 4 * self._filters:
+      if self._temporal_strides == 1 and self._spatial_strides == 1:
+        shortcut = inputs
+      else:
+        shortcut = self._shortcut_maxpool(inputs)
+    else:
+      shortcut = self._shortcut_conv(inputs)
+      shortcut = self._norm0(shortcut)
+
+    x = self._temporal_conv(inputs)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+
+    x = self._spatial_conv(x)
+    x = self._norm2(x)
+    x = self._activation_fn(x)
+
+    x = self._expand_conv(x)
+    x = self._norm3(x)
+
+    # Apply self-gating, SE, stochastic depth.
+    if self._self_gating:
+      x = self._self_gating(x)
+    if self._squeeze_excitation:
+      x = self._squeeze_excitation(x)
+    if self._stochastic_depth:
+      x = self._stochastic_depth(x, training=training)
+
+    # Apply activation before additional modules.
+    x = self._activation_fn(x + shortcut)
+
+    return x
--- a/official/vision/modeling/layers/nn_blocks_3d_test.py
+++ b/official/vision/modeling/layers/nn_blocks_3d_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for resnet."""
+
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.modeling.layers import nn_blocks_3d
+
+
+class NNBlocksTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (nn_blocks_3d.BottleneckBlock3D, 1, 1, 2, True, 0.2, 0.1),
+      (nn_blocks_3d.BottleneckBlock3D, 3, 2, 1, False, 0.0, 0.0),
+  )
+  def test_bottleneck_block_creation(self, block_fn, temporal_kernel_size,
+                                     temporal_strides, spatial_strides,
+                                     use_self_gating, se_ratio,
+                                     stochastic_depth):
+    temporal_size = 16
+    spatial_size = 128
+    filters = 256
+    inputs = tf.keras.Input(
+        shape=(temporal_size, spatial_size, spatial_size, filters * 4),
+        batch_size=1)
+    block = block_fn(
+        filters=filters,
+        temporal_kernel_size=temporal_kernel_size,
+        temporal_strides=temporal_strides,
+        spatial_strides=spatial_strides,
+        use_self_gating=use_self_gating,
+        se_ratio=se_ratio,
+        stochastic_depth_drop_rate=stochastic_depth)
+
+    features = block(inputs)
+
+    self.assertAllEqual([
+        1, temporal_size // temporal_strides, spatial_size // spatial_strides,
+        spatial_size // spatial_strides, filters * 4
+    ], features.shape.as_list())
+
+
+if __name__ == '__main__':
+  tf.test.main()