Internal change

PiperOrigin-RevId: 425419954

Internal change
PiperOrigin-RevId: 425419954
8a9a607c · Yeqing Li · A. Unique TensorFlower · b7bb52f0 · 8a9a607c · 8a9a607c
Commit 8a9a607c authored Jan 31, 2022 by Yeqing Li Committed by A. Unique TensorFlower Jan 31, 2022
14 changed files
--- a/official/vision/modeling/layers/roi_aligner_test.py
+++ b/official/vision/modeling/layers/roi_aligner_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for roi_aligner.py."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.modeling.layers import roi_aligner
+
+
+class MultilevelROIAlignerTest(tf.test.TestCase):
+
+  def test_serialize_deserialize(self):
+    kwargs = dict(
+        crop_size=7,
+        sample_offset=0.5,
+    )
+    aligner = roi_aligner.MultilevelROIAligner(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(aligner.get_config(), expected_config)
+
+    new_aligner = roi_aligner.MultilevelROIAligner.from_config(
+        aligner.get_config())
+
+    self.assertAllEqual(aligner.get_config(), new_aligner.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/roi_generator.py
+++ b/official/vision/modeling/layers/roi_generator.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of ROI generator."""
+from typing import Optional, Mapping
+# Import libraries
+import tensorflow as tf
+
+from official.vision.ops import box_ops
+from official.vision.ops import nms
+
+
+def _multilevel_propose_rois(raw_boxes: Mapping[str, tf.Tensor],
+                             raw_scores: Mapping[str, tf.Tensor],
+                             anchor_boxes: Mapping[str, tf.Tensor],
+                             image_shape: tf.Tensor,
+                             pre_nms_top_k: int = 2000,
+                             pre_nms_score_threshold: float = 0.0,
+                             pre_nms_min_size_threshold: float = 0.0,
+                             nms_iou_threshold: float = 0.7,
+                             num_proposals: int = 1000,
+                             use_batched_nms: bool = False,
+                             decode_boxes: bool = True,
+                             clip_boxes: bool = True,
+                             apply_sigmoid_to_score: bool = True):
+  """Proposes RoIs given a group of candidates from different FPN levels.
+
+  The following describes the steps:
+    1. For each individual level:
+      a. Apply sigmoid transform if specified.
+      b. Decode boxes if specified.
+      c. Clip boxes if specified.
+      d. Filter small boxes and those fall outside image if specified.
+      e. Apply pre-NMS filtering including pre-NMS top k and score thresholding.
+      f. Apply NMS.
+    2. Aggregate post-NMS boxes from each level.
+    3. Apply an overall top k to generate the final selected RoIs.
+
+  Args:
+    raw_boxes: A `dict` with keys representing FPN levels and values
+      representing box tenors of shape
+      [batch_size, feature_h, feature_w, num_anchors * 4].
+    raw_scores: A `dict` with keys representing FPN levels and values
+      representing logit tensors of shape
+      [batch_size, feature_h, feature_w, num_anchors].
+    anchor_boxes: A `dict` with keys representing FPN levels and values
+      representing anchor box tensors of shape
+      [batch_size, feature_h * feature_w * num_anchors, 4].
+    image_shape: A `tf.Tensor` of shape [batch_size, 2] where the last dimension
+      are [height, width] of the scaled image.
+    pre_nms_top_k: An `int` of top scoring RPN proposals *per level* to keep
+      before applying NMS. Default: 2000.
+    pre_nms_score_threshold: A `float` between 0 and 1 representing the minimal
+      box score to keep before applying NMS. This is often used as a
+      pre-filtering step for better performance. Default: 0, no filtering is
+      applied.
+    pre_nms_min_size_threshold: A `float` representing the minimal box size in
+      each side (w.r.t. the scaled image) to keep before applying NMS. This is
+      often used as a pre-filtering step for better performance. Default: 0, no
+      filtering is applied.
+    nms_iou_threshold: A `float` between 0 and 1 representing the IoU threshold
+      used for NMS. If 0.0, no NMS is applied. Default: 0.7.
+    num_proposals: An `int` of top scoring RPN proposals *in total* to keep
+      after applying NMS. Default: 1000.
+    use_batched_nms: A `bool` indicating whether NMS is applied in batch using
+      `tf.image.combined_non_max_suppression`. Currently only available in
+      CPU/GPU. Default is False.
+    decode_boxes: A `bool` indicating whether `raw_boxes` needs to be decoded
+      using `anchor_boxes`. If False, use `raw_boxes` directly and ignore
+      `anchor_boxes`. Default is True.
+    clip_boxes: A `bool` indicating whether boxes are first clipped to the
+      scaled image size before appliying NMS. If False, no clipping is applied
+      and `image_shape` is ignored. Default is True.
+    apply_sigmoid_to_score: A `bool` indicating whether apply sigmoid to
+      `raw_scores` before applying NMS. Default is True.
+
+  Returns:
+    selected_rois: A `tf.Tensor` of shape [batch_size, num_proposals, 4],
+      representing the box coordinates of the selected proposals w.r.t. the
+      scaled image.
+    selected_roi_scores: A `tf.Tensor` of shape [batch_size, num_proposals, 1],
+      representing the scores of the selected proposals.
+  """
+  with tf.name_scope('multilevel_propose_rois'):
+    rois = []
+    roi_scores = []
+    image_shape = tf.expand_dims(image_shape, axis=1)
+    for level in sorted(raw_scores.keys()):
+      with tf.name_scope('level_%s' % level):
+        _, feature_h, feature_w, num_anchors_per_location = (
+            raw_scores[level].get_shape().as_list())
+
+        num_boxes = feature_h * feature_w * num_anchors_per_location
+        this_level_scores = tf.reshape(raw_scores[level], [-1, num_boxes])
+        this_level_boxes = tf.reshape(raw_boxes[level], [-1, num_boxes, 4])
+        this_level_anchors = tf.cast(
+            tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
+            dtype=this_level_scores.dtype)
+
+        if apply_sigmoid_to_score:
+          this_level_scores = tf.sigmoid(this_level_scores)
+
+        if decode_boxes:
+          this_level_boxes = box_ops.decode_boxes(
+              this_level_boxes, this_level_anchors)
+        if clip_boxes:
+          this_level_boxes = box_ops.clip_boxes(
+              this_level_boxes, image_shape)
+
+        if pre_nms_min_size_threshold > 0.0:
+          this_level_boxes, this_level_scores = box_ops.filter_boxes(
+              this_level_boxes,
+              this_level_scores,
+              image_shape,
+              pre_nms_min_size_threshold)
+
+        this_level_pre_nms_top_k = min(num_boxes, pre_nms_top_k)
+        this_level_post_nms_top_k = min(num_boxes, num_proposals)
+        if nms_iou_threshold > 0.0:
+          if use_batched_nms:
+            this_level_rois, this_level_roi_scores, _, _ = (
+                tf.image.combined_non_max_suppression(
+                    tf.expand_dims(this_level_boxes, axis=2),
+                    tf.expand_dims(this_level_scores, axis=-1),
+                    max_output_size_per_class=this_level_pre_nms_top_k,
+                    max_total_size=this_level_post_nms_top_k,
+                    iou_threshold=nms_iou_threshold,
+                    score_threshold=pre_nms_score_threshold,
+                    pad_per_class=False,
+                    clip_boxes=False))
+          else:
+            if pre_nms_score_threshold > 0.0:
+              this_level_boxes, this_level_scores = (
+                  box_ops.filter_boxes_by_scores(
+                      this_level_boxes,
+                      this_level_scores,
+                      pre_nms_score_threshold))
+            this_level_boxes, this_level_scores = box_ops.top_k_boxes(
+                this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k)
+            this_level_roi_scores, this_level_rois = (
+                nms.sorted_non_max_suppression_padded(
+                    this_level_scores,
+                    this_level_boxes,
+                    max_output_size=this_level_post_nms_top_k,
+                    iou_threshold=nms_iou_threshold))
+        else:
+          this_level_rois, this_level_roi_scores = box_ops.top_k_boxes(
+              this_level_boxes,
+              this_level_scores,
+              k=this_level_post_nms_top_k)
+
+        rois.append(this_level_rois)
+        roi_scores.append(this_level_roi_scores)
+
+    all_rois = tf.concat(rois, axis=1)
+    all_roi_scores = tf.concat(roi_scores, axis=1)
+
+    with tf.name_scope('top_k_rois'):
+      _, num_valid_rois = all_roi_scores.get_shape().as_list()
+      overall_top_k = min(num_valid_rois, num_proposals)
+
+      selected_rois, selected_roi_scores = box_ops.top_k_boxes(
+          all_rois, all_roi_scores, k=overall_top_k)
+
+    return selected_rois, selected_roi_scores
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MultilevelROIGenerator(tf.keras.layers.Layer):
+  """Proposes RoIs for the second stage processing."""
+
+  def __init__(self,
+               pre_nms_top_k: int = 2000,
+               pre_nms_score_threshold: float = 0.0,
+               pre_nms_min_size_threshold: float = 0.0,
+               nms_iou_threshold: float = 0.7,
+               num_proposals: int = 1000,
+               test_pre_nms_top_k: int = 1000,
+               test_pre_nms_score_threshold: float = 0.0,
+               test_pre_nms_min_size_threshold: float = 0.0,
+               test_nms_iou_threshold: float = 0.7,
+               test_num_proposals: int = 1000,
+               use_batched_nms: bool = False,
+               **kwargs):
+    """Initializes a ROI generator.
+
+    The ROI generator transforms the raw predictions from RPN to ROIs.
+
+    Args:
+      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
+        before applying NMS.
+      pre_nms_score_threshold: A `float` of the score threshold to apply before
+        applying NMS. Proposals whose scores are below this threshold are
+        thrown away.
+      pre_nms_min_size_threshold: A `float` of the threshold of each side of the
+        box (w.r.t. the scaled image). Proposals whose sides are below this
+        threshold are thrown away.
+      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
+      num_proposals: An `int` of the final number of proposals to generate.
+      test_pre_nms_top_k: An `int` of the number of top scores proposals to be
+        kept before applying NMS in testing.
+      test_pre_nms_score_threshold: A `float` of the score threshold to apply
+        before applying NMS in testing. Proposals whose scores are below this
+        threshold are thrown away.
+      test_pre_nms_min_size_threshold: A `float` of the threshold of each side
+        of the box (w.r.t. the scaled image) in testing. Proposals whose sides
+        are below this threshold are thrown away.
+      test_nms_iou_threshold: A `float` in [0, 1] of the NMS IoU threshold in
+        testing.
+      test_num_proposals: An `int` of the final number of proposals to generate
+        in testing.
+      use_batched_nms: A `bool` of whether or not use
+        `tf.image.combined_non_max_suppression`.
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'pre_nms_min_size_threshold': pre_nms_min_size_threshold,
+        'nms_iou_threshold': nms_iou_threshold,
+        'num_proposals': num_proposals,
+        'test_pre_nms_top_k': test_pre_nms_top_k,
+        'test_pre_nms_score_threshold': test_pre_nms_score_threshold,
+        'test_pre_nms_min_size_threshold': test_pre_nms_min_size_threshold,
+        'test_nms_iou_threshold': test_nms_iou_threshold,
+        'test_num_proposals': test_num_proposals,
+        'use_batched_nms': use_batched_nms,
+    }
+    super(MultilevelROIGenerator, self).__init__(**kwargs)
+
+  def call(self,
+           raw_boxes: Mapping[str, tf.Tensor],
+           raw_scores: Mapping[str, tf.Tensor],
+           anchor_boxes: Mapping[str, tf.Tensor],
+           image_shape: tf.Tensor,
+           training: Optional[bool] = None):
+    """Proposes RoIs given a group of candidates from different FPN levels.
+
+    The following describes the steps:
+      1. For each individual level:
+        a. Apply sigmoid transform if specified.
+        b. Decode boxes if specified.
+        c. Clip boxes if specified.
+        d. Filter small boxes and those fall outside image if specified.
+        e. Apply pre-NMS filtering including pre-NMS top k and score
+           thresholding.
+        f. Apply NMS.
+      2. Aggregate post-NMS boxes from each level.
+      3. Apply an overall top k to generate the final selected RoIs.
+
+    Args:
+      raw_boxes: A `dict` with keys representing FPN levels and values
+        representing box tenors of shape
+        [batch, feature_h, feature_w, num_anchors * 4].
+      raw_scores: A `dict` with keys representing FPN levels and values
+        representing logit tensors of shape
+        [batch, feature_h, feature_w, num_anchors].
+      anchor_boxes: A `dict` with keys representing FPN levels and values
+        representing anchor box tensors of shape
+        [batch, feature_h * feature_w * num_anchors, 4].
+      image_shape: A `tf.Tensor` of shape [batch, 2] where the last dimension
+        are [height, width] of the scaled image.
+      training: A `bool` that indicates whether it is in training mode.
+
+    Returns:
+      roi_boxes: A `tf.Tensor` of shape [batch, num_proposals, 4], the proposed
+        ROIs in the scaled image coordinate.
+      roi_scores: A `tf.Tensor` of shape [batch, num_proposals], scores of the
+        proposed ROIs.
+    """
+    roi_boxes, roi_scores = _multilevel_propose_rois(
+        raw_boxes,
+        raw_scores,
+        anchor_boxes,
+        image_shape,
+        pre_nms_top_k=(
+            self._config_dict['pre_nms_top_k'] if training
+            else self._config_dict['test_pre_nms_top_k']),
+        pre_nms_score_threshold=(
+            self._config_dict['pre_nms_score_threshold'] if training
+            else self._config_dict['test_pre_nms_score_threshold']),
+        pre_nms_min_size_threshold=(
+            self._config_dict['pre_nms_min_size_threshold'] if training
+            else self._config_dict['test_pre_nms_min_size_threshold']),
+        nms_iou_threshold=(
+            self._config_dict['nms_iou_threshold'] if training
+            else self._config_dict['test_nms_iou_threshold']),
+        num_proposals=(
+            self._config_dict['num_proposals'] if training
+            else self._config_dict['test_num_proposals']),
+        use_batched_nms=self._config_dict['use_batched_nms'],
+        decode_boxes=True,
+        clip_boxes=True,
+        apply_sigmoid_to_score=True)
+    return roi_boxes, roi_scores
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/modeling/layers/roi_generator_test.py
+++ b/official/vision/modeling/layers/roi_generator_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for roi_generator.py."""
+
+# Import libraries
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling.layers import roi_generator
+
+
+class MultilevelProposeRoisTest(tf.test.TestCase):
+
+  def test_multilevel_propose_rois_single_level(self):
+    rpn_boxes_np = np.array(
+        [[[[0, 0, 10, 10], [0.01, 0.01, 9.9, 9.9]],
+          [[5, 5, 10, 10], [2, 2, 8, 8]]],
+         [[[2, 2, 4, 4], [3, 3, 6, 6]],
+          [[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
+    rpn_boxes = {
+        '2': tf.constant(rpn_boxes_np, dtype=tf.float32)
+    }
+    rpn_scores_np = np.array(
+        [[[[0.6], [0.9]], [[0.2], [0.3]]], [[[0.1], [0.8]], [[0.3], [0.5]]]])
+    rpn_scores = {
+        '2': tf.constant(rpn_scores_np, dtype=tf.float32)
+    }
+    anchor_boxes_np = np.array(
+        [[[[0, 0, 10, 10], [0.01, 0.01, 9.9, 9.9]],
+          [[5, 5, 10, 10], [2, 2, 8, 8]]],
+         [[[2, 2, 4, 4], [3, 3, 6, 6]],
+          [[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
+    anchor_boxes = {
+        '2': tf.constant(anchor_boxes_np, dtype=tf.float32)
+    }
+    image_shape = tf.constant([[20, 20], [20, 20]], dtype=tf.int32)
+
+    selected_rois_np = np.array(
+        [[[0.01, 0.01, 9.9, 9.9], [2, 2, 8, 8], [5, 5, 10, 10], [0, 0, 0, 0]],
+         [[3, 3, 6, 6], [1, 1, 8, 8], [2, 2, 4, 4], [0, 0, 0, 0]]])
+    selected_roi_scores_np = np.array(
+        [[0.9, 0.3, 0.2, 0], [0.8, 0.5, 0.1, 0]])
+
+    # Runs on TPU.
+    strategy = tf.distribute.TPUStrategy()
+    with strategy.scope():
+      selected_rois_tpu, selected_roi_scores_tpu = (
+          roi_generator._multilevel_propose_rois(
+              rpn_boxes,
+              rpn_scores,
+              anchor_boxes=anchor_boxes,
+              image_shape=image_shape,
+              pre_nms_top_k=4,
+              pre_nms_score_threshold=0.0,
+              pre_nms_min_size_threshold=0.0,
+              nms_iou_threshold=0.5,
+              num_proposals=4,
+              use_batched_nms=False,
+              decode_boxes=False,
+              clip_boxes=False,
+              apply_sigmoid_to_score=False))
+
+    # Runs on CPU.
+    selected_rois_cpu, selected_roi_scores_cpu = (
+        roi_generator._multilevel_propose_rois(
+            rpn_boxes,
+            rpn_scores,
+            anchor_boxes=anchor_boxes,
+            image_shape=image_shape,
+            pre_nms_top_k=4,
+            pre_nms_score_threshold=0.0,
+            pre_nms_min_size_threshold=0.0,
+            nms_iou_threshold=0.5,
+            num_proposals=4,
+            use_batched_nms=False,
+            decode_boxes=False,
+            clip_boxes=False,
+            apply_sigmoid_to_score=False))
+
+    self.assertNDArrayNear(
+        selected_rois_tpu.numpy(), selected_rois_cpu.numpy(), 1e-5)
+    self.assertNDArrayNear(
+        selected_roi_scores_tpu.numpy(), selected_roi_scores_cpu.numpy(), 1e-5)
+
+    self.assertNDArrayNear(
+        selected_rois_tpu.numpy(), selected_rois_np, 1e-5)
+    self.assertNDArrayNear(
+        selected_roi_scores_tpu.numpy(), selected_roi_scores_np, 1e-5)
+
+  def test_multilevel_propose_rois_two_levels(self):
+    rpn_boxes_1_np = np.array(
+        [[[[0, 0, 10, 10], [0.01, 0.01, 9.99, 9.99]],
+          [[5, 5, 10, 10], [2, 2, 8, 8]]],
+         [[[2, 2, 2.5, 2.5], [3, 3, 6, 6]],
+          [[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
+    rpn_boxes_2_np = np.array(
+        [[[[0, 0, 10.01, 10.01]]], [[[2, 2, 4.5, 4.5]]]])
+    rpn_boxes = {
+        '2': tf.constant(rpn_boxes_1_np, dtype=tf.float32),
+        '3': tf.constant(rpn_boxes_2_np, dtype=tf.float32),
+    }
+    rpn_scores_1_np = np.array(
+        [[[[0.6], [0.9]], [[0.2], [0.3]]], [[[0.1], [0.8]], [[0.3], [0.5]]]])
+    rpn_scores_2_np = np.array([[[[0.95]]], [[[0.99]]]])
+    rpn_scores = {
+        '2': tf.constant(rpn_scores_1_np, dtype=tf.float32),
+        '3': tf.constant(rpn_scores_2_np, dtype=tf.float32),
+    }
+    anchor_boxes_1_np = np.array(
+        [[[[0, 0, 10, 10], [0.01, 0.01, 9.99, 9.99]],
+          [[5, 5, 10, 10], [2, 2, 8, 8]]],
+         [[[2, 2, 2.5, 2.5], [3, 3, 6, 6]],
+          [[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
+    anchor_boxes_2_np = np.array(
+        [[[[0, 0, 10.01, 10.01]]], [[[2, 2, 4.5, 4.5]]]])
+    anchor_boxes = {
+        '2': tf.constant(anchor_boxes_1_np, dtype=tf.float32),
+        '3': tf.constant(anchor_boxes_2_np, dtype=tf.float32),
+    }
+    image_shape = tf.constant([[20, 20], [20, 20]], dtype=tf.int32)
+
+    selected_rois_np = np.array(
+        [[[0, 0, 10.01, 10.01], [0.01, 0.01, 9.99, 9.99]],
+         [[2, 2, 4.5, 4.5], [3, 3, 6, 6]]])
+    selected_roi_scores_np = np.array([[0.95, 0.9], [0.99, 0.8]])
+
+    # Runs on TPU.
+    strategy = tf.distribute.TPUStrategy()
+    with strategy.scope():
+      selected_rois_tpu, selected_roi_scores_tpu = (
+          roi_generator._multilevel_propose_rois(
+              rpn_boxes,
+              rpn_scores,
+              anchor_boxes=anchor_boxes,
+              image_shape=image_shape,
+              pre_nms_top_k=4,
+              pre_nms_score_threshold=0.0,
+              pre_nms_min_size_threshold=0.0,
+              nms_iou_threshold=0.5,
+              num_proposals=2,
+              use_batched_nms=False,
+              decode_boxes=False,
+              clip_boxes=False,
+              apply_sigmoid_to_score=False))
+
+    # Runs on CPU.
+    selected_rois_cpu, selected_roi_scores_cpu = (
+        roi_generator._multilevel_propose_rois(
+            rpn_boxes,
+            rpn_scores,
+            anchor_boxes=anchor_boxes,
+            image_shape=image_shape,
+            pre_nms_top_k=4,
+            pre_nms_score_threshold=0.0,
+            pre_nms_min_size_threshold=0.0,
+            nms_iou_threshold=0.5,
+            num_proposals=2,
+            use_batched_nms=False,
+            decode_boxes=False,
+            clip_boxes=False,
+            apply_sigmoid_to_score=False))
+
+    self.assertNDArrayNear(
+        selected_rois_tpu.numpy(), selected_rois_cpu.numpy(), 1e-5)
+    self.assertNDArrayNear(
+        selected_roi_scores_tpu.numpy(), selected_roi_scores_cpu.numpy(), 1e-5)
+
+    self.assertNDArrayNear(
+        selected_rois_tpu.numpy(), selected_rois_np, 1e-5)
+    self.assertNDArrayNear(
+        selected_roi_scores_tpu.numpy(), selected_roi_scores_np, 1e-5)
+
+
+class MultilevelROIGeneratorTest(tf.test.TestCase):
+
+  def test_serialize_deserialize(self):
+    kwargs = dict(
+        pre_nms_top_k=2000,
+        pre_nms_score_threshold=0.0,
+        pre_nms_min_size_threshold=0.0,
+        nms_iou_threshold=0.7,
+        num_proposals=1000,
+        test_pre_nms_top_k=1000,
+        test_pre_nms_score_threshold=0.0,
+        test_pre_nms_min_size_threshold=0.0,
+        test_nms_iou_threshold=0.7,
+        test_num_proposals=1000,
+        use_batched_nms=False,
+    )
+    generator = roi_generator.MultilevelROIGenerator(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(generator.get_config(), expected_config)
+
+    new_generator = roi_generator.MultilevelROIGenerator.from_config(
+        generator.get_config())
+
+    self.assertAllEqual(generator.get_config(), new_generator.get_config())
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/layers/roi_sampler.py
+++ b/official/vision/modeling/layers/roi_sampler.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains definitions of ROI sampler."""
+# Import libraries
+
+import tensorflow as tf
+
+from official.vision.modeling.layers import box_sampler
+from official.vision.ops import box_matcher
+from official.vision.ops import iou_similarity
+from official.vision.ops import target_gather
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ROISampler(tf.keras.layers.Layer):
+  """Samples ROIs and assigns targets to the sampled ROIs."""
+
+  def __init__(self,
+               mix_gt_boxes: bool = True,
+               num_sampled_rois: int = 512,
+               foreground_fraction: float = 0.25,
+               foreground_iou_threshold: float = 0.5,
+               background_iou_high_threshold: float = 0.5,
+               background_iou_low_threshold: float = 0,
+               skip_subsampling: bool = False,
+               **kwargs):
+    """Initializes a ROI sampler.
+
+    Args:
+      mix_gt_boxes: A `bool` of whether to mix the groundtruth boxes with
+        proposed ROIs.
+      num_sampled_rois: An `int` of the number of sampled ROIs per image.
+      foreground_fraction: A `float` in [0, 1], what percentage of proposed ROIs
+        should be sampled from the foreground boxes.
+      foreground_iou_threshold: A `float` that represents the IoU threshold for
+        a box to be considered as positive (if >= `foreground_iou_threshold`).
+      background_iou_high_threshold: A `float` that represents the IoU threshold
+        for a box to be considered as negative (if overlap in
+        [`background_iou_low_threshold`, `background_iou_high_threshold`]).
+      background_iou_low_threshold: A `float` that represents the IoU threshold
+        for a box to be considered as negative (if overlap in
+        [`background_iou_low_threshold`, `background_iou_high_threshold`])
+      skip_subsampling: a bool that determines if we want to skip the sampling
+        procedure than balances the fg/bg classes. Used for upper frcnn layers
+        in cascade RCNN.
+      **kwargs: Additional keyword arguments passed to Layer.
+    """
+    self._config_dict = {
+        'mix_gt_boxes': mix_gt_boxes,
+        'num_sampled_rois': num_sampled_rois,
+        'foreground_fraction': foreground_fraction,
+        'foreground_iou_threshold': foreground_iou_threshold,
+        'background_iou_high_threshold': background_iou_high_threshold,
+        'background_iou_low_threshold': background_iou_low_threshold,
+        'skip_subsampling': skip_subsampling,
+    }
+
+    self._sim_calc = iou_similarity.IouSimilarity()
+    self._box_matcher = box_matcher.BoxMatcher(
+        thresholds=[
+            background_iou_low_threshold, background_iou_high_threshold,
+            foreground_iou_threshold
+        ],
+        indicators=[-3, -1, -2, 1])
+    self._target_gather = target_gather.TargetGather()
+
+    self._sampler = box_sampler.BoxSampler(
+        num_sampled_rois, foreground_fraction)
+    super(ROISampler, self).__init__(**kwargs)
+
+  def call(self, boxes: tf.Tensor, gt_boxes: tf.Tensor, gt_classes: tf.Tensor):
+    """Assigns the proposals with groundtruth classes and performs subsmpling.
+
+    Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
+    following algorithm to generate the final `num_samples_per_image` RoIs.
+      1. Calculates the IoU between each proposal box and each gt_boxes.
+      2. Assigns each proposed box with a groundtruth class and box by choosing
+         the largest IoU overlap.
+      3. Samples `num_samples_per_image` boxes from all proposed boxes, and
+         returns box_targets, class_targets, and RoIs.
+
+    Args:
+      boxes: A `tf.Tensor` of shape of [batch_size, N, 4]. N is the number of
+        proposals before groundtruth assignment. The last dimension is the
+        box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
+        format.
+      gt_boxes: A `tf.Tensor` of shape of [batch_size, MAX_NUM_INSTANCES, 4].
+        The coordinates of gt_boxes are in the pixel coordinates of the scaled
+        image. This tensor might have padding of values -1 indicating the
+        invalid box coordinates.
+      gt_classes: A `tf.Tensor` with a shape of [batch_size, MAX_NUM_INSTANCES].
+        This tensor might have paddings with values of -1 indicating the invalid
+        classes.
+
+    Returns:
+      sampled_rois: A `tf.Tensor` of shape of [batch_size, K, 4], representing
+        the coordinates of the sampled RoIs, where K is the number of the
+        sampled RoIs, i.e. K = num_samples_per_image.
+      sampled_gt_boxes: A `tf.Tensor` of shape of [batch_size, K, 4], storing
+        the box coordinates of the matched groundtruth boxes of the samples
+        RoIs.
+      sampled_gt_classes: A `tf.Tensor` of shape of [batch_size, K], storing the
+        classes of the matched groundtruth boxes of the sampled RoIs.
+      sampled_gt_indices: A `tf.Tensor` of shape of [batch_size, K], storing the
+        indices of the sampled groudntruth boxes in the original `gt_boxes`
+        tensor, i.e.,
+        gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
+    """
+    gt_boxes = tf.cast(gt_boxes, dtype=boxes.dtype)
+    if self._config_dict['mix_gt_boxes']:
+      boxes = tf.concat([boxes, gt_boxes], axis=1)
+
+    boxes_invalid_mask = tf.less(
+        tf.reduce_max(boxes, axis=-1, keepdims=True), 0.0)
+    gt_invalid_mask = tf.less(
+        tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
+    similarity_matrix = self._sim_calc(boxes, gt_boxes, boxes_invalid_mask,
+                                       gt_invalid_mask)
+    matched_gt_indices, match_indicators = self._box_matcher(similarity_matrix)
+    positive_matches = tf.greater_equal(match_indicators, 0)
+    negative_matches = tf.equal(match_indicators, -1)
+    ignored_matches = tf.equal(match_indicators, -2)
+    invalid_matches = tf.equal(match_indicators, -3)
+
+    background_mask = tf.expand_dims(
+        tf.logical_or(negative_matches, invalid_matches), -1)
+    gt_classes = tf.expand_dims(gt_classes, axis=-1)
+    matched_gt_classes = self._target_gather(gt_classes, matched_gt_indices,
+                                             background_mask)
+    matched_gt_classes = tf.where(background_mask,
+                                  tf.zeros_like(matched_gt_classes),
+                                  matched_gt_classes)
+    matched_gt_boxes = self._target_gather(gt_boxes, matched_gt_indices,
+                                           tf.tile(background_mask, [1, 1, 4]))
+    matched_gt_boxes = tf.where(background_mask,
+                                tf.zeros_like(matched_gt_boxes),
+                                matched_gt_boxes)
+    matched_gt_indices = tf.where(
+        tf.squeeze(background_mask, -1), -tf.ones_like(matched_gt_indices),
+        matched_gt_indices)
+
+    if self._config_dict['skip_subsampling']:
+      return (boxes, matched_gt_boxes, tf.squeeze(matched_gt_classes,
+                                                  axis=-1), matched_gt_indices)
+
+    sampled_indices = self._sampler(
+        positive_matches, negative_matches, ignored_matches)
+
+    sampled_rois = self._target_gather(boxes, sampled_indices)
+    sampled_gt_boxes = self._target_gather(matched_gt_boxes, sampled_indices)
+    sampled_gt_classes = tf.squeeze(self._target_gather(
+        matched_gt_classes, sampled_indices), axis=-1)
+    sampled_gt_indices = tf.squeeze(self._target_gather(
+        tf.expand_dims(matched_gt_indices, -1), sampled_indices), axis=-1)
+    return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
+            sampled_gt_indices)
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/layers/roi_sampler_test.py
+++ b/official/vision/modeling/layers/roi_sampler_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for roi_sampler.py."""
+
+# Import libraries
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling.layers import roi_sampler
+
+
+class ROISamplerTest(tf.test.TestCase):
+
+  def test_roi_sampler(self):
+    boxes_np = np.array(
+        [[[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5],
+          [5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+
+    gt_boxes_np = np.array(
+        [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5],
+          [-1, -1, -1, -1]]])
+    gt_boxes = tf.constant(gt_boxes_np, dtype=tf.float32)
+    gt_classes_np = np.array([[2, 10, -1]])
+    gt_classes = tf.constant(gt_classes_np, dtype=tf.int32)
+
+    generator = roi_sampler.ROISampler(
+        mix_gt_boxes=True,
+        num_sampled_rois=2,
+        foreground_fraction=0.5,
+        foreground_iou_threshold=0.5,
+        background_iou_high_threshold=0.5,
+        background_iou_low_threshold=0.0)
+
+    # Runs on TPU.
+    strategy = tf.distribute.TPUStrategy()
+    with strategy.scope():
+      _ = generator(boxes, gt_boxes, gt_classes)
+
+    # Runs on CPU.
+    _ = generator(boxes, gt_boxes, gt_classes)
+
+  def test_serialize_deserialize(self):
+    kwargs = dict(
+        mix_gt_boxes=True,
+        num_sampled_rois=512,
+        foreground_fraction=0.25,
+        foreground_iou_threshold=0.5,
+        background_iou_high_threshold=0.5,
+        background_iou_low_threshold=0.5,
+        skip_subsampling=False,
+    )
+    generator = roi_sampler.ROISampler(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(generator.get_config(), expected_config)
+
+    new_generator = roi_sampler.ROISampler.from_config(
+        generator.get_config())
+
+    self.assertAllEqual(generator.get_config(), new_generator.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/maskrcnn_model.py
+++ b/official/vision/modeling/maskrcnn_model.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""R-CNN(-RS) models."""
+
+from typing import Any, List, Mapping, Optional, Tuple, Union
+
+import tensorflow as tf
+
+from official.vision.ops import anchor
+from official.vision.ops import box_ops
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MaskRCNNModel(tf.keras.Model):
+  """The Mask R-CNN(-RS) and Cascade RCNN-RS models."""
+
+  def __init__(self,
+               backbone: tf.keras.Model,
+               decoder: tf.keras.Model,
+               rpn_head: tf.keras.layers.Layer,
+               detection_head: Union[tf.keras.layers.Layer,
+                                     List[tf.keras.layers.Layer]],
+               roi_generator: tf.keras.layers.Layer,
+               roi_sampler: Union[tf.keras.layers.Layer,
+                                  List[tf.keras.layers.Layer]],
+               roi_aligner: tf.keras.layers.Layer,
+               detection_generator: tf.keras.layers.Layer,
+               mask_head: Optional[tf.keras.layers.Layer] = None,
+               mask_sampler: Optional[tf.keras.layers.Layer] = None,
+               mask_roi_aligner: Optional[tf.keras.layers.Layer] = None,
+               class_agnostic_bbox_pred: bool = False,
+               cascade_class_ensemble: bool = False,
+               min_level: Optional[int] = None,
+               max_level: Optional[int] = None,
+               num_scales: Optional[int] = None,
+               aspect_ratios: Optional[List[float]] = None,
+               anchor_size: Optional[float] = None,
+               **kwargs):
+    """Initializes the R-CNN(-RS) model.
+
+    Args:
+      backbone: `tf.keras.Model`, the backbone network.
+      decoder: `tf.keras.Model`, the decoder network.
+      rpn_head: the RPN head.
+      detection_head: the detection head or a list of heads.
+      roi_generator: the ROI generator.
+      roi_sampler: a single ROI sampler or a list of ROI samplers for cascade
+        detection heads.
+      roi_aligner: the ROI aligner.
+      detection_generator: the detection generator.
+      mask_head: the mask head.
+      mask_sampler: the mask sampler.
+      mask_roi_aligner: the ROI alginer for mask prediction.
+      class_agnostic_bbox_pred: if True, perform class agnostic bounding box
+        prediction. Needs to be `True` for Cascade RCNN models.
+      cascade_class_ensemble: if True, ensemble classification scores over all
+        detection heads.
+      min_level: Minimum level in output feature maps.
+      max_level: Maximum level in output feature maps.
+      num_scales: A number representing intermediate scales added on each level.
+        For instances, num_scales=2 adds one additional intermediate anchor
+        scales [2^0, 2^0.5] on each level.
+      aspect_ratios: A list representing the aspect raito anchors added on each
+        level. The number indicates the ratio of width to height. For instances,
+        aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level.
+      anchor_size: A number representing the scale of size of the base anchor to
+        the feature stride 2^level.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(MaskRCNNModel, self).__init__(**kwargs)
+    self._config_dict = {
+        'backbone': backbone,
+        'decoder': decoder,
+        'rpn_head': rpn_head,
+        'detection_head': detection_head,
+        'roi_generator': roi_generator,
+        'roi_sampler': roi_sampler,
+        'roi_aligner': roi_aligner,
+        'detection_generator': detection_generator,
+        'mask_head': mask_head,
+        'mask_sampler': mask_sampler,
+        'mask_roi_aligner': mask_roi_aligner,
+        'class_agnostic_bbox_pred': class_agnostic_bbox_pred,
+        'cascade_class_ensemble': cascade_class_ensemble,
+        'min_level': min_level,
+        'max_level': max_level,
+        'num_scales': num_scales,
+        'aspect_ratios': aspect_ratios,
+        'anchor_size': anchor_size,
+    }
+    self.backbone = backbone
+    self.decoder = decoder
+    self.rpn_head = rpn_head
+    if not isinstance(detection_head, (list, tuple)):
+      self.detection_head = [detection_head]
+    else:
+      self.detection_head = detection_head
+    self.roi_generator = roi_generator
+    if not isinstance(roi_sampler, (list, tuple)):
+      self.roi_sampler = [roi_sampler]
+    else:
+      self.roi_sampler = roi_sampler
+    if len(self.roi_sampler) > 1 and not class_agnostic_bbox_pred:
+      raise ValueError(
+          '`class_agnostic_bbox_pred` needs to be True if multiple detection heads are specified.'
+      )
+    self.roi_aligner = roi_aligner
+    self.detection_generator = detection_generator
+    self._include_mask = mask_head is not None
+    self.mask_head = mask_head
+    if self._include_mask and mask_sampler is None:
+      raise ValueError('`mask_sampler` is not provided in Mask R-CNN.')
+    self.mask_sampler = mask_sampler
+    if self._include_mask and mask_roi_aligner is None:
+      raise ValueError('`mask_roi_aligner` is not provided in Mask R-CNN.')
+    self.mask_roi_aligner = mask_roi_aligner
+    # Weights for the regression losses for each FRCNN layer.
+    # TODO(xianzhi): Make the weights configurable.
+    self._cascade_layer_to_weights = [
+        [10.0, 10.0, 5.0, 5.0],
+        [20.0, 20.0, 10.0, 10.0],
+        [30.0, 30.0, 15.0, 15.0],
+    ]
+
+  def call(self,
+           images: tf.Tensor,
+           image_shape: tf.Tensor,
+           anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+           gt_boxes: Optional[tf.Tensor] = None,
+           gt_classes: Optional[tf.Tensor] = None,
+           gt_masks: Optional[tf.Tensor] = None,
+           training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
+
+    model_outputs, intermediate_outputs = self._call_box_outputs(
+        images=images, image_shape=image_shape, anchor_boxes=anchor_boxes,
+        gt_boxes=gt_boxes, gt_classes=gt_classes, training=training)
+    if not self._include_mask:
+      return model_outputs
+
+    model_mask_outputs = self._call_mask_outputs(
+        model_box_outputs=model_outputs,
+        features=model_outputs['decoder_features'],
+        current_rois=intermediate_outputs['current_rois'],
+        matched_gt_indices=intermediate_outputs['matched_gt_indices'],
+        matched_gt_boxes=intermediate_outputs['matched_gt_boxes'],
+        matched_gt_classes=intermediate_outputs['matched_gt_classes'],
+        gt_masks=gt_masks,
+        training=training)
+    model_outputs.update(model_mask_outputs)
+    return model_outputs
+
+  def _get_backbone_and_decoder_features(self, images):
+
+    backbone_features = self.backbone(images)
+    if self.decoder:
+      features = self.decoder(backbone_features)
+    else:
+      features = backbone_features
+    return backbone_features, features
+
+  def _call_box_outputs(
+      self, images: tf.Tensor,
+      image_shape: tf.Tensor,
+      anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+      gt_boxes: Optional[tf.Tensor] = None,
+      gt_classes: Optional[tf.Tensor] = None,
+      training: Optional[bool] = None) -> Tuple[
+          Mapping[str, tf.Tensor], Mapping[str, tf.Tensor]]:
+    """Implementation of the Faster-RCNN logic for boxes."""
+    model_outputs = {}
+
+    # Feature extraction.
+    (backbone_features,
+     decoder_features) = self._get_backbone_and_decoder_features(images)
+
+    # Region proposal network.
+    rpn_scores, rpn_boxes = self.rpn_head(decoder_features)
+
+    model_outputs.update({
+        'backbone_features': backbone_features,
+        'decoder_features': decoder_features,
+        'rpn_boxes': rpn_boxes,
+        'rpn_scores': rpn_scores
+    })
+
+    # Generate anchor boxes for this batch if not provided.
+    if anchor_boxes is None:
+      _, image_height, image_width, _ = images.get_shape().as_list()
+      anchor_boxes = anchor.Anchor(
+          min_level=self._config_dict['min_level'],
+          max_level=self._config_dict['max_level'],
+          num_scales=self._config_dict['num_scales'],
+          aspect_ratios=self._config_dict['aspect_ratios'],
+          anchor_size=self._config_dict['anchor_size'],
+          image_size=(image_height, image_width)).multilevel_boxes
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0),
+            [tf.shape(images)[0], 1, 1, 1])
+
+    # Generate RoIs.
+    current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores, anchor_boxes,
+                                         image_shape, training)
+
+    next_rois = current_rois
+    all_class_outputs = []
+    for cascade_num in range(len(self.roi_sampler)):
+      # In cascade RCNN we want the higher layers to have different regression
+      # weights as the predicted deltas become smaller and smaller.
+      regression_weights = self._cascade_layer_to_weights[cascade_num]
+      current_rois = next_rois
+
+      (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
+       matched_gt_classes, matched_gt_indices,
+       current_rois) = self._run_frcnn_head(
+           features=decoder_features,
+           rois=current_rois,
+           gt_boxes=gt_boxes,
+           gt_classes=gt_classes,
+           training=training,
+           model_outputs=model_outputs,
+           cascade_num=cascade_num,
+           regression_weights=regression_weights)
+      all_class_outputs.append(class_outputs)
+
+      # Generate ROIs for the next cascade head if there is any.
+      if cascade_num < len(self.roi_sampler) - 1:
+        next_rois = box_ops.decode_boxes(
+            tf.cast(box_outputs, tf.float32),
+            current_rois,
+            weights=regression_weights)
+        next_rois = box_ops.clip_boxes(next_rois,
+                                       tf.expand_dims(image_shape, axis=1))
+
+    if not training:
+      if self._config_dict['cascade_class_ensemble']:
+        class_outputs = tf.add_n(all_class_outputs) / len(all_class_outputs)
+
+      detections = self.detection_generator(
+          box_outputs,
+          class_outputs,
+          current_rois,
+          image_shape,
+          regression_weights,
+          bbox_per_class=(not self._config_dict['class_agnostic_bbox_pred']))
+      model_outputs.update({
+          'cls_outputs': class_outputs,
+          'box_outputs': box_outputs,
+      })
+      if self.detection_generator.get_config()['apply_nms']:
+        model_outputs.update({
+            'detection_boxes': detections['detection_boxes'],
+            'detection_scores': detections['detection_scores'],
+            'detection_classes': detections['detection_classes'],
+            'num_detections': detections['num_detections']
+        })
+      else:
+        model_outputs.update({
+            'decoded_boxes': detections['decoded_boxes'],
+            'decoded_box_scores': detections['decoded_box_scores']
+        })
+
+    intermediate_outputs = {
+        'matched_gt_boxes': matched_gt_boxes,
+        'matched_gt_indices': matched_gt_indices,
+        'matched_gt_classes': matched_gt_classes,
+        'current_rois': current_rois,
+    }
+    return (model_outputs, intermediate_outputs)
+
+  def _call_mask_outputs(
+      self,
+      model_box_outputs: Mapping[str, tf.Tensor],
+      features: tf.Tensor,
+      current_rois: tf.Tensor,
+      matched_gt_indices: tf.Tensor,
+      matched_gt_boxes: tf.Tensor,
+      matched_gt_classes: tf.Tensor,
+      gt_masks: tf.Tensor,
+      training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
+    """Implementation of Mask-RCNN mask prediction logic."""
+
+    model_outputs = dict(model_box_outputs)
+    if training:
+      current_rois, roi_classes, roi_masks = self.mask_sampler(
+          current_rois, matched_gt_boxes, matched_gt_classes,
+          matched_gt_indices, gt_masks)
+      roi_masks = tf.stop_gradient(roi_masks)
+
+      model_outputs.update({
+          'mask_class_targets': roi_classes,
+          'mask_targets': roi_masks,
+      })
+    else:
+      current_rois = model_outputs['detection_boxes']
+      roi_classes = model_outputs['detection_classes']
+
+    mask_logits, mask_probs = self._features_to_mask_outputs(
+        features, current_rois, roi_classes)
+
+    if training:
+      model_outputs.update({
+          'mask_outputs': mask_logits,
+      })
+    else:
+      model_outputs.update({
+          'detection_masks': mask_probs,
+      })
+    return model_outputs
+
+  def _run_frcnn_head(self, features, rois, gt_boxes, gt_classes, training,
+                      model_outputs, cascade_num, regression_weights):
+    """Runs the frcnn head that does both class and box prediction.
+
+    Args:
+      features: `list` of features from the feature extractor.
+      rois: `list` of current rois that will be used to predict bbox refinement
+        and classes from.
+      gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4].
+        This tensor might have paddings with a negative value.
+      gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
+        classes. It is padded with -1s to indicate the invalid classes.
+      training: `bool`, if model is training or being evaluated.
+      model_outputs: `dict`, used for storing outputs used for eval and losses.
+      cascade_num: `int`, the current frcnn layer in the cascade.
+      regression_weights: `list`, weights used for l1 loss in bounding box
+        regression.
+
+    Returns:
+      class_outputs: Class predictions for rois.
+      box_outputs: Box predictions for rois. These are formatted for the
+        regression loss and need to be converted before being used as rois
+        in the next stage.
+      model_outputs: Updated dict with predictions used for losses and eval.
+      matched_gt_boxes: If `is_training` is true, then these give the gt box
+        location of its positive match.
+      matched_gt_classes: If `is_training` is true, then these give the gt class
+         of the predicted box.
+      matched_gt_boxes: If `is_training` is true, then these give the box
+        location of its positive match.
+      matched_gt_indices: If `is_training` is true, then gives the index of
+        the positive box match. Used for mask prediction.
+      rois: The sampled rois used for this layer.
+    """
+    # Only used during training.
+    matched_gt_boxes, matched_gt_classes, matched_gt_indices = (None, None,
+                                                                None)
+    if training and gt_boxes is not None:
+      rois = tf.stop_gradient(rois)
+
+      current_roi_sampler = self.roi_sampler[cascade_num]
+      rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
+          current_roi_sampler(rois, gt_boxes, gt_classes))
+      # Create bounding box training targets.
+      box_targets = box_ops.encode_boxes(
+          matched_gt_boxes, rois, weights=regression_weights)
+      # If the target is background, the box target is set to all 0s.
+      box_targets = tf.where(
+          tf.tile(
+              tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
+              [1, 1, 4]), tf.zeros_like(box_targets), box_targets)
+      model_outputs.update({
+          'class_targets_{}'.format(cascade_num)
+          if cascade_num else 'class_targets':
+              matched_gt_classes,
+          'box_targets_{}'.format(cascade_num)
+          if cascade_num else 'box_targets':
+              box_targets,
+      })
+
+    # Get roi features.
+    roi_features = self.roi_aligner(features, rois)
+
+    # Run frcnn head to get class and bbox predictions.
+    current_detection_head = self.detection_head[cascade_num]
+    class_outputs, box_outputs = current_detection_head(roi_features)
+
+    model_outputs.update({
+        'class_outputs_{}'.format(cascade_num)
+        if cascade_num else 'class_outputs':
+            class_outputs,
+        'box_outputs_{}'.format(cascade_num) if cascade_num else 'box_outputs':
+            box_outputs,
+    })
+    return (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
+            matched_gt_classes, matched_gt_indices, rois)
+
+  def _features_to_mask_outputs(self, features, rois, roi_classes):
+    # Mask RoI align.
+    mask_roi_features = self.mask_roi_aligner(features, rois)
+
+    # Mask head.
+    raw_masks = self.mask_head([mask_roi_features, roi_classes])
+
+    return raw_masks, tf.nn.sigmoid(raw_masks)
+
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(
+        backbone=self.backbone,
+        rpn_head=self.rpn_head,
+        detection_head=self.detection_head)
+    if self.decoder is not None:
+      items.update(decoder=self.decoder)
+    if self._include_mask:
+      items.update(mask_head=self.mask_head)
+
+    return items
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/maskrcnn_model_test.py
+++ b/official/vision/modeling/maskrcnn_model_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for maskrcnn_model.py."""
+
+import os
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.modeling import maskrcnn_model
+from official.vision.modeling.backbones import resnet
+from official.vision.modeling.decoders import fpn
+from official.vision.modeling.heads import dense_prediction_heads
+from official.vision.modeling.heads import instance_heads
+from official.vision.modeling.layers import detection_generator
+from official.vision.modeling.layers import mask_sampler
+from official.vision.modeling.layers import roi_aligner
+from official.vision.modeling.layers import roi_generator
+from official.vision.modeling.layers import roi_sampler
+from official.vision.ops import anchor
+
+
+class MaskRCNNModelTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          include_mask=[True, False],
+          use_separable_conv=[True, False],
+          build_anchor_boxes=[True, False],
+          is_training=[True, False]))
+  def test_build_model(self, include_mask, use_separable_conv,
+                       build_anchor_boxes, is_training):
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    anchor_size = 3
+    resnet_model_id = 50
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+    image_size = 384
+    images = np.random.rand(2, image_size, image_size, 3)
+    image_shape = np.array([[image_size, image_size], [image_size, image_size]])
+
+    if build_anchor_boxes:
+      anchor_boxes = anchor.Anchor(
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=3,
+          image_size=(image_size, image_size)).multilevel_boxes
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
+    else:
+      anchor_boxes = None
+
+    backbone = resnet.ResNet(model_id=resnet_model_id)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level,
+        use_separable_conv=use_separable_conv)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_anchors_per_location=num_anchors_per_location,
+        num_convs=1)
+    detection_head = instance_heads.DetectionHead(num_classes=num_classes)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    if include_mask:
+      mask_head = instance_heads.MaskHead(
+          num_classes=num_classes, upsample_factor=2)
+      mask_sampler_obj = mask_sampler.MaskSampler(
+          mask_target_size=28, num_sampled_masks=1)
+      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+    else:
+      mask_head = None
+      mask_sampler_obj = None
+      mask_roi_aligner_obj = None
+    model = maskrcnn_model.MaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size)
+
+    gt_boxes = np.array(
+        [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
+         [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
+        dtype=np.float32)
+    gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
+    if include_mask:
+      gt_masks = np.ones((2, 3, 100, 100))
+    else:
+      gt_masks = None
+
+    # Results will be checked in test_forward.
+    _ = model(
+        images,
+        image_shape,
+        anchor_boxes,
+        gt_boxes,
+        gt_classes,
+        gt_masks,
+        training=is_training)
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          include_mask=[True, False],
+          build_anchor_boxes=[True, False],
+          use_cascade_heads=[True, False],
+          training=[True, False],
+      ))
+  def test_forward(self, strategy, include_mask, build_anchor_boxes, training,
+                   use_cascade_heads):
+    num_classes = 3
+    min_level = 3
+    max_level = 4
+    num_scales = 3
+    aspect_ratios = [1.0]
+    anchor_size = 3
+    if use_cascade_heads:
+      cascade_iou_thresholds = [0.6]
+      class_agnostic_bbox_pred = True
+      cascade_class_ensemble = True
+    else:
+      cascade_iou_thresholds = None
+      class_agnostic_bbox_pred = False
+      cascade_class_ensemble = False
+
+    image_size = (256, 256)
+    images = np.random.rand(2, image_size[0], image_size[1], 3)
+    image_shape = np.array([[224, 100], [100, 224]])
+    with strategy.scope():
+      if build_anchor_boxes:
+        anchor_boxes = anchor.Anchor(
+            min_level=min_level,
+            max_level=max_level,
+            num_scales=num_scales,
+            aspect_ratios=aspect_ratios,
+            anchor_size=anchor_size,
+            image_size=image_size).multilevel_boxes
+      else:
+        anchor_boxes = None
+      num_anchors_per_location = len(aspect_ratios) * num_scales
+
+      input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+      backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+      decoder = fpn.FPN(
+          min_level=min_level,
+          max_level=max_level,
+          input_specs=backbone.output_specs)
+      rpn_head = dense_prediction_heads.RPNHead(
+          min_level=min_level,
+          max_level=max_level,
+          num_anchors_per_location=num_anchors_per_location)
+      detection_head = instance_heads.DetectionHead(
+          num_classes=num_classes,
+          class_agnostic_bbox_pred=class_agnostic_bbox_pred)
+      roi_generator_obj = roi_generator.MultilevelROIGenerator()
+
+      roi_sampler_cascade = []
+      roi_sampler_obj = roi_sampler.ROISampler()
+      roi_sampler_cascade.append(roi_sampler_obj)
+      if cascade_iou_thresholds:
+        for iou in cascade_iou_thresholds:
+          roi_sampler_obj = roi_sampler.ROISampler(
+              mix_gt_boxes=False,
+              foreground_iou_threshold=iou,
+              background_iou_high_threshold=iou,
+              background_iou_low_threshold=0.0,
+              skip_subsampling=True)
+          roi_sampler_cascade.append(roi_sampler_obj)
+      roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+      detection_generator_obj = detection_generator.DetectionGenerator()
+      if include_mask:
+        mask_head = instance_heads.MaskHead(
+            num_classes=num_classes, upsample_factor=2)
+        mask_sampler_obj = mask_sampler.MaskSampler(
+            mask_target_size=28, num_sampled_masks=1)
+        mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+      else:
+        mask_head = None
+        mask_sampler_obj = None
+        mask_roi_aligner_obj = None
+      model = maskrcnn_model.MaskRCNNModel(
+          backbone,
+          decoder,
+          rpn_head,
+          detection_head,
+          roi_generator_obj,
+          roi_sampler_obj,
+          roi_aligner_obj,
+          detection_generator_obj,
+          mask_head,
+          mask_sampler_obj,
+          mask_roi_aligner_obj,
+          class_agnostic_bbox_pred=class_agnostic_bbox_pred,
+          cascade_class_ensemble=cascade_class_ensemble,
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=anchor_size)
+
+      gt_boxes = np.array(
+          [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
+           [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
+          dtype=np.float32)
+      gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
+      if include_mask:
+        gt_masks = np.ones((2, 3, 100, 100))
+      else:
+        gt_masks = None
+
+      results = model(
+          images,
+          image_shape,
+          anchor_boxes,
+          gt_boxes,
+          gt_classes,
+          gt_masks,
+          training=training)
+
+    self.assertIn('rpn_boxes', results)
+    self.assertIn('rpn_scores', results)
+    if training:
+      self.assertIn('class_targets', results)
+      self.assertIn('box_targets', results)
+      self.assertIn('class_outputs', results)
+      self.assertIn('box_outputs', results)
+      if include_mask:
+        self.assertIn('mask_outputs', results)
+    else:
+      self.assertIn('detection_boxes', results)
+      self.assertIn('detection_scores', results)
+      self.assertIn('detection_classes', results)
+      self.assertIn('num_detections', results)
+      if include_mask:
+        self.assertIn('detection_masks', results)
+
+  @parameterized.parameters(
+      (False,),
+      (True,),
+  )
+  def test_serialize_deserialize(self, include_mask):
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+    backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+    decoder = fpn.FPN(
+        min_level=3, max_level=7, input_specs=backbone.output_specs)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=3, max_level=7, num_anchors_per_location=3)
+    detection_head = instance_heads.DetectionHead(num_classes=2)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    if include_mask:
+      mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
+      mask_sampler_obj = mask_sampler.MaskSampler(
+          mask_target_size=28, num_sampled_masks=1)
+      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+    else:
+      mask_head = None
+      mask_sampler_obj = None
+      mask_roi_aligner_obj = None
+    model = maskrcnn_model.MaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj,
+        min_level=3,
+        max_level=7,
+        num_scales=3,
+        aspect_ratios=[1.0],
+        anchor_size=3)
+
+    config = model.get_config()
+    new_model = maskrcnn_model.MaskRCNNModel.from_config(config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+  @parameterized.parameters(
+      (False,),
+      (True,),
+  )
+  def test_checkpoint(self, include_mask):
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+    backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+    decoder = fpn.FPN(
+        min_level=3, max_level=7, input_specs=backbone.output_specs)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=3, max_level=7, num_anchors_per_location=3)
+    detection_head = instance_heads.DetectionHead(num_classes=2)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    if include_mask:
+      mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
+      mask_sampler_obj = mask_sampler.MaskSampler(
+          mask_target_size=28, num_sampled_masks=1)
+      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+    else:
+      mask_head = None
+      mask_sampler_obj = None
+      mask_roi_aligner_obj = None
+    model = maskrcnn_model.MaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj,
+        min_level=3,
+        max_level=7,
+        num_scales=3,
+        aspect_ratios=[1.0],
+        anchor_size=3)
+    expect_checkpoint_items = dict(
+        backbone=backbone,
+        decoder=decoder,
+        rpn_head=rpn_head,
+        detection_head=[detection_head])
+    if include_mask:
+      expect_checkpoint_items['mask_head'] = mask_head
+    self.assertAllEqual(expect_checkpoint_items, model.checkpoint_items)
+
+    # Test save and load checkpoints.
+    ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items)
+    save_dir = self.create_tempdir().full_path
+    ckpt.save(os.path.join(save_dir, 'ckpt'))
+
+    partial_ckpt = tf.train.Checkpoint(backbone=backbone)
+    partial_ckpt.read(tf.train.latest_checkpoint(
+        save_dir)).expect_partial().assert_existing_objects_matched()
+
+    if include_mask:
+      partial_ckpt_mask = tf.train.Checkpoint(
+          backbone=backbone, mask_head=mask_head)
+      partial_ckpt_mask.restore(tf.train.latest_checkpoint(
+          save_dir)).expect_partial().assert_existing_objects_matched()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/retinanet_model.py
+++ b/official/vision/modeling/retinanet_model.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""RetinaNet."""
+from typing import Any, Mapping, List, Optional, Union
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.ops import anchor
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class RetinaNetModel(tf.keras.Model):
+  """The RetinaNet model class."""
+
+  def __init__(self,
+               backbone: tf.keras.Model,
+               decoder: tf.keras.Model,
+               head: tf.keras.layers.Layer,
+               detection_generator: tf.keras.layers.Layer,
+               min_level: Optional[int] = None,
+               max_level: Optional[int] = None,
+               num_scales: Optional[int] = None,
+               aspect_ratios: Optional[List[float]] = None,
+               anchor_size: Optional[float] = None,
+               **kwargs):
+    """Classification initialization function.
+
+    Args:
+      backbone: `tf.keras.Model` a backbone network.
+      decoder: `tf.keras.Model` a decoder network.
+      head: `RetinaNetHead`, the RetinaNet head.
+      detection_generator: the detection generator.
+      min_level: Minimum level in output feature maps.
+      max_level: Maximum level in output feature maps.
+      num_scales: A number representing intermediate scales added
+        on each level. For instances, num_scales=2 adds one additional
+        intermediate anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: A list representing the aspect raito
+        anchors added on each level. The number indicates the ratio of width to
+        height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
+        on each scale level.
+      anchor_size: A number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(RetinaNetModel, self).__init__(**kwargs)
+    self._config_dict = {
+        'backbone': backbone,
+        'decoder': decoder,
+        'head': head,
+        'detection_generator': detection_generator,
+        'min_level': min_level,
+        'max_level': max_level,
+        'num_scales': num_scales,
+        'aspect_ratios': aspect_ratios,
+        'anchor_size': anchor_size,
+    }
+    self._backbone = backbone
+    self._decoder = decoder
+    self._head = head
+    self._detection_generator = detection_generator
+
+  def call(self,
+           images: tf.Tensor,
+           image_shape: Optional[tf.Tensor] = None,
+           anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+           output_intermediate_features: bool = False,
+           training: bool = None) -> Mapping[str, tf.Tensor]:
+    """Forward pass of the RetinaNet model.
+
+    Args:
+      images: `Tensor`, the input batched images, whose shape is
+        [batch, height, width, 3].
+      image_shape: `Tensor`, the actual shape of the input images, whose shape
+        is [batch, 2] where the last dimension is [height, width]. Note that
+        this is the actual image shape excluding paddings. For example, images
+        in the batch may be resized into different shapes before padding to the
+        fixed size.
+      anchor_boxes: a dict of tensors which includes multilevel anchors.
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the anchor coordinates of a particular feature
+            level, whose shape is [height_l, width_l, num_anchors_per_location].
+      output_intermediate_features: `bool` indicating whether to return the
+        intermediate feature maps generated by backbone and decoder.
+      training: `bool`, indicating whether it is in training mode.
+
+    Returns:
+      scores: a dict of tensors which includes scores of the predictions.
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the box scores predicted from a particular feature
+            level, whose shape is
+            [batch, height_l, width_l, num_classes * num_anchors_per_location].
+      boxes: a dict of tensors which includes coordinates of the predictions.
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the box coordinates predicted from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l, 4 * num_anchors_per_location].
+      attributes: a dict of (attribute_name, attribute_predictions). Each
+        attribute prediction is a dict that includes:
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the attribute predictions from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l, att_size * num_anchors_per_location].
+    """
+    outputs = {}
+    # Feature extraction.
+    features = self.backbone(images)
+    if output_intermediate_features:
+      outputs.update(
+          {'backbone_{}'.format(k): v for k, v in features.items()})
+    if self.decoder:
+      features = self.decoder(features)
+    if output_intermediate_features:
+      outputs.update(
+          {'decoder_{}'.format(k): v for k, v in features.items()})
+
+    # Dense prediction. `raw_attributes` can be empty.
+    raw_scores, raw_boxes, raw_attributes = self.head(features)
+
+    if training:
+      outputs.update({
+          'cls_outputs': raw_scores,
+          'box_outputs': raw_boxes,
+      })
+      if raw_attributes:
+        outputs.update({'attribute_outputs': raw_attributes})
+      return outputs
+    else:
+      # Generate anchor boxes for this batch if not provided.
+      if anchor_boxes is None:
+        _, image_height, image_width, _ = images.get_shape().as_list()
+        anchor_boxes = anchor.Anchor(
+            min_level=self._config_dict['min_level'],
+            max_level=self._config_dict['max_level'],
+            num_scales=self._config_dict['num_scales'],
+            aspect_ratios=self._config_dict['aspect_ratios'],
+            anchor_size=self._config_dict['anchor_size'],
+            image_size=(image_height, image_width)).multilevel_boxes
+        for l in anchor_boxes:
+          anchor_boxes[l] = tf.tile(
+              tf.expand_dims(anchor_boxes[l], axis=0),
+              [tf.shape(images)[0], 1, 1, 1])
+
+      # Post-processing.
+      final_results = self.detection_generator(raw_boxes, raw_scores,
+                                               anchor_boxes, image_shape,
+                                               raw_attributes)
+      outputs.update({
+          'cls_outputs': raw_scores,
+          'box_outputs': raw_boxes,
+      })
+      if self.detection_generator.get_config()['apply_nms']:
+        outputs.update({
+            'detection_boxes': final_results['detection_boxes'],
+            'detection_scores': final_results['detection_scores'],
+            'detection_classes': final_results['detection_classes'],
+            'num_detections': final_results['num_detections']
+        })
+      else:
+        outputs.update({
+            'decoded_boxes': final_results['decoded_boxes'],
+            'decoded_box_scores': final_results['decoded_box_scores']
+        })
+
+      if raw_attributes:
+        outputs.update({
+            'attribute_outputs': raw_attributes,
+            'detection_attributes': final_results['detection_attributes'],
+        })
+      return outputs
+
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(backbone=self.backbone, head=self.head)
+    if self.decoder is not None:
+      items.update(decoder=self.decoder)
+
+    return items
+
+  @property
+  def backbone(self) -> tf.keras.Model:
+    return self._backbone
+
+  @property
+  def decoder(self) -> tf.keras.Model:
+    return self._decoder
+
+  @property
+  def head(self) -> tf.keras.layers.Layer:
+    return self._head
+
+  @property
+  def detection_generator(self) -> tf.keras.layers.Layer:
+    return self._detection_generator
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/modeling/retinanet_model_test.py
+++ b/official/vision/modeling/retinanet_model_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for RetinaNet models."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.modeling import retinanet_model
+from official.vision.modeling.backbones import resnet
+from official.vision.modeling.decoders import fpn
+from official.vision.modeling.heads import dense_prediction_heads
+from official.vision.modeling.layers import detection_generator
+from official.vision.ops import anchor
+
+
+class RetinaNetTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      {
+          'use_separable_conv': True,
+          'build_anchor_boxes': True,
+          'is_training': False,
+          'has_att_heads': False
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': True,
+          'is_training': False,
+          'has_att_heads': False
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': False,
+          'is_training': False,
+          'has_att_heads': False
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': False,
+          'is_training': True,
+          'has_att_heads': False
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': True,
+          'is_training': True,
+          'has_att_heads': True
+      },
+      {
+          'use_separable_conv': False,
+          'build_anchor_boxes': True,
+          'is_training': False,
+          'has_att_heads': True
+      },
+  )
+  def test_build_model(self, use_separable_conv, build_anchor_boxes,
+                       is_training, has_att_heads):
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    anchor_size = 3
+    fpn_num_filters = 256
+    head_num_convs = 4
+    head_num_filters = 256
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+    image_size = 384
+    images = np.random.rand(2, image_size, image_size, 3)
+    image_shape = np.array([[image_size, image_size], [image_size, image_size]])
+
+    if build_anchor_boxes:
+      anchor_boxes = anchor.Anchor(
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=anchor_size,
+          image_size=(image_size, image_size)).multilevel_boxes
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
+    else:
+      anchor_boxes = None
+
+    if has_att_heads:
+      attribute_heads = [dict(name='depth', type='regression', size=1)]
+    else:
+      attribute_heads = None
+
+    backbone = resnet.ResNet(model_id=50)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level,
+        num_filters=fpn_num_filters,
+        use_separable_conv=use_separable_conv)
+    head = dense_prediction_heads.RetinaNetHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_classes=num_classes,
+        attribute_heads=attribute_heads,
+        num_anchors_per_location=num_anchors_per_location,
+        use_separable_conv=use_separable_conv,
+        num_convs=head_num_convs,
+        num_filters=head_num_filters)
+    generator = detection_generator.MultilevelDetectionGenerator(
+        max_num_detections=10)
+    model = retinanet_model.RetinaNetModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head,
+        detection_generator=generator,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size)
+
+    _ = model(images, image_shape, anchor_boxes, training=is_training)
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          image_size=[
+              (128, 128),
+          ],
+          training=[True, False],
+          has_att_heads=[True, False],
+          output_intermediate_features=[True, False],
+          soft_nms_sigma=[None, 0.0, 0.1],
+      ))
+  def test_forward(self, strategy, image_size, training, has_att_heads,
+                   output_intermediate_features, soft_nms_sigma):
+    """Test for creation of a R50-FPN RetinaNet."""
+    tf.keras.backend.set_image_data_format('channels_last')
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+
+    images = np.random.rand(2, image_size[0], image_size[1], 3)
+    image_shape = np.array(
+        [[image_size[0], image_size[1]], [image_size[0], image_size[1]]])
+
+    with strategy.scope():
+      anchor_gen = anchor.build_anchor_generator(
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=3)
+      anchor_boxes = anchor_gen(image_size)
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
+
+      backbone = resnet.ResNet(model_id=50)
+      decoder = fpn.FPN(
+          input_specs=backbone.output_specs,
+          min_level=min_level,
+          max_level=max_level)
+
+      if has_att_heads:
+        attribute_heads = [dict(name='depth', type='regression', size=1)]
+      else:
+        attribute_heads = None
+      head = dense_prediction_heads.RetinaNetHead(
+          min_level=min_level,
+          max_level=max_level,
+          num_classes=num_classes,
+          attribute_heads=attribute_heads,
+          num_anchors_per_location=num_anchors_per_location)
+      generator = detection_generator.MultilevelDetectionGenerator(
+          max_num_detections=10,
+          nms_version='v1',
+          use_cpu_nms=soft_nms_sigma is not None,
+          soft_nms_sigma=soft_nms_sigma)
+      model = retinanet_model.RetinaNetModel(
+          backbone=backbone,
+          decoder=decoder,
+          head=head,
+          detection_generator=generator)
+
+      model_outputs = model(
+          images,
+          image_shape,
+          anchor_boxes,
+          output_intermediate_features=output_intermediate_features,
+          training=training)
+
+    if training:
+      cls_outputs = model_outputs['cls_outputs']
+      box_outputs = model_outputs['box_outputs']
+      for level in range(min_level, max_level + 1):
+        self.assertIn(str(level), cls_outputs)
+        self.assertIn(str(level), box_outputs)
+        self.assertAllEqual([
+            2,
+            image_size[0] // 2**level,
+            image_size[1] // 2**level,
+            num_classes * num_anchors_per_location
+        ], cls_outputs[str(level)].numpy().shape)
+        self.assertAllEqual([
+            2,
+            image_size[0] // 2**level,
+            image_size[1] // 2**level,
+            4 * num_anchors_per_location
+        ], box_outputs[str(level)].numpy().shape)
+        if has_att_heads:
+          att_outputs = model_outputs['attribute_outputs']
+          for att in att_outputs.values():
+            self.assertAllEqual([
+                2, image_size[0] // 2**level, image_size[1] // 2**level,
+                1 * num_anchors_per_location
+            ], att[str(level)].numpy().shape)
+    else:
+      self.assertIn('detection_boxes', model_outputs)
+      self.assertIn('detection_scores', model_outputs)
+      self.assertIn('detection_classes', model_outputs)
+      self.assertIn('num_detections', model_outputs)
+      self.assertAllEqual(
+          [2, 10, 4], model_outputs['detection_boxes'].numpy().shape)
+      self.assertAllEqual(
+          [2, 10], model_outputs['detection_scores'].numpy().shape)
+      self.assertAllEqual(
+          [2, 10], model_outputs['detection_classes'].numpy().shape)
+      self.assertAllEqual(
+          [2,], model_outputs['num_detections'].numpy().shape)
+      if has_att_heads:
+        self.assertIn('detection_attributes', model_outputs)
+        self.assertAllEqual(
+            [2, 10, 1],
+            model_outputs['detection_attributes']['depth'].numpy().shape)
+    if output_intermediate_features:
+      for l in range(2, 6):
+        self.assertIn('backbone_{}'.format(l), model_outputs)
+        self.assertAllEqual([
+            2, image_size[0] // 2**l, image_size[1] // 2**l,
+            backbone.output_specs[str(l)].as_list()[-1]
+        ], model_outputs['backbone_{}'.format(l)].numpy().shape)
+      for l in range(min_level, max_level + 1):
+        self.assertIn('decoder_{}'.format(l), model_outputs)
+        self.assertAllEqual([
+            2, image_size[0] // 2**l, image_size[1] // 2**l,
+            decoder.output_specs[str(l)].as_list()[-1]
+        ], model_outputs['decoder_{}'.format(l)].numpy().shape)
+
+  def test_serialize_deserialize(self):
+    """Validate the network can be serialized and deserialized."""
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+
+    backbone = resnet.ResNet(model_id=50)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level)
+    head = dense_prediction_heads.RetinaNetHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_classes=num_classes,
+        num_anchors_per_location=num_anchors_per_location)
+    generator = detection_generator.MultilevelDetectionGenerator(
+        max_num_detections=10)
+    model = retinanet_model.RetinaNetModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head,
+        detection_generator=generator,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=3)
+
+    config = model.get_config()
+    new_model = retinanet_model.RetinaNetModel.from_config(config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/segmentation_model.py
+++ b/official/vision/modeling/segmentation_model.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Build segmentation models."""
+from typing import Any, Mapping, Union, Optional, Dict
+
+# Import libraries
+import tensorflow as tf
+
+layers = tf.keras.layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SegmentationModel(tf.keras.Model):
+  """A Segmentation class model.
+
+  Input images are passed through backbone first. Decoder network is then
+  applied, and finally, segmentation head is applied on the output of the
+  decoder network. Layers such as ASPP should be part of decoder. Any feature
+  fusion is done as part of the segmentation head (i.e. deeplabv3+ feature
+  fusion is not part of the decoder, instead it is part of the segmentation
+  head). This way, different feature fusion techniques can be combined with
+  different backbones, and decoders.
+  """
+
+  def __init__(self, backbone: tf.keras.Model, decoder: tf.keras.Model,
+               head: tf.keras.layers.Layer,
+               mask_scoring_head: Optional[tf.keras.layers.Layer] = None,
+               **kwargs):
+    """Segmentation initialization function.
+
+    Args:
+      backbone: a backbone network.
+      decoder: a decoder network. E.g. FPN.
+      head: segmentation head.
+      mask_scoring_head: mask scoring head.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(SegmentationModel, self).__init__(**kwargs)
+    self._config_dict = {
+        'backbone': backbone,
+        'decoder': decoder,
+        'head': head,
+        'mask_scoring_head': mask_scoring_head,
+    }
+    self.backbone = backbone
+    self.decoder = decoder
+    self.head = head
+    self.mask_scoring_head = mask_scoring_head
+
+  def call(self, inputs: tf.Tensor, training: bool = None
+           ) -> Dict[str, tf.Tensor]:
+    backbone_features = self.backbone(inputs)
+
+    if self.decoder:
+      decoder_features = self.decoder(backbone_features)
+    else:
+      decoder_features = backbone_features
+
+    logits = self.head((backbone_features, decoder_features))
+    outputs = {'logits': logits}
+    if self.mask_scoring_head:
+      mask_scores = self.mask_scoring_head(logits)
+      outputs.update({'mask_scores': mask_scores})
+    return outputs
+
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(backbone=self.backbone, head=self.head)
+    if self.decoder is not None:
+      items.update(decoder=self.decoder)
+    if self.mask_scoring_head is not None:
+      items.update(mask_scoring_head=self.mask_scoring_head)
+    return items
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/modeling/segmentation_model_test.py
+++ b/official/vision/modeling/segmentation_model_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for segmentation network."""
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling import backbones
+from official.vision.modeling import segmentation_model
+from official.vision.modeling.decoders import fpn
+from official.vision.modeling.heads import segmentation_heads
+
+
+class SegmentationNetworkTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (128, 2),
+      (128, 3),
+      (128, 4),
+      (256, 2),
+      (256, 3),
+      (256, 4),
+  )
+  def test_segmentation_network_creation(
+      self, input_size, level):
+    """Test for creation of a segmentation network."""
+    num_classes = 10
+    inputs = np.random.rand(2, input_size, input_size, 3)
+    tf.keras.backend.set_image_data_format('channels_last')
+    backbone = backbones.ResNet(model_id=50)
+
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs, min_level=2, max_level=7)
+    head = segmentation_heads.SegmentationHead(num_classes, level=level)
+
+    model = segmentation_model.SegmentationModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head,
+        mask_scoring_head=None,
+    )
+
+    outputs = model(inputs)
+    self.assertAllEqual(
+        [2, input_size // (2**level), input_size // (2**level), num_classes],
+        outputs['logits'].numpy().shape)
+
+  def test_serialize_deserialize(self):
+    """Validate the network can be serialized and deserialized."""
+    num_classes = 3
+    backbone = backbones.ResNet(model_id=50)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs, min_level=3, max_level=7)
+    head = segmentation_heads.SegmentationHead(num_classes, level=3)
+    model = segmentation_model.SegmentationModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head
+    )
+
+    config = model.get_config()
+    new_model = segmentation_model.SegmentationModel.from_config(config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/modeling/video_classification_model.py
+++ b/official/vision/modeling/video_classification_model.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Build video classification models."""
+from typing import Any, Mapping, Optional, Union, List, Text
+
+import tensorflow as tf
+
+layers = tf.keras.layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class VideoClassificationModel(tf.keras.Model):
+  """A video classification class builder."""
+
+  def __init__(
+      self,
+      backbone: tf.keras.Model,
+      num_classes: int,
+      input_specs: Optional[Mapping[str, tf.keras.layers.InputSpec]] = None,
+      dropout_rate: float = 0.0,
+      aggregate_endpoints: bool = False,
+      kernel_initializer: str = 'random_uniform',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      require_endpoints: Optional[List[Text]] = None,
+      **kwargs):
+    """Video Classification initialization function.
+
+    Args:
+      backbone: a 3d backbone network.
+      num_classes: `int` number of classes in classification task.
+      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
+      dropout_rate: `float` rate for dropout regularization.
+      aggregate_endpoints: `bool` aggregate all end ponits or only use the
+        final end point.
+      kernel_initializer: kernel initializer for the dense layer.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object. Default to
+        None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object. Default to
+        None.
+      require_endpoints: the required endpoints for prediction. If None or
+        empty, then only uses the final endpoint.
+      **kwargs: keyword arguments to be passed.
+    """
+    if not input_specs:
+      input_specs = {
+          'image': layers.InputSpec(shape=[None, None, None, None, 3])
+      }
+    self._self_setattr_tracking = False
+    self._config_dict = {
+        'backbone': backbone,
+        'num_classes': num_classes,
+        'input_specs': input_specs,
+        'dropout_rate': dropout_rate,
+        'aggregate_endpoints': aggregate_endpoints,
+        'kernel_initializer': kernel_initializer,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+        'require_endpoints': require_endpoints,
+    }
+    self._input_specs = input_specs
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._backbone = backbone
+
+    inputs = {
+        k: tf.keras.Input(shape=v.shape[1:]) for k, v in input_specs.items()
+    }
+    endpoints = backbone(inputs['image'])
+
+    if aggregate_endpoints:
+      pooled_feats = []
+      for endpoint in endpoints.values():
+        x_pool = tf.keras.layers.GlobalAveragePooling3D()(endpoint)
+        pooled_feats.append(x_pool)
+      x = tf.concat(pooled_feats, axis=1)
+    else:
+      if not require_endpoints:
+        # Uses the last endpoint for prediction.
+        x = endpoints[max(endpoints.keys())]
+        x = tf.keras.layers.GlobalAveragePooling3D()(x)
+      else:
+        # Concats all the required endpoints for prediction.
+        outputs = []
+        for name in require_endpoints:
+          x = endpoints[name]
+          x = tf.keras.layers.GlobalAveragePooling3D()(x)
+          outputs.append(x)
+        x = tf.concat(outputs, axis=1)
+
+    x = tf.keras.layers.Dropout(dropout_rate)(x)
+    x = tf.keras.layers.Dense(
+        num_classes, kernel_initializer=kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            x)
+
+    super(VideoClassificationModel, self).__init__(
+        inputs=inputs, outputs=x, **kwargs)
+
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    return dict(backbone=self.backbone)
+
+  @property
+  def backbone(self) -> tf.keras.Model:
+    return self._backbone
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/modeling/video_classification_model_test.py
+++ b/official/vision/modeling/video_classification_model_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for video classification network."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.modeling import backbones
+from official.vision.modeling import video_classification_model
+
+
+class VideoClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (50, 8, 112, 'relu', False),
+      (50, 8, 112, 'swish', True),
+  )
+  def test_resnet3d_network_creation(self, model_id, temporal_size,
+                                     spatial_size, activation,
+                                     aggregate_endpoints):
+    """Test for creation of a ResNet3D-50 classifier."""
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, temporal_size, spatial_size, spatial_size, 3])
+    temporal_strides = [1, 1, 1, 1]
+    temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
+                             (1, 3, 1)]
+
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    backbone = backbones.ResNet3D(
+        model_id=model_id,
+        temporal_strides=temporal_strides,
+        temporal_kernel_sizes=temporal_kernel_sizes,
+        input_specs=input_specs,
+        activation=activation)
+
+    num_classes = 1000
+    model = video_classification_model.VideoClassificationModel(
+        backbone=backbone,
+        num_classes=num_classes,
+        input_specs={'image': input_specs},
+        dropout_rate=0.2,
+        aggregate_endpoints=aggregate_endpoints,
+    )
+
+    inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
+    logits = model(inputs)
+    self.assertAllEqual([2, num_classes], logits.numpy().shape)
+
+  def test_serialize_deserialize(self):
+    """Validate the classification network can be serialized and deserialized."""
+    model_id = 50
+    temporal_strides = [1, 1, 1, 1]
+    temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
+                             (1, 3, 1)]
+
+    backbone = backbones.ResNet3D(
+        model_id=model_id,
+        temporal_strides=temporal_strides,
+        temporal_kernel_sizes=temporal_kernel_sizes)
+
+    model = video_classification_model.VideoClassificationModel(
+        backbone=backbone, num_classes=1000)
+
+    config = model.get_config()
+    new_model = video_classification_model.VideoClassificationModel.from_config(
+        config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/utils/object_detection/visualization_utils.py
+++ b/official/vision/utils/object_detection/visualization_utils.py
@@ -34,7 +34,7 @@ import PIL.ImageFont as ImageFont
 import six
 import tensorflow as tf

-from official.vision.beta.ops import box_ops
+from official.vision.ops import box_ops
 from official.vision.utils.object_detection import shape_utils

 _TITLE_LEFT_MARGIN = 10