Internal change

PiperOrigin-RevId: 425740068

Internal change
PiperOrigin-RevId: 425740068
7785dec0 · Yeqing Li · A. Unique TensorFlower · 9c93f07c · 9c93f07c · 9c93f07c
Commit 7785dec0 authored Feb 01, 2022 by Yeqing Li Committed by A. Unique TensorFlower Feb 01, 2022
12 changed files
--- a/official/vision/modeling/layers/roi_generator.py
+++ b/official/vision/modeling/layers/roi_generator.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Contains definitions of ROI generator."""
-from typing import Optional, Mapping
-# Import libraries
-import tensorflow as tf
-from official.vision.ops import box_ops
-from official.vision.ops import nms
-def _multilevel_propose_rois(raw_boxes: Mapping[str, tf.Tensor],
-                             raw_scores: Mapping[str, tf.Tensor],
-                             anchor_boxes: Mapping[str, tf.Tensor],
-                             image_shape: tf.Tensor,
-                             pre_nms_top_k: int = 2000,
-                             pre_nms_score_threshold: float = 0.0,
-                             pre_nms_min_size_threshold: float = 0.0,
-                             nms_iou_threshold: float = 0.7,
-                             num_proposals: int = 1000,
-                             use_batched_nms: bool = False,
-                             decode_boxes: bool = True,
-                             clip_boxes: bool = True,
-                             apply_sigmoid_to_score: bool = True):
-  """Proposes RoIs given a group of candidates from different FPN levels.
-  The following describes the steps:
-    1. For each individual level:
-      a. Apply sigmoid transform if specified.
-      b. Decode boxes if specified.
-      c. Clip boxes if specified.
-      d. Filter small boxes and those fall outside image if specified.
-      e. Apply pre-NMS filtering including pre-NMS top k and score thresholding.
-      f. Apply NMS.
-    2. Aggregate post-NMS boxes from each level.
-    3. Apply an overall top k to generate the final selected RoIs.
-  Args:
-    raw_boxes: A `dict` with keys representing FPN levels and values
-      representing box tenors of shape
-      [batch_size, feature_h, feature_w, num_anchors * 4].
-    raw_scores: A `dict` with keys representing FPN levels and values
-      representing logit tensors of shape
-      [batch_size, feature_h, feature_w, num_anchors].
-    anchor_boxes: A `dict` with keys representing FPN levels and values
-      representing anchor box tensors of shape
-      [batch_size, feature_h * feature_w * num_anchors, 4].
-    image_shape: A `tf.Tensor` of shape [batch_size, 2] where the last dimension
-      are [height, width] of the scaled image.
-    pre_nms_top_k: An `int` of top scoring RPN proposals *per level* to keep
-      before applying NMS. Default: 2000.
-    pre_nms_score_threshold: A `float` between 0 and 1 representing the minimal
-      box score to keep before applying NMS. This is often used as a
-      pre-filtering step for better performance. Default: 0, no filtering is
-      applied.
-    pre_nms_min_size_threshold: A `float` representing the minimal box size in
-      each side (w.r.t. the scaled image) to keep before applying NMS. This is
-      often used as a pre-filtering step for better performance. Default: 0, no
-      filtering is applied.
-    nms_iou_threshold: A `float` between 0 and 1 representing the IoU threshold
-      used for NMS. If 0.0, no NMS is applied. Default: 0.7.
-    num_proposals: An `int` of top scoring RPN proposals *in total* to keep
-      after applying NMS. Default: 1000.
-    use_batched_nms: A `bool` indicating whether NMS is applied in batch using
-      `tf.image.combined_non_max_suppression`. Currently only available in
-      CPU/GPU. Default is False.
-    decode_boxes: A `bool` indicating whether `raw_boxes` needs to be decoded
-      using `anchor_boxes`. If False, use `raw_boxes` directly and ignore
-      `anchor_boxes`. Default is True.
-    clip_boxes: A `bool` indicating whether boxes are first clipped to the
-      scaled image size before appliying NMS. If False, no clipping is applied
-      and `image_shape` is ignored. Default is True.
-    apply_sigmoid_to_score: A `bool` indicating whether apply sigmoid to
-      `raw_scores` before applying NMS. Default is True.
-  Returns:
-    selected_rois: A `tf.Tensor` of shape [batch_size, num_proposals, 4],
-      representing the box coordinates of the selected proposals w.r.t. the
-      scaled image.
-    selected_roi_scores: A `tf.Tensor` of shape [batch_size, num_proposals, 1],
-      representing the scores of the selected proposals.
-  """
-  with tf.name_scope('multilevel_propose_rois'):
-    rois = []
-    roi_scores = []
-    image_shape = tf.expand_dims(image_shape, axis=1)
-    for level in sorted(raw_scores.keys()):
-      with tf.name_scope('level_%s' % level):
-        _, feature_h, feature_w, num_anchors_per_location = (
-            raw_scores[level].get_shape().as_list())
-        num_boxes = feature_h * feature_w * num_anchors_per_location
-        this_level_scores = tf.reshape(raw_scores[level], [-1, num_boxes])
-        this_level_boxes = tf.reshape(raw_boxes[level], [-1, num_boxes, 4])
-        this_level_anchors = tf.cast(
-            tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
-            dtype=this_level_scores.dtype)
-        if apply_sigmoid_to_score:
-          this_level_scores = tf.sigmoid(this_level_scores)
-        if decode_boxes:
-          this_level_boxes = box_ops.decode_boxes(
-              this_level_boxes, this_level_anchors)
-        if clip_boxes:
-          this_level_boxes = box_ops.clip_boxes(
-              this_level_boxes, image_shape)
-        if pre_nms_min_size_threshold > 0.0:
-          this_level_boxes, this_level_scores = box_ops.filter_boxes(
-              this_level_boxes,
-              this_level_scores,
-              image_shape,
-              pre_nms_min_size_threshold)
-        this_level_pre_nms_top_k = min(num_boxes, pre_nms_top_k)
-        this_level_post_nms_top_k = min(num_boxes, num_proposals)
-        if nms_iou_threshold > 0.0:
-          if use_batched_nms:
-            this_level_rois, this_level_roi_scores, _, _ = (
-                tf.image.combined_non_max_suppression(
-                    tf.expand_dims(this_level_boxes, axis=2),
-                    tf.expand_dims(this_level_scores, axis=-1),
-                    max_output_size_per_class=this_level_pre_nms_top_k,
-                    max_total_size=this_level_post_nms_top_k,
-                    iou_threshold=nms_iou_threshold,
-                    score_threshold=pre_nms_score_threshold,
-                    pad_per_class=False,
-                    clip_boxes=False))
-          else:
-            if pre_nms_score_threshold > 0.0:
-              this_level_boxes, this_level_scores = (
-                  box_ops.filter_boxes_by_scores(
-                      this_level_boxes,
-                      this_level_scores,
-                      pre_nms_score_threshold))
-            this_level_boxes, this_level_scores = box_ops.top_k_boxes(
-                this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k)
-            this_level_roi_scores, this_level_rois = (
-                nms.sorted_non_max_suppression_padded(
-                    this_level_scores,
-                    this_level_boxes,
-                    max_output_size=this_level_post_nms_top_k,
-                    iou_threshold=nms_iou_threshold))
-        else:
-          this_level_rois, this_level_roi_scores = box_ops.top_k_boxes(
-              this_level_boxes,
-              this_level_scores,
-              k=this_level_post_nms_top_k)
-        rois.append(this_level_rois)
-        roi_scores.append(this_level_roi_scores)
-    all_rois = tf.concat(rois, axis=1)
-    all_roi_scores = tf.concat(roi_scores, axis=1)
-    with tf.name_scope('top_k_rois'):
-      _, num_valid_rois = all_roi_scores.get_shape().as_list()
-      overall_top_k = min(num_valid_rois, num_proposals)
-      selected_rois, selected_roi_scores = box_ops.top_k_boxes(
-          all_rois, all_roi_scores, k=overall_top_k)
-    return selected_rois, selected_roi_scores
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class MultilevelROIGenerator(tf.keras.layers.Layer):
-  """Proposes RoIs for the second stage processing."""
-  def __init__(self,
-               pre_nms_top_k: int = 2000,
-               pre_nms_score_threshold: float = 0.0,
-               pre_nms_min_size_threshold: float = 0.0,
-               nms_iou_threshold: float = 0.7,
-               num_proposals: int = 1000,
-               test_pre_nms_top_k: int = 1000,
-               test_pre_nms_score_threshold: float = 0.0,
-               test_pre_nms_min_size_threshold: float = 0.0,
-               test_nms_iou_threshold: float = 0.7,
-               test_num_proposals: int = 1000,
-               use_batched_nms: bool = False,
-               **kwargs):
-    """Initializes a ROI generator.
-    The ROI generator transforms the raw predictions from RPN to ROIs.
-    Args:
-      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
-        before applying NMS.
-      pre_nms_score_threshold: A `float` of the score threshold to apply before
-        applying NMS. Proposals whose scores are below this threshold are
-        thrown away.
-      pre_nms_min_size_threshold: A `float` of the threshold of each side of the
-        box (w.r.t. the scaled image). Proposals whose sides are below this
-        threshold are thrown away.
-      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
-      num_proposals: An `int` of the final number of proposals to generate.
-      test_pre_nms_top_k: An `int` of the number of top scores proposals to be
-        kept before applying NMS in testing.
-      test_pre_nms_score_threshold: A `float` of the score threshold to apply
-        before applying NMS in testing. Proposals whose scores are below this
-        threshold are thrown away.
-      test_pre_nms_min_size_threshold: A `float` of the threshold of each side
-        of the box (w.r.t. the scaled image) in testing. Proposals whose sides
-        are below this threshold are thrown away.
-      test_nms_iou_threshold: A `float` in [0, 1] of the NMS IoU threshold in
-        testing.
-      test_num_proposals: An `int` of the final number of proposals to generate
-        in testing.
-      use_batched_nms: A `bool` of whether or not use
-        `tf.image.combined_non_max_suppression`.
-      **kwargs: Additional keyword arguments passed to Layer.
-    """
-    self._config_dict = {
-        'pre_nms_top_k': pre_nms_top_k,
-        'pre_nms_score_threshold': pre_nms_score_threshold,
-        'pre_nms_min_size_threshold': pre_nms_min_size_threshold,
-        'nms_iou_threshold': nms_iou_threshold,
-        'num_proposals': num_proposals,
-        'test_pre_nms_top_k': test_pre_nms_top_k,
-        'test_pre_nms_score_threshold': test_pre_nms_score_threshold,
-        'test_pre_nms_min_size_threshold': test_pre_nms_min_size_threshold,
-        'test_nms_iou_threshold': test_nms_iou_threshold,
-        'test_num_proposals': test_num_proposals,
-        'use_batched_nms': use_batched_nms,
-    }
-    super(MultilevelROIGenerator, self).__init__(**kwargs)
-  def call(self,
-           raw_boxes: Mapping[str, tf.Tensor],
-           raw_scores: Mapping[str, tf.Tensor],
-           anchor_boxes: Mapping[str, tf.Tensor],
-           image_shape: tf.Tensor,
-           training: Optional[bool] = None):
-    """Proposes RoIs given a group of candidates from different FPN levels.
-    The following describes the steps:
-      1. For each individual level:
-        a. Apply sigmoid transform if specified.
-        b. Decode boxes if specified.
-        c. Clip boxes if specified.
-        d. Filter small boxes and those fall outside image if specified.
-        e. Apply pre-NMS filtering including pre-NMS top k and score
-           thresholding.
-        f. Apply NMS.
-      2. Aggregate post-NMS boxes from each level.
-      3. Apply an overall top k to generate the final selected RoIs.
-    Args:
-      raw_boxes: A `dict` with keys representing FPN levels and values
-        representing box tenors of shape
-        [batch, feature_h, feature_w, num_anchors * 4].
-      raw_scores: A `dict` with keys representing FPN levels and values
-        representing logit tensors of shape
-        [batch, feature_h, feature_w, num_anchors].
-      anchor_boxes: A `dict` with keys representing FPN levels and values
-        representing anchor box tensors of shape
-        [batch, feature_h * feature_w * num_anchors, 4].
-      image_shape: A `tf.Tensor` of shape [batch, 2] where the last dimension
-        are [height, width] of the scaled image.
-      training: A `bool` that indicates whether it is in training mode.
-    Returns:
-      roi_boxes: A `tf.Tensor` of shape [batch, num_proposals, 4], the proposed
-        ROIs in the scaled image coordinate.
-      roi_scores: A `tf.Tensor` of shape [batch, num_proposals], scores of the
-        proposed ROIs.
-    """
-    roi_boxes, roi_scores = _multilevel_propose_rois(
-        raw_boxes,
-        raw_scores,
-        anchor_boxes,
-        image_shape,
-        pre_nms_top_k=(
-            self._config_dict['pre_nms_top_k'] if training
-            else self._config_dict['test_pre_nms_top_k']),
-        pre_nms_score_threshold=(
-            self._config_dict['pre_nms_score_threshold'] if training
-            else self._config_dict['test_pre_nms_score_threshold']),
-        pre_nms_min_size_threshold=(
-            self._config_dict['pre_nms_min_size_threshold'] if training
-            else self._config_dict['test_pre_nms_min_size_threshold']),
-        nms_iou_threshold=(
-            self._config_dict['nms_iou_threshold'] if training
-            else self._config_dict['test_nms_iou_threshold']),
-        num_proposals=(
-            self._config_dict['num_proposals'] if training
-            else self._config_dict['test_num_proposals']),
-        use_batched_nms=self._config_dict['use_batched_nms'],
-        decode_boxes=True,
-        clip_boxes=True,
-        apply_sigmoid_to_score=True)
-    return roi_boxes, roi_scores
-  def get_config(self):
-    return self._config_dict
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
--- a/official/vision/modeling/layers/roi_generator_test.py
+++ b/official/vision/modeling/layers/roi_generator_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for roi_generator.py."""
-# Import libraries
-import numpy as np
-import tensorflow as tf
-from official.vision.modeling.layers import roi_generator
-class MultilevelProposeRoisTest(tf.test.TestCase):
-  def test_multilevel_propose_rois_single_level(self):
-    rpn_boxes_np = np.array(
-        [[[[0, 0, 10, 10], [0.01, 0.01, 9.9, 9.9]],
-          [[5, 5, 10, 10], [2, 2, 8, 8]]],
-         [[[2, 2, 4, 4], [3, 3, 6, 6]],
-          [[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
-    rpn_boxes = {
-        '2': tf.constant(rpn_boxes_np, dtype=tf.float32)
-    }
-    rpn_scores_np = np.array(
-        [[[[0.6], [0.9]], [[0.2], [0.3]]], [[[0.1], [0.8]], [[0.3], [0.5]]]])
-    rpn_scores = {
-        '2': tf.constant(rpn_scores_np, dtype=tf.float32)
-    }
-    anchor_boxes_np = np.array(
-        [[[[0, 0, 10, 10], [0.01, 0.01, 9.9, 9.9]],
-          [[5, 5, 10, 10], [2, 2, 8, 8]]],
-         [[[2, 2, 4, 4], [3, 3, 6, 6]],
-          [[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
-    anchor_boxes = {
-        '2': tf.constant(anchor_boxes_np, dtype=tf.float32)
-    }
-    image_shape = tf.constant([[20, 20], [20, 20]], dtype=tf.int32)
-    selected_rois_np = np.array(
-        [[[0.01, 0.01, 9.9, 9.9], [2, 2, 8, 8], [5, 5, 10, 10], [0, 0, 0, 0]],
-         [[3, 3, 6, 6], [1, 1, 8, 8], [2, 2, 4, 4], [0, 0, 0, 0]]])
-    selected_roi_scores_np = np.array(
-        [[0.9, 0.3, 0.2, 0], [0.8, 0.5, 0.1, 0]])
-    # Runs on TPU.
-    strategy = tf.distribute.TPUStrategy()
-    with strategy.scope():
-      selected_rois_tpu, selected_roi_scores_tpu = (
-          roi_generator._multilevel_propose_rois(
-              rpn_boxes,
-              rpn_scores,
-              anchor_boxes=anchor_boxes,
-              image_shape=image_shape,
-              pre_nms_top_k=4,
-              pre_nms_score_threshold=0.0,
-              pre_nms_min_size_threshold=0.0,
-              nms_iou_threshold=0.5,
-              num_proposals=4,
-              use_batched_nms=False,
-              decode_boxes=False,
-              clip_boxes=False,
-              apply_sigmoid_to_score=False))
-    # Runs on CPU.
-    selected_rois_cpu, selected_roi_scores_cpu = (
-        roi_generator._multilevel_propose_rois(
-            rpn_boxes,
-            rpn_scores,
-            anchor_boxes=anchor_boxes,
-            image_shape=image_shape,
-            pre_nms_top_k=4,
-            pre_nms_score_threshold=0.0,
-            pre_nms_min_size_threshold=0.0,
-            nms_iou_threshold=0.5,
-            num_proposals=4,
-            use_batched_nms=False,
-            decode_boxes=False,
-            clip_boxes=False,
-            apply_sigmoid_to_score=False))
-    self.assertNDArrayNear(
-        selected_rois_tpu.numpy(), selected_rois_cpu.numpy(), 1e-5)
-    self.assertNDArrayNear(
-        selected_roi_scores_tpu.numpy(), selected_roi_scores_cpu.numpy(), 1e-5)
-    self.assertNDArrayNear(
-        selected_rois_tpu.numpy(), selected_rois_np, 1e-5)
-    self.assertNDArrayNear(
-        selected_roi_scores_tpu.numpy(), selected_roi_scores_np, 1e-5)
-  def test_multilevel_propose_rois_two_levels(self):
-    rpn_boxes_1_np = np.array(
-        [[[[0, 0, 10, 10], [0.01, 0.01, 9.99, 9.99]],
-          [[5, 5, 10, 10], [2, 2, 8, 8]]],
-         [[[2, 2, 2.5, 2.5], [3, 3, 6, 6]],
-          [[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
-    rpn_boxes_2_np = np.array(
-        [[[[0, 0, 10.01, 10.01]]], [[[2, 2, 4.5, 4.5]]]])
-    rpn_boxes = {
-        '2': tf.constant(rpn_boxes_1_np, dtype=tf.float32),
-        '3': tf.constant(rpn_boxes_2_np, dtype=tf.float32),
-    }
-    rpn_scores_1_np = np.array(
-        [[[[0.6], [0.9]], [[0.2], [0.3]]], [[[0.1], [0.8]], [[0.3], [0.5]]]])
-    rpn_scores_2_np = np.array([[[[0.95]]], [[[0.99]]]])
-    rpn_scores = {
-        '2': tf.constant(rpn_scores_1_np, dtype=tf.float32),
-        '3': tf.constant(rpn_scores_2_np, dtype=tf.float32),
-    }
-    anchor_boxes_1_np = np.array(
-        [[[[0, 0, 10, 10], [0.01, 0.01, 9.99, 9.99]],
-          [[5, 5, 10, 10], [2, 2, 8, 8]]],
-         [[[2, 2, 2.5, 2.5], [3, 3, 6, 6]],
-          [[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
-    anchor_boxes_2_np = np.array(
-        [[[[0, 0, 10.01, 10.01]]], [[[2, 2, 4.5, 4.5]]]])
-    anchor_boxes = {
-        '2': tf.constant(anchor_boxes_1_np, dtype=tf.float32),
-        '3': tf.constant(anchor_boxes_2_np, dtype=tf.float32),
-    }
-    image_shape = tf.constant([[20, 20], [20, 20]], dtype=tf.int32)
-    selected_rois_np = np.array(
-        [[[0, 0, 10.01, 10.01], [0.01, 0.01, 9.99, 9.99]],
-         [[2, 2, 4.5, 4.5], [3, 3, 6, 6]]])
-    selected_roi_scores_np = np.array([[0.95, 0.9], [0.99, 0.8]])
-    # Runs on TPU.
-    strategy = tf.distribute.TPUStrategy()
-    with strategy.scope():
-      selected_rois_tpu, selected_roi_scores_tpu = (
-          roi_generator._multilevel_propose_rois(
-              rpn_boxes,
-              rpn_scores,
-              anchor_boxes=anchor_boxes,
-              image_shape=image_shape,
-              pre_nms_top_k=4,
-              pre_nms_score_threshold=0.0,
-              pre_nms_min_size_threshold=0.0,
-              nms_iou_threshold=0.5,
-              num_proposals=2,
-              use_batched_nms=False,
-              decode_boxes=False,
-              clip_boxes=False,
-              apply_sigmoid_to_score=False))
-    # Runs on CPU.
-    selected_rois_cpu, selected_roi_scores_cpu = (
-        roi_generator._multilevel_propose_rois(
-            rpn_boxes,
-            rpn_scores,
-            anchor_boxes=anchor_boxes,
-            image_shape=image_shape,
-            pre_nms_top_k=4,
-            pre_nms_score_threshold=0.0,
-            pre_nms_min_size_threshold=0.0,
-            nms_iou_threshold=0.5,
-            num_proposals=2,
-            use_batched_nms=False,
-            decode_boxes=False,
-            clip_boxes=False,
-            apply_sigmoid_to_score=False))
-    self.assertNDArrayNear(
-        selected_rois_tpu.numpy(), selected_rois_cpu.numpy(), 1e-5)
-    self.assertNDArrayNear(
-        selected_roi_scores_tpu.numpy(), selected_roi_scores_cpu.numpy(), 1e-5)
-    self.assertNDArrayNear(
-        selected_rois_tpu.numpy(), selected_rois_np, 1e-5)
-    self.assertNDArrayNear(
-        selected_roi_scores_tpu.numpy(), selected_roi_scores_np, 1e-5)
-class MultilevelROIGeneratorTest(tf.test.TestCase):
-  def test_serialize_deserialize(self):
-    kwargs = dict(
-        pre_nms_top_k=2000,
-        pre_nms_score_threshold=0.0,
-        pre_nms_min_size_threshold=0.0,
-        nms_iou_threshold=0.7,
-        num_proposals=1000,
-        test_pre_nms_top_k=1000,
-        test_pre_nms_score_threshold=0.0,
-        test_pre_nms_min_size_threshold=0.0,
-        test_nms_iou_threshold=0.7,
-        test_num_proposals=1000,
-        use_batched_nms=False,
-    )
-    generator = roi_generator.MultilevelROIGenerator(**kwargs)
-    expected_config = dict(kwargs)
-    self.assertEqual(generator.get_config(), expected_config)
-    new_generator = roi_generator.MultilevelROIGenerator.from_config(
-        generator.get_config())
-    self.assertAllEqual(generator.get_config(), new_generator.get_config())
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/layers/roi_sampler.py
+++ b/official/vision/modeling/layers/roi_sampler.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Contains definitions of ROI sampler."""
-# Import libraries
-import tensorflow as tf
-from official.vision.modeling.layers import box_sampler
-from official.vision.ops import box_matcher
-from official.vision.ops import iou_similarity
-from official.vision.ops import target_gather
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class ROISampler(tf.keras.layers.Layer):
-  """Samples ROIs and assigns targets to the sampled ROIs."""
-  def __init__(self,
-               mix_gt_boxes: bool = True,
-               num_sampled_rois: int = 512,
-               foreground_fraction: float = 0.25,
-               foreground_iou_threshold: float = 0.5,
-               background_iou_high_threshold: float = 0.5,
-               background_iou_low_threshold: float = 0,
-               skip_subsampling: bool = False,
-               **kwargs):
-    """Initializes a ROI sampler.
-    Args:
-      mix_gt_boxes: A `bool` of whether to mix the groundtruth boxes with
-        proposed ROIs.
-      num_sampled_rois: An `int` of the number of sampled ROIs per image.
-      foreground_fraction: A `float` in [0, 1], what percentage of proposed ROIs
-        should be sampled from the foreground boxes.
-      foreground_iou_threshold: A `float` that represents the IoU threshold for
-        a box to be considered as positive (if >= `foreground_iou_threshold`).
-      background_iou_high_threshold: A `float` that represents the IoU threshold
-        for a box to be considered as negative (if overlap in
-        [`background_iou_low_threshold`, `background_iou_high_threshold`]).
-      background_iou_low_threshold: A `float` that represents the IoU threshold
-        for a box to be considered as negative (if overlap in
-        [`background_iou_low_threshold`, `background_iou_high_threshold`])
-      skip_subsampling: a bool that determines if we want to skip the sampling
-        procedure than balances the fg/bg classes. Used for upper frcnn layers
-        in cascade RCNN.
-      **kwargs: Additional keyword arguments passed to Layer.
-    """
-    self._config_dict = {
-        'mix_gt_boxes': mix_gt_boxes,
-        'num_sampled_rois': num_sampled_rois,
-        'foreground_fraction': foreground_fraction,
-        'foreground_iou_threshold': foreground_iou_threshold,
-        'background_iou_high_threshold': background_iou_high_threshold,
-        'background_iou_low_threshold': background_iou_low_threshold,
-        'skip_subsampling': skip_subsampling,
-    }
-    self._sim_calc = iou_similarity.IouSimilarity()
-    self._box_matcher = box_matcher.BoxMatcher(
-        thresholds=[
-            background_iou_low_threshold, background_iou_high_threshold,
-            foreground_iou_threshold
-        ],
-        indicators=[-3, -1, -2, 1])
-    self._target_gather = target_gather.TargetGather()
-    self._sampler = box_sampler.BoxSampler(
-        num_sampled_rois, foreground_fraction)
-    super(ROISampler, self).__init__(**kwargs)
-  def call(self, boxes: tf.Tensor, gt_boxes: tf.Tensor, gt_classes: tf.Tensor):
-    """Assigns the proposals with groundtruth classes and performs subsmpling.
-    Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
-    following algorithm to generate the final `num_samples_per_image` RoIs.
-      1. Calculates the IoU between each proposal box and each gt_boxes.
-      2. Assigns each proposed box with a groundtruth class and box by choosing
-         the largest IoU overlap.
-      3. Samples `num_samples_per_image` boxes from all proposed boxes, and
-         returns box_targets, class_targets, and RoIs.
-    Args:
-      boxes: A `tf.Tensor` of shape of [batch_size, N, 4]. N is the number of
-        proposals before groundtruth assignment. The last dimension is the
-        box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
-        format.
-      gt_boxes: A `tf.Tensor` of shape of [batch_size, MAX_NUM_INSTANCES, 4].
-        The coordinates of gt_boxes are in the pixel coordinates of the scaled
-        image. This tensor might have padding of values -1 indicating the
-        invalid box coordinates.
-      gt_classes: A `tf.Tensor` with a shape of [batch_size, MAX_NUM_INSTANCES].
-        This tensor might have paddings with values of -1 indicating the invalid
-        classes.
-    Returns:
-      sampled_rois: A `tf.Tensor` of shape of [batch_size, K, 4], representing
-        the coordinates of the sampled RoIs, where K is the number of the
-        sampled RoIs, i.e. K = num_samples_per_image.
-      sampled_gt_boxes: A `tf.Tensor` of shape of [batch_size, K, 4], storing
-        the box coordinates of the matched groundtruth boxes of the samples
-        RoIs.
-      sampled_gt_classes: A `tf.Tensor` of shape of [batch_size, K], storing the
-        classes of the matched groundtruth boxes of the sampled RoIs.
-      sampled_gt_indices: A `tf.Tensor` of shape of [batch_size, K], storing the
-        indices of the sampled groudntruth boxes in the original `gt_boxes`
-        tensor, i.e.,
-        gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
-    """
-    gt_boxes = tf.cast(gt_boxes, dtype=boxes.dtype)
-    if self._config_dict['mix_gt_boxes']:
-      boxes = tf.concat([boxes, gt_boxes], axis=1)
-    boxes_invalid_mask = tf.less(
-        tf.reduce_max(boxes, axis=-1, keepdims=True), 0.0)
-    gt_invalid_mask = tf.less(
-        tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
-    similarity_matrix = self._sim_calc(boxes, gt_boxes, boxes_invalid_mask,
-                                       gt_invalid_mask)
-    matched_gt_indices, match_indicators = self._box_matcher(similarity_matrix)
-    positive_matches = tf.greater_equal(match_indicators, 0)
-    negative_matches = tf.equal(match_indicators, -1)
-    ignored_matches = tf.equal(match_indicators, -2)
-    invalid_matches = tf.equal(match_indicators, -3)
-    background_mask = tf.expand_dims(
-        tf.logical_or(negative_matches, invalid_matches), -1)
-    gt_classes = tf.expand_dims(gt_classes, axis=-1)
-    matched_gt_classes = self._target_gather(gt_classes, matched_gt_indices,
-                                             background_mask)
-    matched_gt_classes = tf.where(background_mask,
-                                  tf.zeros_like(matched_gt_classes),
-                                  matched_gt_classes)
-    matched_gt_boxes = self._target_gather(gt_boxes, matched_gt_indices,
-                                           tf.tile(background_mask, [1, 1, 4]))
-    matched_gt_boxes = tf.where(background_mask,
-                                tf.zeros_like(matched_gt_boxes),
-                                matched_gt_boxes)
-    matched_gt_indices = tf.where(
-        tf.squeeze(background_mask, -1), -tf.ones_like(matched_gt_indices),
-        matched_gt_indices)
-    if self._config_dict['skip_subsampling']:
-      return (boxes, matched_gt_boxes, tf.squeeze(matched_gt_classes,
-                                                  axis=-1), matched_gt_indices)
-    sampled_indices = self._sampler(
-        positive_matches, negative_matches, ignored_matches)
-    sampled_rois = self._target_gather(boxes, sampled_indices)
-    sampled_gt_boxes = self._target_gather(matched_gt_boxes, sampled_indices)
-    sampled_gt_classes = tf.squeeze(self._target_gather(
-        matched_gt_classes, sampled_indices), axis=-1)
-    sampled_gt_indices = tf.squeeze(self._target_gather(
-        tf.expand_dims(matched_gt_indices, -1), sampled_indices), axis=-1)
-    return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
-            sampled_gt_indices)
-  def get_config(self):
-    return self._config_dict
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
--- a/official/vision/modeling/layers/roi_sampler_test.py
+++ b/official/vision/modeling/layers/roi_sampler_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for roi_sampler.py."""
-# Import libraries
-import numpy as np
-import tensorflow as tf
-from official.vision.modeling.layers import roi_sampler
-class ROISamplerTest(tf.test.TestCase):
-  def test_roi_sampler(self):
-    boxes_np = np.array(
-        [[[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5],
-          [5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]]])
-    boxes = tf.constant(boxes_np, dtype=tf.float32)
-    gt_boxes_np = np.array(
-        [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5],
-          [-1, -1, -1, -1]]])
-    gt_boxes = tf.constant(gt_boxes_np, dtype=tf.float32)
-    gt_classes_np = np.array([[2, 10, -1]])
-    gt_classes = tf.constant(gt_classes_np, dtype=tf.int32)
-    generator = roi_sampler.ROISampler(
-        mix_gt_boxes=True,
-        num_sampled_rois=2,
-        foreground_fraction=0.5,
-        foreground_iou_threshold=0.5,
-        background_iou_high_threshold=0.5,
-        background_iou_low_threshold=0.0)
-    # Runs on TPU.
-    strategy = tf.distribute.TPUStrategy()
-    with strategy.scope():
-      _ = generator(boxes, gt_boxes, gt_classes)
-    # Runs on CPU.
-    _ = generator(boxes, gt_boxes, gt_classes)
-  def test_serialize_deserialize(self):
-    kwargs = dict(
-        mix_gt_boxes=True,
-        num_sampled_rois=512,
-        foreground_fraction=0.25,
-        foreground_iou_threshold=0.5,
-        background_iou_high_threshold=0.5,
-        background_iou_low_threshold=0.5,
-        skip_subsampling=False,
-    )
-    generator = roi_sampler.ROISampler(**kwargs)
-    expected_config = dict(kwargs)
-    self.assertEqual(generator.get_config(), expected_config)
-    new_generator = roi_sampler.ROISampler.from_config(
-        generator.get_config())
-    self.assertAllEqual(generator.get_config(), new_generator.get_config())
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/maskrcnn_model.py
+++ b/official/vision/modeling/maskrcnn_model.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""R-CNN(-RS) models."""
-from typing import Any, List, Mapping, Optional, Tuple, Union
-import tensorflow as tf
-from official.vision.ops import anchor
-from official.vision.ops import box_ops
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class MaskRCNNModel(tf.keras.Model):
-  """The Mask R-CNN(-RS) and Cascade RCNN-RS models."""
-  def __init__(self,
-               backbone: tf.keras.Model,
-               decoder: tf.keras.Model,
-               rpn_head: tf.keras.layers.Layer,
-               detection_head: Union[tf.keras.layers.Layer,
-                                     List[tf.keras.layers.Layer]],
-               roi_generator: tf.keras.layers.Layer,
-               roi_sampler: Union[tf.keras.layers.Layer,
-                                  List[tf.keras.layers.Layer]],
-               roi_aligner: tf.keras.layers.Layer,
-               detection_generator: tf.keras.layers.Layer,
-               mask_head: Optional[tf.keras.layers.Layer] = None,
-               mask_sampler: Optional[tf.keras.layers.Layer] = None,
-               mask_roi_aligner: Optional[tf.keras.layers.Layer] = None,
-               class_agnostic_bbox_pred: bool = False,
-               cascade_class_ensemble: bool = False,
-               min_level: Optional[int] = None,
-               max_level: Optional[int] = None,
-               num_scales: Optional[int] = None,
-               aspect_ratios: Optional[List[float]] = None,
-               anchor_size: Optional[float] = None,
-               **kwargs):
-    """Initializes the R-CNN(-RS) model.
-    Args:
-      backbone: `tf.keras.Model`, the backbone network.
-      decoder: `tf.keras.Model`, the decoder network.
-      rpn_head: the RPN head.
-      detection_head: the detection head or a list of heads.
-      roi_generator: the ROI generator.
-      roi_sampler: a single ROI sampler or a list of ROI samplers for cascade
-        detection heads.
-      roi_aligner: the ROI aligner.
-      detection_generator: the detection generator.
-      mask_head: the mask head.
-      mask_sampler: the mask sampler.
-      mask_roi_aligner: the ROI alginer for mask prediction.
-      class_agnostic_bbox_pred: if True, perform class agnostic bounding box
-        prediction. Needs to be `True` for Cascade RCNN models.
-      cascade_class_ensemble: if True, ensemble classification scores over all
-        detection heads.
-      min_level: Minimum level in output feature maps.
-      max_level: Maximum level in output feature maps.
-      num_scales: A number representing intermediate scales added on each level.
-        For instances, num_scales=2 adds one additional intermediate anchor
-        scales [2^0, 2^0.5] on each level.
-      aspect_ratios: A list representing the aspect raito anchors added on each
-        level. The number indicates the ratio of width to height. For instances,
-        aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level.
-      anchor_size: A number representing the scale of size of the base anchor to
-        the feature stride 2^level.
-      **kwargs: keyword arguments to be passed.
-    """
-    super(MaskRCNNModel, self).__init__(**kwargs)
-    self._config_dict = {
-        'backbone': backbone,
-        'decoder': decoder,
-        'rpn_head': rpn_head,
-        'detection_head': detection_head,
-        'roi_generator': roi_generator,
-        'roi_sampler': roi_sampler,
-        'roi_aligner': roi_aligner,
-        'detection_generator': detection_generator,
-        'mask_head': mask_head,
-        'mask_sampler': mask_sampler,
-        'mask_roi_aligner': mask_roi_aligner,
-        'class_agnostic_bbox_pred': class_agnostic_bbox_pred,
-        'cascade_class_ensemble': cascade_class_ensemble,
-        'min_level': min_level,
-        'max_level': max_level,
-        'num_scales': num_scales,
-        'aspect_ratios': aspect_ratios,
-        'anchor_size': anchor_size,
-    }
-    self.backbone = backbone
-    self.decoder = decoder
-    self.rpn_head = rpn_head
-    if not isinstance(detection_head, (list, tuple)):
-      self.detection_head = [detection_head]
-    else:
-      self.detection_head = detection_head
-    self.roi_generator = roi_generator
-    if not isinstance(roi_sampler, (list, tuple)):
-      self.roi_sampler = [roi_sampler]
-    else:
-      self.roi_sampler = roi_sampler
-    if len(self.roi_sampler) > 1 and not class_agnostic_bbox_pred:
-      raise ValueError(
-          '`class_agnostic_bbox_pred` needs to be True if multiple detection heads are specified.'
-      )
-    self.roi_aligner = roi_aligner
-    self.detection_generator = detection_generator
-    self._include_mask = mask_head is not None
-    self.mask_head = mask_head
-    if self._include_mask and mask_sampler is None:
-      raise ValueError('`mask_sampler` is not provided in Mask R-CNN.')
-    self.mask_sampler = mask_sampler
-    if self._include_mask and mask_roi_aligner is None:
-      raise ValueError('`mask_roi_aligner` is not provided in Mask R-CNN.')
-    self.mask_roi_aligner = mask_roi_aligner
-    # Weights for the regression losses for each FRCNN layer.
-    # TODO(xianzhi): Make the weights configurable.
-    self._cascade_layer_to_weights = [
-        [10.0, 10.0, 5.0, 5.0],
-        [20.0, 20.0, 10.0, 10.0],
-        [30.0, 30.0, 15.0, 15.0],
-    ]
-  def call(self,
-           images: tf.Tensor,
-           image_shape: tf.Tensor,
-           anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
-           gt_boxes: Optional[tf.Tensor] = None,
-           gt_classes: Optional[tf.Tensor] = None,
-           gt_masks: Optional[tf.Tensor] = None,
-           training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
-    model_outputs, intermediate_outputs = self._call_box_outputs(
-        images=images, image_shape=image_shape, anchor_boxes=anchor_boxes,
-        gt_boxes=gt_boxes, gt_classes=gt_classes, training=training)
-    if not self._include_mask:
-      return model_outputs
-    model_mask_outputs = self._call_mask_outputs(
-        model_box_outputs=model_outputs,
-        features=model_outputs['decoder_features'],
-        current_rois=intermediate_outputs['current_rois'],
-        matched_gt_indices=intermediate_outputs['matched_gt_indices'],
-        matched_gt_boxes=intermediate_outputs['matched_gt_boxes'],
-        matched_gt_classes=intermediate_outputs['matched_gt_classes'],
-        gt_masks=gt_masks,
-        training=training)
-    model_outputs.update(model_mask_outputs)
-    return model_outputs
-  def _get_backbone_and_decoder_features(self, images):
-    backbone_features = self.backbone(images)
-    if self.decoder:
-      features = self.decoder(backbone_features)
-    else:
-      features = backbone_features
-    return backbone_features, features
-  def _call_box_outputs(
-      self, images: tf.Tensor,
-      image_shape: tf.Tensor,
-      anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
-      gt_boxes: Optional[tf.Tensor] = None,
-      gt_classes: Optional[tf.Tensor] = None,
-      training: Optional[bool] = None) -> Tuple[
-          Mapping[str, tf.Tensor], Mapping[str, tf.Tensor]]:
-    """Implementation of the Faster-RCNN logic for boxes."""
-    model_outputs = {}
-    # Feature extraction.
-    (backbone_features,
-     decoder_features) = self._get_backbone_and_decoder_features(images)
-    # Region proposal network.
-    rpn_scores, rpn_boxes = self.rpn_head(decoder_features)
-    model_outputs.update({
-        'backbone_features': backbone_features,
-        'decoder_features': decoder_features,
-        'rpn_boxes': rpn_boxes,
-        'rpn_scores': rpn_scores
-    })
-    # Generate anchor boxes for this batch if not provided.
-    if anchor_boxes is None:
-      _, image_height, image_width, _ = images.get_shape().as_list()
-      anchor_boxes = anchor.Anchor(
-          min_level=self._config_dict['min_level'],
-          max_level=self._config_dict['max_level'],
-          num_scales=self._config_dict['num_scales'],
-          aspect_ratios=self._config_dict['aspect_ratios'],
-          anchor_size=self._config_dict['anchor_size'],
-          image_size=(image_height, image_width)).multilevel_boxes
-      for l in anchor_boxes:
-        anchor_boxes[l] = tf.tile(
-            tf.expand_dims(anchor_boxes[l], axis=0),
-            [tf.shape(images)[0], 1, 1, 1])
-    # Generate RoIs.
-    current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores, anchor_boxes,
-                                         image_shape, training)
-    next_rois = current_rois
-    all_class_outputs = []
-    for cascade_num in range(len(self.roi_sampler)):
-      # In cascade RCNN we want the higher layers to have different regression
-      # weights as the predicted deltas become smaller and smaller.
-      regression_weights = self._cascade_layer_to_weights[cascade_num]
-      current_rois = next_rois
-      (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
-       matched_gt_classes, matched_gt_indices,
-       current_rois) = self._run_frcnn_head(
-           features=decoder_features,
-           rois=current_rois,
-           gt_boxes=gt_boxes,
-           gt_classes=gt_classes,
-           training=training,
-           model_outputs=model_outputs,
-           cascade_num=cascade_num,
-           regression_weights=regression_weights)
-      all_class_outputs.append(class_outputs)
-      # Generate ROIs for the next cascade head if there is any.
-      if cascade_num < len(self.roi_sampler) - 1:
-        next_rois = box_ops.decode_boxes(
-            tf.cast(box_outputs, tf.float32),
-            current_rois,
-            weights=regression_weights)
-        next_rois = box_ops.clip_boxes(next_rois,
-                                       tf.expand_dims(image_shape, axis=1))
-    if not training:
-      if self._config_dict['cascade_class_ensemble']:
-        class_outputs = tf.add_n(all_class_outputs) / len(all_class_outputs)
-      detections = self.detection_generator(
-          box_outputs,
-          class_outputs,
-          current_rois,
-          image_shape,
-          regression_weights,
-          bbox_per_class=(not self._config_dict['class_agnostic_bbox_pred']))
-      model_outputs.update({
-          'cls_outputs': class_outputs,
-          'box_outputs': box_outputs,
-      })
-      if self.detection_generator.get_config()['apply_nms']:
-        model_outputs.update({
-            'detection_boxes': detections['detection_boxes'],
-            'detection_scores': detections['detection_scores'],
-            'detection_classes': detections['detection_classes'],
-            'num_detections': detections['num_detections']
-        })
-      else:
-        model_outputs.update({
-            'decoded_boxes': detections['decoded_boxes'],
-            'decoded_box_scores': detections['decoded_box_scores']
-        })
-    intermediate_outputs = {
-        'matched_gt_boxes': matched_gt_boxes,
-        'matched_gt_indices': matched_gt_indices,
-        'matched_gt_classes': matched_gt_classes,
-        'current_rois': current_rois,
-    }
-    return (model_outputs, intermediate_outputs)
-  def _call_mask_outputs(
-      self,
-      model_box_outputs: Mapping[str, tf.Tensor],
-      features: tf.Tensor,
-      current_rois: tf.Tensor,
-      matched_gt_indices: tf.Tensor,
-      matched_gt_boxes: tf.Tensor,
-      matched_gt_classes: tf.Tensor,
-      gt_masks: tf.Tensor,
-      training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
-    """Implementation of Mask-RCNN mask prediction logic."""
-    model_outputs = dict(model_box_outputs)
-    if training:
-      current_rois, roi_classes, roi_masks = self.mask_sampler(
-          current_rois, matched_gt_boxes, matched_gt_classes,
-          matched_gt_indices, gt_masks)
-      roi_masks = tf.stop_gradient(roi_masks)
-      model_outputs.update({
-          'mask_class_targets': roi_classes,
-          'mask_targets': roi_masks,
-      })
-    else:
-      current_rois = model_outputs['detection_boxes']
-      roi_classes = model_outputs['detection_classes']
-    mask_logits, mask_probs = self._features_to_mask_outputs(
-        features, current_rois, roi_classes)
-    if training:
-      model_outputs.update({
-          'mask_outputs': mask_logits,
-      })
-    else:
-      model_outputs.update({
-          'detection_masks': mask_probs,
-      })
-    return model_outputs
-  def _run_frcnn_head(self, features, rois, gt_boxes, gt_classes, training,
-                      model_outputs, cascade_num, regression_weights):
-    """Runs the frcnn head that does both class and box prediction.
-    Args:
-      features: `list` of features from the feature extractor.
-      rois: `list` of current rois that will be used to predict bbox refinement
-        and classes from.
-      gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4].
-        This tensor might have paddings with a negative value.
-      gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
-        classes. It is padded with -1s to indicate the invalid classes.
-      training: `bool`, if model is training or being evaluated.
-      model_outputs: `dict`, used for storing outputs used for eval and losses.
-      cascade_num: `int`, the current frcnn layer in the cascade.
-      regression_weights: `list`, weights used for l1 loss in bounding box
-        regression.
-    Returns:
-      class_outputs: Class predictions for rois.
-      box_outputs: Box predictions for rois. These are formatted for the
-        regression loss and need to be converted before being used as rois
-        in the next stage.
-      model_outputs: Updated dict with predictions used for losses and eval.
-      matched_gt_boxes: If `is_training` is true, then these give the gt box
-        location of its positive match.
-      matched_gt_classes: If `is_training` is true, then these give the gt class
-         of the predicted box.
-      matched_gt_boxes: If `is_training` is true, then these give the box
-        location of its positive match.
-      matched_gt_indices: If `is_training` is true, then gives the index of
-        the positive box match. Used for mask prediction.
-      rois: The sampled rois used for this layer.
-    """
-    # Only used during training.
-    matched_gt_boxes, matched_gt_classes, matched_gt_indices = (None, None,
-                                                                None)
-    if training and gt_boxes is not None:
-      rois = tf.stop_gradient(rois)
-      current_roi_sampler = self.roi_sampler[cascade_num]
-      rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
-          current_roi_sampler(rois, gt_boxes, gt_classes))
-      # Create bounding box training targets.
-      box_targets = box_ops.encode_boxes(
-          matched_gt_boxes, rois, weights=regression_weights)
-      # If the target is background, the box target is set to all 0s.
-      box_targets = tf.where(
-          tf.tile(
-              tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
-              [1, 1, 4]), tf.zeros_like(box_targets), box_targets)
-      model_outputs.update({
-          'class_targets_{}'.format(cascade_num)
-          if cascade_num else 'class_targets':
-              matched_gt_classes,
-          'box_targets_{}'.format(cascade_num)
-          if cascade_num else 'box_targets':
-              box_targets,
-      })
-    # Get roi features.
-    roi_features = self.roi_aligner(features, rois)
-    # Run frcnn head to get class and bbox predictions.
-    current_detection_head = self.detection_head[cascade_num]
-    class_outputs, box_outputs = current_detection_head(roi_features)
-    model_outputs.update({
-        'class_outputs_{}'.format(cascade_num)
-        if cascade_num else 'class_outputs':
-            class_outputs,
-        'box_outputs_{}'.format(cascade_num) if cascade_num else 'box_outputs':
-            box_outputs,
-    })
-    return (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
-            matched_gt_classes, matched_gt_indices, rois)
-  def _features_to_mask_outputs(self, features, rois, roi_classes):
-    # Mask RoI align.
-    mask_roi_features = self.mask_roi_aligner(features, rois)
-    # Mask head.
-    raw_masks = self.mask_head([mask_roi_features, roi_classes])
-    return raw_masks, tf.nn.sigmoid(raw_masks)
-  @property
-  def checkpoint_items(
-      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
-    """Returns a dictionary of items to be additionally checkpointed."""
-    items = dict(
-        backbone=self.backbone,
-        rpn_head=self.rpn_head,
-        detection_head=self.detection_head)
-    if self.decoder is not None:
-      items.update(decoder=self.decoder)
-    if self._include_mask:
-      items.update(mask_head=self.mask_head)
-    return items
-  def get_config(self) -> Mapping[str, Any]:
-    return self._config_dict
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
--- a/official/vision/modeling/maskrcnn_model_test.py
+++ b/official/vision/modeling/maskrcnn_model_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Lint as: python3
-"""Tests for maskrcnn_model.py."""
-import os
-# Import libraries
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.vision.modeling import maskrcnn_model
-from official.vision.modeling.backbones import resnet
-from official.vision.modeling.decoders import fpn
-from official.vision.modeling.heads import dense_prediction_heads
-from official.vision.modeling.heads import instance_heads
-from official.vision.modeling.layers import detection_generator
-from official.vision.modeling.layers import mask_sampler
-from official.vision.modeling.layers import roi_aligner
-from official.vision.modeling.layers import roi_generator
-from official.vision.modeling.layers import roi_sampler
-from official.vision.ops import anchor
-class MaskRCNNModelTest(parameterized.TestCase, tf.test.TestCase):
-  @combinations.generate(
-      combinations.combine(
-          include_mask=[True, False],
-          use_separable_conv=[True, False],
-          build_anchor_boxes=[True, False],
-          is_training=[True, False]))
-  def test_build_model(self, include_mask, use_separable_conv,
-                       build_anchor_boxes, is_training):
-    num_classes = 3
-    min_level = 3
-    max_level = 7
-    num_scales = 3
-    aspect_ratios = [1.0]
-    anchor_size = 3
-    resnet_model_id = 50
-    num_anchors_per_location = num_scales * len(aspect_ratios)
-    image_size = 384
-    images = np.random.rand(2, image_size, image_size, 3)
-    image_shape = np.array([[image_size, image_size], [image_size, image_size]])
-    if build_anchor_boxes:
-      anchor_boxes = anchor.Anchor(
-          min_level=min_level,
-          max_level=max_level,
-          num_scales=num_scales,
-          aspect_ratios=aspect_ratios,
-          anchor_size=3,
-          image_size=(image_size, image_size)).multilevel_boxes
-      for l in anchor_boxes:
-        anchor_boxes[l] = tf.tile(
-            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
-    else:
-      anchor_boxes = None
-    backbone = resnet.ResNet(model_id=resnet_model_id)
-    decoder = fpn.FPN(
-        input_specs=backbone.output_specs,
-        min_level=min_level,
-        max_level=max_level,
-        use_separable_conv=use_separable_conv)
-    rpn_head = dense_prediction_heads.RPNHead(
-        min_level=min_level,
-        max_level=max_level,
-        num_anchors_per_location=num_anchors_per_location,
-        num_convs=1)
-    detection_head = instance_heads.DetectionHead(num_classes=num_classes)
-    roi_generator_obj = roi_generator.MultilevelROIGenerator()
-    roi_sampler_obj = roi_sampler.ROISampler()
-    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
-    detection_generator_obj = detection_generator.DetectionGenerator()
-    if include_mask:
-      mask_head = instance_heads.MaskHead(
-          num_classes=num_classes, upsample_factor=2)
-      mask_sampler_obj = mask_sampler.MaskSampler(
-          mask_target_size=28, num_sampled_masks=1)
-      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
-    else:
-      mask_head = None
-      mask_sampler_obj = None
-      mask_roi_aligner_obj = None
-    model = maskrcnn_model.MaskRCNNModel(
-        backbone,
-        decoder,
-        rpn_head,
-        detection_head,
-        roi_generator_obj,
-        roi_sampler_obj,
-        roi_aligner_obj,
-        detection_generator_obj,
-        mask_head,
-        mask_sampler_obj,
-        mask_roi_aligner_obj,
-        min_level=min_level,
-        max_level=max_level,
-        num_scales=num_scales,
-        aspect_ratios=aspect_ratios,
-        anchor_size=anchor_size)
-    gt_boxes = np.array(
-        [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
-         [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
-        dtype=np.float32)
-    gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
-    if include_mask:
-      gt_masks = np.ones((2, 3, 100, 100))
-    else:
-      gt_masks = None
-    # Results will be checked in test_forward.
-    _ = model(
-        images,
-        image_shape,
-        anchor_boxes,
-        gt_boxes,
-        gt_classes,
-        gt_masks,
-        training=is_training)
-  @combinations.generate(
-      combinations.combine(
-          strategy=[
-              strategy_combinations.cloud_tpu_strategy,
-              strategy_combinations.one_device_strategy_gpu,
-          ],
-          include_mask=[True, False],
-          build_anchor_boxes=[True, False],
-          use_cascade_heads=[True, False],
-          training=[True, False],
-      ))
-  def test_forward(self, strategy, include_mask, build_anchor_boxes, training,
-                   use_cascade_heads):
-    num_classes = 3
-    min_level = 3
-    max_level = 4
-    num_scales = 3
-    aspect_ratios = [1.0]
-    anchor_size = 3
-    if use_cascade_heads:
-      cascade_iou_thresholds = [0.6]
-      class_agnostic_bbox_pred = True
-      cascade_class_ensemble = True
-    else:
-      cascade_iou_thresholds = None
-      class_agnostic_bbox_pred = False
-      cascade_class_ensemble = False
-    image_size = (256, 256)
-    images = np.random.rand(2, image_size[0], image_size[1], 3)
-    image_shape = np.array([[224, 100], [100, 224]])
-    with strategy.scope():
-      if build_anchor_boxes:
-        anchor_boxes = anchor.Anchor(
-            min_level=min_level,
-            max_level=max_level,
-            num_scales=num_scales,
-            aspect_ratios=aspect_ratios,
-            anchor_size=anchor_size,
-            image_size=image_size).multilevel_boxes
-      else:
-        anchor_boxes = None
-      num_anchors_per_location = len(aspect_ratios) * num_scales
-      input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
-      backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
-      decoder = fpn.FPN(
-          min_level=min_level,
-          max_level=max_level,
-          input_specs=backbone.output_specs)
-      rpn_head = dense_prediction_heads.RPNHead(
-          min_level=min_level,
-          max_level=max_level,
-          num_anchors_per_location=num_anchors_per_location)
-      detection_head = instance_heads.DetectionHead(
-          num_classes=num_classes,
-          class_agnostic_bbox_pred=class_agnostic_bbox_pred)
-      roi_generator_obj = roi_generator.MultilevelROIGenerator()
-      roi_sampler_cascade = []
-      roi_sampler_obj = roi_sampler.ROISampler()
-      roi_sampler_cascade.append(roi_sampler_obj)
-      if cascade_iou_thresholds:
-        for iou in cascade_iou_thresholds:
-          roi_sampler_obj = roi_sampler.ROISampler(
-              mix_gt_boxes=False,
-              foreground_iou_threshold=iou,
-              background_iou_high_threshold=iou,
-              background_iou_low_threshold=0.0,
-              skip_subsampling=True)
-          roi_sampler_cascade.append(roi_sampler_obj)
-      roi_aligner_obj = roi_aligner.MultilevelROIAligner()
-      detection_generator_obj = detection_generator.DetectionGenerator()
-      if include_mask:
-        mask_head = instance_heads.MaskHead(
-            num_classes=num_classes, upsample_factor=2)
-        mask_sampler_obj = mask_sampler.MaskSampler(
-            mask_target_size=28, num_sampled_masks=1)
-        mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
-      else:
-        mask_head = None
-        mask_sampler_obj = None
-        mask_roi_aligner_obj = None
-      model = maskrcnn_model.MaskRCNNModel(
-          backbone,
-          decoder,
-          rpn_head,
-          detection_head,
-          roi_generator_obj,
-          roi_sampler_obj,
-          roi_aligner_obj,
-          detection_generator_obj,
-          mask_head,
-          mask_sampler_obj,
-          mask_roi_aligner_obj,
-          class_agnostic_bbox_pred=class_agnostic_bbox_pred,
-          cascade_class_ensemble=cascade_class_ensemble,
-          min_level=min_level,
-          max_level=max_level,
-          num_scales=num_scales,
-          aspect_ratios=aspect_ratios,
-          anchor_size=anchor_size)
-      gt_boxes = np.array(
-          [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
-           [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
-          dtype=np.float32)
-      gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
-      if include_mask:
-        gt_masks = np.ones((2, 3, 100, 100))
-      else:
-        gt_masks = None
-      results = model(
-          images,
-          image_shape,
-          anchor_boxes,
-          gt_boxes,
-          gt_classes,
-          gt_masks,
-          training=training)
-    self.assertIn('rpn_boxes', results)
-    self.assertIn('rpn_scores', results)
-    if training:
-      self.assertIn('class_targets', results)
-      self.assertIn('box_targets', results)
-      self.assertIn('class_outputs', results)
-      self.assertIn('box_outputs', results)
-      if include_mask:
-        self.assertIn('mask_outputs', results)
-    else:
-      self.assertIn('detection_boxes', results)
-      self.assertIn('detection_scores', results)
-      self.assertIn('detection_classes', results)
-      self.assertIn('num_detections', results)
-      if include_mask:
-        self.assertIn('detection_masks', results)
-  @parameterized.parameters(
-      (False,),
-      (True,),
-  )
-  def test_serialize_deserialize(self, include_mask):
-    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
-    backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
-    decoder = fpn.FPN(
-        min_level=3, max_level=7, input_specs=backbone.output_specs)
-    rpn_head = dense_prediction_heads.RPNHead(
-        min_level=3, max_level=7, num_anchors_per_location=3)
-    detection_head = instance_heads.DetectionHead(num_classes=2)
-    roi_generator_obj = roi_generator.MultilevelROIGenerator()
-    roi_sampler_obj = roi_sampler.ROISampler()
-    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
-    detection_generator_obj = detection_generator.DetectionGenerator()
-    if include_mask:
-      mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
-      mask_sampler_obj = mask_sampler.MaskSampler(
-          mask_target_size=28, num_sampled_masks=1)
-      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
-    else:
-      mask_head = None
-      mask_sampler_obj = None
-      mask_roi_aligner_obj = None
-    model = maskrcnn_model.MaskRCNNModel(
-        backbone,
-        decoder,
-        rpn_head,
-        detection_head,
-        roi_generator_obj,
-        roi_sampler_obj,
-        roi_aligner_obj,
-        detection_generator_obj,
-        mask_head,
-        mask_sampler_obj,
-        mask_roi_aligner_obj,
-        min_level=3,
-        max_level=7,
-        num_scales=3,
-        aspect_ratios=[1.0],
-        anchor_size=3)
-    config = model.get_config()
-    new_model = maskrcnn_model.MaskRCNNModel.from_config(config)
-    # Validate that the config can be forced to JSON.
-    _ = new_model.to_json()
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(model.get_config(), new_model.get_config())
-  @parameterized.parameters(
-      (False,),
-      (True,),
-  )
-  def test_checkpoint(self, include_mask):
-    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
-    backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
-    decoder = fpn.FPN(
-        min_level=3, max_level=7, input_specs=backbone.output_specs)
-    rpn_head = dense_prediction_heads.RPNHead(
-        min_level=3, max_level=7, num_anchors_per_location=3)
-    detection_head = instance_heads.DetectionHead(num_classes=2)
-    roi_generator_obj = roi_generator.MultilevelROIGenerator()
-    roi_sampler_obj = roi_sampler.ROISampler()
-    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
-    detection_generator_obj = detection_generator.DetectionGenerator()
-    if include_mask:
-      mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
-      mask_sampler_obj = mask_sampler.MaskSampler(
-          mask_target_size=28, num_sampled_masks=1)
-      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
-    else:
-      mask_head = None
-      mask_sampler_obj = None
-      mask_roi_aligner_obj = None
-    model = maskrcnn_model.MaskRCNNModel(
-        backbone,
-        decoder,
-        rpn_head,
-        detection_head,
-        roi_generator_obj,
-        roi_sampler_obj,
-        roi_aligner_obj,
-        detection_generator_obj,
-        mask_head,
-        mask_sampler_obj,
-        mask_roi_aligner_obj,
-        min_level=3,
-        max_level=7,
-        num_scales=3,
-        aspect_ratios=[1.0],
-        anchor_size=3)
-    expect_checkpoint_items = dict(
-        backbone=backbone,
-        decoder=decoder,
-        rpn_head=rpn_head,
-        detection_head=[detection_head])
-    if include_mask:
-      expect_checkpoint_items['mask_head'] = mask_head
-    self.assertAllEqual(expect_checkpoint_items, model.checkpoint_items)
-    # Test save and load checkpoints.
-    ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items)
-    save_dir = self.create_tempdir().full_path
-    ckpt.save(os.path.join(save_dir, 'ckpt'))
-    partial_ckpt = tf.train.Checkpoint(backbone=backbone)
-    partial_ckpt.read(tf.train.latest_checkpoint(
-        save_dir)).expect_partial().assert_existing_objects_matched()
-    if include_mask:
-      partial_ckpt_mask = tf.train.Checkpoint(
-          backbone=backbone, mask_head=mask_head)
-      partial_ckpt_mask.restore(tf.train.latest_checkpoint(
-          save_dir)).expect_partial().assert_existing_objects_matched()
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/retinanet_model.py
+++ b/official/vision/modeling/retinanet_model.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""RetinaNet."""
-from typing import Any, Mapping, List, Optional, Union
-# Import libraries
-import tensorflow as tf
-from official.vision.ops import anchor
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class RetinaNetModel(tf.keras.Model):
-  """The RetinaNet model class."""
-  def __init__(self,
-               backbone: tf.keras.Model,
-               decoder: tf.keras.Model,
-               head: tf.keras.layers.Layer,
-               detection_generator: tf.keras.layers.Layer,
-               min_level: Optional[int] = None,
-               max_level: Optional[int] = None,
-               num_scales: Optional[int] = None,
-               aspect_ratios: Optional[List[float]] = None,
-               anchor_size: Optional[float] = None,
-               **kwargs):
-    """Classification initialization function.
-    Args:
-      backbone: `tf.keras.Model` a backbone network.
-      decoder: `tf.keras.Model` a decoder network.
-      head: `RetinaNetHead`, the RetinaNet head.
-      detection_generator: the detection generator.
-      min_level: Minimum level in output feature maps.
-      max_level: Maximum level in output feature maps.
-      num_scales: A number representing intermediate scales added
-        on each level. For instances, num_scales=2 adds one additional
-        intermediate anchor scales [2^0, 2^0.5] on each level.
-      aspect_ratios: A list representing the aspect raito
-        anchors added on each level. The number indicates the ratio of width to
-        height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
-        on each scale level.
-      anchor_size: A number representing the scale of size of the base
-        anchor to the feature stride 2^level.
-      **kwargs: keyword arguments to be passed.
-    """
-    super(RetinaNetModel, self).__init__(**kwargs)
-    self._config_dict = {
-        'backbone': backbone,
-        'decoder': decoder,
-        'head': head,
-        'detection_generator': detection_generator,
-        'min_level': min_level,
-        'max_level': max_level,
-        'num_scales': num_scales,
-        'aspect_ratios': aspect_ratios,
-        'anchor_size': anchor_size,
-    }
-    self._backbone = backbone
-    self._decoder = decoder
-    self._head = head
-    self._detection_generator = detection_generator
-  def call(self,
-           images: tf.Tensor,
-           image_shape: Optional[tf.Tensor] = None,
-           anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
-           output_intermediate_features: bool = False,
-           training: bool = None) -> Mapping[str, tf.Tensor]:
-    """Forward pass of the RetinaNet model.
-    Args:
-      images: `Tensor`, the input batched images, whose shape is
-        [batch, height, width, 3].
-      image_shape: `Tensor`, the actual shape of the input images, whose shape
-        is [batch, 2] where the last dimension is [height, width]. Note that
-        this is the actual image shape excluding paddings. For example, images
-        in the batch may be resized into different shapes before padding to the
-        fixed size.
-      anchor_boxes: a dict of tensors which includes multilevel anchors.
-        - key: `str`, the level of the multilevel predictions.
-        - values: `Tensor`, the anchor coordinates of a particular feature
-            level, whose shape is [height_l, width_l, num_anchors_per_location].
-      output_intermediate_features: `bool` indicating whether to return the
-        intermediate feature maps generated by backbone and decoder.
-      training: `bool`, indicating whether it is in training mode.
-    Returns:
-      scores: a dict of tensors which includes scores of the predictions.
-        - key: `str`, the level of the multilevel predictions.
-        - values: `Tensor`, the box scores predicted from a particular feature
-            level, whose shape is
-            [batch, height_l, width_l, num_classes * num_anchors_per_location].
-      boxes: a dict of tensors which includes coordinates of the predictions.
-        - key: `str`, the level of the multilevel predictions.
-        - values: `Tensor`, the box coordinates predicted from a particular
-            feature level, whose shape is
-            [batch, height_l, width_l, 4 * num_anchors_per_location].
-      attributes: a dict of (attribute_name, attribute_predictions). Each
-        attribute prediction is a dict that includes:
-        - key: `str`, the level of the multilevel predictions.
-        - values: `Tensor`, the attribute predictions from a particular
-            feature level, whose shape is
-            [batch, height_l, width_l, att_size * num_anchors_per_location].
-    """
-    outputs = {}
-    # Feature extraction.
-    features = self.backbone(images)
-    if output_intermediate_features:
-      outputs.update(
-          {'backbone_{}'.format(k): v for k, v in features.items()})
-    if self.decoder:
-      features = self.decoder(features)
-    if output_intermediate_features:
-      outputs.update(
-          {'decoder_{}'.format(k): v for k, v in features.items()})
-    # Dense prediction. `raw_attributes` can be empty.
-    raw_scores, raw_boxes, raw_attributes = self.head(features)
-    if training:
-      outputs.update({
-          'cls_outputs': raw_scores,
-          'box_outputs': raw_boxes,
-      })
-      if raw_attributes:
-        outputs.update({'attribute_outputs': raw_attributes})
-      return outputs
-    else:
-      # Generate anchor boxes for this batch if not provided.
-      if anchor_boxes is None:
-        _, image_height, image_width, _ = images.get_shape().as_list()
-        anchor_boxes = anchor.Anchor(
-            min_level=self._config_dict['min_level'],
-            max_level=self._config_dict['max_level'],
-            num_scales=self._config_dict['num_scales'],
-            aspect_ratios=self._config_dict['aspect_ratios'],
-            anchor_size=self._config_dict['anchor_size'],
-            image_size=(image_height, image_width)).multilevel_boxes
-        for l in anchor_boxes:
-          anchor_boxes[l] = tf.tile(
-              tf.expand_dims(anchor_boxes[l], axis=0),
-              [tf.shape(images)[0], 1, 1, 1])
-      # Post-processing.
-      final_results = self.detection_generator(raw_boxes, raw_scores,
-                                               anchor_boxes, image_shape,
-                                               raw_attributes)
-      outputs.update({
-          'cls_outputs': raw_scores,
-          'box_outputs': raw_boxes,
-      })
-      if self.detection_generator.get_config()['apply_nms']:
-        outputs.update({
-            'detection_boxes': final_results['detection_boxes'],
-            'detection_scores': final_results['detection_scores'],
-            'detection_classes': final_results['detection_classes'],
-            'num_detections': final_results['num_detections']
-        })
-      else:
-        outputs.update({
-            'decoded_boxes': final_results['decoded_boxes'],
-            'decoded_box_scores': final_results['decoded_box_scores']
-        })
-      if raw_attributes:
-        outputs.update({
-            'attribute_outputs': raw_attributes,
-            'detection_attributes': final_results['detection_attributes'],
-        })
-      return outputs
-  @property
-  def checkpoint_items(
-      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
-    """Returns a dictionary of items to be additionally checkpointed."""
-    items = dict(backbone=self.backbone, head=self.head)
-    if self.decoder is not None:
-      items.update(decoder=self.decoder)
-    return items
-  @property
-  def backbone(self) -> tf.keras.Model:
-    return self._backbone
-  @property
-  def decoder(self) -> tf.keras.Model:
-    return self._decoder
-  @property
-  def head(self) -> tf.keras.layers.Layer:
-    return self._head
-  @property
-  def detection_generator(self) -> tf.keras.layers.Layer:
-    return self._detection_generator
-  def get_config(self) -> Mapping[str, Any]:
-    return self._config_dict
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
--- a/official/vision/modeling/retinanet_model_test.py
+++ b/official/vision/modeling/retinanet_model_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Lint as: python3
-"""Tests for RetinaNet models."""
-# Import libraries
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.vision.modeling import retinanet_model
-from official.vision.modeling.backbones import resnet
-from official.vision.modeling.decoders import fpn
-from official.vision.modeling.heads import dense_prediction_heads
-from official.vision.modeling.layers import detection_generator
-from official.vision.ops import anchor
-class RetinaNetTest(parameterized.TestCase, tf.test.TestCase):
-  @parameterized.parameters(
-      {
-          'use_separable_conv': True,
-          'build_anchor_boxes': True,
-          'is_training': False,
-          'has_att_heads': False
-      },
-      {
-          'use_separable_conv': False,
-          'build_anchor_boxes': True,
-          'is_training': False,
-          'has_att_heads': False
-      },
-      {
-          'use_separable_conv': False,
-          'build_anchor_boxes': False,
-          'is_training': False,
-          'has_att_heads': False
-      },
-      {
-          'use_separable_conv': False,
-          'build_anchor_boxes': False,
-          'is_training': True,
-          'has_att_heads': False
-      },
-      {
-          'use_separable_conv': False,
-          'build_anchor_boxes': True,
-          'is_training': True,
-          'has_att_heads': True
-      },
-      {
-          'use_separable_conv': False,
-          'build_anchor_boxes': True,
-          'is_training': False,
-          'has_att_heads': True
-      },
-  )
-  def test_build_model(self, use_separable_conv, build_anchor_boxes,
-                       is_training, has_att_heads):
-    num_classes = 3
-    min_level = 3
-    max_level = 7
-    num_scales = 3
-    aspect_ratios = [1.0]
-    anchor_size = 3
-    fpn_num_filters = 256
-    head_num_convs = 4
-    head_num_filters = 256
-    num_anchors_per_location = num_scales * len(aspect_ratios)
-    image_size = 384
-    images = np.random.rand(2, image_size, image_size, 3)
-    image_shape = np.array([[image_size, image_size], [image_size, image_size]])
-    if build_anchor_boxes:
-      anchor_boxes = anchor.Anchor(
-          min_level=min_level,
-          max_level=max_level,
-          num_scales=num_scales,
-          aspect_ratios=aspect_ratios,
-          anchor_size=anchor_size,
-          image_size=(image_size, image_size)).multilevel_boxes
-      for l in anchor_boxes:
-        anchor_boxes[l] = tf.tile(
-            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
-    else:
-      anchor_boxes = None
-    if has_att_heads:
-      attribute_heads = [dict(name='depth', type='regression', size=1)]
-    else:
-      attribute_heads = None
-    backbone = resnet.ResNet(model_id=50)
-    decoder = fpn.FPN(
-        input_specs=backbone.output_specs,
-        min_level=min_level,
-        max_level=max_level,
-        num_filters=fpn_num_filters,
-        use_separable_conv=use_separable_conv)
-    head = dense_prediction_heads.RetinaNetHead(
-        min_level=min_level,
-        max_level=max_level,
-        num_classes=num_classes,
-        attribute_heads=attribute_heads,
-        num_anchors_per_location=num_anchors_per_location,
-        use_separable_conv=use_separable_conv,
-        num_convs=head_num_convs,
-        num_filters=head_num_filters)
-    generator = detection_generator.MultilevelDetectionGenerator(
-        max_num_detections=10)
-    model = retinanet_model.RetinaNetModel(
-        backbone=backbone,
-        decoder=decoder,
-        head=head,
-        detection_generator=generator,
-        min_level=min_level,
-        max_level=max_level,
-        num_scales=num_scales,
-        aspect_ratios=aspect_ratios,
-        anchor_size=anchor_size)
-    _ = model(images, image_shape, anchor_boxes, training=is_training)
-  @combinations.generate(
-      combinations.combine(
-          strategy=[
-              strategy_combinations.cloud_tpu_strategy,
-              strategy_combinations.one_device_strategy_gpu,
-          ],
-          image_size=[
-              (128, 128),
-          ],
-          training=[True, False],
-          has_att_heads=[True, False],
-          output_intermediate_features=[True, False],
-          soft_nms_sigma=[None, 0.0, 0.1],
-      ))
-  def test_forward(self, strategy, image_size, training, has_att_heads,
-                   output_intermediate_features, soft_nms_sigma):
-    """Test for creation of a R50-FPN RetinaNet."""
-    tf.keras.backend.set_image_data_format('channels_last')
-    num_classes = 3
-    min_level = 3
-    max_level = 7
-    num_scales = 3
-    aspect_ratios = [1.0]
-    num_anchors_per_location = num_scales * len(aspect_ratios)
-    images = np.random.rand(2, image_size[0], image_size[1], 3)
-    image_shape = np.array(
-        [[image_size[0], image_size[1]], [image_size[0], image_size[1]]])
-    with strategy.scope():
-      anchor_gen = anchor.build_anchor_generator(
-          min_level=min_level,
-          max_level=max_level,
-          num_scales=num_scales,
-          aspect_ratios=aspect_ratios,
-          anchor_size=3)
-      anchor_boxes = anchor_gen(image_size)
-      for l in anchor_boxes:
-        anchor_boxes[l] = tf.tile(
-            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
-      backbone = resnet.ResNet(model_id=50)
-      decoder = fpn.FPN(
-          input_specs=backbone.output_specs,
-          min_level=min_level,
-          max_level=max_level)
-      if has_att_heads:
-        attribute_heads = [dict(name='depth', type='regression', size=1)]
-      else:
-        attribute_heads = None
-      head = dense_prediction_heads.RetinaNetHead(
-          min_level=min_level,
-          max_level=max_level,
-          num_classes=num_classes,
-          attribute_heads=attribute_heads,
-          num_anchors_per_location=num_anchors_per_location)
-      generator = detection_generator.MultilevelDetectionGenerator(
-          max_num_detections=10,
-          nms_version='v1',
-          use_cpu_nms=soft_nms_sigma is not None,
-          soft_nms_sigma=soft_nms_sigma)
-      model = retinanet_model.RetinaNetModel(
-          backbone=backbone,
-          decoder=decoder,
-          head=head,
-          detection_generator=generator)
-      model_outputs = model(
-          images,
-          image_shape,
-          anchor_boxes,
-          output_intermediate_features=output_intermediate_features,
-          training=training)
-    if training:
-      cls_outputs = model_outputs['cls_outputs']
-      box_outputs = model_outputs['box_outputs']
-      for level in range(min_level, max_level + 1):
-        self.assertIn(str(level), cls_outputs)
-        self.assertIn(str(level), box_outputs)
-        self.assertAllEqual([
-            2,
-            image_size[0] // 2**level,
-            image_size[1] // 2**level,
-            num_classes * num_anchors_per_location
-        ], cls_outputs[str(level)].numpy().shape)
-        self.assertAllEqual([
-            2,
-            image_size[0] // 2**level,
-            image_size[1] // 2**level,
-            4 * num_anchors_per_location
-        ], box_outputs[str(level)].numpy().shape)
-        if has_att_heads:
-          att_outputs = model_outputs['attribute_outputs']
-          for att in att_outputs.values():
-            self.assertAllEqual([
-                2, image_size[0] // 2**level, image_size[1] // 2**level,
-                1 * num_anchors_per_location
-            ], att[str(level)].numpy().shape)
-    else:
-      self.assertIn('detection_boxes', model_outputs)
-      self.assertIn('detection_scores', model_outputs)
-      self.assertIn('detection_classes', model_outputs)
-      self.assertIn('num_detections', model_outputs)
-      self.assertAllEqual(
-          [2, 10, 4], model_outputs['detection_boxes'].numpy().shape)
-      self.assertAllEqual(
-          [2, 10], model_outputs['detection_scores'].numpy().shape)
-      self.assertAllEqual(
-          [2, 10], model_outputs['detection_classes'].numpy().shape)
-      self.assertAllEqual(
-          [2,], model_outputs['num_detections'].numpy().shape)
-      if has_att_heads:
-        self.assertIn('detection_attributes', model_outputs)
-        self.assertAllEqual(
-            [2, 10, 1],
-            model_outputs['detection_attributes']['depth'].numpy().shape)
-    if output_intermediate_features:
-      for l in range(2, 6):
-        self.assertIn('backbone_{}'.format(l), model_outputs)
-        self.assertAllEqual([
-            2, image_size[0] // 2**l, image_size[1] // 2**l,
-            backbone.output_specs[str(l)].as_list()[-1]
-        ], model_outputs['backbone_{}'.format(l)].numpy().shape)
-      for l in range(min_level, max_level + 1):
-        self.assertIn('decoder_{}'.format(l), model_outputs)
-        self.assertAllEqual([
-            2, image_size[0] // 2**l, image_size[1] // 2**l,
-            decoder.output_specs[str(l)].as_list()[-1]
-        ], model_outputs['decoder_{}'.format(l)].numpy().shape)
-  def test_serialize_deserialize(self):
-    """Validate the network can be serialized and deserialized."""
-    num_classes = 3
-    min_level = 3
-    max_level = 7
-    num_scales = 3
-    aspect_ratios = [1.0]
-    num_anchors_per_location = num_scales * len(aspect_ratios)
-    backbone = resnet.ResNet(model_id=50)
-    decoder = fpn.FPN(
-        input_specs=backbone.output_specs,
-        min_level=min_level,
-        max_level=max_level)
-    head = dense_prediction_heads.RetinaNetHead(
-        min_level=min_level,
-        max_level=max_level,
-        num_classes=num_classes,
-        num_anchors_per_location=num_anchors_per_location)
-    generator = detection_generator.MultilevelDetectionGenerator(
-        max_num_detections=10)
-    model = retinanet_model.RetinaNetModel(
-        backbone=backbone,
-        decoder=decoder,
-        head=head,
-        detection_generator=generator,
-        min_level=min_level,
-        max_level=max_level,
-        num_scales=num_scales,
-        aspect_ratios=aspect_ratios,
-        anchor_size=3)
-    config = model.get_config()
-    new_model = retinanet_model.RetinaNetModel.from_config(config)
-    # Validate that the config can be forced to JSON.
-    _ = new_model.to_json()
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(model.get_config(), new_model.get_config())
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/segmentation_model.py
+++ b/official/vision/modeling/segmentation_model.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Build segmentation models."""
-from typing import Any, Mapping, Union, Optional, Dict
-# Import libraries
-import tensorflow as tf
-layers = tf.keras.layers
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class SegmentationModel(tf.keras.Model):
-  """A Segmentation class model.
-  Input images are passed through backbone first. Decoder network is then
-  applied, and finally, segmentation head is applied on the output of the
-  decoder network. Layers such as ASPP should be part of decoder. Any feature
-  fusion is done as part of the segmentation head (i.e. deeplabv3+ feature
-  fusion is not part of the decoder, instead it is part of the segmentation
-  head). This way, different feature fusion techniques can be combined with
-  different backbones, and decoders.
-  """
-  def __init__(self, backbone: tf.keras.Model, decoder: tf.keras.Model,
-               head: tf.keras.layers.Layer,
-               mask_scoring_head: Optional[tf.keras.layers.Layer] = None,
-               **kwargs):
-    """Segmentation initialization function.
-    Args:
-      backbone: a backbone network.
-      decoder: a decoder network. E.g. FPN.
-      head: segmentation head.
-      mask_scoring_head: mask scoring head.
-      **kwargs: keyword arguments to be passed.
-    """
-    super(SegmentationModel, self).__init__(**kwargs)
-    self._config_dict = {
-        'backbone': backbone,
-        'decoder': decoder,
-        'head': head,
-        'mask_scoring_head': mask_scoring_head,
-    }
-    self.backbone = backbone
-    self.decoder = decoder
-    self.head = head
-    self.mask_scoring_head = mask_scoring_head
-  def call(self, inputs: tf.Tensor, training: bool = None
-           ) -> Dict[str, tf.Tensor]:
-    backbone_features = self.backbone(inputs)
-    if self.decoder:
-      decoder_features = self.decoder(backbone_features)
-    else:
-      decoder_features = backbone_features
-    logits = self.head((backbone_features, decoder_features))
-    outputs = {'logits': logits}
-    if self.mask_scoring_head:
-      mask_scores = self.mask_scoring_head(logits)
-      outputs.update({'mask_scores': mask_scores})
-    return outputs
-  @property
-  def checkpoint_items(
-      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
-    """Returns a dictionary of items to be additionally checkpointed."""
-    items = dict(backbone=self.backbone, head=self.head)
-    if self.decoder is not None:
-      items.update(decoder=self.decoder)
-    if self.mask_scoring_head is not None:
-      items.update(mask_scoring_head=self.mask_scoring_head)
-    return items
-  def get_config(self) -> Mapping[str, Any]:
-    return self._config_dict
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
--- a/official/vision/modeling/segmentation_model_test.py
+++ b/official/vision/modeling/segmentation_model_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Lint as: python3
-"""Tests for segmentation network."""
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-from official.vision.modeling import backbones
-from official.vision.modeling import segmentation_model
-from official.vision.modeling.decoders import fpn
-from official.vision.modeling.heads import segmentation_heads
-class SegmentationNetworkTest(parameterized.TestCase, tf.test.TestCase):
-  @parameterized.parameters(
-      (128, 2),
-      (128, 3),
-      (128, 4),
-      (256, 2),
-      (256, 3),
-      (256, 4),
-  )
-  def test_segmentation_network_creation(
-      self, input_size, level):
-    """Test for creation of a segmentation network."""
-    num_classes = 10
-    inputs = np.random.rand(2, input_size, input_size, 3)
-    tf.keras.backend.set_image_data_format('channels_last')
-    backbone = backbones.ResNet(model_id=50)
-    decoder = fpn.FPN(
-        input_specs=backbone.output_specs, min_level=2, max_level=7)
-    head = segmentation_heads.SegmentationHead(num_classes, level=level)
-    model = segmentation_model.SegmentationModel(
-        backbone=backbone,
-        decoder=decoder,
-        head=head,
-        mask_scoring_head=None,
-    )
-    outputs = model(inputs)
-    self.assertAllEqual(
-        [2, input_size // (2**level), input_size // (2**level), num_classes],
-        outputs['logits'].numpy().shape)
-  def test_serialize_deserialize(self):
-    """Validate the network can be serialized and deserialized."""
-    num_classes = 3
-    backbone = backbones.ResNet(model_id=50)
-    decoder = fpn.FPN(
-        input_specs=backbone.output_specs, min_level=3, max_level=7)
-    head = segmentation_heads.SegmentationHead(num_classes, level=3)
-    model = segmentation_model.SegmentationModel(
-        backbone=backbone,
-        decoder=decoder,
-        head=head
-    )
-    config = model.get_config()
-    new_model = segmentation_model.SegmentationModel.from_config(config)
-    # Validate that the config can be forced to JSON.
-    _ = new_model.to_json()
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(model.get_config(), new_model.get_config())
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/modeling/video_classification_model.py
+++ b/official/vision/modeling/video_classification_model.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Build video classification models."""
-from typing import Any, Mapping, Optional, Union, List, Text
-import tensorflow as tf
-layers = tf.keras.layers
-@tf.keras.utils.register_keras_serializable(package='Vision')
-class VideoClassificationModel(tf.keras.Model):
-  """A video classification class builder."""
-  def __init__(
-      self,
-      backbone: tf.keras.Model,
-      num_classes: int,
-      input_specs: Optional[Mapping[str, tf.keras.layers.InputSpec]] = None,
-      dropout_rate: float = 0.0,
-      aggregate_endpoints: bool = False,
-      kernel_initializer: str = 'random_uniform',
-      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-      require_endpoints: Optional[List[Text]] = None,
-      **kwargs):
-    """Video Classification initialization function.
-    Args:
-      backbone: a 3d backbone network.
-      num_classes: `int` number of classes in classification task.
-      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
-      dropout_rate: `float` rate for dropout regularization.
-      aggregate_endpoints: `bool` aggregate all end ponits or only use the
-        final end point.
-      kernel_initializer: kernel initializer for the dense layer.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object. Default to
-        None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object. Default to
-        None.
-      require_endpoints: the required endpoints for prediction. If None or
-        empty, then only uses the final endpoint.
-      **kwargs: keyword arguments to be passed.
-    """
-    if not input_specs:
-      input_specs = {
-          'image': layers.InputSpec(shape=[None, None, None, None, 3])
-      }
-    self._self_setattr_tracking = False
-    self._config_dict = {
-        'backbone': backbone,
-        'num_classes': num_classes,
-        'input_specs': input_specs,
-        'dropout_rate': dropout_rate,
-        'aggregate_endpoints': aggregate_endpoints,
-        'kernel_initializer': kernel_initializer,
-        'kernel_regularizer': kernel_regularizer,
-        'bias_regularizer': bias_regularizer,
-        'require_endpoints': require_endpoints,
-    }
-    self._input_specs = input_specs
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-    self._backbone = backbone
-    inputs = {
-        k: tf.keras.Input(shape=v.shape[1:]) for k, v in input_specs.items()
-    }
-    endpoints = backbone(inputs['image'])
-    if aggregate_endpoints:
-      pooled_feats = []
-      for endpoint in endpoints.values():
-        x_pool = tf.keras.layers.GlobalAveragePooling3D()(endpoint)
-        pooled_feats.append(x_pool)
-      x = tf.concat(pooled_feats, axis=1)
-    else:
-      if not require_endpoints:
-        # Uses the last endpoint for prediction.
-        x = endpoints[max(endpoints.keys())]
-        x = tf.keras.layers.GlobalAveragePooling3D()(x)
-      else:
-        # Concats all the required endpoints for prediction.
-        outputs = []
-        for name in require_endpoints:
-          x = endpoints[name]
-          x = tf.keras.layers.GlobalAveragePooling3D()(x)
-          outputs.append(x)
-        x = tf.concat(outputs, axis=1)
-    x = tf.keras.layers.Dropout(dropout_rate)(x)
-    x = tf.keras.layers.Dense(
-        num_classes, kernel_initializer=kernel_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)(
-            x)
-    super(VideoClassificationModel, self).__init__(
-        inputs=inputs, outputs=x, **kwargs)
-  @property
-  def checkpoint_items(
-      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
-    """Returns a dictionary of items to be additionally checkpointed."""
-    return dict(backbone=self.backbone)
-  @property
-  def backbone(self) -> tf.keras.Model:
-    return self._backbone
-  def get_config(self) -> Mapping[str, Any]:
-    return self._config_dict
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
--- a/official/vision/modeling/video_classification_model_test.py
+++ b/official/vision/modeling/video_classification_model_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Lint as: python3
-"""Tests for video classification network."""
-# Import libraries
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-from official.vision.modeling import backbones
-from official.vision.modeling import video_classification_model
-class VideoClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase):
-  @parameterized.parameters(
-      (50, 8, 112, 'relu', False),
-      (50, 8, 112, 'swish', True),
-  )
-  def test_resnet3d_network_creation(self, model_id, temporal_size,
-                                     spatial_size, activation,
-                                     aggregate_endpoints):
-    """Test for creation of a ResNet3D-50 classifier."""
-    input_specs = tf.keras.layers.InputSpec(
-        shape=[None, temporal_size, spatial_size, spatial_size, 3])
-    temporal_strides = [1, 1, 1, 1]
-    temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
-                             (1, 3, 1)]
-    tf.keras.backend.set_image_data_format('channels_last')
-    backbone = backbones.ResNet3D(
-        model_id=model_id,
-        temporal_strides=temporal_strides,
-        temporal_kernel_sizes=temporal_kernel_sizes,
-        input_specs=input_specs,
-        activation=activation)
-    num_classes = 1000
-    model = video_classification_model.VideoClassificationModel(
-        backbone=backbone,
-        num_classes=num_classes,
-        input_specs={'image': input_specs},
-        dropout_rate=0.2,
-        aggregate_endpoints=aggregate_endpoints,
-    )
-    inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
-    logits = model(inputs)
-    self.assertAllEqual([2, num_classes], logits.numpy().shape)
-  def test_serialize_deserialize(self):
-    """Validate the classification network can be serialized and deserialized."""
-    model_id = 50
-    temporal_strides = [1, 1, 1, 1]
-    temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
-                             (1, 3, 1)]
-    backbone = backbones.ResNet3D(
-        model_id=model_id,
-        temporal_strides=temporal_strides,
-        temporal_kernel_sizes=temporal_kernel_sizes)
-    model = video_classification_model.VideoClassificationModel(
-        backbone=backbone, num_classes=1000)
-    config = model.get_config()
-    new_model = video_classification_model.VideoClassificationModel.from_config(
-        config)
-    # Validate that the config can be forced to JSON.
-    _ = new_model.to_json()
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(model.get_config(), new_model.get_config())
-if __name__ == '__main__':
-  tf.test.main()